diff options
228 files changed, 12811 insertions, 5333 deletions
diff --git a/Documentation/devicetree/bindings/arm/l2c2x0.txt b/Documentation/devicetree/bindings/arm/l2c2x0.txt index 917199f17965..d9650c1788f4 100644 --- a/Documentation/devicetree/bindings/arm/l2c2x0.txt +++ b/Documentation/devicetree/bindings/arm/l2c2x0.txt @@ -90,6 +90,9 @@ Optional properties: - arm,standby-mode: L2 standby mode enable. Value <0> (forcibly disable), <1> (forcibly enable), property absent (OS specific behavior, preferably retain firmware settings) +- arm,early-bresp-disable : Disable the CA9 optimization Early BRESP (PL310) +- arm,full-line-zero-disable : Disable the CA9 optimization Full line of zero + write (PL310) Example: diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index fd106899afd1..4029943887a3 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -110,17 +110,18 @@ Type: system ioctl Parameters: machine type identifier (KVM_VM_*) Returns: a VM fd that can be used to control the new virtual machine. -The new VM has no virtual cpus and no memory. An mmap() of a VM fd -will access the virtual machine's physical address space; offset zero -corresponds to guest physical address zero. Use of mmap() on a VM fd -is discouraged if userspace memory allocation (KVM_CAP_USER_MEMORY) is -available. -You most certainly want to use 0 as machine type. +The new VM has no virtual cpus and no memory. +You probably want to use 0 as machine type. In order to create user controlled virtual machines on S390, check KVM_CAP_S390_UCONTROL and use the flag KVM_VM_S390_UCONTROL as privileged user (CAP_SYS_ADMIN). +To use hardware assisted virtualization on MIPS (VZ ASE) rather than +the default trap & emulate implementation (which changes the virtual +memory layout to fit in user mode), check KVM_CAP_MIPS_VZ and use the +flag KVM_VM_MIPS_VZ. + 4.3 KVM_GET_MSR_INDEX_LIST @@ -1321,130 +1322,6 @@ The flags bitmap is defined as: /* the host supports the ePAPR idle hcall #define KVM_PPC_PVINFO_FLAGS_EV_IDLE (1<<0) -4.48 KVM_ASSIGN_PCI_DEVICE (deprecated) - -Capability: none -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_assigned_pci_dev (in) -Returns: 0 on success, -1 on error - -Assigns a host PCI device to the VM. - -struct kvm_assigned_pci_dev { - __u32 assigned_dev_id; - __u32 busnr; - __u32 devfn; - __u32 flags; - __u32 segnr; - union { - __u32 reserved[11]; - }; -}; - -The PCI device is specified by the triple segnr, busnr, and devfn. -Identification in succeeding service requests is done via assigned_dev_id. The -following flags are specified: - -/* Depends on KVM_CAP_IOMMU */ -#define KVM_DEV_ASSIGN_ENABLE_IOMMU (1 << 0) -/* The following two depend on KVM_CAP_PCI_2_3 */ -#define KVM_DEV_ASSIGN_PCI_2_3 (1 << 1) -#define KVM_DEV_ASSIGN_MASK_INTX (1 << 2) - -If KVM_DEV_ASSIGN_PCI_2_3 is set, the kernel will manage legacy INTx interrupts -via the PCI-2.3-compliant device-level mask, thus enable IRQ sharing with other -assigned devices or host devices. KVM_DEV_ASSIGN_MASK_INTX specifies the -guest's view on the INTx mask, see KVM_ASSIGN_SET_INTX_MASK for details. - -The KVM_DEV_ASSIGN_ENABLE_IOMMU flag is a mandatory option to ensure -isolation of the device. Usages not specifying this flag are deprecated. - -Only PCI header type 0 devices with PCI BAR resources are supported by -device assignment. The user requesting this ioctl must have read/write -access to the PCI sysfs resource files associated with the device. - -Errors: - ENOTTY: kernel does not support this ioctl - - Other error conditions may be defined by individual device types or - have their standard meanings. - - -4.49 KVM_DEASSIGN_PCI_DEVICE (deprecated) - -Capability: none -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_assigned_pci_dev (in) -Returns: 0 on success, -1 on error - -Ends PCI device assignment, releasing all associated resources. - -See KVM_ASSIGN_PCI_DEVICE for the data structure. Only assigned_dev_id is -used in kvm_assigned_pci_dev to identify the device. - -Errors: - ENOTTY: kernel does not support this ioctl - - Other error conditions may be defined by individual device types or - have their standard meanings. - -4.50 KVM_ASSIGN_DEV_IRQ (deprecated) - -Capability: KVM_CAP_ASSIGN_DEV_IRQ -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_assigned_irq (in) -Returns: 0 on success, -1 on error - -Assigns an IRQ to a passed-through device. - -struct kvm_assigned_irq { - __u32 assigned_dev_id; - __u32 host_irq; /* ignored (legacy field) */ - __u32 guest_irq; - __u32 flags; - union { - __u32 reserved[12]; - }; -}; - -The following flags are defined: - -#define KVM_DEV_IRQ_HOST_INTX (1 << 0) -#define KVM_DEV_IRQ_HOST_MSI (1 << 1) -#define KVM_DEV_IRQ_HOST_MSIX (1 << 2) - -#define KVM_DEV_IRQ_GUEST_INTX (1 << 8) -#define KVM_DEV_IRQ_GUEST_MSI (1 << 9) -#define KVM_DEV_IRQ_GUEST_MSIX (1 << 10) - -It is not valid to specify multiple types per host or guest IRQ. However, the -IRQ type of host and guest can differ or can even be null. - -Errors: - ENOTTY: kernel does not support this ioctl - - Other error conditions may be defined by individual device types or - have their standard meanings. - - -4.51 KVM_DEASSIGN_DEV_IRQ (deprecated) - -Capability: KVM_CAP_ASSIGN_DEV_IRQ -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_assigned_irq (in) -Returns: 0 on success, -1 on error - -Ends an IRQ assignment to a passed-through device. - -See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified -by assigned_dev_id, flags must correspond to the IRQ type specified on -KVM_ASSIGN_DEV_IRQ. Partial deassignment of host or guest IRQ is allowed. - - 4.52 KVM_SET_GSI_ROUTING Capability: KVM_CAP_IRQ_ROUTING @@ -1531,52 +1408,6 @@ struct kvm_irq_routing_hv_sint { __u32 sint; }; -4.53 KVM_ASSIGN_SET_MSIX_NR (deprecated) - -Capability: none -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_assigned_msix_nr (in) -Returns: 0 on success, -1 on error - -Set the number of MSI-X interrupts for an assigned device. The number is -reset again by terminating the MSI-X assignment of the device via -KVM_DEASSIGN_DEV_IRQ. Calling this service more than once at any earlier -point will fail. - -struct kvm_assigned_msix_nr { - __u32 assigned_dev_id; - __u16 entry_nr; - __u16 padding; -}; - -#define KVM_MAX_MSIX_PER_DEV 256 - - -4.54 KVM_ASSIGN_SET_MSIX_ENTRY (deprecated) - -Capability: none -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_assigned_msix_entry (in) -Returns: 0 on success, -1 on error - -Specifies the routing of an MSI-X assigned device interrupt to a GSI. Setting -the GSI vector to zero means disabling the interrupt. - -struct kvm_assigned_msix_entry { - __u32 assigned_dev_id; - __u32 gsi; - __u16 entry; /* The index of entry in the MSI-X table */ - __u16 padding[3]; -}; - -Errors: - ENOTTY: kernel does not support this ioctl - - Other error conditions may be defined by individual device types or - have their standard meanings. - 4.55 KVM_SET_TSC_KHZ @@ -1728,40 +1559,6 @@ should skip processing the bitmap and just invalidate everything. It must be set to the number of set bits in the bitmap. -4.61 KVM_ASSIGN_SET_INTX_MASK (deprecated) - -Capability: KVM_CAP_PCI_2_3 -Architectures: x86 -Type: vm ioctl -Parameters: struct kvm_assigned_pci_dev (in) -Returns: 0 on success, -1 on error - -Allows userspace to mask PCI INTx interrupts from the assigned device. The -kernel will not deliver INTx interrupts to the guest between setting and -clearing of KVM_ASSIGN_SET_INTX_MASK via this interface. This enables use of -and emulation of PCI 2.3 INTx disable command register behavior. - -This may be used for both PCI 2.3 devices supporting INTx disable natively and -older devices lacking this support. Userspace is responsible for emulating the -read value of the INTx disable bit in the guest visible PCI command register. -When modifying the INTx disable state, userspace should precede updating the -physical device command register by calling this ioctl to inform the kernel of -the new intended INTx mask state. - -Note that the kernel uses the device INTx disable bit to internally manage the -device interrupt state for PCI 2.3 devices. Reads of this register may -therefore not match the expected value. Writes should always use the guest -intended INTx disable value rather than attempting to read-copy-update the -current physical device state. Races between user and kernel updates to the -INTx disable bit are handled lazily in the kernel. It's possible the device -may generate unintended interrupts, but they will not be injected into the -guest. - -See KVM_ASSIGN_DEV_IRQ for the data structure. The target device is specified -by assigned_dev_id. In the flags field, only KVM_DEV_ASSIGN_MASK_INTX is -evaluated. - - 4.62 KVM_CREATE_SPAPR_TCE Capability: KVM_CAP_SPAPR_TCE @@ -2068,11 +1865,23 @@ registers, find a list below: MIPS | KVM_REG_MIPS_CP0_ENTRYLO0 | 64 MIPS | KVM_REG_MIPS_CP0_ENTRYLO1 | 64 MIPS | KVM_REG_MIPS_CP0_CONTEXT | 64 + MIPS | KVM_REG_MIPS_CP0_CONTEXTCONFIG| 32 MIPS | KVM_REG_MIPS_CP0_USERLOCAL | 64 + MIPS | KVM_REG_MIPS_CP0_XCONTEXTCONFIG| 64 MIPS | KVM_REG_MIPS_CP0_PAGEMASK | 32 + MIPS | KVM_REG_MIPS_CP0_PAGEGRAIN | 32 + MIPS | KVM_REG_MIPS_CP0_SEGCTL0 | 64 + MIPS | KVM_REG_MIPS_CP0_SEGCTL1 | 64 + MIPS | KVM_REG_MIPS_CP0_SEGCTL2 | 64 + MIPS | KVM_REG_MIPS_CP0_PWBASE | 64 + MIPS | KVM_REG_MIPS_CP0_PWFIELD | 64 + MIPS | KVM_REG_MIPS_CP0_PWSIZE | 64 MIPS | KVM_REG_MIPS_CP0_WIRED | 32 + MIPS | KVM_REG_MIPS_CP0_PWCTL | 32 MIPS | KVM_REG_MIPS_CP0_HWRENA | 32 MIPS | KVM_REG_MIPS_CP0_BADVADDR | 64 + MIPS | KVM_REG_MIPS_CP0_BADINSTR | 32 + MIPS | KVM_REG_MIPS_CP0_BADINSTRP | 32 MIPS | KVM_REG_MIPS_CP0_COUNT | 32 MIPS | KVM_REG_MIPS_CP0_ENTRYHI | 64 MIPS | KVM_REG_MIPS_CP0_COMPARE | 32 @@ -2089,6 +1898,7 @@ registers, find a list below: MIPS | KVM_REG_MIPS_CP0_CONFIG4 | 32 MIPS | KVM_REG_MIPS_CP0_CONFIG5 | 32 MIPS | KVM_REG_MIPS_CP0_CONFIG7 | 32 + MIPS | KVM_REG_MIPS_CP0_XCONTEXT | 64 MIPS | KVM_REG_MIPS_CP0_ERROREPC | 64 MIPS | KVM_REG_MIPS_CP0_KSCRATCH1 | 64 MIPS | KVM_REG_MIPS_CP0_KSCRATCH2 | 64 @@ -2096,6 +1906,7 @@ registers, find a list below: MIPS | KVM_REG_MIPS_CP0_KSCRATCH4 | 64 MIPS | KVM_REG_MIPS_CP0_KSCRATCH5 | 64 MIPS | KVM_REG_MIPS_CP0_KSCRATCH6 | 64 + MIPS | KVM_REG_MIPS_CP0_MAAR(0..63) | 64 MIPS | KVM_REG_MIPS_COUNT_CTL | 64 MIPS | KVM_REG_MIPS_COUNT_RESUME | 64 MIPS | KVM_REG_MIPS_COUNT_HZ | 64 @@ -2162,6 +1973,10 @@ hardware, host kernel, guest, and whether XPA is present in the guest, i.e. with the RI and XI bits (if they exist) in bits 63 and 62 respectively, and the PFNX field starting at bit 30. +MIPS MAARs (see KVM_REG_MIPS_CP0_MAAR(*) above) have the following id bit +patterns: + 0x7030 0000 0001 01 <reg:8> + MIPS KVM control registers (see above) have the following id bit patterns: 0x7030 0000 0002 <reg:16> @@ -4164,6 +3979,23 @@ to take care of that. This capability can be enabled dynamically even if VCPUs were already created and are running. +7.9 KVM_CAP_S390_GS + +Architectures: s390 +Parameters: none +Returns: 0 on success; -EINVAL if the machine does not support + guarded storage; -EBUSY if a VCPU has already been created. + +Allows use of guarded storage for the KVM guest. + +7.10 KVM_CAP_S390_AIS + +Architectures: s390 +Parameters: none + +Allow use of adapter-interruption suppression. +Returns: 0 on success; -EBUSY if a VCPU has already been created. + 8. Other capabilities. ---------------------- @@ -4210,3 +4042,118 @@ This capability, if KVM_CHECK_EXTENSION indicates that it is available, means that that the kernel can support guests using the hashed page table MMU defined in Power ISA V3.00 (as implemented in the POWER9 processor), including in-memory segment tables. + +8.5 KVM_CAP_MIPS_VZ + +Architectures: mips + +This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that +it is available, means that full hardware assisted virtualization capabilities +of the hardware are available for use through KVM. An appropriate +KVM_VM_MIPS_* type must be passed to KVM_CREATE_VM to create a VM which +utilises it. + +If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is +available, it means that the VM is using full hardware assisted virtualization +capabilities of the hardware. This is useful to check after creating a VM with +KVM_VM_MIPS_DEFAULT. + +The value returned by KVM_CHECK_EXTENSION should be compared against known +values (see below). All other values are reserved. This is to allow for the +possibility of other hardware assisted virtualization implementations which +may be incompatible with the MIPS VZ ASE. + + 0: The trap & emulate implementation is in use to run guest code in user + mode. Guest virtual memory segments are rearranged to fit the guest in the + user mode address space. + + 1: The MIPS VZ ASE is in use, providing full hardware assisted + virtualization, including standard guest virtual memory segments. + +8.6 KVM_CAP_MIPS_TE + +Architectures: mips + +This capability, if KVM_CHECK_EXTENSION on the main kvm handle indicates that +it is available, means that the trap & emulate implementation is available to +run guest code in user mode, even if KVM_CAP_MIPS_VZ indicates that hardware +assisted virtualisation is also available. KVM_VM_MIPS_TE (0) must be passed +to KVM_CREATE_VM to create a VM which utilises it. + +If KVM_CHECK_EXTENSION on a kvm VM handle indicates that this capability is +available, it means that the VM is using trap & emulate. + +8.7 KVM_CAP_MIPS_64BIT + +Architectures: mips + +This capability indicates the supported architecture type of the guest, i.e. the +supported register and address width. + +The values returned when this capability is checked by KVM_CHECK_EXTENSION on a +kvm VM handle correspond roughly to the CP0_Config.AT register field, and should +be checked specifically against known values (see below). All other values are +reserved. + + 0: MIPS32 or microMIPS32. + Both registers and addresses are 32-bits wide. + It will only be possible to run 32-bit guest code. + + 1: MIPS64 or microMIPS64 with access only to 32-bit compatibility segments. + Registers are 64-bits wide, but addresses are 32-bits wide. + 64-bit guest code may run but cannot access MIPS64 memory segments. + It will also be possible to run 32-bit guest code. + + 2: MIPS64 or microMIPS64 with access to all address segments. + Both registers and addresses are 64-bits wide. + It will be possible to run 64-bit or 32-bit guest code. + +8.8 KVM_CAP_X86_GUEST_MWAIT + +Architectures: x86 + +This capability indicates that guest using memory monotoring instructions +(MWAIT/MWAITX) to stop the virtual CPU will not cause a VM exit. As such time +spent while virtual CPU is halted in this way will then be accounted for as +guest running time on the host (as opposed to e.g. HLT). + +8.9 KVM_CAP_ARM_USER_IRQ + +Architectures: arm, arm64 +This capability, if KVM_CHECK_EXTENSION indicates that it is available, means +that if userspace creates a VM without an in-kernel interrupt controller, it +will be notified of changes to the output level of in-kernel emulated devices, +which can generate virtual interrupts, presented to the VM. +For such VMs, on every return to userspace, the kernel +updates the vcpu's run->s.regs.device_irq_level field to represent the actual +output level of the device. + +Whenever kvm detects a change in the device output level, kvm guarantees at +least one return to userspace before running the VM. This exit could either +be a KVM_EXIT_INTR or any other exit event, like KVM_EXIT_MMIO. This way, +userspace can always sample the device output level and re-compute the state of +the userspace interrupt controller. Userspace should always check the state +of run->s.regs.device_irq_level on every kvm exit. +The value in run->s.regs.device_irq_level can represent both level and edge +triggered interrupt signals, depending on the device. Edge triggered interrupt +signals will exit to userspace with the bit in run->s.regs.device_irq_level +set exactly once per edge signal. + +The field run->s.regs.device_irq_level is available independent of +run->kvm_valid_regs or run->kvm_dirty_regs bits. + +If KVM_CAP_ARM_USER_IRQ is supported, the KVM_CHECK_EXTENSION ioctl returns a +number larger than 0 indicating the version of this capability is implemented +and thereby which bits in in run->s.regs.device_irq_level can signal values. + +Currently the following bits are defined for the device_irq_level bitmap: + + KVM_CAP_ARM_USER_IRQ >= 1: + + KVM_ARM_DEV_EL1_VTIMER - EL1 virtual timer + KVM_ARM_DEV_EL1_PTIMER - EL1 physical timer + KVM_ARM_DEV_PMU - ARM PMU overflow interrupt signal + +Future versions of kvm may implement additional events. These will get +indicated by returning a higher number from KVM_CHECK_EXTENSION and will be +listed above. diff --git a/Documentation/virtual/kvm/arm/hyp-abi.txt b/Documentation/virtual/kvm/arm/hyp-abi.txt new file mode 100644 index 000000000000..a20a0bee268d --- /dev/null +++ b/Documentation/virtual/kvm/arm/hyp-abi.txt @@ -0,0 +1,53 @@ +* Internal ABI between the kernel and HYP + +This file documents the interaction between the Linux kernel and the +hypervisor layer when running Linux as a hypervisor (for example +KVM). It doesn't cover the interaction of the kernel with the +hypervisor when running as a guest (under Xen, KVM or any other +hypervisor), or any hypervisor-specific interaction when the kernel is +used as a host. + +On arm and arm64 (without VHE), the kernel doesn't run in hypervisor +mode, but still needs to interact with it, allowing a built-in +hypervisor to be either installed or torn down. + +In order to achieve this, the kernel must be booted at HYP (arm) or +EL2 (arm64), allowing it to install a set of stubs before dropping to +SVC/EL1. These stubs are accessible by using a 'hvc #0' instruction, +and only act on individual CPUs. + +Unless specified otherwise, any built-in hypervisor must implement +these functions (see arch/arm{,64}/include/asm/virt.h): + +* r0/x0 = HVC_SET_VECTORS + r1/x1 = vectors + + Set HVBAR/VBAR_EL2 to 'vectors' to enable a hypervisor. 'vectors' + must be a physical address, and respect the alignment requirements + of the architecture. Only implemented by the initial stubs, not by + Linux hypervisors. + +* r0/x0 = HVC_RESET_VECTORS + + Turn HYP/EL2 MMU off, and reset HVBAR/VBAR_EL2 to the initials + stubs' exception vector value. This effectively disables an existing + hypervisor. + +* r0/x0 = HVC_SOFT_RESTART + r1/x1 = restart address + x2 = x0's value when entering the next payload (arm64) + x3 = x1's value when entering the next payload (arm64) + x4 = x2's value when entering the next payload (arm64) + + Mask all exceptions, disable the MMU, move the arguments into place + (arm64 only), and jump to the restart address while at HYP/EL2. This + hypercall is not expected to return to its caller. + +Any other value of r0/x0 triggers a hypervisor-specific handling, +which is not documented here. + +The return value of a stub hypercall is held by r0/x0, and is 0 on +success, and HVC_STUB_ERR on error. A stub hypercall is allowed to +clobber any of the caller-saved registers (x0-x18 on arm64, r0-r3 and +ip on arm). It is thus recommended to use a function call to perform +the hypercall. diff --git a/Documentation/virtual/kvm/devices/s390_flic.txt b/Documentation/virtual/kvm/devices/s390_flic.txt index 6b0e115301c8..c2518cea8ab4 100644 --- a/Documentation/virtual/kvm/devices/s390_flic.txt +++ b/Documentation/virtual/kvm/devices/s390_flic.txt @@ -14,6 +14,8 @@ FLIC provides support to - purge one pending floating I/O interrupt (KVM_DEV_FLIC_CLEAR_IO_IRQ) - enable/disable for the guest transparent async page faults - register and modify adapter interrupt sources (KVM_DEV_FLIC_ADAPTER_*) +- modify AIS (adapter-interruption-suppression) mode state (KVM_DEV_FLIC_AISM) +- inject adapter interrupts on a specified adapter (KVM_DEV_FLIC_AIRQ_INJECT) Groups: KVM_DEV_FLIC_ENQUEUE @@ -64,12 +66,18 @@ struct kvm_s390_io_adapter { __u8 isc; __u8 maskable; __u8 swap; - __u8 pad; + __u8 flags; }; id contains the unique id for the adapter, isc the I/O interruption subclass - to use, maskable whether this adapter may be masked (interrupts turned off) - and swap whether the indicators need to be byte swapped. + to use, maskable whether this adapter may be masked (interrupts turned off), + swap whether the indicators need to be byte swapped, and flags contains + further characteristics of the adapter. + Currently defined values for 'flags' are: + - KVM_S390_ADAPTER_SUPPRESSIBLE: adapter is subject to AIS + (adapter-interrupt-suppression) facility. This flag only has an effect if + the AIS capability is enabled. + Unknown flag values are ignored. KVM_DEV_FLIC_ADAPTER_MODIFY @@ -101,6 +109,33 @@ struct kvm_s390_io_adapter_req { release a userspace page for the translated address specified in addr from the list of mappings + KVM_DEV_FLIC_AISM + modify the adapter-interruption-suppression mode for a given isc if the + AIS capability is enabled. Takes a kvm_s390_ais_req describing: + +struct kvm_s390_ais_req { + __u8 isc; + __u16 mode; +}; + + isc contains the target I/O interruption subclass, mode the target + adapter-interruption-suppression mode. The following modes are + currently supported: + - KVM_S390_AIS_MODE_ALL: ALL-Interruptions Mode, i.e. airq injection + is always allowed; + - KVM_S390_AIS_MODE_SINGLE: SINGLE-Interruption Mode, i.e. airq + injection is only allowed once and the following adapter interrupts + will be suppressed until the mode is set again to ALL-Interruptions + or SINGLE-Interruption mode. + + KVM_DEV_FLIC_AIRQ_INJECT + Inject adapter interrupts on a specified adapter. + attr->attr contains the unique id for the adapter, which allows for + adapter-specific checks and actions. + For adapters subject to AIS, handle the airq injection suppression for + an isc according to the adapter-interruption-suppression mode on condition + that the AIS capability is enabled. + Note: The KVM_SET_DEVICE_ATTR/KVM_GET_DEVICE_ATTR device ioctls executed on FLIC with an unknown group or attribute gives the error code EINVAL (instead of ENXIO, as specified in the API documentation). It is not possible to conclude diff --git a/Documentation/virtual/kvm/devices/vfio.txt b/Documentation/virtual/kvm/devices/vfio.txt index ef51740c67ca..528c77c8022c 100644 --- a/Documentation/virtual/kvm/devices/vfio.txt +++ b/Documentation/virtual/kvm/devices/vfio.txt @@ -16,7 +16,21 @@ Groups: KVM_DEV_VFIO_GROUP attributes: KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking + kvm_device_attr.addr points to an int32_t file descriptor + for the VFIO group. KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking + kvm_device_attr.addr points to an int32_t file descriptor + for the VFIO group. + KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table + allocated by sPAPR KVM. + kvm_device_attr.addr points to a struct: -For each, kvm_device_attr.addr points to an int32_t file descriptor -for the VFIO group. + struct kvm_vfio_spapr_tce { + __s32 groupfd; + __s32 tablefd; + }; + + where + @groupfd is a file descriptor for a VFIO group; + @tablefd is a file descriptor for a TCE table allocated via + KVM_CREATE_SPAPR_TCE. diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt index b6cda49f2ba4..575ccb022aac 100644 --- a/Documentation/virtual/kvm/devices/vm.txt +++ b/Documentation/virtual/kvm/devices/vm.txt @@ -140,7 +140,8 @@ struct kvm_s390_vm_cpu_subfunc { u8 kmo[16]; # valid with Message-Security-Assist-Extension 4 u8 pcc[16]; # valid with Message-Security-Assist-Extension 4 u8 ppno[16]; # valid with Message-Security-Assist-Extension 5 - u8 reserved[1824]; # reserved for future instructions + u8 kma[16]; # valid with Message-Security-Assist-Extension 8 + u8 reserved[1808]; # reserved for future instructions }; Parameters: address of a buffer to load the subfunction blocks from. diff --git a/Documentation/virtual/kvm/hypercalls.txt b/Documentation/virtual/kvm/hypercalls.txt index feaaa634f154..a890529c63ed 100644 --- a/Documentation/virtual/kvm/hypercalls.txt +++ b/Documentation/virtual/kvm/hypercalls.txt @@ -28,6 +28,11 @@ S390: property inside the device tree's /hypervisor node. For more information refer to Documentation/virtual/kvm/ppc-pv.txt +MIPS: + KVM hypercalls use the HYPCALL instruction with code 0 and the hypercall + number in $2 (v0). Up to four arguments may be placed in $4-$7 (a0-a3) and + the return value is placed in $2 (v0). + KVM Hypercalls Documentation =========================== The template for each hypercall is: diff --git a/MAINTAINERS b/MAINTAINERS index b948dfaaacd9..08360bb0468b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -5417,10 +5417,12 @@ F: Documentation/filesystems/caching/ F: fs/fscache/ F: include/linux/fscache*.h -FS-CRYPTO: FILE SYSTEM LEVEL ENCRYPTION SUPPORT +FSCRYPT: FILE SYSTEM LEVEL ENCRYPTION SUPPORT M: Theodore Y. Ts'o <tytso@mit.edu> M: Jaegeuk Kim <jaegeuk@kernel.org> -L: linux-fsdevel@vger.kernel.org +L: linux-fscrypt@vger.kernel.org +Q: https://patchwork.kernel.org/project/linux-fscrypt/list/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tytso/fscrypt.git S: Supported F: fs/crypto/ F: include/linux/fscrypt*.h diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8a7ab5e73af9..4c6816be23a9 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -27,6 +27,7 @@ config ARM select GENERIC_ALLOCATOR select GENERIC_ATOMIC64 if (CPU_V7M || CPU_V6 || !CPU_32v6K || !AEABI) select GENERIC_CLOCKEVENTS_BROADCAST if SMP + select GENERIC_CPU_AUTOPROBE select GENERIC_EARLY_IOREMAP select GENERIC_IDLE_POLL_SETUP select GENERIC_IRQ_PROBE diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S index 9150f9732785..7c711ba61417 100644 --- a/arch/arm/boot/compressed/head.S +++ b/arch/arm/boot/compressed/head.S @@ -422,7 +422,17 @@ dtb_check_done: cmp r0, #HYP_MODE bne 1f - bl __hyp_get_vectors + /* + * Compute the address of the hyp vectors after relocation. + * This requires some arithmetic since we cannot directly + * reference __hyp_stub_vectors in a PC-relative way. + * Call __hyp_set_vectors with the new address so that we + * can HVC again after the copy. + */ +0: adr r0, 0b + movw r1, #:lower16:__hyp_stub_vectors - 0b + movt r1, #:upper16:__hyp_stub_vectors - 0b + add r0, r0, r1 sub r0, r0, r5 add r0, r0, r10 bl __hyp_set_vectors diff --git a/arch/arm/boot/dts/r7s72100.dtsi b/arch/arm/boot/dts/r7s72100.dtsi index b8aa256bd515..1cf2bd038090 100644 --- a/arch/arm/boot/dts/r7s72100.dtsi +++ b/arch/arm/boot/dts/r7s72100.dtsi @@ -177,6 +177,7 @@ compatible = "arm,cortex-a9"; reg = <0>; clock-frequency = <400000000>; + next-level-cache = <&L2>; }; }; @@ -368,6 +369,16 @@ <0xe8202000 0x1000>; }; + L2: cache-controller@3ffff000 { + compatible = "arm,pl310-cache"; + reg = <0x3ffff000 0x1000>; + interrupts = <GIC_SPI 8 IRQ_TYPE_LEVEL_HIGH>; + arm,early-bresp-disable; + arm,full-line-zero-disable; + cache-unified; + cache-level = <2>; + }; + i2c0: i2c@fcfee000 { #address-cells = <1>; #size-cells = <0>; diff --git a/arch/arm/include/asm/cpufeature.h b/arch/arm/include/asm/cpufeature.h new file mode 100644 index 000000000000..6d425191d01d --- /dev/null +++ b/arch/arm/include/asm/cpufeature.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef __ASM_CPUFEATURE_H +#define __ASM_CPUFEATURE_H + +#include <linux/log2.h> +#include <asm/hwcap.h> + +/* + * Due to the fact that ELF_HWCAP is a 32-bit type on ARM, and given the number + * of optional CPU features it defines, ARM's CPU hardware capability bits have + * been distributed over separate elf_hwcap and elf_hwcap2 variables, each of + * which covers a subset of the available CPU features. + * + * Currently, only a few of those are suitable for automatic module loading + * (which is the primary use case of this facility) and those happen to be all + * covered by HWCAP2. So let's only cover those via the cpu_feature() + * convenience macro for now (which is used by module_cpu_feature_match()). + * However, all capabilities are exposed via the modalias, and can be matched + * using an explicit MODULE_DEVICE_TABLE() that uses __hwcap_feature() directly. + */ +#define MAX_CPU_FEATURES 64 +#define __hwcap_feature(x) ilog2(HWCAP_ ## x) +#define __hwcap2_feature(x) (32 + ilog2(HWCAP2_ ## x)) +#define cpu_feature(x) __hwcap2_feature(x) + +static inline bool cpu_have_feature(unsigned int num) +{ + return num < 32 ? elf_hwcap & BIT(num) : elf_hwcap2 & BIT(num - 32); +} + +#endif diff --git a/arch/arm/include/asm/fixmap.h b/arch/arm/include/asm/fixmap.h index 5c17d2dec777..8f967d1373f6 100644 --- a/arch/arm/include/asm/fixmap.h +++ b/arch/arm/include/asm/fixmap.h @@ -41,7 +41,7 @@ static const enum fixed_addresses __end_of_fixed_addresses = #define FIXMAP_PAGE_COMMON (L_PTE_YOUNG | L_PTE_PRESENT | L_PTE_XN | L_PTE_DIRTY) -#define FIXMAP_PAGE_NORMAL (FIXMAP_PAGE_COMMON | L_PTE_MT_WRITEBACK) +#define FIXMAP_PAGE_NORMAL (pgprot_kernel | L_PTE_XN) #define FIXMAP_PAGE_RO (FIXMAP_PAGE_NORMAL | L_PTE_RDONLY) /* Used by set_fixmap_(io|nocache), both meant for mapping a device */ diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h index 8ef05381984b..14d68a4d826f 100644 --- a/arch/arm/include/asm/kvm_asm.h +++ b/arch/arm/include/asm/kvm_asm.h @@ -33,7 +33,7 @@ #define ARM_EXCEPTION_IRQ 5 #define ARM_EXCEPTION_FIQ 6 #define ARM_EXCEPTION_HVC 7 - +#define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR /* * The rr_lo_hi macro swaps a pair of registers depending on * current endianness. It is used in conjunction with ldrd and strd @@ -72,10 +72,11 @@ extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); extern void __init_stage2_translation(void); -extern void __kvm_hyp_reset(unsigned long); - extern u64 __vgic_v3_get_ich_vtr_el2(void); +extern u64 __vgic_v3_read_vmcr(void); +extern void __vgic_v3_write_vmcr(u32 vmcr); extern void __vgic_v3_init_lrs(void); + #endif #endif /* __ARM_KVM_ASM_H__ */ diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h index 31ee468ce667..f0e66577ce05 100644 --- a/arch/arm/include/asm/kvm_host.h +++ b/arch/arm/include/asm/kvm_host.h @@ -30,7 +30,6 @@ #define __KVM_HAVE_ARCH_INTC_INITIALIZED #define KVM_USER_MEM_SLOTS 32 -#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #define KVM_HAVE_ONE_REG #define KVM_HALT_POLL_NS_DEFAULT 500000 @@ -45,7 +44,7 @@ #define KVM_MAX_VCPUS VGIC_V2_MAX_CPUS #endif -#define KVM_REQ_VCPU_EXIT 8 +#define KVM_REQ_VCPU_EXIT (8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) u32 *kvm_vcpu_reg(struct kvm_vcpu *vcpu, u8 reg_num, u32 mode); int __attribute_const__ kvm_target_cpu(void); @@ -270,12 +269,6 @@ static inline void __cpu_init_stage2(void) kvm_call_hyp(__init_stage2_translation); } -static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr, - phys_addr_t phys_idmap_start) -{ - kvm_call_hyp((void *)virt_to_idmap(__kvm_hyp_reset), vector_ptr); -} - static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) { return 0; diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h index 95f38dcd611d..fa6f2174276b 100644 --- a/arch/arm/include/asm/kvm_mmu.h +++ b/arch/arm/include/asm/kvm_mmu.h @@ -56,7 +56,6 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); -phys_addr_t kvm_get_idmap_start(void); int kvm_mmu_init(void); void kvm_clear_hyp_idmap(void); diff --git a/arch/arm/include/asm/module.h b/arch/arm/include/asm/module.h index 464748b9fd7d..ed2319663a1e 100644 --- a/arch/arm/include/asm/module.h +++ b/arch/arm/include/asm/module.h @@ -18,13 +18,18 @@ enum { }; #endif +struct mod_plt_sec { + struct elf32_shdr *plt; + int plt_count; +}; + struct mod_arch_specific { #ifdef CONFIG_ARM_UNWIND struct unwind_table *unwind[ARM_SEC_MAX]; #endif #ifdef CONFIG_ARM_MODULE_PLTS - struct elf32_shdr *plt; - int plt_count; + struct mod_plt_sec core; + struct mod_plt_sec init; #endif }; diff --git a/arch/arm/include/asm/proc-fns.h b/arch/arm/include/asm/proc-fns.h index 8877ad5ffe10..f2e1af45bd6f 100644 --- a/arch/arm/include/asm/proc-fns.h +++ b/arch/arm/include/asm/proc-fns.h @@ -43,7 +43,7 @@ extern struct processor { /* * Special stuff for a reset */ - void (*reset)(unsigned long addr) __attribute__((noreturn)); + void (*reset)(unsigned long addr, bool hvc) __attribute__((noreturn)); /* * Idle the processor */ @@ -88,7 +88,7 @@ extern void cpu_set_pte_ext(pte_t *ptep, pte_t pte); #else extern void cpu_set_pte_ext(pte_t *ptep, pte_t pte, unsigned int ext); #endif -extern void cpu_reset(unsigned long addr) __attribute__((noreturn)); +extern void cpu_reset(unsigned long addr, bool hvc) __attribute__((noreturn)); /* These three are private to arch/arm/kernel/suspend.c */ extern void cpu_do_suspend(void *); diff --git a/arch/arm/include/asm/virt.h b/arch/arm/include/asm/virt.h index 6dae1956c74d..141144f333a2 100644 --- a/arch/arm/include/asm/virt.h +++ b/arch/arm/include/asm/virt.h @@ -53,7 +53,7 @@ static inline void sync_boot_mode(void) } void __hyp_set_vectors(unsigned long phys_vector_base); -unsigned long __hyp_get_vectors(void); +void __hyp_reset_vectors(void); #else #define __boot_cpu_mode (SVC_MODE) #define sync_boot_mode() @@ -94,6 +94,18 @@ extern char __hyp_text_start[]; extern char __hyp_text_end[]; #endif +#else + +/* Only assembly code should need those */ + +#define HVC_SET_VECTORS 0 +#define HVC_SOFT_RESTART 1 +#define HVC_RESET_VECTORS 2 + +#define HVC_STUB_HCALL_NR 3 + #endif /* __ASSEMBLY__ */ +#define HVC_STUB_ERR 0xbadca11 + #endif /* ! VIRT_H */ diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index 6ebd3e6a1fd1..a88726359e5f 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -27,6 +27,8 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_READONLY_MEM +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + #define KVM_REG_SIZE(id) \ (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) @@ -114,6 +116,8 @@ struct kvm_debug_exit_arch { }; struct kvm_sync_regs { + /* Used with KVM_CAP_ARM_USER_IRQ */ + __u64 device_irq_level; }; struct kvm_arch_memory_slot { diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index dea3e965fe88..833c991075a1 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -30,11 +30,6 @@ #endif #ifdef CONFIG_DYNAMIC_FTRACE -#ifdef CONFIG_OLD_MCOUNT -#define OLD_MCOUNT_ADDR ((unsigned long) mcount) -#define OLD_FTRACE_ADDR ((unsigned long) ftrace_caller_old) - -#define OLD_NOP 0xe1a00000 /* mov r0, r0 */ static int __ftrace_modify_code(void *data) { @@ -52,6 +47,12 @@ void arch_ftrace_update_code(int command) stop_machine(__ftrace_modify_code, &command, NULL); } +#ifdef CONFIG_OLD_MCOUNT +#define OLD_MCOUNT_ADDR ((unsigned long) mcount) +#define OLD_FTRACE_ADDR ((unsigned long) ftrace_caller_old) + +#define OLD_NOP 0xe1a00000 /* mov r0, r0 */ + static unsigned long ftrace_nop_replace(struct dyn_ftrace *rec) { return rec->arch.old_mcount ? OLD_NOP : NOP; diff --git a/arch/arm/kernel/hyp-stub.S b/arch/arm/kernel/hyp-stub.S index 15d073ae5da2..ec7e7377d423 100644 --- a/arch/arm/kernel/hyp-stub.S +++ b/arch/arm/kernel/hyp-stub.S @@ -125,7 +125,7 @@ ENTRY(__hyp_stub_install_secondary) * (see safe_svcmode_maskall). */ @ Now install the hypervisor stub: - adr r7, __hyp_stub_vectors + W(adr) r7, __hyp_stub_vectors mcr p15, 4, r7, c12, c0, 0 @ set hypervisor vector base (HVBAR) @ Disable all traps, so we don't get any nasty surprise @@ -202,9 +202,23 @@ ARM_BE8(orr r7, r7, #(1 << 25)) @ HSCTLR.EE ENDPROC(__hyp_stub_install_secondary) __hyp_stub_do_trap: - cmp r0, #-1 - mrceq p15, 4, r0, c12, c0, 0 @ get HVBAR - mcrne p15, 4, r0, c12, c0, 0 @ set HVBAR + teq r0, #HVC_SET_VECTORS + bne 1f + mcr p15, 4, r1, c12, c0, 0 @ set HVBAR + b __hyp_stub_exit + +1: teq r0, #HVC_SOFT_RESTART + bne 1f + bx r1 + +1: teq r0, #HVC_RESET_VECTORS + beq __hyp_stub_exit + + ldr r0, =HVC_STUB_ERR + __ERET + +__hyp_stub_exit: + mov r0, #0 __ERET ENDPROC(__hyp_stub_do_trap) @@ -230,15 +244,26 @@ ENDPROC(__hyp_stub_do_trap) * so you will need to set that to something sensible at the new hypervisor's * initialisation entry point. */ -ENTRY(__hyp_get_vectors) - mov r0, #-1 -ENDPROC(__hyp_get_vectors) - @ fall through ENTRY(__hyp_set_vectors) + mov r1, r0 + mov r0, #HVC_SET_VECTORS __HVC(0) ret lr ENDPROC(__hyp_set_vectors) +ENTRY(__hyp_soft_restart) + mov r1, r0 + mov r0, #HVC_SOFT_RESTART + __HVC(0) + ret lr +ENDPROC(__hyp_soft_restart) + +ENTRY(__hyp_reset_vectors) + mov r0, #HVC_RESET_VECTORS + __HVC(0) + ret lr +ENDPROC(__hyp_reset_vectors) + #ifndef ZIMAGE .align 2 .L__boot_cpu_mode_offset: @@ -246,7 +271,7 @@ ENDPROC(__hyp_set_vectors) #endif .align 5 -__hyp_stub_vectors: +ENTRY(__hyp_stub_vectors) __hyp_stub_reset: W(b) . __hyp_stub_und: W(b) . __hyp_stub_svc: W(b) . diff --git a/arch/arm/kernel/module-plts.c b/arch/arm/kernel/module-plts.c index 3a5cba90c971..3d0c2e4dda1d 100644 --- a/arch/arm/kernel/module-plts.c +++ b/arch/arm/kernel/module-plts.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2014 Linaro Ltd. <ard.biesheuvel@linaro.org> + * Copyright (C) 2014-2017 Linaro Ltd. <ard.biesheuvel@linaro.org> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -31,9 +31,17 @@ struct plt_entries { u32 lit[PLT_ENT_COUNT]; }; +static bool in_init(const struct module *mod, unsigned long loc) +{ + return loc - (u32)mod->init_layout.base < mod->init_layout.size; +} + u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val) { - struct plt_entries *plt = (struct plt_entries *)mod->arch.plt->sh_addr; + struct mod_plt_sec *pltsec = !in_init(mod, loc) ? &mod->arch.core : + &mod->arch.init; + + struct plt_entries *plt = (struct plt_entries *)pltsec->plt->sh_addr; int idx = 0; /* @@ -41,9 +49,9 @@ u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val) * relocations are sorted, this will be the last entry we allocated. * (if one exists). */ - if (mod->arch.plt_count > 0) { - plt += (mod->arch.plt_count - 1) / PLT_ENT_COUNT; - idx = (mod->arch.plt_count - 1) % PLT_ENT_COUNT; + if (pltsec->plt_count > 0) { + plt += (pltsec->plt_count - 1) / PLT_ENT_COUNT; + idx = (pltsec->plt_count - 1) % PLT_ENT_COUNT; if (plt->lit[idx] == val) return (u32)&plt->ldr[idx]; @@ -53,8 +61,8 @@ u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val) plt++; } - mod->arch.plt_count++; - BUG_ON(mod->arch.plt_count * PLT_ENT_SIZE > mod->arch.plt->sh_size); + pltsec->plt_count++; + BUG_ON(pltsec->plt_count * PLT_ENT_SIZE > pltsec->plt->sh_size); if (!idx) /* Populate a new set of entries */ @@ -129,7 +137,7 @@ static bool duplicate_rel(Elf32_Addr base, const Elf32_Rel *rel, int num) /* Count how many PLT entries we may need */ static unsigned int count_plts(const Elf32_Sym *syms, Elf32_Addr base, - const Elf32_Rel *rel, int num) + const Elf32_Rel *rel, int num, Elf32_Word dstidx) { unsigned int ret = 0; const Elf32_Sym *s; @@ -144,13 +152,17 @@ static unsigned int count_plts(const Elf32_Sym *syms, Elf32_Addr base, case R_ARM_THM_JUMP24: /* * We only have to consider branch targets that resolve - * to undefined symbols. This is not simply a heuristic, - * it is a fundamental limitation, since the PLT itself - * is part of the module, and needs to be within range - * as well, so modules can never grow beyond that limit. + * to symbols that are defined in a different section. + * This is not simply a heuristic, it is a fundamental + * limitation, since there is no guaranteed way to emit + * PLT entries sufficiently close to the branch if the + * section size exceeds the range of a branch + * instruction. So ignore relocations against defined + * symbols if they live in the same section as the + * relocation target. */ s = syms + ELF32_R_SYM(rel[i].r_info); - if (s->st_shndx != SHN_UNDEF) + if (s->st_shndx == dstidx) break; /* @@ -161,7 +173,12 @@ static unsigned int count_plts(const Elf32_Sym *syms, Elf32_Addr base, * So we need to support them, but there is no need to * take them into consideration when trying to optimize * this code. So let's only check for duplicates when - * the addend is zero. + * the addend is zero. (Note that calls into the core + * module via init PLT entries could involve section + * relative symbol references with non-zero addends, for + * which we may end up emitting duplicates, but the init + * PLT is released along with the rest of the .init + * region as soon as module loading completes.) */ if (!is_zero_addend_relocation(base, rel + i) || !duplicate_rel(base, rel, i)) @@ -174,7 +191,8 @@ static unsigned int count_plts(const Elf32_Sym *syms, Elf32_Addr base, int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { - unsigned long plts = 0; + unsigned long core_plts = 0; + unsigned long init_plts = 0; Elf32_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum; Elf32_Sym *syms = NULL; @@ -184,13 +202,15 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, */ for (s = sechdrs; s < sechdrs_end; ++s) { if (strcmp(".plt", secstrings + s->sh_name) == 0) - mod->arch.plt = s; + mod->arch.core.plt = s; + else if (strcmp(".init.plt", secstrings + s->sh_name) == 0) + mod->arch.init.plt = s; else if (s->sh_type == SHT_SYMTAB) syms = (Elf32_Sym *)s->sh_addr; } - if (!mod->arch.plt) { - pr_err("%s: module PLT section missing\n", mod->name); + if (!mod->arch.core.plt || !mod->arch.init.plt) { + pr_err("%s: module PLT section(s) missing\n", mod->name); return -ENOEXEC; } if (!syms) { @@ -213,16 +233,29 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, /* sort by type and symbol index */ sort(rels, numrels, sizeof(Elf32_Rel), cmp_rel, NULL); - plts += count_plts(syms, dstsec->sh_addr, rels, numrels); + if (strncmp(secstrings + dstsec->sh_name, ".init", 5) != 0) + core_plts += count_plts(syms, dstsec->sh_addr, rels, + numrels, s->sh_info); + else + init_plts += count_plts(syms, dstsec->sh_addr, rels, + numrels, s->sh_info); } - mod->arch.plt->sh_type = SHT_NOBITS; - mod->arch.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; - mod->arch.plt->sh_addralign = L1_CACHE_BYTES; - mod->arch.plt->sh_size = round_up(plts * PLT_ENT_SIZE, - sizeof(struct plt_entries)); - mod->arch.plt_count = 0; - - pr_debug("%s: plt=%x\n", __func__, mod->arch.plt->sh_size); + mod->arch.core.plt->sh_type = SHT_NOBITS; + mod->arch.core.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + mod->arch.core.plt->sh_addralign = L1_CACHE_BYTES; + mod->arch.core.plt->sh_size = round_up(core_plts * PLT_ENT_SIZE, + sizeof(struct plt_entries)); + mod->arch.core.plt_count = 0; + + mod->arch.init.plt->sh_type = SHT_NOBITS; + mod->arch.init.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + mod->arch.init.plt->sh_addralign = L1_CACHE_BYTES; + mod->arch.init.plt->sh_size = round_up(init_plts * PLT_ENT_SIZE, + sizeof(struct plt_entries)); + mod->arch.init.plt_count = 0; + + pr_debug("%s: plt=%x, init.plt=%x\n", __func__, + mod->arch.core.plt->sh_size, mod->arch.init.plt->sh_size); return 0; } diff --git a/arch/arm/kernel/module.lds b/arch/arm/kernel/module.lds index 05881e2b414c..eacb5c67f61e 100644 --- a/arch/arm/kernel/module.lds +++ b/arch/arm/kernel/module.lds @@ -1,3 +1,4 @@ SECTIONS { .plt : { BYTE(0) } + .init.plt : { BYTE(0) } } diff --git a/arch/arm/kernel/reboot.c b/arch/arm/kernel/reboot.c index 3fa867a2aae6..3b2aa9a9fe26 100644 --- a/arch/arm/kernel/reboot.c +++ b/arch/arm/kernel/reboot.c @@ -12,10 +12,11 @@ #include <asm/cacheflush.h> #include <asm/idmap.h> +#include <asm/virt.h> #include "reboot.h" -typedef void (*phys_reset_t)(unsigned long); +typedef void (*phys_reset_t)(unsigned long, bool); /* * Function pointers to optional machine specific functions @@ -51,7 +52,9 @@ static void __soft_restart(void *addr) /* Switch to the identity mapping. */ phys_reset = (phys_reset_t)virt_to_idmap(cpu_reset); - phys_reset((unsigned long)addr); + + /* original stub should be restored by kvm */ + phys_reset((unsigned long)addr, is_hyp_mode_available()); /* Should never get here. */ BUG(); diff --git a/arch/arm/kernel/setup.c b/arch/arm/kernel/setup.c index f4e54503afa9..32e1a9513dc7 100644 --- a/arch/arm/kernel/setup.c +++ b/arch/arm/kernel/setup.c @@ -80,7 +80,7 @@ __setup("fpe=", fpe_setup); extern void init_default_cache_policy(unsigned long); extern void paging_init(const struct machine_desc *desc); -extern void early_paging_init(const struct machine_desc *); +extern void early_mm_init(const struct machine_desc *); extern void adjust_lowmem_bounds(void); extern enum reboot_mode reboot_mode; extern void setup_dma_zone(const struct machine_desc *desc); @@ -1088,7 +1088,7 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); #ifdef CONFIG_MMU - early_paging_init(mdesc); + early_mm_init(mdesc); #endif setup_dma_zone(mdesc); xen_early_init(); diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 314eb6abe1ff..8a31906bdc9b 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -53,7 +53,6 @@ __asm__(".arch_extension virt"); static DEFINE_PER_CPU(unsigned long, kvm_arm_hyp_stack_page); static kvm_cpu_context_t __percpu *kvm_host_cpu_state; -static unsigned long hyp_default_vectors; /* Per-CPU variable containing the currently running vcpu. */ static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_arm_running_vcpu); @@ -209,9 +208,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IMMEDIATE_EXIT: r = 1; break; - case KVM_CAP_COALESCED_MMIO: - r = KVM_COALESCED_MMIO_PAGE_OFFSET; - break; case KVM_CAP_ARM_SET_DEVICE_ADDR: r = 1; break; @@ -230,6 +226,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) else r = kvm->arch.vgic.msis_require_devid; break; + case KVM_CAP_ARM_USER_IRQ: + /* + * 1: EL1_VTIMER, EL1_PTIMER, and PMU. + * (bump this number if adding more devices) + */ + r = 1; + break; default: r = kvm_arch_dev_ioctl_check_extension(kvm, ext); break; @@ -351,15 +354,14 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) vcpu->arch.host_cpu_context = this_cpu_ptr(kvm_host_cpu_state); kvm_arm_set_running_vcpu(vcpu); + + kvm_vgic_load(vcpu); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { - /* - * The arch-generic KVM code expects the cpu field of a vcpu to be -1 - * if the vcpu is no longer assigned to a cpu. This is used for the - * optimized make_all_cpus_request path. - */ + kvm_vgic_put(vcpu); + vcpu->cpu = -1; kvm_arm_set_running_vcpu(NULL); @@ -517,13 +519,7 @@ static int kvm_vcpu_first_run_init(struct kvm_vcpu *vcpu) return ret; } - /* - * Enable the arch timers only if we have an in-kernel VGIC - * and it has been properly initialized, since we cannot handle - * interrupts from the virtual timer with a userspace gic. - */ - if (irqchip_in_kernel(kvm) && vgic_initialized(kvm)) - ret = kvm_timer_enable(vcpu); + ret = kvm_timer_enable(vcpu); return ret; } @@ -633,16 +629,23 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) * non-preemptible context. */ preempt_disable(); + kvm_pmu_flush_hwstate(vcpu); + kvm_timer_flush_hwstate(vcpu); kvm_vgic_flush_hwstate(vcpu); local_irq_disable(); /* - * Re-check atomic conditions + * If we have a singal pending, or need to notify a userspace + * irqchip about timer or PMU level changes, then we exit (and + * update the timer level state in kvm_timer_update_run + * below). */ - if (signal_pending(current)) { + if (signal_pending(current) || + kvm_timer_should_notify_user(vcpu) || + kvm_pmu_should_notify_user(vcpu)) { ret = -EINTR; run->exit_reason = KVM_EXIT_INTR; } @@ -714,6 +717,12 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) ret = handle_exit(vcpu, run, ret); } + /* Tell userspace about in-kernel device output levels */ + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) { + kvm_timer_update_run(vcpu); + kvm_pmu_update_run(vcpu); + } + if (vcpu->sigset_active) sigprocmask(SIG_SETMASK, &sigsaved, NULL); return ret; @@ -1112,8 +1121,16 @@ static void cpu_init_hyp_mode(void *dummy) kvm_arm_init_debug(); } +static void cpu_hyp_reset(void) +{ + if (!is_kernel_in_hyp_mode()) + __hyp_reset_vectors(); +} + static void cpu_hyp_reinit(void) { + cpu_hyp_reset(); + if (is_kernel_in_hyp_mode()) { /* * __cpu_init_stage2() is safe to call even if the PM @@ -1121,21 +1138,13 @@ static void cpu_hyp_reinit(void) */ __cpu_init_stage2(); } else { - if (__hyp_get_vectors() == hyp_default_vectors) - cpu_init_hyp_mode(NULL); + cpu_init_hyp_mode(NULL); } if (vgic_present) kvm_vgic_init_cpu_hardware(); } -static void cpu_hyp_reset(void) -{ - if (!is_kernel_in_hyp_mode()) - __cpu_reset_hyp_mode(hyp_default_vectors, - kvm_get_idmap_start()); -} - static void _kvm_arch_hardware_enable(void *discard) { if (!__this_cpu_read(kvm_arm_hardware_enabled)) { @@ -1319,12 +1328,6 @@ static int init_hyp_mode(void) goto out_err; /* - * It is probably enough to obtain the default on one - * CPU. It's unlikely to be different on the others. - */ - hyp_default_vectors = __hyp_get_vectors(); - - /* * Allocate stack pages for Hypervisor-mode */ for_each_possible_cpu(cpu) { diff --git a/arch/arm/kvm/coproc.c b/arch/arm/kvm/coproc.c index 3e5e4194ef86..2c14b69511e9 100644 --- a/arch/arm/kvm/coproc.c +++ b/arch/arm/kvm/coproc.c @@ -40,6 +40,24 @@ * Co-processor emulation *****************************************************************************/ +static bool write_to_read_only(struct kvm_vcpu *vcpu, + const struct coproc_params *params) +{ + WARN_ONCE(1, "CP15 write to read-only register\n"); + print_cp_instr(params); + kvm_inject_undefined(vcpu); + return false; +} + +static bool read_from_write_only(struct kvm_vcpu *vcpu, + const struct coproc_params *params) +{ + WARN_ONCE(1, "CP15 read to write-only register\n"); + print_cp_instr(params); + kvm_inject_undefined(vcpu); + return false; +} + /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ static u32 cache_levels; @@ -502,15 +520,15 @@ static int emulate_cp15(struct kvm_vcpu *vcpu, if (likely(r->access(vcpu, params, r))) { /* Skip instruction, since it was emulated */ kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - return 1; } - /* If access function fails, it should complain. */ } else { + /* If access function fails, it should complain. */ kvm_err("Unsupported guest CP15 access at: %08lx\n", *vcpu_pc(vcpu)); print_cp_instr(params); + kvm_inject_undefined(vcpu); } - kvm_inject_undefined(vcpu); + return 1; } diff --git a/arch/arm/kvm/coproc.h b/arch/arm/kvm/coproc.h index eef1759c2b65..3a41b7d1eb86 100644 --- a/arch/arm/kvm/coproc.h +++ b/arch/arm/kvm/coproc.h @@ -81,24 +81,6 @@ static inline bool read_zero(struct kvm_vcpu *vcpu, return true; } -static inline bool write_to_read_only(struct kvm_vcpu *vcpu, - const struct coproc_params *params) -{ - kvm_debug("CP15 write to read-only register at: %08lx\n", - *vcpu_pc(vcpu)); - print_cp_instr(params); - return false; -} - -static inline bool read_from_write_only(struct kvm_vcpu *vcpu, - const struct coproc_params *params) -{ - kvm_debug("CP15 read to write-only register at: %08lx\n", - *vcpu_pc(vcpu)); - print_cp_instr(params); - return false; -} - /* Reset functions */ static inline void reset_unknown(struct kvm_vcpu *vcpu, const struct coproc_reg *r) diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c index 96af65a30d78..5fd7968cdae9 100644 --- a/arch/arm/kvm/handle_exit.c +++ b/arch/arm/kvm/handle_exit.c @@ -160,6 +160,14 @@ int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run, case ARM_EXCEPTION_DATA_ABORT: kvm_inject_vabt(vcpu); return 1; + case ARM_EXCEPTION_HYP_GONE: + /* + * HYP has been reset to the hyp-stub. This happens + * when a guest is pre-empted by kvm_reboot()'s + * shutdown call. + */ + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + return 0; default: kvm_pr_unimpl("Unsupported exception type: %d", exception_index); diff --git a/arch/arm/kvm/hyp/hyp-entry.S b/arch/arm/kvm/hyp/hyp-entry.S index 96beb53934c9..95a2faefc070 100644 --- a/arch/arm/kvm/hyp/hyp-entry.S +++ b/arch/arm/kvm/hyp/hyp-entry.S @@ -126,11 +126,29 @@ hyp_hvc: */ pop {r0, r1, r2} - /* Check for __hyp_get_vectors */ - cmp r0, #-1 - mrceq p15, 4, r0, c12, c0, 0 @ get HVBAR - beq 1f + /* + * Check if we have a kernel function, which is guaranteed to be + * bigger than the maximum hyp stub hypercall + */ + cmp r0, #HVC_STUB_HCALL_NR + bhs 1f + /* + * Not a kernel function, treat it as a stub hypercall. + * Compute the physical address for __kvm_handle_stub_hvc + * (as the code lives in the idmaped page) and branch there. + * We hijack ip (r12) as a tmp register. + */ + push {r1} + ldr r1, =kimage_voffset + ldr r1, [r1] + ldr ip, =__kvm_handle_stub_hvc + sub ip, ip, r1 + pop {r1} + + bx ip + +1: push {lr} mov lr, r0 @@ -142,7 +160,7 @@ THUMB( orr lr, #1) blx lr @ Call the HYP function pop {lr} -1: eret + eret guest_trap: load_vcpu r0 @ Load VCPU pointer to r0 diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S index bf89c919efc1..570ed4a9c261 100644 --- a/arch/arm/kvm/init.S +++ b/arch/arm/kvm/init.S @@ -23,6 +23,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_arm.h> #include <asm/kvm_mmu.h> +#include <asm/virt.h> /******************************************************************** * Hypervisor initialization @@ -39,6 +40,10 @@ * - Setup the page tables * - Enable the MMU * - Profit! (or eret, if you only care about the code). + * + * Another possibility is to get a HYP stub hypercall. + * We discriminate between the two by checking if r0 contains a value + * that is less than HVC_STUB_HCALL_NR. */ .text @@ -58,6 +63,10 @@ __kvm_hyp_init: W(b) . __do_hyp_init: + @ Check for a stub hypercall + cmp r0, #HVC_STUB_HCALL_NR + blo __kvm_handle_stub_hvc + @ Set stack pointer mov sp, r0 @@ -112,20 +121,46 @@ __do_hyp_init: eret - @ r0 : stub vectors address -ENTRY(__kvm_hyp_reset) +ENTRY(__kvm_handle_stub_hvc) + cmp r0, #HVC_SOFT_RESTART + bne 1f + + /* The target is expected in r1 */ + msr ELR_hyp, r1 + mrs r0, cpsr + bic r0, r0, #MODE_MASK + orr r0, r0, #HYP_MODE +THUMB( orr r0, r0, #PSR_T_BIT ) + msr spsr_cxsf, r0 + b reset + +1: cmp r0, #HVC_RESET_VECTORS + bne 1f + +reset: /* We're now in idmap, disable MMU */ mrc p15, 4, r1, c1, c0, 0 @ HSCTLR - ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I) - bic r1, r1, r2 + ldr r0, =(HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I) + bic r1, r1, r0 mcr p15, 4, r1, c1, c0, 0 @ HSCTLR - /* Install stub vectors */ - mcr p15, 4, r0, c12, c0, 0 @ HVBAR - isb + /* + * Install stub vectors, using ardb's VA->PA trick. + */ +0: adr r0, 0b @ PA(0) + movw r1, #:lower16:__hyp_stub_vectors - 0b @ VA(stub) - VA(0) + movt r1, #:upper16:__hyp_stub_vectors - 0b + add r1, r1, r0 @ PA(stub) + mcr p15, 4, r1, c12, c0, 0 @ HVBAR + b exit + +1: ldr r0, =HVC_STUB_ERR + eret +exit: + mov r0, #0 eret -ENDPROC(__kvm_hyp_reset) +ENDPROC(__kvm_handle_stub_hvc) .ltorg diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S index b1bd316f14c0..80a1d6cd261c 100644 --- a/arch/arm/kvm/interrupts.S +++ b/arch/arm/kvm/interrupts.S @@ -37,10 +37,6 @@ * in Hyp mode (see init_hyp_mode in arch/arm/kvm/arm.c). Return values are * passed in r0 (strictly 32bit). * - * A function pointer with a value of 0xffffffff has a special meaning, - * and is used to implement __hyp_get_vectors in the same way as in - * arch/arm/kernel/hyp_stub.S. - * * The calling convention follows the standard AAPCS: * r0 - r3: caller save * r12: caller save diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 582a972371cf..313ee646480f 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -1524,7 +1524,8 @@ static int handle_hva_to_gpa(struct kvm *kvm, unsigned long start, unsigned long end, int (*handler)(struct kvm *kvm, - gpa_t gpa, void *data), + gpa_t gpa, u64 size, + void *data), void *data) { struct kvm_memslots *slots; @@ -1536,7 +1537,7 @@ static int handle_hva_to_gpa(struct kvm *kvm, /* we only care about the pages that the guest sees */ kvm_for_each_memslot(memslot, slots) { unsigned long hva_start, hva_end; - gfn_t gfn, gfn_end; + gfn_t gpa; hva_start = max(start, memslot->userspace_addr); hva_end = min(end, memslot->userspace_addr + @@ -1544,25 +1545,16 @@ static int handle_hva_to_gpa(struct kvm *kvm, if (hva_start >= hva_end) continue; - /* - * {gfn(page) | page intersects with [hva_start, hva_end)} = - * {gfn_start, gfn_start+1, ..., gfn_end-1}. - */ - gfn = hva_to_gfn_memslot(hva_start, memslot); - gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); - - for (; gfn < gfn_end; ++gfn) { - gpa_t gpa = gfn << PAGE_SHIFT; - ret |= handler(kvm, gpa, data); - } + gpa = hva_to_gfn_memslot(hva_start, memslot) << PAGE_SHIFT; + ret |= handler(kvm, gpa, (u64)(hva_end - hva_start), data); } return ret; } -static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) +static int kvm_unmap_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { - unmap_stage2_range(kvm, gpa, PAGE_SIZE); + unmap_stage2_range(kvm, gpa, size); return 0; } @@ -1589,10 +1581,11 @@ int kvm_unmap_hva_range(struct kvm *kvm, return 0; } -static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, void *data) +static int kvm_set_spte_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { pte_t *pte = (pte_t *)data; + WARN_ON(size != PAGE_SIZE); /* * We can always call stage2_set_pte with KVM_S2PTE_FLAG_LOGGING_ACTIVE * flag clear because MMU notifiers will have unmapped a huge PMD before @@ -1618,11 +1611,12 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte) handle_hva_to_gpa(kvm, hva, end, &kvm_set_spte_handler, &stage2_pte); } -static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) +static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { pmd_t *pmd; pte_t *pte; + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); pmd = stage2_get_pmd(kvm, NULL, gpa); if (!pmd || pmd_none(*pmd)) /* Nothing there */ return 0; @@ -1637,11 +1631,12 @@ static int kvm_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) return stage2_ptep_test_and_clear_young(pte); } -static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, void *data) +static int kvm_test_age_hva_handler(struct kvm *kvm, gpa_t gpa, u64 size, void *data) { pmd_t *pmd; pte_t *pte; + WARN_ON(size != PAGE_SIZE && size != PMD_SIZE); pmd = stage2_get_pmd(kvm, NULL, gpa); if (!pmd || pmd_none(*pmd)) /* Nothing there */ return 0; @@ -1686,11 +1681,6 @@ phys_addr_t kvm_get_idmap_vector(void) return hyp_idmap_vector; } -phys_addr_t kvm_get_idmap_start(void) -{ - return hyp_idmap_start; -} - static int kvm_map_idmap_text(pgd_t *pgd) { int err; diff --git a/arch/arm/kvm/psci.c b/arch/arm/kvm/psci.c index c2b131527a64..a08d7a93aebb 100644 --- a/arch/arm/kvm/psci.c +++ b/arch/arm/kvm/psci.c @@ -208,9 +208,10 @@ int kvm_psci_version(struct kvm_vcpu *vcpu) static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) { - int ret = 1; + struct kvm *kvm = vcpu->kvm; unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0); unsigned long val; + int ret = 1; switch (psci_fn) { case PSCI_0_2_FN_PSCI_VERSION: @@ -230,7 +231,9 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) break; case PSCI_0_2_FN_CPU_ON: case PSCI_0_2_FN64_CPU_ON: + mutex_lock(&kvm->lock); val = kvm_psci_vcpu_on(vcpu); + mutex_unlock(&kvm->lock); break; case PSCI_0_2_FN_AFFINITY_INFO: case PSCI_0_2_FN64_AFFINITY_INFO: @@ -279,6 +282,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0); unsigned long val; @@ -288,7 +292,9 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) val = PSCI_RET_SUCCESS; break; case KVM_PSCI_FN_CPU_ON: + mutex_lock(&kvm->lock); val = kvm_psci_vcpu_on(vcpu); + mutex_unlock(&kvm->lock); break; default: val = PSCI_RET_NOT_SUPPORTED; diff --git a/arch/arm/mach-shmobile/setup-r7s72100.c b/arch/arm/mach-shmobile/setup-r7s72100.c index d46639fc6849..319ca9508ec6 100644 --- a/arch/arm/mach-shmobile/setup-r7s72100.c +++ b/arch/arm/mach-shmobile/setup-r7s72100.c @@ -26,6 +26,8 @@ static const char *const r7s72100_boards_compat_dt[] __initconst = { }; DT_MACHINE_START(R7S72100_DT, "Generic R7S72100 (Flattened Device Tree)") + .l2c_aux_val = 0, + .l2c_aux_mask = ~0, .init_early = shmobile_init_delay, .init_late = shmobile_init_late, .dt_compat = r7s72100_boards_compat_dt, diff --git a/arch/arm/mm/cache-l2x0.c b/arch/arm/mm/cache-l2x0.c index 2290be390f87..808efbb89b88 100644 --- a/arch/arm/mm/cache-l2x0.c +++ b/arch/arm/mm/cache-l2x0.c @@ -57,6 +57,9 @@ static unsigned long sync_reg_offset = L2X0_CACHE_SYNC; struct l2x0_regs l2x0_saved_regs; +static bool l2x0_bresp_disable; +static bool l2x0_flz_disable; + /* * Common code for all cache controllers. */ @@ -620,7 +623,7 @@ static void __init l2c310_enable(void __iomem *base, unsigned num_lock) u32 aux = l2x0_saved_regs.aux_ctrl; if (rev >= L310_CACHE_ID_RTL_R2P0) { - if (cortex_a9) { + if (cortex_a9 && !l2x0_bresp_disable) { aux |= L310_AUX_CTRL_EARLY_BRESP; pr_info("L2C-310 enabling early BRESP for Cortex-A9\n"); } else if (aux & L310_AUX_CTRL_EARLY_BRESP) { @@ -629,7 +632,7 @@ static void __init l2c310_enable(void __iomem *base, unsigned num_lock) } } - if (cortex_a9) { + if (cortex_a9 && !l2x0_flz_disable) { u32 aux_cur = readl_relaxed(base + L2X0_AUX_CTRL); u32 acr = get_auxcr(); @@ -1200,6 +1203,12 @@ static void __init l2c310_of_parse(const struct device_node *np, *aux_mask &= ~L2C_AUX_CTRL_PARITY_ENABLE; } + if (of_property_read_bool(np, "arm,early-bresp-disable")) + l2x0_bresp_disable = true; + + if (of_property_read_bool(np, "arm,full-line-zero-disable")) + l2x0_flz_disable = true; + prefetch = l2x0_saved_regs.prefetch_ctrl; ret = of_property_read_u32(np, "arm,double-linefill", &val); diff --git a/arch/arm/mm/dump.c b/arch/arm/mm/dump.c index 21192d6eda40..35ff45470dbf 100644 --- a/arch/arm/mm/dump.c +++ b/arch/arm/mm/dump.c @@ -17,6 +17,7 @@ #include <linux/mm.h> #include <linux/seq_file.h> +#include <asm/domain.h> #include <asm/fixmap.h> #include <asm/memory.h> #include <asm/pgtable.h> @@ -43,6 +44,7 @@ struct pg_state { unsigned long start_address; unsigned level; u64 current_prot; + const char *current_domain; }; struct prot_bits { @@ -216,7 +218,8 @@ static void dump_prot(struct pg_state *st, const struct prot_bits *bits, size_t } } -static void note_page(struct pg_state *st, unsigned long addr, unsigned level, u64 val) +static void note_page(struct pg_state *st, unsigned long addr, + unsigned int level, u64 val, const char *domain) { static const char units[] = "KMGTPE"; u64 prot = val & pg_level[level].mask; @@ -224,8 +227,10 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level, u if (!st->level) { st->level = level; st->current_prot = prot; + st->current_domain = domain; seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } else if (prot != st->current_prot || level != st->level || + domain != st->current_domain || addr >= st->marker[1].start_address) { const char *unit = units; unsigned long delta; @@ -240,6 +245,8 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level, u unit++; } seq_printf(st->seq, "%9lu%c", delta, *unit); + if (st->current_domain) + seq_printf(st->seq, " %s", st->current_domain); if (pg_level[st->level].bits) dump_prot(st, pg_level[st->level].bits, pg_level[st->level].num); seq_printf(st->seq, "\n"); @@ -251,11 +258,13 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level, u } st->start_address = addr; st->current_prot = prot; + st->current_domain = domain; st->level = level; } } -static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) +static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start, + const char *domain) { pte_t *pte = pte_offset_kernel(pmd, 0); unsigned long addr; @@ -263,25 +272,50 @@ static void walk_pte(struct pg_state *st, pmd_t *pmd, unsigned long start) for (i = 0; i < PTRS_PER_PTE; i++, pte++) { addr = start + i * PAGE_SIZE; - note_page(st, addr, 4, pte_val(*pte)); + note_page(st, addr, 4, pte_val(*pte), domain); } } +static const char *get_domain_name(pmd_t *pmd) +{ +#ifndef CONFIG_ARM_LPAE + switch (pmd_val(*pmd) & PMD_DOMAIN_MASK) { + case PMD_DOMAIN(DOMAIN_KERNEL): + return "KERNEL "; + case PMD_DOMAIN(DOMAIN_USER): + return "USER "; + case PMD_DOMAIN(DOMAIN_IO): + return "IO "; + case PMD_DOMAIN(DOMAIN_VECTORS): + return "VECTORS"; + default: + return "unknown"; + } +#endif + return NULL; +} + static void walk_pmd(struct pg_state *st, pud_t *pud, unsigned long start) { pmd_t *pmd = pmd_offset(pud, 0); unsigned long addr; unsigned i; + const char *domain; for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { addr = start + i * PMD_SIZE; + domain = get_domain_name(pmd); if (pmd_none(*pmd) || pmd_large(*pmd) || !pmd_present(*pmd)) - note_page(st, addr, 3, pmd_val(*pmd)); + note_page(st, addr, 3, pmd_val(*pmd), domain); else - walk_pte(st, pmd, addr); + walk_pte(st, pmd, addr, domain); - if (SECTION_SIZE < PMD_SIZE && pmd_large(pmd[1])) - note_page(st, addr + SECTION_SIZE, 3, pmd_val(pmd[1])); + if (SECTION_SIZE < PMD_SIZE && pmd_large(pmd[1])) { + addr += SECTION_SIZE; + pmd++; + domain = get_domain_name(pmd); + note_page(st, addr, 3, pmd_val(*pmd), domain); + } } } @@ -296,7 +330,7 @@ static void walk_pud(struct pg_state *st, pgd_t *pgd, unsigned long start) if (!pud_none(*pud)) { walk_pmd(st, pud, addr); } else { - note_page(st, addr, 2, pud_val(*pud)); + note_page(st, addr, 2, pud_val(*pud), NULL); } } } @@ -317,11 +351,11 @@ static void walk_pgd(struct seq_file *m) if (!pgd_none(*pgd)) { walk_pud(&st, pgd, addr); } else { - note_page(&st, addr, 1, pgd_val(*pgd)); + note_page(&st, addr, 1, pgd_val(*pgd), NULL); } } - note_page(&st, 0, 0, 0); + note_page(&st, 0, 0, 0, NULL); } static int ptdump_show(struct seq_file *m, void *v) diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index 1d8558ff9827..ad80548325fe 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -709,34 +709,37 @@ void set_section_perms(struct section_perm *perms, int n, bool set, } +/** + * update_sections_early intended to be called only through stop_machine + * framework and executed by only one CPU while all other CPUs will spin and + * wait, so no locking is required in this function. + */ static void update_sections_early(struct section_perm perms[], int n) { struct task_struct *t, *s; - read_lock(&tasklist_lock); for_each_process(t) { if (t->flags & PF_KTHREAD) continue; for_each_thread(t, s) set_section_perms(perms, n, true, s->mm); } - read_unlock(&tasklist_lock); set_section_perms(perms, n, true, current->active_mm); set_section_perms(perms, n, true, &init_mm); } -int __fix_kernmem_perms(void *unused) +static int __fix_kernmem_perms(void *unused) { update_sections_early(nx_perms, ARRAY_SIZE(nx_perms)); return 0; } -void fix_kernmem_perms(void) +static void fix_kernmem_perms(void) { stop_machine(__fix_kernmem_perms, NULL, NULL); } -int __mark_rodata_ro(void *unused) +static int __mark_rodata_ro(void *unused) { update_sections_early(ro_perms, ARRAY_SIZE(ro_perms)); return 0; diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 4e016d7f37b3..31af3cb59a60 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -87,6 +87,8 @@ struct cachepolicy { #define s2_policy(policy) 0 #endif +unsigned long kimage_voffset __ro_after_init; + static struct cachepolicy cache_policies[] __initdata = { { .policy = "uncached", @@ -414,6 +416,11 @@ void __set_fixmap(enum fixed_addresses idx, phys_addr_t phys, pgprot_t prot) FIXADDR_END); BUG_ON(idx >= __end_of_fixed_addresses); + /* we only support device mappings until pgprot_kernel has been set */ + if (WARN_ON(pgprot_val(prot) != pgprot_val(FIXMAP_PAGE_IO) && + pgprot_val(pgprot_kernel) == 0)) + return; + if (pgprot_val(prot)) set_pte_at(NULL, vaddr, pte, pfn_pte(phys >> PAGE_SHIFT, prot)); @@ -1492,7 +1499,7 @@ pgtables_remap lpae_pgtables_remap_asm; * early_paging_init() recreates boot time page table setup, allowing machines * to switch over to a high (>4G) address space on LPAE systems */ -void __init early_paging_init(const struct machine_desc *mdesc) +static void __init early_paging_init(const struct machine_desc *mdesc) { pgtables_remap *lpae_pgtables_remap; unsigned long pa_pgd; @@ -1560,7 +1567,7 @@ void __init early_paging_init(const struct machine_desc *mdesc) #else -void __init early_paging_init(const struct machine_desc *mdesc) +static void __init early_paging_init(const struct machine_desc *mdesc) { long long offset; @@ -1616,7 +1623,6 @@ void __init paging_init(const struct machine_desc *mdesc) { void *zero_page; - build_mem_type_table(); prepare_page_table(); map_lowmem(); memblock_set_current_limit(arm_lowmem_limit); @@ -1635,4 +1641,13 @@ void __init paging_init(const struct machine_desc *mdesc) empty_zero_page = virt_to_page(zero_page); __flush_dcache_page(NULL, empty_zero_page); + + /* Compute the virt/idmap offset, mostly for the sake of KVM */ + kimage_voffset = (unsigned long)&kimage_voffset - virt_to_idmap(&kimage_voffset); +} + +void __init early_mm_init(const struct machine_desc *mdesc) +{ + build_mem_type_table(); + early_paging_init(mdesc); } diff --git a/arch/arm/mm/proc-v7.S b/arch/arm/mm/proc-v7.S index d00d52c9de3e..01d64c0b2563 100644 --- a/arch/arm/mm/proc-v7.S +++ b/arch/arm/mm/proc-v7.S @@ -39,13 +39,14 @@ ENTRY(cpu_v7_proc_fin) ENDPROC(cpu_v7_proc_fin) /* - * cpu_v7_reset(loc) + * cpu_v7_reset(loc, hyp) * * Perform a soft reset of the system. Put the CPU into the * same state as it would be if it had been reset, and branch * to what would be the reset vector. * * - loc - location to jump to for soft reset + * - hyp - indicate if restart occurs in HYP mode * * This code must be executed using a flat identity mapping with * caches disabled. @@ -53,11 +54,15 @@ ENDPROC(cpu_v7_proc_fin) .align 5 .pushsection .idmap.text, "ax" ENTRY(cpu_v7_reset) - mrc p15, 0, r1, c1, c0, 0 @ ctrl register - bic r1, r1, #0x1 @ ...............m - THUMB( bic r1, r1, #1 << 30 ) @ SCTLR.TE (Thumb exceptions) - mcr p15, 0, r1, c1, c0, 0 @ disable MMU + mrc p15, 0, r2, c1, c0, 0 @ ctrl register + bic r2, r2, #0x1 @ ...............m + THUMB( bic r2, r2, #1 << 30 ) @ SCTLR.TE (Thumb exceptions) + mcr p15, 0, r2, c1, c0, 0 @ disable MMU isb +#ifdef CONFIG_ARM_VIRT_EXT + teq r1, #0 + bne __hyp_soft_restart +#endif bx r0 ENDPROC(cpu_v7_reset) .popsection diff --git a/arch/arm/mm/proc-v7m.S b/arch/arm/mm/proc-v7m.S index 8dea61640cc1..47a5acc64433 100644 --- a/arch/arm/mm/proc-v7m.S +++ b/arch/arm/mm/proc-v7m.S @@ -135,9 +135,11 @@ __v7m_setup_cont: dsb mov r6, lr @ save LR ldr sp, =init_thread_union + THREAD_START_SP + stmia sp, {r0-r3, r12} cpsie i svc #0 1: cpsid i + ldmia sp, {r0-r3, r12} str r5, [r12, #11 * 4] @ restore the original SVC vector entry mov lr, r6 @ restore LR @@ -147,10 +149,10 @@ __v7m_setup_cont: @ Configure caches (if implemented) teq r8, #0 - stmneia r12, {r0-r6, lr} @ v7m_invalidate_l1 touches r0-r6 + stmneia sp, {r0-r6, lr} @ v7m_invalidate_l1 touches r0-r6 blne v7m_invalidate_l1 teq r8, #0 @ re-evalutae condition - ldmneia r12, {r0-r6, lr} + ldmneia sp, {r0-r6, lr} @ Configure the System Control Register to ensure 8-byte stack alignment @ Note the STKALIGN bit is either RW or RAO. diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index ec3553eb9349..26a64d0f9ab9 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -28,7 +28,7 @@ #define ARM_EXCEPTION_EL1_SERROR 1 #define ARM_EXCEPTION_TRAP 2 /* The hyp-stub will return this for any kvm_call_hyp() call */ -#define ARM_EXCEPTION_HYP_GONE 3 +#define ARM_EXCEPTION_HYP_GONE HVC_STUB_ERR #define KVM_ARM64_DEBUG_DIRTY_SHIFT 0 #define KVM_ARM64_DEBUG_DIRTY (1 << KVM_ARM64_DEBUG_DIRTY_SHIFT) @@ -47,7 +47,6 @@ struct kvm_vcpu; extern char __kvm_hyp_init[]; extern char __kvm_hyp_init_end[]; -extern char __kvm_hyp_reset[]; extern char __kvm_hyp_vector[]; @@ -59,6 +58,8 @@ extern void __kvm_tlb_flush_local_vmid(struct kvm_vcpu *vcpu); extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu); extern u64 __vgic_v3_get_ich_vtr_el2(void); +extern u64 __vgic_v3_read_vmcr(void); +extern void __vgic_v3_write_vmcr(u32 vmcr); extern void __vgic_v3_init_lrs(void); extern u32 __kvm_get_mdcr_el2(void); diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e7705e7bb07b..5e19165c5fa8 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -31,7 +31,6 @@ #define __KVM_HAVE_ARCH_INTC_INITIALIZED #define KVM_USER_MEM_SLOTS 512 -#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #define KVM_HALT_POLL_NS_DEFAULT 500000 #include <kvm/arm_vgic.h> @@ -42,7 +41,7 @@ #define KVM_VCPU_MAX_FEATURES 4 -#define KVM_REQ_VCPU_EXIT 8 +#define KVM_REQ_VCPU_EXIT (8 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) int __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); @@ -362,13 +361,6 @@ static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr, __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr); } -void __kvm_hyp_teardown(void); -static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr, - phys_addr_t phys_idmap_start) -{ - kvm_call_hyp(__kvm_hyp_teardown, phys_idmap_start); -} - static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) {} diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index 2bc6ffa7b89b..a89cc22abadc 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -155,7 +155,6 @@ void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu); phys_addr_t kvm_mmu_get_httbr(void); phys_addr_t kvm_get_idmap_vector(void); -phys_addr_t kvm_get_idmap_start(void); int kvm_mmu_init(void); void kvm_clear_hyp_idmap(void); diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h index 439f6b5d31f6..c5f89442785c 100644 --- a/arch/arm64/include/asm/virt.h +++ b/arch/arm64/include/asm/virt.h @@ -19,25 +19,38 @@ #define __ASM__VIRT_H /* - * The arm64 hcall implementation uses x0 to specify the hcall type. A value - * less than 0xfff indicates a special hcall, such as get/set vector. - * Any other value is used as a pointer to the function to call. + * The arm64 hcall implementation uses x0 to specify the hcall + * number. A value less than HVC_STUB_HCALL_NR indicates a special + * hcall, such as set vector. Any other value is handled in a + * hypervisor specific way. + * + * The hypercall is allowed to clobber any of the caller-saved + * registers (x0-x18), so it is advisable to use it through the + * indirection of a function call (as implemented in hyp-stub.S). */ -/* HVC_GET_VECTORS - Return the value of the vbar_el2 register. */ -#define HVC_GET_VECTORS 0 - /* * HVC_SET_VECTORS - Set the value of the vbar_el2 register. * * @x1: Physical address of the new vector table. */ -#define HVC_SET_VECTORS 1 +#define HVC_SET_VECTORS 0 /* * HVC_SOFT_RESTART - CPU soft reset, used by the cpu_soft_restart routine. */ -#define HVC_SOFT_RESTART 2 +#define HVC_SOFT_RESTART 1 + +/* + * HVC_RESET_VECTORS - Restore the vectors to the original HYP stubs + */ +#define HVC_RESET_VECTORS 2 + +/* Max number of HYP stub hypercalls */ +#define HVC_STUB_HCALL_NR 3 + +/* Error returned when an invalid stub number is passed into x0 */ +#define HVC_STUB_ERR 0xbadca11 #define BOOT_CPU_MODE_EL1 (0xe11) #define BOOT_CPU_MODE_EL2 (0xe12) @@ -61,7 +74,7 @@ extern u32 __boot_cpu_mode[2]; void __hyp_set_vectors(phys_addr_t phys_vector_base); -phys_addr_t __hyp_get_vectors(void); +void __hyp_reset_vectors(void); /* Reports the availability of HYP mode */ static inline bool is_hyp_mode_available(void) diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h index c2860358ae3e..869ee480deed 100644 --- a/arch/arm64/include/uapi/asm/kvm.h +++ b/arch/arm64/include/uapi/asm/kvm.h @@ -39,6 +39,8 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_READONLY_MEM +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + #define KVM_REG_SIZE(id) \ (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) @@ -143,6 +145,8 @@ struct kvm_debug_exit_arch { #define KVM_GUESTDBG_USE_HW (1 << 17) struct kvm_sync_regs { + /* Used with KVM_CAP_ARM_USER_IRQ */ + __u64 device_irq_level; }; struct kvm_arch_memory_slot { diff --git a/arch/arm64/kernel/hyp-stub.S b/arch/arm64/kernel/hyp-stub.S index d3b5f75e652e..e1261fbaa374 100644 --- a/arch/arm64/kernel/hyp-stub.S +++ b/arch/arm64/kernel/hyp-stub.S @@ -55,18 +55,7 @@ ENDPROC(__hyp_stub_vectors) .align 11 el1_sync: - mrs x30, esr_el2 - lsr x30, x30, #ESR_ELx_EC_SHIFT - - cmp x30, #ESR_ELx_EC_HVC64 - b.ne 9f // Not an HVC trap - - cmp x0, #HVC_GET_VECTORS - b.ne 1f - mrs x0, vbar_el2 - b 9f - -1: cmp x0, #HVC_SET_VECTORS + cmp x0, #HVC_SET_VECTORS b.ne 2f msr vbar_el2, x1 b 9f @@ -79,10 +68,15 @@ el1_sync: mov x1, x3 br x4 // no return +3: cmp x0, #HVC_RESET_VECTORS + beq 9f // Nothing to reset! + /* Someone called kvm_call_hyp() against the hyp-stub... */ -3: mov x0, #ARM_EXCEPTION_HYP_GONE + ldr x0, =HVC_STUB_ERR + eret -9: eret +9: mov x0, xzr + eret ENDPROC(el1_sync) .macro invalid_vector label @@ -121,19 +115,15 @@ ENDPROC(\label) * initialisation entry point. */ -ENTRY(__hyp_get_vectors) - str lr, [sp, #-16]! - mov x0, #HVC_GET_VECTORS - hvc #0 - ldr lr, [sp], #16 - ret -ENDPROC(__hyp_get_vectors) - ENTRY(__hyp_set_vectors) - str lr, [sp, #-16]! mov x1, x0 mov x0, #HVC_SET_VECTORS hvc #0 - ldr lr, [sp], #16 ret ENDPROC(__hyp_set_vectors) + +ENTRY(__hyp_reset_vectors) + mov x0, #HVC_RESET_VECTORS + hvc #0 + ret +ENDPROC(__hyp_reset_vectors) diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S index 6b29d3d9e1f2..839425c24b1c 100644 --- a/arch/arm64/kvm/hyp-init.S +++ b/arch/arm64/kvm/hyp-init.S @@ -22,6 +22,7 @@ #include <asm/kvm_mmu.h> #include <asm/pgtable-hwdef.h> #include <asm/sysreg.h> +#include <asm/virt.h> .text .pushsection .hyp.idmap.text, "ax" @@ -58,6 +59,9 @@ __invalid: * x2: HYP vectors */ __do_hyp_init: + /* Check for a stub HVC call */ + cmp x0, #HVC_STUB_HCALL_NR + b.lo __kvm_handle_stub_hvc msr ttbr0_el2, x0 @@ -119,23 +123,45 @@ __do_hyp_init: eret ENDPROC(__kvm_hyp_init) +ENTRY(__kvm_handle_stub_hvc) + cmp x0, #HVC_SOFT_RESTART + b.ne 1f + + /* This is where we're about to jump, staying at EL2 */ + msr elr_el2, x1 + mov x0, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT | PSR_MODE_EL2h) + msr spsr_el2, x0 + + /* Shuffle the arguments, and don't come back */ + mov x0, x2 + mov x1, x3 + mov x2, x4 + b reset + +1: cmp x0, #HVC_RESET_VECTORS + b.ne 1f +reset: /* - * Reset kvm back to the hyp stub. + * Reset kvm back to the hyp stub. Do not clobber x0-x4 in + * case we coming via HVC_SOFT_RESTART. */ -ENTRY(__kvm_hyp_reset) - /* We're now in idmap, disable MMU */ - mrs x0, sctlr_el2 - ldr x1, =SCTLR_ELx_FLAGS - bic x0, x0, x1 // Clear SCTL_M and etc - msr sctlr_el2, x0 + mrs x5, sctlr_el2 + ldr x6, =SCTLR_ELx_FLAGS + bic x5, x5, x6 // Clear SCTL_M and etc + msr sctlr_el2, x5 isb /* Install stub vectors */ - adr_l x0, __hyp_stub_vectors - msr vbar_el2, x0 + adr_l x5, __hyp_stub_vectors + msr vbar_el2, x5 + mov x0, xzr + eret +1: /* Bad stub call */ + ldr x0, =HVC_STUB_ERR eret -ENDPROC(__kvm_hyp_reset) + +ENDPROC(__kvm_handle_stub_hvc) .ltorg diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S index 2726635dceba..952f6cb9cf72 100644 --- a/arch/arm64/kvm/hyp.S +++ b/arch/arm64/kvm/hyp.S @@ -36,15 +36,12 @@ * passed in x0. * * A function pointer with a value less than 0xfff has a special meaning, - * and is used to implement __hyp_get_vectors in the same way as in + * and is used to implement hyp stubs in the same way as in * arch/arm64/kernel/hyp_stub.S. - * HVC behaves as a 'bl' call and will clobber lr. */ ENTRY(__kvm_call_hyp) alternative_if_not ARM64_HAS_VIRT_HOST_EXTN - str lr, [sp, #-16]! hvc #0 - ldr lr, [sp], #16 ret alternative_else_nop_endif b __vhe_hyp_call diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 5e9052f087f2..5170ce1021da 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -32,17 +32,17 @@ * Shuffle the parameters before calling the function * pointed to in x0. Assumes parameters in x[1,2,3]. */ + str lr, [sp, #-16]! mov lr, x0 mov x0, x1 mov x1, x2 mov x2, x3 blr lr + ldr lr, [sp], #16 .endm ENTRY(__vhe_hyp_call) - str lr, [sp, #-16]! do_el2_call - ldr lr, [sp], #16 /* * We used to rely on having an exception return to get * an implicit isb. In the E2H case, we don't have it anymore. @@ -53,21 +53,6 @@ ENTRY(__vhe_hyp_call) ret ENDPROC(__vhe_hyp_call) -/* - * Compute the idmap address of __kvm_hyp_reset based on the idmap - * start passed as a parameter, and jump there. - * - * x0: HYP phys_idmap_start - */ -ENTRY(__kvm_hyp_teardown) - mov x4, x0 - adr_l x3, __kvm_hyp_reset - - /* insert __kvm_hyp_reset()s offset into phys_idmap_start */ - bfi x4, x3, #0, #PAGE_SHIFT - br x4 -ENDPROC(__kvm_hyp_teardown) - el1_sync: // Guest trapped into EL2 stp x0, x1, [sp, #-16]! @@ -87,10 +72,24 @@ alternative_endif /* Here, we're pretty sure the host called HVC. */ ldp x0, x1, [sp], #16 - cmp x0, #HVC_GET_VECTORS - b.ne 1f - mrs x0, vbar_el2 - b 2f + /* Check for a stub HVC call */ + cmp x0, #HVC_STUB_HCALL_NR + b.hs 1f + + /* + * Compute the idmap address of __kvm_handle_stub_hvc and + * jump there. Since we use kimage_voffset, do not use the + * HYP VA for __kvm_handle_stub_hvc, but the kernel VA instead + * (by loading it from the constant pool). + * + * Preserve x0-x4, which may contain stub parameters. + */ + ldr x5, =__kvm_handle_stub_hvc + ldr_l x6, kimage_voffset + + /* x5 = __pa(x5) */ + sub x5, x5, x6 + br x5 1: /* @@ -99,7 +98,7 @@ alternative_endif kern_hyp_va x0 do_el2_call -2: eret + eret el1_trap: /* diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index 26b0e77878b5..efbe9e8e7a78 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -55,6 +55,15 @@ * 64bit interface. */ +static bool read_from_write_only(struct kvm_vcpu *vcpu, + const struct sys_reg_params *params) +{ + WARN_ONCE(1, "Unexpected sys_reg read to write-only register\n"); + print_sys_reg_instr(params); + kvm_inject_undefined(vcpu); + return false; +} + /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ static u32 cache_levels; @@ -460,35 +469,35 @@ static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) vcpu_sys_reg(vcpu, PMCR_EL0) = val; } -static bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu) +static bool check_pmu_access_disabled(struct kvm_vcpu *vcpu, u64 flags) { u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0); + bool enabled = (reg & flags) || vcpu_mode_priv(vcpu); - return !((reg & ARMV8_PMU_USERENR_EN) || vcpu_mode_priv(vcpu)); + if (!enabled) + kvm_inject_undefined(vcpu); + + return !enabled; } -static bool pmu_write_swinc_el0_disabled(struct kvm_vcpu *vcpu) +static bool pmu_access_el0_disabled(struct kvm_vcpu *vcpu) { - u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0); + return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_EN); +} - return !((reg & (ARMV8_PMU_USERENR_SW | ARMV8_PMU_USERENR_EN)) - || vcpu_mode_priv(vcpu)); +static bool pmu_write_swinc_el0_disabled(struct kvm_vcpu *vcpu) +{ + return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_SW | ARMV8_PMU_USERENR_EN); } static bool pmu_access_cycle_counter_el0_disabled(struct kvm_vcpu *vcpu) { - u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0); - - return !((reg & (ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_EN)) - || vcpu_mode_priv(vcpu)); + return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_CR | ARMV8_PMU_USERENR_EN); } static bool pmu_access_event_counter_el0_disabled(struct kvm_vcpu *vcpu) { - u64 reg = vcpu_sys_reg(vcpu, PMUSERENR_EL0); - - return !((reg & (ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_EN)) - || vcpu_mode_priv(vcpu)); + return check_pmu_access_disabled(vcpu, ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_EN); } static bool access_pmcr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -567,8 +576,10 @@ static bool pmu_counter_idx_valid(struct kvm_vcpu *vcpu, u64 idx) pmcr = vcpu_sys_reg(vcpu, PMCR_EL0); val = (pmcr >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK; - if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) + if (idx >= val && idx != ARMV8_PMU_CYCLE_IDX) { + kvm_inject_undefined(vcpu); return false; + } return true; } @@ -707,8 +718,10 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (!kvm_arm_pmu_v3_ready(vcpu)) return trap_raz_wi(vcpu, p, r); - if (!vcpu_mode_priv(vcpu)) + if (!vcpu_mode_priv(vcpu)) { + kvm_inject_undefined(vcpu); return false; + } if (p->is_write) { u64 val = p->regval & mask; @@ -759,16 +772,15 @@ static bool access_pmswinc(struct kvm_vcpu *vcpu, struct sys_reg_params *p, if (!kvm_arm_pmu_v3_ready(vcpu)) return trap_raz_wi(vcpu, p, r); + if (!p->is_write) + return read_from_write_only(vcpu, p); + if (pmu_write_swinc_el0_disabled(vcpu)) return false; - if (p->is_write) { - mask = kvm_pmu_valid_counter_mask(vcpu); - kvm_pmu_software_increment(vcpu, p->regval & mask); - return true; - } - - return false; + mask = kvm_pmu_valid_counter_mask(vcpu); + kvm_pmu_software_increment(vcpu, p->regval & mask); + return true; } static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -778,8 +790,10 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, return trap_raz_wi(vcpu, p, r); if (p->is_write) { - if (!vcpu_mode_priv(vcpu)) + if (!vcpu_mode_priv(vcpu)) { + kvm_inject_undefined(vcpu); return false; + } vcpu_sys_reg(vcpu, PMUSERENR_EL0) = p->regval & ARMV8_PMU_USERENR_MASK; @@ -793,31 +807,23 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* Silly macro to expand the DBG{BCR,BVR,WVR,WCR}n_EL1 registers in one go */ #define DBG_BCR_BVR_WCR_WVR_EL1(n) \ - /* DBGBVRn_EL1 */ \ - { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b100), \ + { SYS_DESC(SYS_DBGBVRn_EL1(n)), \ trap_bvr, reset_bvr, n, 0, get_bvr, set_bvr }, \ - /* DBGBCRn_EL1 */ \ - { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b101), \ + { SYS_DESC(SYS_DBGBCRn_EL1(n)), \ trap_bcr, reset_bcr, n, 0, get_bcr, set_bcr }, \ - /* DBGWVRn_EL1 */ \ - { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b110), \ + { SYS_DESC(SYS_DBGWVRn_EL1(n)), \ trap_wvr, reset_wvr, n, 0, get_wvr, set_wvr }, \ - /* DBGWCRn_EL1 */ \ - { Op0(0b10), Op1(0b000), CRn(0b0000), CRm((n)), Op2(0b111), \ + { SYS_DESC(SYS_DBGWCRn_EL1(n)), \ trap_wcr, reset_wcr, n, 0, get_wcr, set_wcr } /* Macro to expand the PMEVCNTRn_EL0 register */ #define PMU_PMEVCNTR_EL0(n) \ - /* PMEVCNTRn_EL0 */ \ - { Op0(0b11), Op1(0b011), CRn(0b1110), \ - CRm((0b1000 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \ + { SYS_DESC(SYS_PMEVCNTRn_EL0(n)), \ access_pmu_evcntr, reset_unknown, (PMEVCNTR0_EL0 + n), } /* Macro to expand the PMEVTYPERn_EL0 register */ #define PMU_PMEVTYPER_EL0(n) \ - /* PMEVTYPERn_EL0 */ \ - { Op0(0b11), Op1(0b011), CRn(0b1110), \ - CRm((0b1100 | (((n) >> 3) & 0x3))), Op2(((n) & 0x7)), \ + { SYS_DESC(SYS_PMEVTYPERn_EL0(n)), \ access_pmu_evtyper, reset_unknown, (PMEVTYPER0_EL0 + n), } static bool access_cntp_tval(struct kvm_vcpu *vcpu, @@ -887,24 +893,14 @@ static bool access_cntp_cval(struct kvm_vcpu *vcpu, * more demanding guest... */ static const struct sys_reg_desc sys_reg_descs[] = { - /* DC ISW */ - { Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b0110), Op2(0b010), - access_dcsw }, - /* DC CSW */ - { Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b1010), Op2(0b010), - access_dcsw }, - /* DC CISW */ - { Op0(0b01), Op1(0b000), CRn(0b0111), CRm(0b1110), Op2(0b010), - access_dcsw }, + { SYS_DESC(SYS_DC_ISW), access_dcsw }, + { SYS_DESC(SYS_DC_CSW), access_dcsw }, + { SYS_DESC(SYS_DC_CISW), access_dcsw }, DBG_BCR_BVR_WCR_WVR_EL1(0), DBG_BCR_BVR_WCR_WVR_EL1(1), - /* MDCCINT_EL1 */ - { Op0(0b10), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b000), - trap_debug_regs, reset_val, MDCCINT_EL1, 0 }, - /* MDSCR_EL1 */ - { Op0(0b10), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b010), - trap_debug_regs, reset_val, MDSCR_EL1, 0 }, + { SYS_DESC(SYS_MDCCINT_EL1), trap_debug_regs, reset_val, MDCCINT_EL1, 0 }, + { SYS_DESC(SYS_MDSCR_EL1), trap_debug_regs, reset_val, MDSCR_EL1, 0 }, DBG_BCR_BVR_WCR_WVR_EL1(2), DBG_BCR_BVR_WCR_WVR_EL1(3), DBG_BCR_BVR_WCR_WVR_EL1(4), @@ -920,179 +916,77 @@ static const struct sys_reg_desc sys_reg_descs[] = { DBG_BCR_BVR_WCR_WVR_EL1(14), DBG_BCR_BVR_WCR_WVR_EL1(15), - /* MDRAR_EL1 */ - { Op0(0b10), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b000), - trap_raz_wi }, - /* OSLAR_EL1 */ - { Op0(0b10), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b100), - trap_raz_wi }, - /* OSLSR_EL1 */ - { Op0(0b10), Op1(0b000), CRn(0b0001), CRm(0b0001), Op2(0b100), - trap_oslsr_el1 }, - /* OSDLR_EL1 */ - { Op0(0b10), Op1(0b000), CRn(0b0001), CRm(0b0011), Op2(0b100), - trap_raz_wi }, - /* DBGPRCR_EL1 */ - { Op0(0b10), Op1(0b000), CRn(0b0001), CRm(0b0100), Op2(0b100), - trap_raz_wi }, - /* DBGCLAIMSET_EL1 */ - { Op0(0b10), Op1(0b000), CRn(0b0111), CRm(0b1000), Op2(0b110), - trap_raz_wi }, - /* DBGCLAIMCLR_EL1 */ - { Op0(0b10), Op1(0b000), CRn(0b0111), CRm(0b1001), Op2(0b110), - trap_raz_wi }, - /* DBGAUTHSTATUS_EL1 */ - { Op0(0b10), Op1(0b000), CRn(0b0111), CRm(0b1110), Op2(0b110), - trap_dbgauthstatus_el1 }, - - /* MDCCSR_EL1 */ - { Op0(0b10), Op1(0b011), CRn(0b0000), CRm(0b0001), Op2(0b000), - trap_raz_wi }, - /* DBGDTR_EL0 */ - { Op0(0b10), Op1(0b011), CRn(0b0000), CRm(0b0100), Op2(0b000), - trap_raz_wi }, - /* DBGDTR[TR]X_EL0 */ - { Op0(0b10), Op1(0b011), CRn(0b0000), CRm(0b0101), Op2(0b000), - trap_raz_wi }, - - /* DBGVCR32_EL2 */ - { Op0(0b10), Op1(0b100), CRn(0b0000), CRm(0b0111), Op2(0b000), - NULL, reset_val, DBGVCR32_EL2, 0 }, - - /* MPIDR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b101), - NULL, reset_mpidr, MPIDR_EL1 }, - /* SCTLR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b000), - access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 }, - /* CPACR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b010), - NULL, reset_val, CPACR_EL1, 0 }, - /* TTBR0_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b000), - access_vm_reg, reset_unknown, TTBR0_EL1 }, - /* TTBR1_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b001), - access_vm_reg, reset_unknown, TTBR1_EL1 }, - /* TCR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0010), CRm(0b0000), Op2(0b010), - access_vm_reg, reset_val, TCR_EL1, 0 }, - - /* AFSR0_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b000), - access_vm_reg, reset_unknown, AFSR0_EL1 }, - /* AFSR1_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0001), Op2(0b001), - access_vm_reg, reset_unknown, AFSR1_EL1 }, - /* ESR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0101), CRm(0b0010), Op2(0b000), - access_vm_reg, reset_unknown, ESR_EL1 }, - /* FAR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0110), CRm(0b0000), Op2(0b000), - access_vm_reg, reset_unknown, FAR_EL1 }, - /* PAR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0111), CRm(0b0100), Op2(0b000), - NULL, reset_unknown, PAR_EL1 }, - - /* PMINTENSET_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b001), - access_pminten, reset_unknown, PMINTENSET_EL1 }, - /* PMINTENCLR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1001), CRm(0b1110), Op2(0b010), - access_pminten, NULL, PMINTENSET_EL1 }, - - /* MAIR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0010), Op2(0b000), - access_vm_reg, reset_unknown, MAIR_EL1 }, - /* AMAIR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1010), CRm(0b0011), Op2(0b000), - access_vm_reg, reset_amair_el1, AMAIR_EL1 }, - - /* VBAR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000), - NULL, reset_val, VBAR_EL1, 0 }, - - /* ICC_SGI1R_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1011), Op2(0b101), - access_gic_sgi }, - /* ICC_SRE_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1100), Op2(0b101), - access_gic_sre }, - - /* CONTEXTIDR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b001), - access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, - /* TPIDR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1101), CRm(0b0000), Op2(0b100), - NULL, reset_unknown, TPIDR_EL1 }, - - /* CNTKCTL_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b1110), CRm(0b0001), Op2(0b000), - NULL, reset_val, CNTKCTL_EL1, 0}, - - /* CSSELR_EL1 */ - { Op0(0b11), Op1(0b010), CRn(0b0000), CRm(0b0000), Op2(0b000), - NULL, reset_unknown, CSSELR_EL1 }, - - /* PMCR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b000), - access_pmcr, reset_pmcr, }, - /* PMCNTENSET_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b001), - access_pmcnten, reset_unknown, PMCNTENSET_EL0 }, - /* PMCNTENCLR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b010), - access_pmcnten, NULL, PMCNTENSET_EL0 }, - /* PMOVSCLR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b011), - access_pmovs, NULL, PMOVSSET_EL0 }, - /* PMSWINC_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b100), - access_pmswinc, reset_unknown, PMSWINC_EL0 }, - /* PMSELR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b101), - access_pmselr, reset_unknown, PMSELR_EL0 }, - /* PMCEID0_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b110), - access_pmceid }, - /* PMCEID1_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1100), Op2(0b111), - access_pmceid }, - /* PMCCNTR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b000), - access_pmu_evcntr, reset_unknown, PMCCNTR_EL0 }, - /* PMXEVTYPER_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b001), - access_pmu_evtyper }, - /* PMXEVCNTR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1101), Op2(0b010), - access_pmu_evcntr }, - /* PMUSERENR_EL0 - * This register resets as unknown in 64bit mode while it resets as zero + { SYS_DESC(SYS_MDRAR_EL1), trap_raz_wi }, + { SYS_DESC(SYS_OSLAR_EL1), trap_raz_wi }, + { SYS_DESC(SYS_OSLSR_EL1), trap_oslsr_el1 }, + { SYS_DESC(SYS_OSDLR_EL1), trap_raz_wi }, + { SYS_DESC(SYS_DBGPRCR_EL1), trap_raz_wi }, + { SYS_DESC(SYS_DBGCLAIMSET_EL1), trap_raz_wi }, + { SYS_DESC(SYS_DBGCLAIMCLR_EL1), trap_raz_wi }, + { SYS_DESC(SYS_DBGAUTHSTATUS_EL1), trap_dbgauthstatus_el1 }, + + { SYS_DESC(SYS_MDCCSR_EL0), trap_raz_wi }, + { SYS_DESC(SYS_DBGDTR_EL0), trap_raz_wi }, + // DBGDTR[TR]X_EL0 share the same encoding + { SYS_DESC(SYS_DBGDTRTX_EL0), trap_raz_wi }, + + { SYS_DESC(SYS_DBGVCR32_EL2), NULL, reset_val, DBGVCR32_EL2, 0 }, + + { SYS_DESC(SYS_MPIDR_EL1), NULL, reset_mpidr, MPIDR_EL1 }, + { SYS_DESC(SYS_SCTLR_EL1), access_vm_reg, reset_val, SCTLR_EL1, 0x00C50078 }, + { SYS_DESC(SYS_CPACR_EL1), NULL, reset_val, CPACR_EL1, 0 }, + { SYS_DESC(SYS_TTBR0_EL1), access_vm_reg, reset_unknown, TTBR0_EL1 }, + { SYS_DESC(SYS_TTBR1_EL1), access_vm_reg, reset_unknown, TTBR1_EL1 }, + { SYS_DESC(SYS_TCR_EL1), access_vm_reg, reset_val, TCR_EL1, 0 }, + + { SYS_DESC(SYS_AFSR0_EL1), access_vm_reg, reset_unknown, AFSR0_EL1 }, + { SYS_DESC(SYS_AFSR1_EL1), access_vm_reg, reset_unknown, AFSR1_EL1 }, + { SYS_DESC(SYS_ESR_EL1), access_vm_reg, reset_unknown, ESR_EL1 }, + { SYS_DESC(SYS_FAR_EL1), access_vm_reg, reset_unknown, FAR_EL1 }, + { SYS_DESC(SYS_PAR_EL1), NULL, reset_unknown, PAR_EL1 }, + + { SYS_DESC(SYS_PMINTENSET_EL1), access_pminten, reset_unknown, PMINTENSET_EL1 }, + { SYS_DESC(SYS_PMINTENCLR_EL1), access_pminten, NULL, PMINTENSET_EL1 }, + + { SYS_DESC(SYS_MAIR_EL1), access_vm_reg, reset_unknown, MAIR_EL1 }, + { SYS_DESC(SYS_AMAIR_EL1), access_vm_reg, reset_amair_el1, AMAIR_EL1 }, + + { SYS_DESC(SYS_VBAR_EL1), NULL, reset_val, VBAR_EL1, 0 }, + + { SYS_DESC(SYS_ICC_SGI1R_EL1), access_gic_sgi }, + { SYS_DESC(SYS_ICC_SRE_EL1), access_gic_sre }, + + { SYS_DESC(SYS_CONTEXTIDR_EL1), access_vm_reg, reset_val, CONTEXTIDR_EL1, 0 }, + { SYS_DESC(SYS_TPIDR_EL1), NULL, reset_unknown, TPIDR_EL1 }, + + { SYS_DESC(SYS_CNTKCTL_EL1), NULL, reset_val, CNTKCTL_EL1, 0}, + + { SYS_DESC(SYS_CSSELR_EL1), NULL, reset_unknown, CSSELR_EL1 }, + + { SYS_DESC(SYS_PMCR_EL0), access_pmcr, reset_pmcr, }, + { SYS_DESC(SYS_PMCNTENSET_EL0), access_pmcnten, reset_unknown, PMCNTENSET_EL0 }, + { SYS_DESC(SYS_PMCNTENCLR_EL0), access_pmcnten, NULL, PMCNTENSET_EL0 }, + { SYS_DESC(SYS_PMOVSCLR_EL0), access_pmovs, NULL, PMOVSSET_EL0 }, + { SYS_DESC(SYS_PMSWINC_EL0), access_pmswinc, reset_unknown, PMSWINC_EL0 }, + { SYS_DESC(SYS_PMSELR_EL0), access_pmselr, reset_unknown, PMSELR_EL0 }, + { SYS_DESC(SYS_PMCEID0_EL0), access_pmceid }, + { SYS_DESC(SYS_PMCEID1_EL0), access_pmceid }, + { SYS_DESC(SYS_PMCCNTR_EL0), access_pmu_evcntr, reset_unknown, PMCCNTR_EL0 }, + { SYS_DESC(SYS_PMXEVTYPER_EL0), access_pmu_evtyper }, + { SYS_DESC(SYS_PMXEVCNTR_EL0), access_pmu_evcntr }, + /* + * PMUSERENR_EL0 resets as unknown in 64bit mode while it resets as zero * in 32bit mode. Here we choose to reset it as zero for consistency. */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b000), - access_pmuserenr, reset_val, PMUSERENR_EL0, 0 }, - /* PMOVSSET_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1001), CRm(0b1110), Op2(0b011), - access_pmovs, reset_unknown, PMOVSSET_EL0 }, - - /* TPIDR_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b010), - NULL, reset_unknown, TPIDR_EL0 }, - /* TPIDRRO_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1101), CRm(0b0000), Op2(0b011), - NULL, reset_unknown, TPIDRRO_EL0 }, - - /* CNTP_TVAL_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b000), - access_cntp_tval }, - /* CNTP_CTL_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b001), - access_cntp_ctl }, - /* CNTP_CVAL_EL0 */ - { Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b0010), Op2(0b010), - access_cntp_cval }, + { SYS_DESC(SYS_PMUSERENR_EL0), access_pmuserenr, reset_val, PMUSERENR_EL0, 0 }, + { SYS_DESC(SYS_PMOVSSET_EL0), access_pmovs, reset_unknown, PMOVSSET_EL0 }, + + { SYS_DESC(SYS_TPIDR_EL0), NULL, reset_unknown, TPIDR_EL0 }, + { SYS_DESC(SYS_TPIDRRO_EL0), NULL, reset_unknown, TPIDRRO_EL0 }, + + { SYS_DESC(SYS_CNTP_TVAL_EL0), access_cntp_tval }, + { SYS_DESC(SYS_CNTP_CTL_EL0), access_cntp_ctl }, + { SYS_DESC(SYS_CNTP_CVAL_EL0), access_cntp_cval }, /* PMEVCNTRn_EL0 */ PMU_PMEVCNTR_EL0(0), @@ -1158,22 +1052,15 @@ static const struct sys_reg_desc sys_reg_descs[] = { PMU_PMEVTYPER_EL0(28), PMU_PMEVTYPER_EL0(29), PMU_PMEVTYPER_EL0(30), - /* PMCCFILTR_EL0 - * This register resets as unknown in 64bit mode while it resets as zero + /* + * PMCCFILTR_EL0 resets as unknown in 64bit mode while it resets as zero * in 32bit mode. Here we choose to reset it as zero for consistency. */ - { Op0(0b11), Op1(0b011), CRn(0b1110), CRm(0b1111), Op2(0b111), - access_pmu_evtyper, reset_val, PMCCFILTR_EL0, 0 }, - - /* DACR32_EL2 */ - { Op0(0b11), Op1(0b100), CRn(0b0011), CRm(0b0000), Op2(0b000), - NULL, reset_unknown, DACR32_EL2 }, - /* IFSR32_EL2 */ - { Op0(0b11), Op1(0b100), CRn(0b0101), CRm(0b0000), Op2(0b001), - NULL, reset_unknown, IFSR32_EL2 }, - /* FPEXC32_EL2 */ - { Op0(0b11), Op1(0b100), CRn(0b0101), CRm(0b0011), Op2(0b000), - NULL, reset_val, FPEXC32_EL2, 0x70 }, + { SYS_DESC(SYS_PMCCFILTR_EL0), access_pmu_evtyper, reset_val, PMCCFILTR_EL0, 0 }, + + { SYS_DESC(SYS_DACR32_EL2), NULL, reset_unknown, DACR32_EL2 }, + { SYS_DESC(SYS_IFSR32_EL2), NULL, reset_unknown, IFSR32_EL2 }, + { SYS_DESC(SYS_FPEXC32_EL2), NULL, reset_val, FPEXC32_EL2, 0x70 }, }; static bool trap_dbgidr(struct kvm_vcpu *vcpu, @@ -1557,6 +1444,22 @@ int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run) return 1; } +static void perform_access(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *r) +{ + /* + * Not having an accessor means that we have configured a trap + * that we don't know how to handle. This certainly qualifies + * as a gross bug that should be fixed right away. + */ + BUG_ON(!r->access); + + /* Skip instruction if instructed so */ + if (likely(r->access(vcpu, params, r))) + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); +} + /* * emulate_cp -- tries to match a sys_reg access in a handling table, and * call the corresponding trap handler. @@ -1580,20 +1483,8 @@ static int emulate_cp(struct kvm_vcpu *vcpu, r = find_reg(params, table, num); if (r) { - /* - * Not having an accessor means that we have - * configured a trap that we don't know how to - * handle. This certainly qualifies as a gross bug - * that should be fixed right away. - */ - BUG_ON(!r->access); - - if (likely(r->access(vcpu, params, r))) { - /* Skip instruction, since it was emulated */ - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - /* Handled */ - return 0; - } + perform_access(vcpu, params, r); + return 0; } /* Not handled */ @@ -1660,20 +1551,25 @@ static int kvm_handle_cp_64(struct kvm_vcpu *vcpu, params.regval |= vcpu_get_reg(vcpu, Rt2) << 32; } - if (!emulate_cp(vcpu, ¶ms, target_specific, nr_specific)) - goto out; - if (!emulate_cp(vcpu, ¶ms, global, nr_global)) - goto out; - - unhandled_cp_access(vcpu, ¶ms); + /* + * Try to emulate the coprocessor access using the target + * specific table first, and using the global table afterwards. + * If either of the tables contains a handler, handle the + * potential register operation in the case of a read and return + * with success. + */ + if (!emulate_cp(vcpu, ¶ms, target_specific, nr_specific) || + !emulate_cp(vcpu, ¶ms, global, nr_global)) { + /* Split up the value between registers for the read side */ + if (!params.is_write) { + vcpu_set_reg(vcpu, Rt, lower_32_bits(params.regval)); + vcpu_set_reg(vcpu, Rt2, upper_32_bits(params.regval)); + } -out: - /* Split up the value between registers for the read side */ - if (!params.is_write) { - vcpu_set_reg(vcpu, Rt, lower_32_bits(params.regval)); - vcpu_set_reg(vcpu, Rt2, upper_32_bits(params.regval)); + return 1; } + unhandled_cp_access(vcpu, ¶ms); return 1; } @@ -1763,26 +1659,13 @@ static int emulate_sys_reg(struct kvm_vcpu *vcpu, r = find_reg(params, sys_reg_descs, ARRAY_SIZE(sys_reg_descs)); if (likely(r)) { - /* - * Not having an accessor means that we have - * configured a trap that we don't know how to - * handle. This certainly qualifies as a gross bug - * that should be fixed right away. - */ - BUG_ON(!r->access); - - if (likely(r->access(vcpu, params, r))) { - /* Skip instruction, since it was emulated */ - kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); - return 1; - } - /* If access function fails, it should complain. */ + perform_access(vcpu, params, r); } else { kvm_err("Unsupported guest sys_reg access at: %lx\n", *vcpu_pc(vcpu)); print_sys_reg_instr(params); + kvm_inject_undefined(vcpu); } - kvm_inject_undefined(vcpu); return 1; } @@ -1932,44 +1815,25 @@ FUNCTION_INVARIANT(aidr_el1) /* ->val is filled in by kvm_sys_reg_table_init() */ static struct sys_reg_desc invariant_sys_regs[] = { - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b000), - NULL, get_midr_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0000), Op2(0b110), - NULL, get_revidr_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b000), - NULL, get_id_pfr0_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b001), - NULL, get_id_pfr1_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b010), - NULL, get_id_dfr0_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b011), - NULL, get_id_afr0_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b100), - NULL, get_id_mmfr0_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b101), - NULL, get_id_mmfr1_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b110), - NULL, get_id_mmfr2_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0001), Op2(0b111), - NULL, get_id_mmfr3_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b000), - NULL, get_id_isar0_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b001), - NULL, get_id_isar1_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b010), - NULL, get_id_isar2_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b011), - NULL, get_id_isar3_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b100), - NULL, get_id_isar4_el1 }, - { Op0(0b11), Op1(0b000), CRn(0b0000), CRm(0b0010), Op2(0b101), - NULL, get_id_isar5_el1 }, - { Op0(0b11), Op1(0b001), CRn(0b0000), CRm(0b0000), Op2(0b001), - NULL, get_clidr_el1 }, - { Op0(0b11), Op1(0b001), CRn(0b0000), CRm(0b0000), Op2(0b111), - NULL, get_aidr_el1 }, - { Op0(0b11), Op1(0b011), CRn(0b0000), CRm(0b0000), Op2(0b001), - NULL, get_ctr_el0 }, + { SYS_DESC(SYS_MIDR_EL1), NULL, get_midr_el1 }, + { SYS_DESC(SYS_REVIDR_EL1), NULL, get_revidr_el1 }, + { SYS_DESC(SYS_ID_PFR0_EL1), NULL, get_id_pfr0_el1 }, + { SYS_DESC(SYS_ID_PFR1_EL1), NULL, get_id_pfr1_el1 }, + { SYS_DESC(SYS_ID_DFR0_EL1), NULL, get_id_dfr0_el1 }, + { SYS_DESC(SYS_ID_AFR0_EL1), NULL, get_id_afr0_el1 }, + { SYS_DESC(SYS_ID_MMFR0_EL1), NULL, get_id_mmfr0_el1 }, + { SYS_DESC(SYS_ID_MMFR1_EL1), NULL, get_id_mmfr1_el1 }, + { SYS_DESC(SYS_ID_MMFR2_EL1), NULL, get_id_mmfr2_el1 }, + { SYS_DESC(SYS_ID_MMFR3_EL1), NULL, get_id_mmfr3_el1 }, + { SYS_DESC(SYS_ID_ISAR0_EL1), NULL, get_id_isar0_el1 }, + { SYS_DESC(SYS_ID_ISAR1_EL1), NULL, get_id_isar1_el1 }, + { SYS_DESC(SYS_ID_ISAR2_EL1), NULL, get_id_isar2_el1 }, + { SYS_DESC(SYS_ID_ISAR3_EL1), NULL, get_id_isar3_el1 }, + { SYS_DESC(SYS_ID_ISAR4_EL1), NULL, get_id_isar4_el1 }, + { SYS_DESC(SYS_ID_ISAR5_EL1), NULL, get_id_isar5_el1 }, + { SYS_DESC(SYS_CLIDR_EL1), NULL, get_clidr_el1 }, + { SYS_DESC(SYS_AIDR_EL1), NULL, get_aidr_el1 }, + { SYS_DESC(SYS_CTR_EL0), NULL, get_ctr_el0 }, }; static int reg_from_user(u64 *val, const void __user *uaddr, u64 id) diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 9c6ffd0f0196..060f5348ef25 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -83,24 +83,6 @@ static inline bool read_zero(struct kvm_vcpu *vcpu, return true; } -static inline bool write_to_read_only(struct kvm_vcpu *vcpu, - const struct sys_reg_params *params) -{ - kvm_debug("sys_reg write to read-only register at: %lx\n", - *vcpu_pc(vcpu)); - print_sys_reg_instr(params); - return false; -} - -static inline bool read_from_write_only(struct kvm_vcpu *vcpu, - const struct sys_reg_params *params) -{ - kvm_debug("sys_reg read to write-only register at: %lx\n", - *vcpu_pc(vcpu)); - print_sys_reg_instr(params); - return false; -} - /* Reset functions */ static inline void reset_unknown(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) @@ -147,4 +129,9 @@ const struct sys_reg_desc *find_reg_by_id(u64 id, #define CRm(_x) .CRm = _x #define Op2(_x) .Op2 = _x +#define SYS_DESC(reg) \ + Op0(sys_reg_Op0(reg)), Op1(sys_reg_Op1(reg)), \ + CRn(sys_reg_CRn(reg)), CRm(sys_reg_CRm(reg)), \ + Op2(sys_reg_Op2(reg)) + #endif /* __ARM64_KVM_SYS_REGS_LOCAL_H__ */ diff --git a/arch/arm64/kvm/sys_regs_generic_v8.c b/arch/arm64/kvm/sys_regs_generic_v8.c index 46af7186bca6..969ade1d333d 100644 --- a/arch/arm64/kvm/sys_regs_generic_v8.c +++ b/arch/arm64/kvm/sys_regs_generic_v8.c @@ -52,9 +52,7 @@ static void reset_actlr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) * Important: Must be sorted ascending by Op0, Op1, CRn, CRm, Op2 */ static const struct sys_reg_desc genericv8_sys_regs[] = { - /* ACTLR_EL1 */ - { Op0(0b11), Op1(0b000), CRn(0b0001), CRm(0b0000), Op2(0b001), - access_actlr, reset_actlr, ACTLR_EL1 }, + { SYS_DESC(SYS_ACTLR_EL1), access_actlr, reset_actlr, ACTLR_EL1 }, }; static const struct sys_reg_desc genericv8_cp15_regs[] = { diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index d6d545a45118..4e9ebf65d071 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -1686,6 +1686,7 @@ config CPU_CAVIUM_OCTEON select USB_EHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN select USB_OHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN select MIPS_L1_CACHE_SHIFT_7 + select HAVE_KVM help The Cavium Octeon processor is a highly integrated chip containing many ethernet hardware widgets for networking tasks. The processor diff --git a/arch/mips/include/asm/cpu-features.h b/arch/mips/include/asm/cpu-features.h index e961c8a7ea66..494d38274142 100644 --- a/arch/mips/include/asm/cpu-features.h +++ b/arch/mips/include/asm/cpu-features.h @@ -444,6 +444,10 @@ # define cpu_has_msa 0 #endif +#ifndef cpu_has_ufr +# define cpu_has_ufr (cpu_data[0].options & MIPS_CPU_UFR) +#endif + #ifndef cpu_has_fre # define cpu_has_fre (cpu_data[0].options & MIPS_CPU_FRE) #endif @@ -528,6 +532,9 @@ #ifndef cpu_guest_has_htw #define cpu_guest_has_htw (cpu_data[0].guest.options & MIPS_CPU_HTW) #endif +#ifndef cpu_guest_has_mvh +#define cpu_guest_has_mvh (cpu_data[0].guest.options & MIPS_CPU_MVH) +#endif #ifndef cpu_guest_has_msa #define cpu_guest_has_msa (cpu_data[0].guest.ases & MIPS_ASE_MSA) #endif @@ -543,6 +550,9 @@ #ifndef cpu_guest_has_maar #define cpu_guest_has_maar (cpu_data[0].guest.options & MIPS_CPU_MAAR) #endif +#ifndef cpu_guest_has_userlocal +#define cpu_guest_has_userlocal (cpu_data[0].guest.options & MIPS_CPU_ULRI) +#endif /* * Guest dynamic capabilities diff --git a/arch/mips/include/asm/cpu-info.h b/arch/mips/include/asm/cpu-info.h index edbe2734a1bf..be3b4c25f335 100644 --- a/arch/mips/include/asm/cpu-info.h +++ b/arch/mips/include/asm/cpu-info.h @@ -33,6 +33,7 @@ struct guest_info { unsigned long ases_dyn; unsigned long long options; unsigned long long options_dyn; + int tlbsize; u8 conf; u8 kscratch_mask; }; @@ -109,6 +110,7 @@ struct cpuinfo_mips { struct guest_info guest; unsigned int gtoffset_mask; unsigned int guestid_mask; + unsigned int guestid_cache; } __attribute__((aligned(SMP_CACHE_BYTES))); extern struct cpuinfo_mips cpu_data[]; diff --git a/arch/mips/include/asm/cpu.h b/arch/mips/include/asm/cpu.h index 9a8372484edc..98f59307e6a3 100644 --- a/arch/mips/include/asm/cpu.h +++ b/arch/mips/include/asm/cpu.h @@ -415,6 +415,7 @@ enum cpu_type_enum { #define MIPS_CPU_GUESTCTL2 MBIT_ULL(50) /* CPU has VZ GuestCtl2 register */ #define MIPS_CPU_GUESTID MBIT_ULL(51) /* CPU uses VZ ASE GuestID feature */ #define MIPS_CPU_DRG MBIT_ULL(52) /* CPU has VZ Direct Root to Guest (DRG) */ +#define MIPS_CPU_UFR MBIT_ULL(53) /* CPU supports User mode FR switching */ /* * CPU ASE encodings diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h index 05e785fc061d..2998479fd4e8 100644 --- a/arch/mips/include/asm/kvm_host.h +++ b/arch/mips/include/asm/kvm_host.h @@ -10,6 +10,7 @@ #ifndef __MIPS_KVM_HOST_H__ #define __MIPS_KVM_HOST_H__ +#include <linux/cpumask.h> #include <linux/mutex.h> #include <linux/hrtimer.h> #include <linux/interrupt.h> @@ -33,12 +34,23 @@ #define KVM_REG_MIPS_CP0_ENTRYLO0 MIPS_CP0_64(2, 0) #define KVM_REG_MIPS_CP0_ENTRYLO1 MIPS_CP0_64(3, 0) #define KVM_REG_MIPS_CP0_CONTEXT MIPS_CP0_64(4, 0) +#define KVM_REG_MIPS_CP0_CONTEXTCONFIG MIPS_CP0_32(4, 1) #define KVM_REG_MIPS_CP0_USERLOCAL MIPS_CP0_64(4, 2) +#define KVM_REG_MIPS_CP0_XCONTEXTCONFIG MIPS_CP0_64(4, 3) #define KVM_REG_MIPS_CP0_PAGEMASK MIPS_CP0_32(5, 0) #define KVM_REG_MIPS_CP0_PAGEGRAIN MIPS_CP0_32(5, 1) +#define KVM_REG_MIPS_CP0_SEGCTL0 MIPS_CP0_64(5, 2) +#define KVM_REG_MIPS_CP0_SEGCTL1 MIPS_CP0_64(5, 3) +#define KVM_REG_MIPS_CP0_SEGCTL2 MIPS_CP0_64(5, 4) +#define KVM_REG_MIPS_CP0_PWBASE MIPS_CP0_64(5, 5) +#define KVM_REG_MIPS_CP0_PWFIELD MIPS_CP0_64(5, 6) +#define KVM_REG_MIPS_CP0_PWSIZE MIPS_CP0_64(5, 7) #define KVM_REG_MIPS_CP0_WIRED MIPS_CP0_32(6, 0) +#define KVM_REG_MIPS_CP0_PWCTL MIPS_CP0_32(6, 6) #define KVM_REG_MIPS_CP0_HWRENA MIPS_CP0_32(7, 0) #define KVM_REG_MIPS_CP0_BADVADDR MIPS_CP0_64(8, 0) +#define KVM_REG_MIPS_CP0_BADINSTR MIPS_CP0_32(8, 1) +#define KVM_REG_MIPS_CP0_BADINSTRP MIPS_CP0_32(8, 2) #define KVM_REG_MIPS_CP0_COUNT MIPS_CP0_32(9, 0) #define KVM_REG_MIPS_CP0_ENTRYHI MIPS_CP0_64(10, 0) #define KVM_REG_MIPS_CP0_COMPARE MIPS_CP0_32(11, 0) @@ -55,6 +67,7 @@ #define KVM_REG_MIPS_CP0_CONFIG4 MIPS_CP0_32(16, 4) #define KVM_REG_MIPS_CP0_CONFIG5 MIPS_CP0_32(16, 5) #define KVM_REG_MIPS_CP0_CONFIG7 MIPS_CP0_32(16, 7) +#define KVM_REG_MIPS_CP0_MAARI MIPS_CP0_64(17, 2) #define KVM_REG_MIPS_CP0_XCONTEXT MIPS_CP0_64(20, 0) #define KVM_REG_MIPS_CP0_ERROREPC MIPS_CP0_64(30, 0) #define KVM_REG_MIPS_CP0_KSCRATCH1 MIPS_CP0_64(31, 2) @@ -70,9 +83,13 @@ /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 0 -#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #define KVM_HALT_POLL_NS_DEFAULT 500000 +#ifdef CONFIG_KVM_MIPS_VZ +extern unsigned long GUESTID_MASK; +extern unsigned long GUESTID_FIRST_VERSION; +extern unsigned long GUESTID_VERSION_MASK; +#endif /* @@ -145,6 +162,16 @@ struct kvm_vcpu_stat { u64 fpe_exits; u64 msa_disabled_exits; u64 flush_dcache_exits; +#ifdef CONFIG_KVM_MIPS_VZ + u64 vz_gpsi_exits; + u64 vz_gsfc_exits; + u64 vz_hc_exits; + u64 vz_grr_exits; + u64 vz_gva_exits; + u64 vz_ghfc_exits; + u64 vz_gpa_exits; + u64 vz_resvd_exits; +#endif u64 halt_successful_poll; u64 halt_attempted_poll; u64 halt_poll_invalid; @@ -157,6 +184,8 @@ struct kvm_arch_memory_slot { struct kvm_arch { /* Guest physical mm */ struct mm_struct gpa_mm; + /* Mask of CPUs needing GPA ASID flush */ + cpumask_t asid_flush_mask; }; #define N_MIPS_COPROC_REGS 32 @@ -214,6 +243,11 @@ struct mips_coproc { #define MIPS_CP0_CONFIG4_SEL 4 #define MIPS_CP0_CONFIG5_SEL 5 +#define MIPS_CP0_GUESTCTL2 10 +#define MIPS_CP0_GUESTCTL2_SEL 5 +#define MIPS_CP0_GTOFFSET 12 +#define MIPS_CP0_GTOFFSET_SEL 7 + /* Resume Flags */ #define RESUME_FLAG_DR (1<<0) /* Reload guest nonvolatile state? */ #define RESUME_FLAG_HOST (1<<1) /* Resume host? */ @@ -229,6 +263,7 @@ enum emulation_result { EMULATE_WAIT, /* WAIT instruction */ EMULATE_PRIV_FAIL, EMULATE_EXCEPT, /* A guest exception has been generated */ + EMULATE_HYPERCALL, /* HYPCALL instruction */ }; #define mips3_paddr_to_tlbpfn(x) \ @@ -276,13 +311,18 @@ struct kvm_mmu_memory_cache { struct kvm_vcpu_arch { void *guest_ebase; int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu); + + /* Host registers preserved across guest mode execution */ unsigned long host_stack; unsigned long host_gp; + unsigned long host_pgd; + unsigned long host_entryhi; /* Host CP0 registers used when handling exits from guest */ unsigned long host_cp0_badvaddr; unsigned long host_cp0_epc; u32 host_cp0_cause; + u32 host_cp0_guestctl0; u32 host_cp0_badinstr; u32 host_cp0_badinstrp; @@ -340,7 +380,23 @@ struct kvm_vcpu_arch { /* Cache some mmu pages needed inside spinlock regions */ struct kvm_mmu_memory_cache mmu_page_cache; +#ifdef CONFIG_KVM_MIPS_VZ + /* vcpu's vzguestid is different on each host cpu in an smp system */ + u32 vzguestid[NR_CPUS]; + + /* wired guest TLB entries */ + struct kvm_mips_tlb *wired_tlb; + unsigned int wired_tlb_limit; + unsigned int wired_tlb_used; + + /* emulated guest MAAR registers */ + unsigned long maar[6]; +#endif + + /* Last CPU the VCPU state was loaded on */ int last_sched_cpu; + /* Last CPU the VCPU actually executed guest code on */ + int last_exec_cpu; /* WAIT executed */ int wait; @@ -349,78 +405,6 @@ struct kvm_vcpu_arch { u8 msa_enabled; }; - -#define kvm_read_c0_guest_index(cop0) (cop0->reg[MIPS_CP0_TLB_INDEX][0]) -#define kvm_write_c0_guest_index(cop0, val) (cop0->reg[MIPS_CP0_TLB_INDEX][0] = val) -#define kvm_read_c0_guest_entrylo0(cop0) (cop0->reg[MIPS_CP0_TLB_LO0][0]) -#define kvm_write_c0_guest_entrylo0(cop0, val) (cop0->reg[MIPS_CP0_TLB_LO0][0] = (val)) -#define kvm_read_c0_guest_entrylo1(cop0) (cop0->reg[MIPS_CP0_TLB_LO1][0]) -#define kvm_write_c0_guest_entrylo1(cop0, val) (cop0->reg[MIPS_CP0_TLB_LO1][0] = (val)) -#define kvm_read_c0_guest_context(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0]) -#define kvm_write_c0_guest_context(cop0, val) (cop0->reg[MIPS_CP0_TLB_CONTEXT][0] = (val)) -#define kvm_read_c0_guest_userlocal(cop0) (cop0->reg[MIPS_CP0_TLB_CONTEXT][2]) -#define kvm_write_c0_guest_userlocal(cop0, val) (cop0->reg[MIPS_CP0_TLB_CONTEXT][2] = (val)) -#define kvm_read_c0_guest_pagemask(cop0) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0]) -#define kvm_write_c0_guest_pagemask(cop0, val) (cop0->reg[MIPS_CP0_TLB_PG_MASK][0] = (val)) -#define kvm_read_c0_guest_wired(cop0) (cop0->reg[MIPS_CP0_TLB_WIRED][0]) -#define kvm_write_c0_guest_wired(cop0, val) (cop0->reg[MIPS_CP0_TLB_WIRED][0] = (val)) -#define kvm_read_c0_guest_hwrena(cop0) (cop0->reg[MIPS_CP0_HWRENA][0]) -#define kvm_write_c0_guest_hwrena(cop0, val) (cop0->reg[MIPS_CP0_HWRENA][0] = (val)) -#define kvm_read_c0_guest_badvaddr(cop0) (cop0->reg[MIPS_CP0_BAD_VADDR][0]) -#define kvm_write_c0_guest_badvaddr(cop0, val) (cop0->reg[MIPS_CP0_BAD_VADDR][0] = (val)) -#define kvm_read_c0_guest_count(cop0) (cop0->reg[MIPS_CP0_COUNT][0]) -#define kvm_write_c0_guest_count(cop0, val) (cop0->reg[MIPS_CP0_COUNT][0] = (val)) -#define kvm_read_c0_guest_entryhi(cop0) (cop0->reg[MIPS_CP0_TLB_HI][0]) -#define kvm_write_c0_guest_entryhi(cop0, val) (cop0->reg[MIPS_CP0_TLB_HI][0] = (val)) -#define kvm_read_c0_guest_compare(cop0) (cop0->reg[MIPS_CP0_COMPARE][0]) -#define kvm_write_c0_guest_compare(cop0, val) (cop0->reg[MIPS_CP0_COMPARE][0] = (val)) -#define kvm_read_c0_guest_status(cop0) (cop0->reg[MIPS_CP0_STATUS][0]) -#define kvm_write_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] = (val)) -#define kvm_read_c0_guest_intctl(cop0) (cop0->reg[MIPS_CP0_STATUS][1]) -#define kvm_write_c0_guest_intctl(cop0, val) (cop0->reg[MIPS_CP0_STATUS][1] = (val)) -#define kvm_read_c0_guest_cause(cop0) (cop0->reg[MIPS_CP0_CAUSE][0]) -#define kvm_write_c0_guest_cause(cop0, val) (cop0->reg[MIPS_CP0_CAUSE][0] = (val)) -#define kvm_read_c0_guest_epc(cop0) (cop0->reg[MIPS_CP0_EXC_PC][0]) -#define kvm_write_c0_guest_epc(cop0, val) (cop0->reg[MIPS_CP0_EXC_PC][0] = (val)) -#define kvm_read_c0_guest_prid(cop0) (cop0->reg[MIPS_CP0_PRID][0]) -#define kvm_write_c0_guest_prid(cop0, val) (cop0->reg[MIPS_CP0_PRID][0] = (val)) -#define kvm_read_c0_guest_ebase(cop0) (cop0->reg[MIPS_CP0_PRID][1]) -#define kvm_write_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] = (val)) -#define kvm_read_c0_guest_config(cop0) (cop0->reg[MIPS_CP0_CONFIG][0]) -#define kvm_read_c0_guest_config1(cop0) (cop0->reg[MIPS_CP0_CONFIG][1]) -#define kvm_read_c0_guest_config2(cop0) (cop0->reg[MIPS_CP0_CONFIG][2]) -#define kvm_read_c0_guest_config3(cop0) (cop0->reg[MIPS_CP0_CONFIG][3]) -#define kvm_read_c0_guest_config4(cop0) (cop0->reg[MIPS_CP0_CONFIG][4]) -#define kvm_read_c0_guest_config5(cop0) (cop0->reg[MIPS_CP0_CONFIG][5]) -#define kvm_read_c0_guest_config7(cop0) (cop0->reg[MIPS_CP0_CONFIG][7]) -#define kvm_write_c0_guest_config(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][0] = (val)) -#define kvm_write_c0_guest_config1(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][1] = (val)) -#define kvm_write_c0_guest_config2(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][2] = (val)) -#define kvm_write_c0_guest_config3(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][3] = (val)) -#define kvm_write_c0_guest_config4(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][4] = (val)) -#define kvm_write_c0_guest_config5(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][5] = (val)) -#define kvm_write_c0_guest_config7(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][7] = (val)) -#define kvm_read_c0_guest_errorepc(cop0) (cop0->reg[MIPS_CP0_ERROR_PC][0]) -#define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val)) -#define kvm_read_c0_guest_kscratch1(cop0) (cop0->reg[MIPS_CP0_DESAVE][2]) -#define kvm_read_c0_guest_kscratch2(cop0) (cop0->reg[MIPS_CP0_DESAVE][3]) -#define kvm_read_c0_guest_kscratch3(cop0) (cop0->reg[MIPS_CP0_DESAVE][4]) -#define kvm_read_c0_guest_kscratch4(cop0) (cop0->reg[MIPS_CP0_DESAVE][5]) -#define kvm_read_c0_guest_kscratch5(cop0) (cop0->reg[MIPS_CP0_DESAVE][6]) -#define kvm_read_c0_guest_kscratch6(cop0) (cop0->reg[MIPS_CP0_DESAVE][7]) -#define kvm_write_c0_guest_kscratch1(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][2] = (val)) -#define kvm_write_c0_guest_kscratch2(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][3] = (val)) -#define kvm_write_c0_guest_kscratch3(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][4] = (val)) -#define kvm_write_c0_guest_kscratch4(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][5] = (val)) -#define kvm_write_c0_guest_kscratch5(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][6] = (val)) -#define kvm_write_c0_guest_kscratch6(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][7] = (val)) - -/* - * Some of the guest registers may be modified asynchronously (e.g. from a - * hrtimer callback in hard irq context) and therefore need stronger atomicity - * guarantees than other registers. - */ - static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg, unsigned long val) { @@ -471,26 +455,286 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg, } while (unlikely(!temp)); } -#define kvm_set_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] |= (val)) -#define kvm_clear_c0_guest_status(cop0, val) (cop0->reg[MIPS_CP0_STATUS][0] &= ~(val)) +/* Guest register types, used in accessor build below */ +#define __KVMT32 u32 +#define __KVMTl unsigned long -/* Cause can be modified asynchronously from hardirq hrtimer callback */ -#define kvm_set_c0_guest_cause(cop0, val) \ - _kvm_atomic_set_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0], val) -#define kvm_clear_c0_guest_cause(cop0, val) \ - _kvm_atomic_clear_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0], val) -#define kvm_change_c0_guest_cause(cop0, change, val) \ - _kvm_atomic_change_c0_guest_reg(&cop0->reg[MIPS_CP0_CAUSE][0], \ - change, val) - -#define kvm_set_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] |= (val)) -#define kvm_clear_c0_guest_ebase(cop0, val) (cop0->reg[MIPS_CP0_PRID][1] &= ~(val)) -#define kvm_change_c0_guest_ebase(cop0, change, val) \ +/* + * __BUILD_KVM_$ops_SAVED(): kvm_$op_sw_gc0_$reg() + * These operate on the saved guest C0 state in RAM. + */ + +/* Generate saved context simple accessors */ +#define __BUILD_KVM_RW_SAVED(name, type, _reg, sel) \ +static inline __KVMT##type kvm_read_sw_gc0_##name(struct mips_coproc *cop0) \ +{ \ + return cop0->reg[(_reg)][(sel)]; \ +} \ +static inline void kvm_write_sw_gc0_##name(struct mips_coproc *cop0, \ + __KVMT##type val) \ +{ \ + cop0->reg[(_reg)][(sel)] = val; \ +} + +/* Generate saved context bitwise modifiers */ +#define __BUILD_KVM_SET_SAVED(name, type, _reg, sel) \ +static inline void kvm_set_sw_gc0_##name(struct mips_coproc *cop0, \ + __KVMT##type val) \ +{ \ + cop0->reg[(_reg)][(sel)] |= val; \ +} \ +static inline void kvm_clear_sw_gc0_##name(struct mips_coproc *cop0, \ + __KVMT##type val) \ +{ \ + cop0->reg[(_reg)][(sel)] &= ~val; \ +} \ +static inline void kvm_change_sw_gc0_##name(struct mips_coproc *cop0, \ + __KVMT##type mask, \ + __KVMT##type val) \ +{ \ + unsigned long _mask = mask; \ + cop0->reg[(_reg)][(sel)] &= ~_mask; \ + cop0->reg[(_reg)][(sel)] |= val & _mask; \ +} + +/* Generate saved context atomic bitwise modifiers */ +#define __BUILD_KVM_ATOMIC_SAVED(name, type, _reg, sel) \ +static inline void kvm_set_sw_gc0_##name(struct mips_coproc *cop0, \ + __KVMT##type val) \ +{ \ + _kvm_atomic_set_c0_guest_reg(&cop0->reg[(_reg)][(sel)], val); \ +} \ +static inline void kvm_clear_sw_gc0_##name(struct mips_coproc *cop0, \ + __KVMT##type val) \ +{ \ + _kvm_atomic_clear_c0_guest_reg(&cop0->reg[(_reg)][(sel)], val); \ +} \ +static inline void kvm_change_sw_gc0_##name(struct mips_coproc *cop0, \ + __KVMT##type mask, \ + __KVMT##type val) \ +{ \ + _kvm_atomic_change_c0_guest_reg(&cop0->reg[(_reg)][(sel)], mask, \ + val); \ +} + +/* + * __BUILD_KVM_$ops_VZ(): kvm_$op_vz_gc0_$reg() + * These operate on the VZ guest C0 context in hardware. + */ + +/* Generate VZ guest context simple accessors */ +#define __BUILD_KVM_RW_VZ(name, type, _reg, sel) \ +static inline __KVMT##type kvm_read_vz_gc0_##name(struct mips_coproc *cop0) \ +{ \ + return read_gc0_##name(); \ +} \ +static inline void kvm_write_vz_gc0_##name(struct mips_coproc *cop0, \ + __KVMT##type val) \ +{ \ + write_gc0_##name(val); \ +} + +/* Generate VZ guest context bitwise modifiers */ +#define __BUILD_KVM_SET_VZ(name, type, _reg, sel) \ +static inline void kvm_set_vz_gc0_##name(struct mips_coproc *cop0, \ + __KVMT##type val) \ +{ \ + set_gc0_##name(val); \ +} \ +static inline void kvm_clear_vz_gc0_##name(struct mips_coproc *cop0, \ + __KVMT##type val) \ +{ \ + clear_gc0_##name(val); \ +} \ +static inline void kvm_change_vz_gc0_##name(struct mips_coproc *cop0, \ + __KVMT##type mask, \ + __KVMT##type val) \ +{ \ + change_gc0_##name(mask, val); \ +} + +/* Generate VZ guest context save/restore to/from saved context */ +#define __BUILD_KVM_SAVE_VZ(name, _reg, sel) \ +static inline void kvm_restore_gc0_##name(struct mips_coproc *cop0) \ +{ \ + write_gc0_##name(cop0->reg[(_reg)][(sel)]); \ +} \ +static inline void kvm_save_gc0_##name(struct mips_coproc *cop0) \ +{ \ + cop0->reg[(_reg)][(sel)] = read_gc0_##name(); \ +} + +/* + * __BUILD_KVM_$ops_WRAP(): kvm_$op_$name1() -> kvm_$op_$name2() + * These wrap a set of operations to provide them with a different name. + */ + +/* Generate simple accessor wrapper */ +#define __BUILD_KVM_RW_WRAP(name1, name2, type) \ +static inline __KVMT##type kvm_read_##name1(struct mips_coproc *cop0) \ +{ \ + return kvm_read_##name2(cop0); \ +} \ +static inline void kvm_write_##name1(struct mips_coproc *cop0, \ + __KVMT##type val) \ +{ \ + kvm_write_##name2(cop0, val); \ +} + +/* Generate bitwise modifier wrapper */ +#define __BUILD_KVM_SET_WRAP(name1, name2, type) \ +static inline void kvm_set_##name1(struct mips_coproc *cop0, \ + __KVMT##type val) \ { \ - kvm_clear_c0_guest_ebase(cop0, change); \ - kvm_set_c0_guest_ebase(cop0, ((val) & (change))); \ + kvm_set_##name2(cop0, val); \ +} \ +static inline void kvm_clear_##name1(struct mips_coproc *cop0, \ + __KVMT##type val) \ +{ \ + kvm_clear_##name2(cop0, val); \ +} \ +static inline void kvm_change_##name1(struct mips_coproc *cop0, \ + __KVMT##type mask, \ + __KVMT##type val) \ +{ \ + kvm_change_##name2(cop0, mask, val); \ } +/* + * __BUILD_KVM_$ops_SW(): kvm_$op_c0_guest_$reg() -> kvm_$op_sw_gc0_$reg() + * These generate accessors operating on the saved context in RAM, and wrap them + * with the common guest C0 accessors (for use by common emulation code). + */ + +#define __BUILD_KVM_RW_SW(name, type, _reg, sel) \ + __BUILD_KVM_RW_SAVED(name, type, _reg, sel) \ + __BUILD_KVM_RW_WRAP(c0_guest_##name, sw_gc0_##name, type) + +#define __BUILD_KVM_SET_SW(name, type, _reg, sel) \ + __BUILD_KVM_SET_SAVED(name, type, _reg, sel) \ + __BUILD_KVM_SET_WRAP(c0_guest_##name, sw_gc0_##name, type) + +#define __BUILD_KVM_ATOMIC_SW(name, type, _reg, sel) \ + __BUILD_KVM_ATOMIC_SAVED(name, type, _reg, sel) \ + __BUILD_KVM_SET_WRAP(c0_guest_##name, sw_gc0_##name, type) + +#ifndef CONFIG_KVM_MIPS_VZ + +/* + * T&E (trap & emulate software based virtualisation) + * We generate the common accessors operating exclusively on the saved context + * in RAM. + */ + +#define __BUILD_KVM_RW_HW __BUILD_KVM_RW_SW +#define __BUILD_KVM_SET_HW __BUILD_KVM_SET_SW +#define __BUILD_KVM_ATOMIC_HW __BUILD_KVM_ATOMIC_SW + +#else + +/* + * VZ (hardware assisted virtualisation) + * These macros use the active guest state in VZ mode (hardware registers), + */ + +/* + * __BUILD_KVM_$ops_HW(): kvm_$op_c0_guest_$reg() -> kvm_$op_vz_gc0_$reg() + * These generate accessors operating on the VZ guest context in hardware, and + * wrap them with the common guest C0 accessors (for use by common emulation + * code). + * + * Accessors operating on the saved context in RAM are also generated to allow + * convenient explicit saving and restoring of the state. + */ + +#define __BUILD_KVM_RW_HW(name, type, _reg, sel) \ + __BUILD_KVM_RW_SAVED(name, type, _reg, sel) \ + __BUILD_KVM_RW_VZ(name, type, _reg, sel) \ + __BUILD_KVM_RW_WRAP(c0_guest_##name, vz_gc0_##name, type) \ + __BUILD_KVM_SAVE_VZ(name, _reg, sel) + +#define __BUILD_KVM_SET_HW(name, type, _reg, sel) \ + __BUILD_KVM_SET_SAVED(name, type, _reg, sel) \ + __BUILD_KVM_SET_VZ(name, type, _reg, sel) \ + __BUILD_KVM_SET_WRAP(c0_guest_##name, vz_gc0_##name, type) + +/* + * We can't do atomic modifications of COP0 state if hardware can modify it. + * Races must be handled explicitly. + */ +#define __BUILD_KVM_ATOMIC_HW __BUILD_KVM_SET_HW + +#endif + +/* + * Define accessors for CP0 registers that are accessible to the guest. These + * are primarily used by common emulation code, which may need to access the + * registers differently depending on the implementation. + * + * fns_hw/sw name type reg num select + */ +__BUILD_KVM_RW_HW(index, 32, MIPS_CP0_TLB_INDEX, 0) +__BUILD_KVM_RW_HW(entrylo0, l, MIPS_CP0_TLB_LO0, 0) +__BUILD_KVM_RW_HW(entrylo1, l, MIPS_CP0_TLB_LO1, 0) +__BUILD_KVM_RW_HW(context, l, MIPS_CP0_TLB_CONTEXT, 0) +__BUILD_KVM_RW_HW(contextconfig, 32, MIPS_CP0_TLB_CONTEXT, 1) +__BUILD_KVM_RW_HW(userlocal, l, MIPS_CP0_TLB_CONTEXT, 2) +__BUILD_KVM_RW_HW(xcontextconfig, l, MIPS_CP0_TLB_CONTEXT, 3) +__BUILD_KVM_RW_HW(pagemask, l, MIPS_CP0_TLB_PG_MASK, 0) +__BUILD_KVM_RW_HW(pagegrain, 32, MIPS_CP0_TLB_PG_MASK, 1) +__BUILD_KVM_RW_HW(segctl0, l, MIPS_CP0_TLB_PG_MASK, 2) +__BUILD_KVM_RW_HW(segctl1, l, MIPS_CP0_TLB_PG_MASK, 3) +__BUILD_KVM_RW_HW(segctl2, l, MIPS_CP0_TLB_PG_MASK, 4) +__BUILD_KVM_RW_HW(pwbase, l, MIPS_CP0_TLB_PG_MASK, 5) +__BUILD_KVM_RW_HW(pwfield, l, MIPS_CP0_TLB_PG_MASK, 6) +__BUILD_KVM_RW_HW(pwsize, l, MIPS_CP0_TLB_PG_MASK, 7) +__BUILD_KVM_RW_HW(wired, 32, MIPS_CP0_TLB_WIRED, 0) +__BUILD_KVM_RW_HW(pwctl, 32, MIPS_CP0_TLB_WIRED, 6) +__BUILD_KVM_RW_HW(hwrena, 32, MIPS_CP0_HWRENA, 0) +__BUILD_KVM_RW_HW(badvaddr, l, MIPS_CP0_BAD_VADDR, 0) +__BUILD_KVM_RW_HW(badinstr, 32, MIPS_CP0_BAD_VADDR, 1) +__BUILD_KVM_RW_HW(badinstrp, 32, MIPS_CP0_BAD_VADDR, 2) +__BUILD_KVM_RW_SW(count, 32, MIPS_CP0_COUNT, 0) +__BUILD_KVM_RW_HW(entryhi, l, MIPS_CP0_TLB_HI, 0) +__BUILD_KVM_RW_HW(compare, 32, MIPS_CP0_COMPARE, 0) +__BUILD_KVM_RW_HW(status, 32, MIPS_CP0_STATUS, 0) +__BUILD_KVM_RW_HW(intctl, 32, MIPS_CP0_STATUS, 1) +__BUILD_KVM_RW_HW(cause, 32, MIPS_CP0_CAUSE, 0) +__BUILD_KVM_RW_HW(epc, l, MIPS_CP0_EXC_PC, 0) +__BUILD_KVM_RW_SW(prid, 32, MIPS_CP0_PRID, 0) +__BUILD_KVM_RW_HW(ebase, l, MIPS_CP0_PRID, 1) +__BUILD_KVM_RW_HW(config, 32, MIPS_CP0_CONFIG, 0) +__BUILD_KVM_RW_HW(config1, 32, MIPS_CP0_CONFIG, 1) +__BUILD_KVM_RW_HW(config2, 32, MIPS_CP0_CONFIG, 2) +__BUILD_KVM_RW_HW(config3, 32, MIPS_CP0_CONFIG, 3) +__BUILD_KVM_RW_HW(config4, 32, MIPS_CP0_CONFIG, 4) +__BUILD_KVM_RW_HW(config5, 32, MIPS_CP0_CONFIG, 5) +__BUILD_KVM_RW_HW(config6, 32, MIPS_CP0_CONFIG, 6) +__BUILD_KVM_RW_HW(config7, 32, MIPS_CP0_CONFIG, 7) +__BUILD_KVM_RW_SW(maari, l, MIPS_CP0_LLADDR, 2) +__BUILD_KVM_RW_HW(xcontext, l, MIPS_CP0_TLB_XCONTEXT, 0) +__BUILD_KVM_RW_HW(errorepc, l, MIPS_CP0_ERROR_PC, 0) +__BUILD_KVM_RW_HW(kscratch1, l, MIPS_CP0_DESAVE, 2) +__BUILD_KVM_RW_HW(kscratch2, l, MIPS_CP0_DESAVE, 3) +__BUILD_KVM_RW_HW(kscratch3, l, MIPS_CP0_DESAVE, 4) +__BUILD_KVM_RW_HW(kscratch4, l, MIPS_CP0_DESAVE, 5) +__BUILD_KVM_RW_HW(kscratch5, l, MIPS_CP0_DESAVE, 6) +__BUILD_KVM_RW_HW(kscratch6, l, MIPS_CP0_DESAVE, 7) + +/* Bitwise operations (on HW state) */ +__BUILD_KVM_SET_HW(status, 32, MIPS_CP0_STATUS, 0) +/* Cause can be modified asynchronously from hardirq hrtimer callback */ +__BUILD_KVM_ATOMIC_HW(cause, 32, MIPS_CP0_CAUSE, 0) +__BUILD_KVM_SET_HW(ebase, l, MIPS_CP0_PRID, 1) + +/* Bitwise operations (on saved state) */ +__BUILD_KVM_SET_SAVED(config, 32, MIPS_CP0_CONFIG, 0) +__BUILD_KVM_SET_SAVED(config1, 32, MIPS_CP0_CONFIG, 1) +__BUILD_KVM_SET_SAVED(config2, 32, MIPS_CP0_CONFIG, 2) +__BUILD_KVM_SET_SAVED(config3, 32, MIPS_CP0_CONFIG, 3) +__BUILD_KVM_SET_SAVED(config4, 32, MIPS_CP0_CONFIG, 4) +__BUILD_KVM_SET_SAVED(config5, 32, MIPS_CP0_CONFIG, 5) + /* Helpers */ static inline bool kvm_mips_guest_can_have_fpu(struct kvm_vcpu_arch *vcpu) @@ -531,6 +775,10 @@ struct kvm_mips_callbacks { int (*handle_msa_fpe)(struct kvm_vcpu *vcpu); int (*handle_fpe)(struct kvm_vcpu *vcpu); int (*handle_msa_disabled)(struct kvm_vcpu *vcpu); + int (*handle_guest_exit)(struct kvm_vcpu *vcpu); + int (*hardware_enable)(void); + void (*hardware_disable)(void); + int (*check_extension)(struct kvm *kvm, long ext); int (*vcpu_init)(struct kvm_vcpu *vcpu); void (*vcpu_uninit)(struct kvm_vcpu *vcpu); int (*vcpu_setup)(struct kvm_vcpu *vcpu); @@ -599,6 +847,10 @@ u32 kvm_get_user_asid(struct kvm_vcpu *vcpu); u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu); +#ifdef CONFIG_KVM_MIPS_VZ +int kvm_mips_handle_vz_root_tlb_fault(unsigned long badvaddr, + struct kvm_vcpu *vcpu, bool write_fault); +#endif extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr, struct kvm_vcpu *vcpu, bool write_fault); @@ -625,6 +877,18 @@ extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi, extern int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi); +#ifdef CONFIG_KVM_MIPS_VZ +int kvm_vz_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi); +int kvm_vz_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long gva, + unsigned long *gpa); +void kvm_vz_local_flush_roottlb_all_guests(void); +void kvm_vz_local_flush_guesttlb_all(void); +void kvm_vz_save_guesttlb(struct kvm_mips_tlb *buf, unsigned int index, + unsigned int count); +void kvm_vz_load_guesttlb(const struct kvm_mips_tlb *buf, unsigned int index, + unsigned int count); +#endif + void kvm_mips_suspend_mm(int cpu); void kvm_mips_resume_mm(int cpu); @@ -795,7 +1059,7 @@ extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, u32 kvm_mips_read_count(struct kvm_vcpu *vcpu); void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count); void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack); -void kvm_mips_init_count(struct kvm_vcpu *vcpu); +void kvm_mips_init_count(struct kvm_vcpu *vcpu, unsigned long count_hz); int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl); int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume); int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz); @@ -803,6 +1067,20 @@ void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu); void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu); enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu); +/* fairly internal functions requiring some care to use */ +int kvm_mips_count_disabled(struct kvm_vcpu *vcpu); +ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count); +int kvm_mips_restore_hrtimer(struct kvm_vcpu *vcpu, ktime_t before, + u32 count, int min_drift); + +#ifdef CONFIG_KVM_MIPS_VZ +void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu); +void kvm_vz_lose_htimer(struct kvm_vcpu *vcpu); +#else +static inline void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu) {} +static inline void kvm_vz_lose_htimer(struct kvm_vcpu *vcpu) {} +#endif + enum emulation_result kvm_mips_check_privilege(u32 cause, u32 *opc, struct kvm_run *run, @@ -827,11 +1105,20 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, struct kvm_run *run, struct kvm_vcpu *vcpu); +/* COP0 */ +enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu); + unsigned int kvm_mips_config1_wrmask(struct kvm_vcpu *vcpu); unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu); unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu); unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu); +/* Hypercalls (hypcall.c) */ + +enum emulation_result kvm_mips_emul_hypcall(struct kvm_vcpu *vcpu, + union mips_instruction inst); +int kvm_mips_handle_hypcall(struct kvm_vcpu *vcpu); + /* Dynamic binary translation */ extern int kvm_mips_trans_cache_index(union mips_instruction inst, u32 *opc, struct kvm_vcpu *vcpu); @@ -846,7 +1133,6 @@ extern int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc, extern void kvm_mips_dump_stats(struct kvm_vcpu *vcpu); extern unsigned long kvm_mips_get_ramsize(struct kvm *kvm); -static inline void kvm_arch_hardware_disable(void) {} static inline void kvm_arch_hardware_unsetup(void) {} static inline void kvm_arch_sync_events(struct kvm *kvm) {} static inline void kvm_arch_free_memslot(struct kvm *kvm, diff --git a/arch/mips/include/asm/maar.h b/arch/mips/include/asm/maar.h index 21d9607c80d7..e10f78befbd9 100644 --- a/arch/mips/include/asm/maar.h +++ b/arch/mips/include/asm/maar.h @@ -36,7 +36,7 @@ unsigned platform_maar_init(unsigned num_pairs); * @upper: The highest address that the MAAR pair will affect. Must be * aligned to one byte before a 2^16 byte boundary. * @attrs: The accessibility attributes to program, eg. MIPS_MAAR_S. The - * MIPS_MAAR_V attribute will automatically be set. + * MIPS_MAAR_VL attribute will automatically be set. * * Program the pair of MAAR registers specified by idx to apply the attributes * specified by attrs to the range of addresses from lower to higher. @@ -49,10 +49,10 @@ static inline void write_maar_pair(unsigned idx, phys_addr_t lower, BUG_ON(((upper & 0xffff) != 0xffff) || ((upper & ~0xffffull) & ~(MIPS_MAAR_ADDR << 4))); - /* Automatically set MIPS_MAAR_V */ - attrs |= MIPS_MAAR_V; + /* Automatically set MIPS_MAAR_VL */ + attrs |= MIPS_MAAR_VL; - /* Write the upper address & attributes (only MIPS_MAAR_V matters) */ + /* Write the upper address & attributes (only MIPS_MAAR_VL matters) */ write_c0_maari(idx << 1); back_to_back_c0_hazard(); write_c0_maar(((upper >> 4) & MIPS_MAAR_ADDR) | attrs); @@ -81,7 +81,7 @@ extern void maar_init(void); * @upper: The highest address that the MAAR pair will affect. Must be * aligned to one byte before a 2^16 byte boundary. * @attrs: The accessibility attributes to program, eg. MIPS_MAAR_S. The - * MIPS_MAAR_V attribute will automatically be set. + * MIPS_MAAR_VL attribute will automatically be set. * * Describes the configuration of a pair of Memory Accessibility Attribute * Registers - applying attributes from attrs to the range of physical diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h index f8d1d2f1d80d..6875b69f59f7 100644 --- a/arch/mips/include/asm/mipsregs.h +++ b/arch/mips/include/asm/mipsregs.h @@ -34,8 +34,10 @@ */ #ifdef __ASSEMBLY__ #define _ULCAST_ +#define _U64CAST_ #else #define _ULCAST_ (unsigned long) +#define _U64CAST_ (u64) #endif /* @@ -217,8 +219,10 @@ /* * Wired register bits */ -#define MIPSR6_WIRED_LIMIT (_ULCAST_(0xffff) << 16) -#define MIPSR6_WIRED_WIRED (_ULCAST_(0xffff) << 0) +#define MIPSR6_WIRED_LIMIT_SHIFT 16 +#define MIPSR6_WIRED_LIMIT (_ULCAST_(0xffff) << MIPSR6_WIRED_LIMIT_SHIFT) +#define MIPSR6_WIRED_WIRED_SHIFT 0 +#define MIPSR6_WIRED_WIRED (_ULCAST_(0xffff) << MIPSR6_WIRED_WIRED_SHIFT) /* * Values used for computation of new tlb entries @@ -645,6 +649,7 @@ #define MIPS_CONF5_LLB (_ULCAST_(1) << 4) #define MIPS_CONF5_MVH (_ULCAST_(1) << 5) #define MIPS_CONF5_VP (_ULCAST_(1) << 7) +#define MIPS_CONF5_SBRI (_ULCAST_(1) << 6) #define MIPS_CONF5_FRE (_ULCAST_(1) << 8) #define MIPS_CONF5_UFE (_ULCAST_(1) << 9) #define MIPS_CONF5_MSAEN (_ULCAST_(1) << 27) @@ -719,10 +724,14 @@ #define XLR_PERFCTRL_ALLTHREADS (_ULCAST_(1) << 13) /* MAAR bit definitions */ +#define MIPS_MAAR_VH (_U64CAST_(1) << 63) #define MIPS_MAAR_ADDR ((BIT_ULL(BITS_PER_LONG - 12) - 1) << 12) #define MIPS_MAAR_ADDR_SHIFT 12 #define MIPS_MAAR_S (_ULCAST_(1) << 1) -#define MIPS_MAAR_V (_ULCAST_(1) << 0) +#define MIPS_MAAR_VL (_ULCAST_(1) << 0) + +/* MAARI bit definitions */ +#define MIPS_MAARI_INDEX (_ULCAST_(0x3f) << 0) /* EBase bit definitions */ #define MIPS_EBASE_CPUNUM_SHIFT 0 @@ -736,6 +745,10 @@ #define MIPS_CMGCRB_BASE 11 #define MIPS_CMGCRF_BASE (~_ULCAST_((1 << MIPS_CMGCRB_BASE) - 1)) +/* LLAddr bit definitions */ +#define MIPS_LLADDR_LLB_SHIFT 0 +#define MIPS_LLADDR_LLB (_ULCAST_(1) << MIPS_LLADDR_LLB_SHIFT) + /* * Bits in the MIPS32 Memory Segmentation registers. */ @@ -961,6 +974,22 @@ /* Flush FTLB */ #define LOONGSON_DIAG_FTLB (_ULCAST_(1) << 13) +/* CvmCtl register field definitions */ +#define CVMCTL_IPPCI_SHIFT 7 +#define CVMCTL_IPPCI (_U64CAST_(0x7) << CVMCTL_IPPCI_SHIFT) +#define CVMCTL_IPTI_SHIFT 4 +#define CVMCTL_IPTI (_U64CAST_(0x7) << CVMCTL_IPTI_SHIFT) + +/* CvmMemCtl2 register field definitions */ +#define CVMMEMCTL2_INHIBITTS (_U64CAST_(1) << 17) + +/* CvmVMConfig register field definitions */ +#define CVMVMCONF_DGHT (_U64CAST_(1) << 60) +#define CVMVMCONF_MMUSIZEM1_S 12 +#define CVMVMCONF_MMUSIZEM1 (_U64CAST_(0xff) << CVMVMCONF_MMUSIZEM1_S) +#define CVMVMCONF_RMMUSIZEM1_S 0 +#define CVMVMCONF_RMMUSIZEM1 (_U64CAST_(0xff) << CVMVMCONF_RMMUSIZEM1_S) + /* * Coprocessor 1 (FPU) register names */ @@ -1720,6 +1749,13 @@ do { \ #define read_c0_cvmmemctl() __read_64bit_c0_register($11, 7) #define write_c0_cvmmemctl(val) __write_64bit_c0_register($11, 7, val) + +#define read_c0_cvmmemctl2() __read_64bit_c0_register($16, 6) +#define write_c0_cvmmemctl2(val) __write_64bit_c0_register($16, 6, val) + +#define read_c0_cvmvmconfig() __read_64bit_c0_register($16, 7) +#define write_c0_cvmvmconfig(val) __write_64bit_c0_register($16, 7, val) + /* * The cacheerr registers are not standardized. On OCTEON, they are * 64 bits wide. @@ -1989,6 +2025,8 @@ do { \ #define read_gc0_epc() __read_ulong_gc0_register(14, 0) #define write_gc0_epc(val) __write_ulong_gc0_register(14, 0, val) +#define read_gc0_prid() __read_32bit_gc0_register(15, 0) + #define read_gc0_ebase() __read_32bit_gc0_register(15, 1) #define write_gc0_ebase(val) __write_32bit_gc0_register(15, 1, val) @@ -2012,6 +2050,9 @@ do { \ #define write_gc0_config6(val) __write_32bit_gc0_register(16, 6, val) #define write_gc0_config7(val) __write_32bit_gc0_register(16, 7, val) +#define read_gc0_lladdr() __read_ulong_gc0_register(17, 0) +#define write_gc0_lladdr(val) __write_ulong_gc0_register(17, 0, val) + #define read_gc0_watchlo0() __read_ulong_gc0_register(18, 0) #define read_gc0_watchlo1() __read_ulong_gc0_register(18, 1) #define read_gc0_watchlo2() __read_ulong_gc0_register(18, 2) @@ -2090,6 +2131,19 @@ do { \ #define write_gc0_kscratch5(val) __write_ulong_gc0_register(31, 6, val) #define write_gc0_kscratch6(val) __write_ulong_gc0_register(31, 7, val) +/* Cavium OCTEON (cnMIPS) */ +#define read_gc0_cvmcount() __read_ulong_gc0_register(9, 6) +#define write_gc0_cvmcount(val) __write_ulong_gc0_register(9, 6, val) + +#define read_gc0_cvmctl() __read_64bit_gc0_register(9, 7) +#define write_gc0_cvmctl(val) __write_64bit_gc0_register(9, 7, val) + +#define read_gc0_cvmmemctl() __read_64bit_gc0_register(11, 7) +#define write_gc0_cvmmemctl(val) __write_64bit_gc0_register(11, 7, val) + +#define read_gc0_cvmmemctl2() __read_64bit_gc0_register(16, 6) +#define write_gc0_cvmmemctl2(val) __write_64bit_gc0_register(16, 6, val) + /* * Macros to access the floating point coprocessor control registers */ @@ -2696,9 +2750,11 @@ __BUILD_SET_C0(brcm_mode) */ #define __BUILD_SET_GC0(name) __BUILD_SET_COMMON(gc0_##name) +__BUILD_SET_GC0(wired) __BUILD_SET_GC0(status) __BUILD_SET_GC0(cause) __BUILD_SET_GC0(ebase) +__BUILD_SET_GC0(config1) /* * Return low 10 bits of ebase. diff --git a/arch/mips/include/asm/tlb.h b/arch/mips/include/asm/tlb.h index dd179fd8acda..939734de4359 100644 --- a/arch/mips/include/asm/tlb.h +++ b/arch/mips/include/asm/tlb.h @@ -21,9 +21,11 @@ */ #define tlb_flush(tlb) flush_tlb_mm((tlb)->mm) -#define UNIQUE_ENTRYHI(idx) \ - ((CKSEG0 + ((idx) << (PAGE_SHIFT + 1))) | \ +#define _UNIQUE_ENTRYHI(base, idx) \ + (((base) + ((idx) << (PAGE_SHIFT + 1))) | \ (cpu_has_tlbinv ? MIPS_ENTRYHI_EHINV : 0)) +#define UNIQUE_ENTRYHI(idx) _UNIQUE_ENTRYHI(CKSEG0, idx) +#define UNIQUE_GUEST_ENTRYHI(idx) _UNIQUE_ENTRYHI(CKSEG1, idx) static inline unsigned int num_wired_entries(void) { diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h index 77429d1622b3..b5e46ae872d3 100644 --- a/arch/mips/include/uapi/asm/inst.h +++ b/arch/mips/include/uapi/asm/inst.h @@ -179,7 +179,7 @@ enum cop0_coi_func { tlbr_op = 0x01, tlbwi_op = 0x02, tlbwr_op = 0x06, tlbp_op = 0x08, rfe_op = 0x10, eret_op = 0x18, - wait_op = 0x20, + wait_op = 0x20, hypcall_op = 0x28 }; /* diff --git a/arch/mips/include/uapi/asm/kvm.h b/arch/mips/include/uapi/asm/kvm.h index a8a0199bf760..0318c6b442ab 100644 --- a/arch/mips/include/uapi/asm/kvm.h +++ b/arch/mips/include/uapi/asm/kvm.h @@ -21,6 +21,8 @@ #define __KVM_HAVE_READONLY_MEM +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + /* * for KVM_GET_REGS and KVM_SET_REGS * @@ -54,9 +56,14 @@ struct kvm_fpu { * Register set = 0: GP registers from kvm_regs (see definitions below). * * Register set = 1: CP0 registers. - * bits[15..8] - Must be zero. - * bits[7..3] - Register 'rd' index. - * bits[2..0] - Register 'sel' index. + * bits[15..8] - COP0 register set. + * + * COP0 register set = 0: Main CP0 registers. + * bits[7..3] - Register 'rd' index. + * bits[2..0] - Register 'sel' index. + * + * COP0 register set = 1: MAARs. + * bits[7..0] - MAAR index. * * Register set = 2: KVM specific registers (see definitions below). * @@ -115,6 +122,15 @@ struct kvm_fpu { /* + * KVM_REG_MIPS_CP0 - Coprocessor 0 registers. + */ + +#define KVM_REG_MIPS_MAAR (KVM_REG_MIPS_CP0 | (1 << 8)) +#define KVM_REG_MIPS_CP0_MAAR(n) (KVM_REG_MIPS_MAAR | \ + KVM_REG_SIZE_U64 | (n)) + + +/* * KVM_REG_MIPS_KVM - KVM specific control registers. */ diff --git a/arch/mips/kernel/cpu-probe.c b/arch/mips/kernel/cpu-probe.c index 12422fd4af23..3382892544f0 100644 --- a/arch/mips/kernel/cpu-probe.c +++ b/arch/mips/kernel/cpu-probe.c @@ -289,6 +289,8 @@ static void cpu_set_fpu_opts(struct cpuinfo_mips *c) MIPS_CPU_ISA_M32R6 | MIPS_CPU_ISA_M64R6)) { if (c->fpu_id & MIPS_FPIR_3D) c->ases |= MIPS_ASE_MIPS3D; + if (c->fpu_id & MIPS_FPIR_UFRP) + c->options |= MIPS_CPU_UFR; if (c->fpu_id & MIPS_FPIR_FREP) c->options |= MIPS_CPU_FRE; } @@ -1003,7 +1005,8 @@ static inline unsigned int decode_guest_config3(struct cpuinfo_mips *c) unsigned int config3, config3_dyn; probe_gc0_config_dyn(config3, config3, config3_dyn, - MIPS_CONF_M | MIPS_CONF3_MSA | MIPS_CONF3_CTXTC); + MIPS_CONF_M | MIPS_CONF3_MSA | MIPS_CONF3_ULRI | + MIPS_CONF3_CTXTC); if (config3 & MIPS_CONF3_CTXTC) c->guest.options |= MIPS_CPU_CTXTC; @@ -1013,6 +1016,9 @@ static inline unsigned int decode_guest_config3(struct cpuinfo_mips *c) if (config3 & MIPS_CONF3_PW) c->guest.options |= MIPS_CPU_HTW; + if (config3 & MIPS_CONF3_ULRI) + c->guest.options |= MIPS_CPU_ULRI; + if (config3 & MIPS_CONF3_SC) c->guest.options |= MIPS_CPU_SEGMENTS; @@ -1051,7 +1057,7 @@ static inline unsigned int decode_guest_config5(struct cpuinfo_mips *c) unsigned int config5, config5_dyn; probe_gc0_config_dyn(config5, config5, config5_dyn, - MIPS_CONF_M | MIPS_CONF5_MRP); + MIPS_CONF_M | MIPS_CONF5_MVH | MIPS_CONF5_MRP); if (config5 & MIPS_CONF5_MRP) c->guest.options |= MIPS_CPU_MAAR; @@ -1061,6 +1067,9 @@ static inline unsigned int decode_guest_config5(struct cpuinfo_mips *c) if (config5 & MIPS_CONF5_LLB) c->guest.options |= MIPS_CPU_RW_LLB; + if (config5 & MIPS_CONF5_MVH) + c->guest.options |= MIPS_CPU_MVH; + if (config5 & MIPS_CONF_M) c->guest.conf |= BIT(6); return config5 & MIPS_CONF_M; diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c index a7f81261c781..c036157fb891 100644 --- a/arch/mips/kernel/time.c +++ b/arch/mips/kernel/time.c @@ -70,6 +70,7 @@ EXPORT_SYMBOL(perf_irq); */ unsigned int mips_hpt_frequency; +EXPORT_SYMBOL_GPL(mips_hpt_frequency); /* * This function exists in order to cause an error due to a duplicate diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig index 65067327db12..50a722dfb236 100644 --- a/arch/mips/kvm/Kconfig +++ b/arch/mips/kvm/Kconfig @@ -26,11 +26,34 @@ config KVM select SRCU ---help--- Support for hosting Guest kernels. - Currently supported on MIPS32 processors. + +choice + prompt "Virtualization mode" + depends on KVM + default KVM_MIPS_TE + +config KVM_MIPS_TE + bool "Trap & Emulate" + ---help--- + Use trap and emulate to virtualize 32-bit guests in user mode. This + does not require any special hardware Virtualization support beyond + standard MIPS32/64 r2 or later, but it does require the guest kernel + to be configured with CONFIG_KVM_GUEST=y so that it resides in the + user address segment. + +config KVM_MIPS_VZ + bool "MIPS Virtualization (VZ) ASE" + ---help--- + Use the MIPS Virtualization (VZ) ASE to virtualize guests. This + supports running unmodified guest kernels (with CONFIG_KVM_GUEST=n), + but requires hardware support. + +endchoice config KVM_MIPS_DYN_TRANS bool "KVM/MIPS: Dynamic binary translation to reduce traps" - depends on KVM + depends on KVM_MIPS_TE + default y ---help--- When running in Trap & Emulate mode patch privileged instructions to reduce the number of traps. diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile index 847429de780d..45d90f5d5177 100644 --- a/arch/mips/kvm/Makefile +++ b/arch/mips/kvm/Makefile @@ -9,8 +9,15 @@ common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \ interrupt.o stats.o commpage.o \ - dyntrans.o trap_emul.o fpu.o + fpu.o +kvm-objs += hypcall.o kvm-objs += mmu.o +ifdef CONFIG_KVM_MIPS_VZ +kvm-objs += vz.o +else +kvm-objs += dyntrans.o +kvm-objs += trap_emul.o +endif obj-$(CONFIG_KVM) += kvm.o obj-y += callback.o tlb.o diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c index d40cfaad4529..4144bfaef137 100644 --- a/arch/mips/kvm/emulate.c +++ b/arch/mips/kvm/emulate.c @@ -308,7 +308,7 @@ int kvm_get_badinstrp(u32 *opc, struct kvm_vcpu *vcpu, u32 *out) * CP0_Cause.DC bit or the count_ctl.DC bit. * 0 otherwise (in which case CP0_Count timer is running). */ -static inline int kvm_mips_count_disabled(struct kvm_vcpu *vcpu) +int kvm_mips_count_disabled(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -467,7 +467,7 @@ u32 kvm_mips_read_count(struct kvm_vcpu *vcpu) * * Returns: The ktime at the point of freeze. */ -static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count) +ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count) { ktime_t now; @@ -517,6 +517,82 @@ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu, } /** + * kvm_mips_restore_hrtimer() - Restore hrtimer after a gap, updating expiry. + * @vcpu: Virtual CPU. + * @before: Time before Count was saved, lower bound of drift calculation. + * @count: CP0_Count at point of restore. + * @min_drift: Minimum amount of drift permitted before correction. + * Must be <= 0. + * + * Restores the timer from a particular @count, accounting for drift. This can + * be used in conjunction with kvm_mips_freeze_timer() when a hardware timer is + * to be used for a period of time, but the exact ktime corresponding to the + * final Count that must be restored is not known. + * + * It is gauranteed that a timer interrupt immediately after restore will be + * handled, but not if CP0_Compare is exactly at @count. That case should + * already be handled when the hardware timer state is saved. + * + * Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is not + * stopped). + * + * Returns: Amount of correction to count_bias due to drift. + */ +int kvm_mips_restore_hrtimer(struct kvm_vcpu *vcpu, ktime_t before, + u32 count, int min_drift) +{ + ktime_t now, count_time; + u32 now_count, before_count; + u64 delta; + int drift, ret = 0; + + /* Calculate expected count at before */ + before_count = vcpu->arch.count_bias + + kvm_mips_ktime_to_count(vcpu, before); + + /* + * Detect significantly negative drift, where count is lower than + * expected. Some negative drift is expected when hardware counter is + * set after kvm_mips_freeze_timer(), and it is harmless to allow the + * time to jump forwards a little, within reason. If the drift is too + * significant, adjust the bias to avoid a big Guest.CP0_Count jump. + */ + drift = count - before_count; + if (drift < min_drift) { + count_time = before; + vcpu->arch.count_bias += drift; + ret = drift; + goto resume; + } + + /* Calculate expected count right now */ + now = ktime_get(); + now_count = vcpu->arch.count_bias + kvm_mips_ktime_to_count(vcpu, now); + + /* + * Detect positive drift, where count is higher than expected, and + * adjust the bias to avoid guest time going backwards. + */ + drift = count - now_count; + if (drift > 0) { + count_time = now; + vcpu->arch.count_bias += drift; + ret = drift; + goto resume; + } + + /* Subtract nanosecond delta to find ktime when count was read */ + delta = (u64)(u32)(now_count - count); + delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz); + count_time = ktime_sub_ns(now, delta); + +resume: + /* Resume using the calculated ktime */ + kvm_mips_resume_hrtimer(vcpu, count_time, count); + return ret; +} + +/** * kvm_mips_write_count() - Modify the count and update timer. * @vcpu: Virtual CPU. * @count: Guest CP0_Count value to set. @@ -543,16 +619,15 @@ void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count) /** * kvm_mips_init_count() - Initialise timer. * @vcpu: Virtual CPU. + * @count_hz: Frequency of timer. * - * Initialise the timer to a sensible frequency, namely 100MHz, zero it, and set - * it going if it's enabled. + * Initialise the timer to the specified frequency, zero it, and set it going if + * it's enabled. */ -void kvm_mips_init_count(struct kvm_vcpu *vcpu) +void kvm_mips_init_count(struct kvm_vcpu *vcpu, unsigned long count_hz) { - /* 100 MHz */ - vcpu->arch.count_hz = 100*1000*1000; - vcpu->arch.count_period = div_u64((u64)NSEC_PER_SEC << 32, - vcpu->arch.count_hz); + vcpu->arch.count_hz = count_hz; + vcpu->arch.count_period = div_u64((u64)NSEC_PER_SEC << 32, count_hz); vcpu->arch.count_dyn_bias = 0; /* Starting at 0 */ @@ -622,7 +697,9 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack) struct mips_coproc *cop0 = vcpu->arch.cop0; int dc; u32 old_compare = kvm_read_c0_guest_compare(cop0); - ktime_t now; + s32 delta = compare - old_compare; + u32 cause; + ktime_t now = ktime_set(0, 0); /* silence bogus GCC warning */ u32 count; /* if unchanged, must just be an ack */ @@ -634,6 +711,21 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack) return; } + /* + * If guest CP0_Compare moves forward, CP0_GTOffset should be adjusted + * too to prevent guest CP0_Count hitting guest CP0_Compare. + * + * The new GTOffset corresponds to the new value of CP0_Compare, and is + * set prior to it being written into the guest context. We disable + * preemption until the new value is written to prevent restore of a + * GTOffset corresponding to the old CP0_Compare value. + */ + if (IS_ENABLED(CONFIG_KVM_MIPS_VZ) && delta > 0) { + preempt_disable(); + write_c0_gtoffset(compare - read_c0_count()); + back_to_back_c0_hazard(); + } + /* freeze_hrtimer() takes care of timer interrupts <= count */ dc = kvm_mips_count_disabled(vcpu); if (!dc) @@ -641,12 +733,36 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack) if (ack) kvm_mips_callbacks->dequeue_timer_int(vcpu); + else if (IS_ENABLED(CONFIG_KVM_MIPS_VZ)) + /* + * With VZ, writing CP0_Compare acks (clears) CP0_Cause.TI, so + * preserve guest CP0_Cause.TI if we don't want to ack it. + */ + cause = kvm_read_c0_guest_cause(cop0); kvm_write_c0_guest_compare(cop0, compare); + if (IS_ENABLED(CONFIG_KVM_MIPS_VZ)) { + if (delta > 0) + preempt_enable(); + + back_to_back_c0_hazard(); + + if (!ack && cause & CAUSEF_TI) + kvm_write_c0_guest_cause(cop0, cause); + } + /* resume_hrtimer() takes care of timer interrupts > count */ if (!dc) kvm_mips_resume_hrtimer(vcpu, now, count); + + /* + * If guest CP0_Compare is moving backward, we delay CP0_GTOffset change + * until after the new CP0_Compare is written, otherwise new guest + * CP0_Count could hit new guest CP0_Compare. + */ + if (IS_ENABLED(CONFIG_KVM_MIPS_VZ) && delta <= 0) + write_c0_gtoffset(compare - read_c0_count()); } /** @@ -857,6 +973,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu) ++vcpu->stat.wait_exits; trace_kvm_exit(vcpu, KVM_TRACE_EXIT_WAIT); if (!vcpu->arch.pending_exceptions) { + kvm_vz_lose_htimer(vcpu); vcpu->arch.wait = 1; kvm_vcpu_block(vcpu); @@ -865,7 +982,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu) * check if any I/O interrupts are pending. */ if (kvm_check_request(KVM_REQ_UNHALT, vcpu)) { - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; } } @@ -873,17 +990,62 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu) return EMULATE_DONE; } -/* - * XXXKYMA: Linux doesn't seem to use TLBR, return EMULATE_FAIL for now so that - * we can catch this, if things ever change - */ +static void kvm_mips_change_entryhi(struct kvm_vcpu *vcpu, + unsigned long entryhi) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; + int cpu, i; + u32 nasid = entryhi & KVM_ENTRYHI_ASID; + + if (((kvm_read_c0_guest_entryhi(cop0) & KVM_ENTRYHI_ASID) != nasid)) { + trace_kvm_asid_change(vcpu, kvm_read_c0_guest_entryhi(cop0) & + KVM_ENTRYHI_ASID, nasid); + + /* + * Flush entries from the GVA page tables. + * Guest user page table will get flushed lazily on re-entry to + * guest user if the guest ASID actually changes. + */ + kvm_mips_flush_gva_pt(kern_mm->pgd, KMF_KERN); + + /* + * Regenerate/invalidate kernel MMU context. + * The user MMU context will be regenerated lazily on re-entry + * to guest user if the guest ASID actually changes. + */ + preempt_disable(); + cpu = smp_processor_id(); + get_new_mmu_context(kern_mm, cpu); + for_each_possible_cpu(i) + if (i != cpu) + cpu_context(i, kern_mm) = 0; + preempt_enable(); + } + kvm_write_c0_guest_entryhi(cop0, entryhi); +} + enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; + struct kvm_mips_tlb *tlb; unsigned long pc = vcpu->arch.pc; + int index; - kvm_err("[%#lx] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0)); - return EMULATE_FAIL; + index = kvm_read_c0_guest_index(cop0); + if (index < 0 || index >= KVM_MIPS_GUEST_TLB_SIZE) { + /* UNDEFINED */ + kvm_debug("[%#lx] TLBR Index %#x out of range\n", pc, index); + index &= KVM_MIPS_GUEST_TLB_SIZE - 1; + } + + tlb = &vcpu->arch.guest_tlb[index]; + kvm_write_c0_guest_pagemask(cop0, tlb->tlb_mask); + kvm_write_c0_guest_entrylo0(cop0, tlb->tlb_lo[0]); + kvm_write_c0_guest_entrylo1(cop0, tlb->tlb_lo[1]); + kvm_mips_change_entryhi(vcpu, tlb->tlb_hi); + + return EMULATE_DONE; } /** @@ -1105,11 +1267,9 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; - struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; enum emulation_result er = EMULATE_DONE; u32 rt, rd, sel; unsigned long curr_pc; - int cpu, i; /* * Update PC and hold onto current PC in case there is @@ -1143,6 +1303,9 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, case wait_op: er = kvm_mips_emul_wait(vcpu); break; + case hypcall_op: + er = kvm_mips_emul_hypcall(vcpu, inst); + break; } } else { rt = inst.c0r_format.rt; @@ -1208,44 +1371,8 @@ enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst, kvm_change_c0_guest_ebase(cop0, 0x1ffff000, vcpu->arch.gprs[rt]); } else if (rd == MIPS_CP0_TLB_HI && sel == 0) { - u32 nasid = - vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID; - if (((kvm_read_c0_guest_entryhi(cop0) & - KVM_ENTRYHI_ASID) != nasid)) { - trace_kvm_asid_change(vcpu, - kvm_read_c0_guest_entryhi(cop0) - & KVM_ENTRYHI_ASID, - nasid); - - /* - * Flush entries from the GVA page - * tables. - * Guest user page table will get - * flushed lazily on re-entry to guest - * user if the guest ASID actually - * changes. - */ - kvm_mips_flush_gva_pt(kern_mm->pgd, - KMF_KERN); - - /* - * Regenerate/invalidate kernel MMU - * context. - * The user MMU context will be - * regenerated lazily on re-entry to - * guest user if the guest ASID actually - * changes. - */ - preempt_disable(); - cpu = smp_processor_id(); - get_new_mmu_context(kern_mm, cpu); - for_each_possible_cpu(i) - if (i != cpu) - cpu_context(i, kern_mm) = 0; - preempt_enable(); - } - kvm_write_c0_guest_entryhi(cop0, - vcpu->arch.gprs[rt]); + kvm_mips_change_entryhi(vcpu, + vcpu->arch.gprs[rt]); } /* Are we writing to COUNT */ else if ((rd == MIPS_CP0_COUNT) && (sel == 0)) { @@ -1474,9 +1601,8 @@ enum emulation_result kvm_mips_emulate_store(union mips_instruction inst, struct kvm_run *run, struct kvm_vcpu *vcpu) { - enum emulation_result er = EMULATE_DO_MMIO; + enum emulation_result er; u32 rt; - u32 bytes; void *data = run->mmio.data; unsigned long curr_pc; @@ -1491,103 +1617,74 @@ enum emulation_result kvm_mips_emulate_store(union mips_instruction inst, rt = inst.i_format.rt; + run->mmio.phys_addr = kvm_mips_callbacks->gva_to_gpa( + vcpu->arch.host_cp0_badvaddr); + if (run->mmio.phys_addr == KVM_INVALID_ADDR) + goto out_fail; + switch (inst.i_format.opcode) { - case sb_op: - bytes = 1; - if (bytes > sizeof(run->mmio.data)) { - kvm_err("%s: bad MMIO length: %d\n", __func__, - run->mmio.len); - } - run->mmio.phys_addr = - kvm_mips_callbacks->gva_to_gpa(vcpu->arch. - host_cp0_badvaddr); - if (run->mmio.phys_addr == KVM_INVALID_ADDR) { - er = EMULATE_FAIL; - break; - } - run->mmio.len = bytes; - run->mmio.is_write = 1; - vcpu->mmio_needed = 1; - vcpu->mmio_is_write = 1; - *(u8 *) data = vcpu->arch.gprs[rt]; - kvm_debug("OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n", - vcpu->arch.host_cp0_badvaddr, vcpu->arch.gprs[rt], - *(u8 *) data); +#if defined(CONFIG_64BIT) && defined(CONFIG_KVM_MIPS_VZ) + case sd_op: + run->mmio.len = 8; + *(u64 *)data = vcpu->arch.gprs[rt]; + kvm_debug("[%#lx] OP_SD: eaddr: %#lx, gpr: %#lx, data: %#llx\n", + vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr, + vcpu->arch.gprs[rt], *(u64 *)data); break; +#endif case sw_op: - bytes = 4; - if (bytes > sizeof(run->mmio.data)) { - kvm_err("%s: bad MMIO length: %d\n", __func__, - run->mmio.len); - } - run->mmio.phys_addr = - kvm_mips_callbacks->gva_to_gpa(vcpu->arch. - host_cp0_badvaddr); - if (run->mmio.phys_addr == KVM_INVALID_ADDR) { - er = EMULATE_FAIL; - break; - } - - run->mmio.len = bytes; - run->mmio.is_write = 1; - vcpu->mmio_needed = 1; - vcpu->mmio_is_write = 1; - *(u32 *) data = vcpu->arch.gprs[rt]; + run->mmio.len = 4; + *(u32 *)data = vcpu->arch.gprs[rt]; kvm_debug("[%#lx] OP_SW: eaddr: %#lx, gpr: %#lx, data: %#x\n", vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr, - vcpu->arch.gprs[rt], *(u32 *) data); + vcpu->arch.gprs[rt], *(u32 *)data); break; case sh_op: - bytes = 2; - if (bytes > sizeof(run->mmio.data)) { - kvm_err("%s: bad MMIO length: %d\n", __func__, - run->mmio.len); - } - run->mmio.phys_addr = - kvm_mips_callbacks->gva_to_gpa(vcpu->arch. - host_cp0_badvaddr); - if (run->mmio.phys_addr == KVM_INVALID_ADDR) { - er = EMULATE_FAIL; - break; - } - - run->mmio.len = bytes; - run->mmio.is_write = 1; - vcpu->mmio_needed = 1; - vcpu->mmio_is_write = 1; - *(u16 *) data = vcpu->arch.gprs[rt]; + run->mmio.len = 2; + *(u16 *)data = vcpu->arch.gprs[rt]; kvm_debug("[%#lx] OP_SH: eaddr: %#lx, gpr: %#lx, data: %#x\n", vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr, - vcpu->arch.gprs[rt], *(u32 *) data); + vcpu->arch.gprs[rt], *(u16 *)data); + break; + + case sb_op: + run->mmio.len = 1; + *(u8 *)data = vcpu->arch.gprs[rt]; + + kvm_debug("[%#lx] OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n", + vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr, + vcpu->arch.gprs[rt], *(u8 *)data); break; default: kvm_err("Store not yet supported (inst=0x%08x)\n", inst.word); - er = EMULATE_FAIL; - break; + goto out_fail; } - /* Rollback PC if emulation was unsuccessful */ - if (er == EMULATE_FAIL) - vcpu->arch.pc = curr_pc; + run->mmio.is_write = 1; + vcpu->mmio_needed = 1; + vcpu->mmio_is_write = 1; + return EMULATE_DO_MMIO; - return er; +out_fail: + /* Rollback PC if emulation was unsuccessful */ + vcpu->arch.pc = curr_pc; + return EMULATE_FAIL; } enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, u32 cause, struct kvm_run *run, struct kvm_vcpu *vcpu) { - enum emulation_result er = EMULATE_DO_MMIO; + enum emulation_result er; unsigned long curr_pc; u32 op, rt; - u32 bytes; rt = inst.i_format.rt; op = inst.i_format.opcode; @@ -1606,96 +1703,53 @@ enum emulation_result kvm_mips_emulate_load(union mips_instruction inst, vcpu->arch.io_gpr = rt; + run->mmio.phys_addr = kvm_mips_callbacks->gva_to_gpa( + vcpu->arch.host_cp0_badvaddr); + if (run->mmio.phys_addr == KVM_INVALID_ADDR) + return EMULATE_FAIL; + + vcpu->mmio_needed = 2; /* signed */ switch (op) { - case lw_op: - bytes = 4; - if (bytes > sizeof(run->mmio.data)) { - kvm_err("%s: bad MMIO length: %d\n", __func__, - run->mmio.len); - er = EMULATE_FAIL; - break; - } - run->mmio.phys_addr = - kvm_mips_callbacks->gva_to_gpa(vcpu->arch. - host_cp0_badvaddr); - if (run->mmio.phys_addr == KVM_INVALID_ADDR) { - er = EMULATE_FAIL; - break; - } +#if defined(CONFIG_64BIT) && defined(CONFIG_KVM_MIPS_VZ) + case ld_op: + run->mmio.len = 8; + break; - run->mmio.len = bytes; - run->mmio.is_write = 0; - vcpu->mmio_needed = 1; - vcpu->mmio_is_write = 0; + case lwu_op: + vcpu->mmio_needed = 1; /* unsigned */ + /* fall through */ +#endif + case lw_op: + run->mmio.len = 4; break; - case lh_op: case lhu_op: - bytes = 2; - if (bytes > sizeof(run->mmio.data)) { - kvm_err("%s: bad MMIO length: %d\n", __func__, - run->mmio.len); - er = EMULATE_FAIL; - break; - } - run->mmio.phys_addr = - kvm_mips_callbacks->gva_to_gpa(vcpu->arch. - host_cp0_badvaddr); - if (run->mmio.phys_addr == KVM_INVALID_ADDR) { - er = EMULATE_FAIL; - break; - } - - run->mmio.len = bytes; - run->mmio.is_write = 0; - vcpu->mmio_needed = 1; - vcpu->mmio_is_write = 0; - - if (op == lh_op) - vcpu->mmio_needed = 2; - else - vcpu->mmio_needed = 1; - + vcpu->mmio_needed = 1; /* unsigned */ + /* fall through */ + case lh_op: + run->mmio.len = 2; break; case lbu_op: + vcpu->mmio_needed = 1; /* unsigned */ + /* fall through */ case lb_op: - bytes = 1; - if (bytes > sizeof(run->mmio.data)) { - kvm_err("%s: bad MMIO length: %d\n", __func__, - run->mmio.len); - er = EMULATE_FAIL; - break; - } - run->mmio.phys_addr = - kvm_mips_callbacks->gva_to_gpa(vcpu->arch. - host_cp0_badvaddr); - if (run->mmio.phys_addr == KVM_INVALID_ADDR) { - er = EMULATE_FAIL; - break; - } - - run->mmio.len = bytes; - run->mmio.is_write = 0; - vcpu->mmio_is_write = 0; - - if (op == lb_op) - vcpu->mmio_needed = 2; - else - vcpu->mmio_needed = 1; - + run->mmio.len = 1; break; default: kvm_err("Load not yet supported (inst=0x%08x)\n", inst.word); - er = EMULATE_FAIL; - break; + vcpu->mmio_needed = 0; + return EMULATE_FAIL; } - return er; + run->mmio.is_write = 0; + vcpu->mmio_is_write = 0; + return EMULATE_DO_MMIO; } +#ifndef CONFIG_KVM_MIPS_VZ static enum emulation_result kvm_mips_guest_cache_op(int (*fn)(unsigned long), unsigned long curr_pc, unsigned long addr, @@ -1786,11 +1840,35 @@ enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst, vcpu->arch.pc, vcpu->arch.gprs[31], cache, op, base, arch->gprs[base], offset); - if (cache == Cache_D) + if (cache == Cache_D) { +#ifdef CONFIG_CPU_R4K_CACHE_TLB r4k_blast_dcache(); - else if (cache == Cache_I) +#else + switch (boot_cpu_type()) { + case CPU_CAVIUM_OCTEON3: + /* locally flush icache */ + local_flush_icache_range(0, 0); + break; + default: + __flush_cache_all(); + break; + } +#endif + } else if (cache == Cache_I) { +#ifdef CONFIG_CPU_R4K_CACHE_TLB r4k_blast_icache(); - else { +#else + switch (boot_cpu_type()) { + case CPU_CAVIUM_OCTEON3: + /* locally flush icache */ + local_flush_icache_range(0, 0); + break; + default: + flush_icache_all(); + break; + } +#endif + } else { kvm_err("%s: unsupported CACHE INDEX operation\n", __func__); return EMULATE_FAIL; @@ -1870,18 +1948,6 @@ enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc, case cop0_op: er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu); break; - case sb_op: - case sh_op: - case sw_op: - er = kvm_mips_emulate_store(inst, cause, run, vcpu); - break; - case lb_op: - case lbu_op: - case lhu_op: - case lh_op: - case lw_op: - er = kvm_mips_emulate_load(inst, cause, run, vcpu); - break; #ifndef CONFIG_CPU_MIPSR6 case cache_op: @@ -1915,6 +1981,7 @@ unknown: return er; } +#endif /* CONFIG_KVM_MIPS_VZ */ /** * kvm_mips_guest_exception_base() - Find guest exception vector base address. @@ -2524,8 +2591,15 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu, vcpu->arch.pc = vcpu->arch.io_pc; switch (run->mmio.len) { + case 8: + *gpr = *(s64 *)run->mmio.data; + break; + case 4: - *gpr = *(s32 *) run->mmio.data; + if (vcpu->mmio_needed == 2) + *gpr = *(s32 *)run->mmio.data; + else + *gpr = *(u32 *)run->mmio.data; break; case 2: diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c index c5b254c4d0da..16e1c93b484f 100644 --- a/arch/mips/kvm/entry.c +++ b/arch/mips/kvm/entry.c @@ -51,12 +51,15 @@ #define RA 31 /* Some CP0 registers */ +#define C0_PWBASE 5, 5 #define C0_HWRENA 7, 0 #define C0_BADVADDR 8, 0 #define C0_BADINSTR 8, 1 #define C0_BADINSTRP 8, 2 #define C0_ENTRYHI 10, 0 +#define C0_GUESTCTL1 10, 4 #define C0_STATUS 12, 0 +#define C0_GUESTCTL0 12, 6 #define C0_CAUSE 13, 0 #define C0_EPC 14, 0 #define C0_EBASE 15, 1 @@ -292,8 +295,8 @@ static void *kvm_mips_build_enter_guest(void *addr) unsigned int i; struct uasm_label labels[2]; struct uasm_reloc relocs[2]; - struct uasm_label *l = labels; - struct uasm_reloc *r = relocs; + struct uasm_label __maybe_unused *l = labels; + struct uasm_reloc __maybe_unused *r = relocs; memset(labels, 0, sizeof(labels)); memset(relocs, 0, sizeof(relocs)); @@ -302,7 +305,67 @@ static void *kvm_mips_build_enter_guest(void *addr) UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1); UASM_i_MTC0(&p, T0, C0_EPC); - /* Set the ASID for the Guest Kernel */ +#ifdef CONFIG_KVM_MIPS_VZ + /* Save normal linux process pgd (VZ guarantees pgd_reg is set) */ + UASM_i_MFC0(&p, K0, c0_kscratch(), pgd_reg); + UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_pgd), K1); + + /* + * Set up KVM GPA pgd. + * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD(): + * - call tlbmiss_handler_setup_pgd(mm->pgd) + * - write mm->pgd into CP0_PWBase + * + * We keep S0 pointing at struct kvm so we can load the ASID below. + */ + UASM_i_LW(&p, S0, (int)offsetof(struct kvm_vcpu, kvm) - + (int)offsetof(struct kvm_vcpu, arch), K1); + UASM_i_LW(&p, A0, offsetof(struct kvm, arch.gpa_mm.pgd), S0); + UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd); + uasm_i_jalr(&p, RA, T9); + /* delay slot */ + if (cpu_has_htw) + UASM_i_MTC0(&p, A0, C0_PWBASE); + else + uasm_i_nop(&p); + + /* Set GM bit to setup eret to VZ guest context */ + uasm_i_addiu(&p, V1, ZERO, 1); + uasm_i_mfc0(&p, K0, C0_GUESTCTL0); + uasm_i_ins(&p, K0, V1, MIPS_GCTL0_GM_SHIFT, 1); + uasm_i_mtc0(&p, K0, C0_GUESTCTL0); + + if (cpu_has_guestid) { + /* + * Set root mode GuestID, so that root TLB refill handler can + * use the correct GuestID in the root TLB. + */ + + /* Get current GuestID */ + uasm_i_mfc0(&p, T0, C0_GUESTCTL1); + /* Set GuestCtl1.RID = GuestCtl1.ID */ + uasm_i_ext(&p, T1, T0, MIPS_GCTL1_ID_SHIFT, + MIPS_GCTL1_ID_WIDTH); + uasm_i_ins(&p, T0, T1, MIPS_GCTL1_RID_SHIFT, + MIPS_GCTL1_RID_WIDTH); + uasm_i_mtc0(&p, T0, C0_GUESTCTL1); + + /* GuestID handles dealiasing so we don't need to touch ASID */ + goto skip_asid_restore; + } + + /* Root ASID Dealias (RAD) */ + + /* Save host ASID */ + UASM_i_MFC0(&p, K0, C0_ENTRYHI); + UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_entryhi), + K1); + + /* Set the root ASID for the Guest */ + UASM_i_ADDIU(&p, T1, S0, + offsetof(struct kvm, arch.gpa_mm.context.asid)); +#else + /* Set the ASID for the Guest Kernel or User */ UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1); UASM_i_LW(&p, T0, offsetof(struct mips_coproc, reg[MIPS_CP0_STATUS][0]), T0); @@ -315,6 +378,7 @@ static void *kvm_mips_build_enter_guest(void *addr) UASM_i_ADDIU(&p, T1, K1, offsetof(struct kvm_vcpu_arch, guest_user_mm.context.asid)); uasm_l_kernel_asid(&l, p); +#endif /* t1: contains the base of the ASID array, need to get the cpu id */ /* smp_processor_id */ @@ -339,6 +403,7 @@ static void *kvm_mips_build_enter_guest(void *addr) uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID); #endif +#ifndef CONFIG_KVM_MIPS_VZ /* * Set up KVM T&E GVA pgd. * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD(): @@ -351,7 +416,11 @@ static void *kvm_mips_build_enter_guest(void *addr) UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd); uasm_i_jalr(&p, RA, T9); uasm_i_mtc0(&p, K0, C0_ENTRYHI); - +#else + /* Set up KVM VZ root ASID (!guestid) */ + uasm_i_mtc0(&p, K0, C0_ENTRYHI); +skip_asid_restore: +#endif uasm_i_ehb(&p); /* Disable RDHWR access */ @@ -559,13 +628,10 @@ void *kvm_mips_build_exit(void *addr) /* Now that context has been saved, we can use other registers */ /* Restore vcpu */ - UASM_i_MFC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]); - uasm_i_move(&p, S1, A1); + UASM_i_MFC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]); /* Restore run (vcpu->run) */ - UASM_i_LW(&p, A0, offsetof(struct kvm_vcpu, run), A1); - /* Save pointer to run in s0, will be saved by the compiler */ - uasm_i_move(&p, S0, A0); + UASM_i_LW(&p, S0, offsetof(struct kvm_vcpu, run), S1); /* * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process @@ -641,6 +707,52 @@ void *kvm_mips_build_exit(void *addr) uasm_l_msa_1(&l, p); } +#ifdef CONFIG_KVM_MIPS_VZ + /* Restore host ASID */ + if (!cpu_has_guestid) { + UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, host_entryhi), + K1); + UASM_i_MTC0(&p, K0, C0_ENTRYHI); + } + + /* + * Set up normal Linux process pgd. + * This does roughly the same as TLBMISS_HANDLER_SETUP_PGD(): + * - call tlbmiss_handler_setup_pgd(mm->pgd) + * - write mm->pgd into CP0_PWBase + */ + UASM_i_LW(&p, A0, + offsetof(struct kvm_vcpu_arch, host_pgd), K1); + UASM_i_LA(&p, T9, (unsigned long)tlbmiss_handler_setup_pgd); + uasm_i_jalr(&p, RA, T9); + /* delay slot */ + if (cpu_has_htw) + UASM_i_MTC0(&p, A0, C0_PWBASE); + else + uasm_i_nop(&p); + + /* Clear GM bit so we don't enter guest mode when EXL is cleared */ + uasm_i_mfc0(&p, K0, C0_GUESTCTL0); + uasm_i_ins(&p, K0, ZERO, MIPS_GCTL0_GM_SHIFT, 1); + uasm_i_mtc0(&p, K0, C0_GUESTCTL0); + + /* Save GuestCtl0 so we can access GExcCode after CPU migration */ + uasm_i_sw(&p, K0, + offsetof(struct kvm_vcpu_arch, host_cp0_guestctl0), K1); + + if (cpu_has_guestid) { + /* + * Clear root mode GuestID, so that root TLB operations use the + * root GuestID in the root TLB. + */ + uasm_i_mfc0(&p, T0, C0_GUESTCTL1); + /* Set GuestCtl1.RID = MIPS_GCTL1_ROOT_GUESTID (i.e. 0) */ + uasm_i_ins(&p, T0, ZERO, MIPS_GCTL1_RID_SHIFT, + MIPS_GCTL1_RID_WIDTH); + uasm_i_mtc0(&p, T0, C0_GUESTCTL1); + } +#endif + /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */ uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE)); uasm_i_and(&p, V0, V0, AT); @@ -680,6 +792,8 @@ void *kvm_mips_build_exit(void *addr) * Now jump to the kvm_mips_handle_exit() to see if we can deal * with this in the kernel */ + uasm_i_move(&p, A0, S0); + uasm_i_move(&p, A1, S1); UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit); uasm_i_jalr(&p, RA, T9); UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ); diff --git a/arch/mips/kvm/hypcall.c b/arch/mips/kvm/hypcall.c new file mode 100644 index 000000000000..83063435195f --- /dev/null +++ b/arch/mips/kvm/hypcall.c @@ -0,0 +1,53 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * KVM/MIPS: Hypercall handling. + * + * Copyright (C) 2015 Imagination Technologies Ltd. + */ + +#include <linux/kernel.h> +#include <linux/kvm_host.h> +#include <linux/kvm_para.h> + +#define MAX_HYPCALL_ARGS 4 + +enum emulation_result kvm_mips_emul_hypcall(struct kvm_vcpu *vcpu, + union mips_instruction inst) +{ + unsigned int code = (inst.co_format.code >> 5) & 0x3ff; + + kvm_debug("[%#lx] HYPCALL %#03x\n", vcpu->arch.pc, code); + + switch (code) { + case 0: + return EMULATE_HYPERCALL; + default: + return EMULATE_FAIL; + }; +} + +static int kvm_mips_hypercall(struct kvm_vcpu *vcpu, unsigned long num, + const unsigned long *args, unsigned long *hret) +{ + /* Report unimplemented hypercall to guest */ + *hret = -KVM_ENOSYS; + return RESUME_GUEST; +} + +int kvm_mips_handle_hypcall(struct kvm_vcpu *vcpu) +{ + unsigned long num, args[MAX_HYPCALL_ARGS]; + + /* read hypcall number and arguments */ + num = vcpu->arch.gprs[2]; /* v0 */ + args[0] = vcpu->arch.gprs[4]; /* a0 */ + args[1] = vcpu->arch.gprs[5]; /* a1 */ + args[2] = vcpu->arch.gprs[6]; /* a2 */ + args[3] = vcpu->arch.gprs[7]; /* a3 */ + + return kvm_mips_hypercall(vcpu, num, + args, &vcpu->arch.gprs[2] /* v0 */); +} diff --git a/arch/mips/kvm/interrupt.h b/arch/mips/kvm/interrupt.h index fb118a2c8379..3bf0a49725e8 100644 --- a/arch/mips/kvm/interrupt.h +++ b/arch/mips/kvm/interrupt.h @@ -30,8 +30,13 @@ #define C_TI (_ULCAST_(1) << 30) +#ifdef CONFIG_KVM_MIPS_VZ +#define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (1) +#define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE (1) +#else #define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0) #define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE (0) +#endif void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority); void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index 15a1b1716c2e..d4b2ad18eef2 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -59,6 +59,16 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "fpe", VCPU_STAT(fpe_exits), KVM_STAT_VCPU }, { "msa_disabled", VCPU_STAT(msa_disabled_exits), KVM_STAT_VCPU }, { "flush_dcache", VCPU_STAT(flush_dcache_exits), KVM_STAT_VCPU }, +#ifdef CONFIG_KVM_MIPS_VZ + { "vz_gpsi", VCPU_STAT(vz_gpsi_exits), KVM_STAT_VCPU }, + { "vz_gsfc", VCPU_STAT(vz_gsfc_exits), KVM_STAT_VCPU }, + { "vz_hc", VCPU_STAT(vz_hc_exits), KVM_STAT_VCPU }, + { "vz_grr", VCPU_STAT(vz_grr_exits), KVM_STAT_VCPU }, + { "vz_gva", VCPU_STAT(vz_gva_exits), KVM_STAT_VCPU }, + { "vz_ghfc", VCPU_STAT(vz_ghfc_exits), KVM_STAT_VCPU }, + { "vz_gpa", VCPU_STAT(vz_gpa_exits), KVM_STAT_VCPU }, + { "vz_resvd", VCPU_STAT(vz_resvd_exits), KVM_STAT_VCPU }, +#endif { "halt_successful_poll", VCPU_STAT(halt_successful_poll), KVM_STAT_VCPU }, { "halt_attempted_poll", VCPU_STAT(halt_attempted_poll), KVM_STAT_VCPU }, { "halt_poll_invalid", VCPU_STAT(halt_poll_invalid), KVM_STAT_VCPU }, @@ -66,6 +76,19 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { {NULL} }; +bool kvm_trace_guest_mode_change; + +int kvm_guest_mode_change_trace_reg(void) +{ + kvm_trace_guest_mode_change = 1; + return 0; +} + +void kvm_guest_mode_change_trace_unreg(void) +{ + kvm_trace_guest_mode_change = 0; +} + /* * XXXKYMA: We are simulatoring a processor that has the WII bit set in * Config7, so we are "runnable" if interrupts are pending @@ -82,7 +105,12 @@ int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) int kvm_arch_hardware_enable(void) { - return 0; + return kvm_mips_callbacks->hardware_enable(); +} + +void kvm_arch_hardware_disable(void) +{ + kvm_mips_callbacks->hardware_disable(); } int kvm_arch_hardware_setup(void) @@ -97,6 +125,18 @@ void kvm_arch_check_processor_compat(void *rtn) int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + switch (type) { +#ifdef CONFIG_KVM_MIPS_VZ + case KVM_VM_MIPS_VZ: +#else + case KVM_VM_MIPS_TE: +#endif + break; + default: + /* Unsupported KVM type */ + return -EINVAL; + }; + /* Allocate page table to map GPA -> RPA */ kvm->arch.gpa_mm.pgd = kvm_pgd_alloc(); if (!kvm->arch.gpa_mm.pgd) @@ -301,8 +341,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) /* Build guest exception vectors dynamically in unmapped memory */ handler = gebase + 0x2000; - /* TLB refill */ + /* TLB refill (or XTLB refill on 64-bit VZ where KX=1) */ refill_start = gebase; + if (IS_ENABLED(CONFIG_KVM_MIPS_VZ) && IS_ENABLED(CONFIG_64BIT)) + refill_start += 0x080; refill_end = kvm_mips_build_tlb_refill_exception(refill_start, handler); /* General Exception Entry point */ @@ -353,9 +395,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id) /* Init */ vcpu->arch.last_sched_cpu = -1; - - /* Start off the timer */ - kvm_mips_init_count(vcpu); + vcpu->arch.last_exec_cpu = -1; return vcpu; @@ -1030,9 +1070,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_IMMEDIATE_EXIT: r = 1; break; - case KVM_CAP_COALESCED_MMIO: - r = KVM_COALESCED_MMIO_PAGE_OFFSET; - break; case KVM_CAP_NR_VCPUS: r = num_online_cpus(); break; @@ -1059,7 +1096,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = cpu_has_msa && !(boot_cpu_data.msa_id & MSA_IR_WRPF); break; default: - r = 0; + r = kvm_mips_callbacks->check_extension(kvm, ext); break; } return r; @@ -1067,7 +1104,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) { - return kvm_mips_pending_timer(vcpu); + return kvm_mips_pending_timer(vcpu) || + kvm_read_c0_guest_cause(vcpu->arch.cop0) & C_TI; } int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu) @@ -1092,7 +1130,7 @@ int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu) kvm_debug("\tlo: 0x%08lx\n", vcpu->arch.lo); cop0 = vcpu->arch.cop0; - kvm_debug("\tStatus: 0x%08lx, Cause: 0x%08lx\n", + kvm_debug("\tStatus: 0x%08x, Cause: 0x%08x\n", kvm_read_c0_guest_status(cop0), kvm_read_c0_guest_cause(cop0)); @@ -1208,7 +1246,8 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) vcpu->mode = OUTSIDE_GUEST_MODE; /* re-enable HTW before enabling interrupts */ - htw_start(); + if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ)) + htw_start(); /* Set a default exit reason */ run->exit_reason = KVM_EXIT_UNKNOWN; @@ -1226,17 +1265,20 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) cause, opc, run, vcpu); trace_kvm_exit(vcpu, exccode); - /* - * Do a privilege check, if in UM most of these exit conditions end up - * causing an exception to be delivered to the Guest Kernel - */ - er = kvm_mips_check_privilege(cause, opc, run, vcpu); - if (er == EMULATE_PRIV_FAIL) { - goto skip_emul; - } else if (er == EMULATE_FAIL) { - run->exit_reason = KVM_EXIT_INTERNAL_ERROR; - ret = RESUME_HOST; - goto skip_emul; + if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ)) { + /* + * Do a privilege check, if in UM most of these exit conditions + * end up causing an exception to be delivered to the Guest + * Kernel + */ + er = kvm_mips_check_privilege(cause, opc, run, vcpu); + if (er == EMULATE_PRIV_FAIL) { + goto skip_emul; + } else if (er == EMULATE_FAIL) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + ret = RESUME_HOST; + goto skip_emul; + } } switch (exccode) { @@ -1267,7 +1309,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) break; case EXCCODE_TLBS: - kvm_debug("TLB ST fault: cause %#x, status %#lx, PC: %p, BadVaddr: %#lx\n", + kvm_debug("TLB ST fault: cause %#x, status %#x, PC: %p, BadVaddr: %#lx\n", cause, kvm_read_c0_guest_status(vcpu->arch.cop0), opc, badvaddr); @@ -1328,12 +1370,17 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) ret = kvm_mips_callbacks->handle_msa_disabled(vcpu); break; + case EXCCODE_GE: + /* defer exit accounting to handler */ + ret = kvm_mips_callbacks->handle_guest_exit(vcpu); + break; + default: if (cause & CAUSEF_BD) opc += 1; inst = 0; kvm_get_badinstr(opc, vcpu, &inst); - kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#lx\n", + kvm_err("Exception Code: %d, not yet handled, @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#x\n", exccode, opc, inst, badvaddr, kvm_read_c0_guest_status(vcpu->arch.cop0)); kvm_arch_vcpu_dump_regs(vcpu); @@ -1346,6 +1393,9 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu) skip_emul: local_irq_disable(); + if (ret == RESUME_GUEST) + kvm_vz_acquire_htimer(vcpu); + if (er == EMULATE_DONE && !(ret & RESUME_HOST)) kvm_mips_deliver_interrupts(vcpu, cause); @@ -1391,7 +1441,8 @@ skip_emul: } /* Disable HTW before returning to guest or host */ - htw_stop(); + if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ)) + htw_stop(); return ret; } @@ -1527,16 +1578,18 @@ void kvm_drop_fpu(struct kvm_vcpu *vcpu) void kvm_lose_fpu(struct kvm_vcpu *vcpu) { /* - * FPU & MSA get disabled in root context (hardware) when it is disabled - * in guest context (software), but the register state in the hardware - * may still be in use. This is why we explicitly re-enable the hardware - * before saving. + * With T&E, FPU & MSA get disabled in root context (hardware) when it + * is disabled in guest context (software), but the register state in + * the hardware may still be in use. + * This is why we explicitly re-enable the hardware before saving. */ preempt_disable(); if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) { - set_c0_config5(MIPS_CONF5_MSAEN); - enable_fpu_hazard(); + if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ)) { + set_c0_config5(MIPS_CONF5_MSAEN); + enable_fpu_hazard(); + } __kvm_save_msa(&vcpu->arch); trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU_MSA); @@ -1549,8 +1602,10 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu) } vcpu->arch.aux_inuse &= ~(KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA); } else if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) { - set_c0_status(ST0_CU1); - enable_fpu_hazard(); + if (!IS_ENABLED(CONFIG_KVM_MIPS_VZ)) { + set_c0_status(ST0_CU1); + enable_fpu_hazard(); + } __kvm_save_fpu(&vcpu->arch); vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU; diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c index cb0faade311e..ee64db032793 100644 --- a/arch/mips/kvm/mmu.c +++ b/arch/mips/kvm/mmu.c @@ -992,6 +992,22 @@ static pte_t kvm_mips_gpa_pte_to_gva_mapped(pte_t pte, long entrylo) return kvm_mips_gpa_pte_to_gva_unmapped(pte); } +#ifdef CONFIG_KVM_MIPS_VZ +int kvm_mips_handle_vz_root_tlb_fault(unsigned long badvaddr, + struct kvm_vcpu *vcpu, + bool write_fault) +{ + int ret; + + ret = kvm_mips_map_page(vcpu, badvaddr, write_fault, NULL, NULL); + if (ret) + return ret; + + /* Invalidate this entry in the TLB */ + return kvm_vz_host_tlb_inv(vcpu, badvaddr); +} +#endif + /* XXXKYMA: Must be called with interrupts disabled */ int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr, struct kvm_vcpu *vcpu, @@ -1225,6 +1241,10 @@ int kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu, u32 *out) { int err; + if (WARN(IS_ENABLED(CONFIG_KVM_MIPS_VZ), + "Expect BadInstr/BadInstrP registers to be used with VZ\n")) + return -EINVAL; + retry: kvm_trap_emul_gva_lockless_begin(vcpu); err = get_user(*out, opc); diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c index 2819eb793345..7c6336dd2638 100644 --- a/arch/mips/kvm/tlb.c +++ b/arch/mips/kvm/tlb.c @@ -33,6 +33,25 @@ #define KVM_GUEST_PC_TLB 0 #define KVM_GUEST_SP_TLB 1 +#ifdef CONFIG_KVM_MIPS_VZ +unsigned long GUESTID_MASK; +EXPORT_SYMBOL_GPL(GUESTID_MASK); +unsigned long GUESTID_FIRST_VERSION; +EXPORT_SYMBOL_GPL(GUESTID_FIRST_VERSION); +unsigned long GUESTID_VERSION_MASK; +EXPORT_SYMBOL_GPL(GUESTID_VERSION_MASK); + +static u32 kvm_mips_get_root_asid(struct kvm_vcpu *vcpu) +{ + struct mm_struct *gpa_mm = &vcpu->kvm->arch.gpa_mm; + + if (cpu_has_guestid) + return 0; + else + return cpu_asid(smp_processor_id(), gpa_mm); +} +#endif + static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu) { struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; @@ -166,6 +185,13 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va, local_irq_restore(flags); + /* + * We don't want to get reserved instruction exceptions for missing tlb + * entries. + */ + if (cpu_has_vtag_icache) + flush_icache_all(); + if (user && idx_user >= 0) kvm_debug("%s: Invalidated guest user entryhi %#lx @ idx %d\n", __func__, (va & VPN2_MASK) | @@ -179,6 +205,421 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va, } EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_inv); +#ifdef CONFIG_KVM_MIPS_VZ + +/* GuestID management */ + +/** + * clear_root_gid() - Set GuestCtl1.RID for normal root operation. + */ +static inline void clear_root_gid(void) +{ + if (cpu_has_guestid) { + clear_c0_guestctl1(MIPS_GCTL1_RID); + mtc0_tlbw_hazard(); + } +} + +/** + * set_root_gid_to_guest_gid() - Set GuestCtl1.RID to match GuestCtl1.ID. + * + * Sets the root GuestID to match the current guest GuestID, for TLB operation + * on the GPA->RPA mappings in the root TLB. + * + * The caller must be sure to disable HTW while the root GID is set, and + * possibly longer if TLB registers are modified. + */ +static inline void set_root_gid_to_guest_gid(void) +{ + unsigned int guestctl1; + + if (cpu_has_guestid) { + back_to_back_c0_hazard(); + guestctl1 = read_c0_guestctl1(); + guestctl1 = (guestctl1 & ~MIPS_GCTL1_RID) | + ((guestctl1 & MIPS_GCTL1_ID) >> MIPS_GCTL1_ID_SHIFT) + << MIPS_GCTL1_RID_SHIFT; + write_c0_guestctl1(guestctl1); + mtc0_tlbw_hazard(); + } +} + +int kvm_vz_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va) +{ + int idx; + unsigned long flags, old_entryhi; + + local_irq_save(flags); + htw_stop(); + + /* Set root GuestID for root probe and write of guest TLB entry */ + set_root_gid_to_guest_gid(); + + old_entryhi = read_c0_entryhi(); + + idx = _kvm_mips_host_tlb_inv((va & VPN2_MASK) | + kvm_mips_get_root_asid(vcpu)); + + write_c0_entryhi(old_entryhi); + clear_root_gid(); + mtc0_tlbw_hazard(); + + htw_start(); + local_irq_restore(flags); + + /* + * We don't want to get reserved instruction exceptions for missing tlb + * entries. + */ + if (cpu_has_vtag_icache) + flush_icache_all(); + + if (idx > 0) + kvm_debug("%s: Invalidated root entryhi %#lx @ idx %d\n", + __func__, (va & VPN2_MASK) | + kvm_mips_get_root_asid(vcpu), idx); + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_vz_host_tlb_inv); + +/** + * kvm_vz_guest_tlb_lookup() - Lookup a guest VZ TLB mapping. + * @vcpu: KVM VCPU pointer. + * @gpa: Guest virtual address in a TLB mapped guest segment. + * @gpa: Ponter to output guest physical address it maps to. + * + * Converts a guest virtual address in a guest TLB mapped segment to a guest + * physical address, by probing the guest TLB. + * + * Returns: 0 if guest TLB mapping exists for @gva. *@gpa will have been + * written. + * -EFAULT if no guest TLB mapping exists for @gva. *@gpa may not + * have been written. + */ +int kvm_vz_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long gva, + unsigned long *gpa) +{ + unsigned long o_entryhi, o_entrylo[2], o_pagemask; + unsigned int o_index; + unsigned long entrylo[2], pagemask, pagemaskbit, pa; + unsigned long flags; + int index; + + /* Probe the guest TLB for a mapping */ + local_irq_save(flags); + /* Set root GuestID for root probe of guest TLB entry */ + htw_stop(); + set_root_gid_to_guest_gid(); + + o_entryhi = read_gc0_entryhi(); + o_index = read_gc0_index(); + + write_gc0_entryhi((o_entryhi & 0x3ff) | (gva & ~0xfffl)); + mtc0_tlbw_hazard(); + guest_tlb_probe(); + tlb_probe_hazard(); + + index = read_gc0_index(); + if (index < 0) { + /* No match, fail */ + write_gc0_entryhi(o_entryhi); + write_gc0_index(o_index); + + clear_root_gid(); + htw_start(); + local_irq_restore(flags); + return -EFAULT; + } + + /* Match! read the TLB entry */ + o_entrylo[0] = read_gc0_entrylo0(); + o_entrylo[1] = read_gc0_entrylo1(); + o_pagemask = read_gc0_pagemask(); + + mtc0_tlbr_hazard(); + guest_tlb_read(); + tlb_read_hazard(); + + entrylo[0] = read_gc0_entrylo0(); + entrylo[1] = read_gc0_entrylo1(); + pagemask = ~read_gc0_pagemask() & ~0x1fffl; + + write_gc0_entryhi(o_entryhi); + write_gc0_index(o_index); + write_gc0_entrylo0(o_entrylo[0]); + write_gc0_entrylo1(o_entrylo[1]); + write_gc0_pagemask(o_pagemask); + + clear_root_gid(); + htw_start(); + local_irq_restore(flags); + + /* Select one of the EntryLo values and interpret the GPA */ + pagemaskbit = (pagemask ^ (pagemask & (pagemask - 1))) >> 1; + pa = entrylo[!!(gva & pagemaskbit)]; + + /* + * TLB entry may have become invalid since TLB probe if physical FTLB + * entries are shared between threads (e.g. I6400). + */ + if (!(pa & ENTRYLO_V)) + return -EFAULT; + + /* + * Note, this doesn't take guest MIPS32 XPA into account, where PFN is + * split with XI/RI in the middle. + */ + pa = (pa << 6) & ~0xfffl; + pa |= gva & ~(pagemask | pagemaskbit); + + *gpa = pa; + return 0; +} +EXPORT_SYMBOL_GPL(kvm_vz_guest_tlb_lookup); + +/** + * kvm_vz_local_flush_roottlb_all_guests() - Flush all root TLB entries for + * guests. + * + * Invalidate all entries in root tlb which are GPA mappings. + */ +void kvm_vz_local_flush_roottlb_all_guests(void) +{ + unsigned long flags; + unsigned long old_entryhi, old_pagemask, old_guestctl1; + int entry; + + if (WARN_ON(!cpu_has_guestid)) + return; + + local_irq_save(flags); + htw_stop(); + + /* TLBR may clobber EntryHi.ASID, PageMask, and GuestCtl1.RID */ + old_entryhi = read_c0_entryhi(); + old_pagemask = read_c0_pagemask(); + old_guestctl1 = read_c0_guestctl1(); + + /* + * Invalidate guest entries in root TLB while leaving root entries + * intact when possible. + */ + for (entry = 0; entry < current_cpu_data.tlbsize; entry++) { + write_c0_index(entry); + mtc0_tlbw_hazard(); + tlb_read(); + tlb_read_hazard(); + + /* Don't invalidate non-guest (RVA) mappings in the root TLB */ + if (!(read_c0_guestctl1() & MIPS_GCTL1_RID)) + continue; + + /* Make sure all entries differ. */ + write_c0_entryhi(UNIQUE_ENTRYHI(entry)); + write_c0_entrylo0(0); + write_c0_entrylo1(0); + write_c0_guestctl1(0); + mtc0_tlbw_hazard(); + tlb_write_indexed(); + } + + write_c0_entryhi(old_entryhi); + write_c0_pagemask(old_pagemask); + write_c0_guestctl1(old_guestctl1); + tlbw_use_hazard(); + + htw_start(); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(kvm_vz_local_flush_roottlb_all_guests); + +/** + * kvm_vz_local_flush_guesttlb_all() - Flush all guest TLB entries. + * + * Invalidate all entries in guest tlb irrespective of guestid. + */ +void kvm_vz_local_flush_guesttlb_all(void) +{ + unsigned long flags; + unsigned long old_index; + unsigned long old_entryhi; + unsigned long old_entrylo[2]; + unsigned long old_pagemask; + int entry; + u64 cvmmemctl2 = 0; + + local_irq_save(flags); + + /* Preserve all clobbered guest registers */ + old_index = read_gc0_index(); + old_entryhi = read_gc0_entryhi(); + old_entrylo[0] = read_gc0_entrylo0(); + old_entrylo[1] = read_gc0_entrylo1(); + old_pagemask = read_gc0_pagemask(); + + switch (current_cpu_type()) { + case CPU_CAVIUM_OCTEON3: + /* Inhibit machine check due to multiple matching TLB entries */ + cvmmemctl2 = read_c0_cvmmemctl2(); + cvmmemctl2 |= CVMMEMCTL2_INHIBITTS; + write_c0_cvmmemctl2(cvmmemctl2); + break; + }; + + /* Invalidate guest entries in guest TLB */ + write_gc0_entrylo0(0); + write_gc0_entrylo1(0); + write_gc0_pagemask(0); + for (entry = 0; entry < current_cpu_data.guest.tlbsize; entry++) { + /* Make sure all entries differ. */ + write_gc0_index(entry); + write_gc0_entryhi(UNIQUE_GUEST_ENTRYHI(entry)); + mtc0_tlbw_hazard(); + guest_tlb_write_indexed(); + } + + if (cvmmemctl2) { + cvmmemctl2 &= ~CVMMEMCTL2_INHIBITTS; + write_c0_cvmmemctl2(cvmmemctl2); + }; + + write_gc0_index(old_index); + write_gc0_entryhi(old_entryhi); + write_gc0_entrylo0(old_entrylo[0]); + write_gc0_entrylo1(old_entrylo[1]); + write_gc0_pagemask(old_pagemask); + tlbw_use_hazard(); + + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(kvm_vz_local_flush_guesttlb_all); + +/** + * kvm_vz_save_guesttlb() - Save a range of guest TLB entries. + * @buf: Buffer to write TLB entries into. + * @index: Start index. + * @count: Number of entries to save. + * + * Save a range of guest TLB entries. The caller must ensure interrupts are + * disabled. + */ +void kvm_vz_save_guesttlb(struct kvm_mips_tlb *buf, unsigned int index, + unsigned int count) +{ + unsigned int end = index + count; + unsigned long old_entryhi, old_entrylo0, old_entrylo1, old_pagemask; + unsigned int guestctl1 = 0; + int old_index, i; + + /* Save registers we're about to clobber */ + old_index = read_gc0_index(); + old_entryhi = read_gc0_entryhi(); + old_entrylo0 = read_gc0_entrylo0(); + old_entrylo1 = read_gc0_entrylo1(); + old_pagemask = read_gc0_pagemask(); + + /* Set root GuestID for root probe */ + htw_stop(); + set_root_gid_to_guest_gid(); + if (cpu_has_guestid) + guestctl1 = read_c0_guestctl1(); + + /* Read each entry from guest TLB */ + for (i = index; i < end; ++i, ++buf) { + write_gc0_index(i); + + mtc0_tlbr_hazard(); + guest_tlb_read(); + tlb_read_hazard(); + + if (cpu_has_guestid && + (read_c0_guestctl1() ^ guestctl1) & MIPS_GCTL1_RID) { + /* Entry invalid or belongs to another guest */ + buf->tlb_hi = UNIQUE_GUEST_ENTRYHI(i); + buf->tlb_lo[0] = 0; + buf->tlb_lo[1] = 0; + buf->tlb_mask = 0; + } else { + /* Entry belongs to the right guest */ + buf->tlb_hi = read_gc0_entryhi(); + buf->tlb_lo[0] = read_gc0_entrylo0(); + buf->tlb_lo[1] = read_gc0_entrylo1(); + buf->tlb_mask = read_gc0_pagemask(); + } + } + + /* Clear root GuestID again */ + clear_root_gid(); + htw_start(); + + /* Restore clobbered registers */ + write_gc0_index(old_index); + write_gc0_entryhi(old_entryhi); + write_gc0_entrylo0(old_entrylo0); + write_gc0_entrylo1(old_entrylo1); + write_gc0_pagemask(old_pagemask); + + tlbw_use_hazard(); +} +EXPORT_SYMBOL_GPL(kvm_vz_save_guesttlb); + +/** + * kvm_vz_load_guesttlb() - Save a range of guest TLB entries. + * @buf: Buffer to read TLB entries from. + * @index: Start index. + * @count: Number of entries to load. + * + * Load a range of guest TLB entries. The caller must ensure interrupts are + * disabled. + */ +void kvm_vz_load_guesttlb(const struct kvm_mips_tlb *buf, unsigned int index, + unsigned int count) +{ + unsigned int end = index + count; + unsigned long old_entryhi, old_entrylo0, old_entrylo1, old_pagemask; + int old_index, i; + + /* Save registers we're about to clobber */ + old_index = read_gc0_index(); + old_entryhi = read_gc0_entryhi(); + old_entrylo0 = read_gc0_entrylo0(); + old_entrylo1 = read_gc0_entrylo1(); + old_pagemask = read_gc0_pagemask(); + + /* Set root GuestID for root probe */ + htw_stop(); + set_root_gid_to_guest_gid(); + + /* Write each entry to guest TLB */ + for (i = index; i < end; ++i, ++buf) { + write_gc0_index(i); + write_gc0_entryhi(buf->tlb_hi); + write_gc0_entrylo0(buf->tlb_lo[0]); + write_gc0_entrylo1(buf->tlb_lo[1]); + write_gc0_pagemask(buf->tlb_mask); + + mtc0_tlbw_hazard(); + guest_tlb_write_indexed(); + } + + /* Clear root GuestID again */ + clear_root_gid(); + htw_start(); + + /* Restore clobbered registers */ + write_gc0_index(old_index); + write_gc0_entryhi(old_entryhi); + write_gc0_entrylo0(old_entrylo0); + write_gc0_entrylo1(old_entrylo1); + write_gc0_pagemask(old_pagemask); + + tlbw_use_hazard(); +} +EXPORT_SYMBOL_GPL(kvm_vz_load_guesttlb); + +#endif + /** * kvm_mips_suspend_mm() - Suspend the active mm. * @cpu The CPU we're running on. diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h index c858cf168078..a8c7fd7bf6d2 100644 --- a/arch/mips/kvm/trace.h +++ b/arch/mips/kvm/trace.h @@ -18,6 +18,13 @@ #define TRACE_INCLUDE_FILE trace /* + * arch/mips/kvm/mips.c + */ +extern bool kvm_trace_guest_mode_change; +int kvm_guest_mode_change_trace_reg(void); +void kvm_guest_mode_change_trace_unreg(void); + +/* * Tracepoints for VM enters */ DECLARE_EVENT_CLASS(kvm_transition, @@ -62,10 +69,20 @@ DEFINE_EVENT(kvm_transition, kvm_out, #define KVM_TRACE_EXIT_MSA_FPE 14 #define KVM_TRACE_EXIT_FPE 15 #define KVM_TRACE_EXIT_MSA_DISABLED 21 +#define KVM_TRACE_EXIT_GUEST_EXIT 27 /* Further exit reasons */ #define KVM_TRACE_EXIT_WAIT 32 #define KVM_TRACE_EXIT_CACHE 33 #define KVM_TRACE_EXIT_SIGNAL 34 +/* 32 exit reasons correspond to GuestCtl0.GExcCode (VZ) */ +#define KVM_TRACE_EXIT_GEXCCODE_BASE 64 +#define KVM_TRACE_EXIT_GPSI 64 /* 0 */ +#define KVM_TRACE_EXIT_GSFC 65 /* 1 */ +#define KVM_TRACE_EXIT_HC 66 /* 2 */ +#define KVM_TRACE_EXIT_GRR 67 /* 3 */ +#define KVM_TRACE_EXIT_GVA 72 /* 8 */ +#define KVM_TRACE_EXIT_GHFC 73 /* 9 */ +#define KVM_TRACE_EXIT_GPA 74 /* 10 */ /* Tracepoints for VM exits */ #define kvm_trace_symbol_exit_types \ @@ -83,9 +100,17 @@ DEFINE_EVENT(kvm_transition, kvm_out, { KVM_TRACE_EXIT_MSA_FPE, "MSA FPE" }, \ { KVM_TRACE_EXIT_FPE, "FPE" }, \ { KVM_TRACE_EXIT_MSA_DISABLED, "MSA Disabled" }, \ + { KVM_TRACE_EXIT_GUEST_EXIT, "Guest Exit" }, \ { KVM_TRACE_EXIT_WAIT, "WAIT" }, \ { KVM_TRACE_EXIT_CACHE, "CACHE" }, \ - { KVM_TRACE_EXIT_SIGNAL, "Signal" } + { KVM_TRACE_EXIT_SIGNAL, "Signal" }, \ + { KVM_TRACE_EXIT_GPSI, "GPSI" }, \ + { KVM_TRACE_EXIT_GSFC, "GSFC" }, \ + { KVM_TRACE_EXIT_HC, "HC" }, \ + { KVM_TRACE_EXIT_GRR, "GRR" }, \ + { KVM_TRACE_EXIT_GVA, "GVA" }, \ + { KVM_TRACE_EXIT_GHFC, "GHFC" }, \ + { KVM_TRACE_EXIT_GPA, "GPA" } TRACE_EVENT(kvm_exit, TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason), @@ -158,6 +183,8 @@ TRACE_EVENT(kvm_exit, { KVM_TRACE_COP0(16, 4), "Config4" }, \ { KVM_TRACE_COP0(16, 5), "Config5" }, \ { KVM_TRACE_COP0(16, 7), "Config7" }, \ + { KVM_TRACE_COP0(17, 1), "MAAR" }, \ + { KVM_TRACE_COP0(17, 2), "MAARI" }, \ { KVM_TRACE_COP0(26, 0), "ECC" }, \ { KVM_TRACE_COP0(30, 0), "ErrorEPC" }, \ { KVM_TRACE_COP0(31, 2), "KScratch1" }, \ @@ -268,6 +295,51 @@ TRACE_EVENT(kvm_asid_change, __entry->new_asid) ); +TRACE_EVENT(kvm_guestid_change, + TP_PROTO(struct kvm_vcpu *vcpu, unsigned int guestid), + TP_ARGS(vcpu, guestid), + TP_STRUCT__entry( + __field(unsigned int, guestid) + ), + + TP_fast_assign( + __entry->guestid = guestid; + ), + + TP_printk("GuestID: 0x%02x", + __entry->guestid) +); + +TRACE_EVENT_FN(kvm_guest_mode_change, + TP_PROTO(struct kvm_vcpu *vcpu), + TP_ARGS(vcpu), + TP_STRUCT__entry( + __field(unsigned long, epc) + __field(unsigned long, pc) + __field(unsigned long, badvaddr) + __field(unsigned int, status) + __field(unsigned int, cause) + ), + + TP_fast_assign( + __entry->epc = kvm_read_c0_guest_epc(vcpu->arch.cop0); + __entry->pc = vcpu->arch.pc; + __entry->badvaddr = kvm_read_c0_guest_badvaddr(vcpu->arch.cop0); + __entry->status = kvm_read_c0_guest_status(vcpu->arch.cop0); + __entry->cause = kvm_read_c0_guest_cause(vcpu->arch.cop0); + ), + + TP_printk("EPC: 0x%08lx PC: 0x%08lx Status: 0x%08x Cause: 0x%08x BadVAddr: 0x%08lx", + __entry->epc, + __entry->pc, + __entry->status, + __entry->cause, + __entry->badvaddr), + + kvm_guest_mode_change_trace_reg, + kvm_guest_mode_change_trace_unreg +); + #endif /* _TRACE_KVM_H */ /* This part must be outside protection */ diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c index b1fa53b252ea..a563759fd142 100644 --- a/arch/mips/kvm/trap_emul.c +++ b/arch/mips/kvm/trap_emul.c @@ -12,6 +12,7 @@ #include <linux/errno.h> #include <linux/err.h> #include <linux/kvm_host.h> +#include <linux/log2.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <asm/mmu_context.h> @@ -40,6 +41,29 @@ static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva) return gpa; } +static int kvm_trap_emul_no_handler(struct kvm_vcpu *vcpu) +{ + u32 __user *opc = (u32 __user *) vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; + u32 exccode = (cause & CAUSEF_EXCCODE) >> CAUSEB_EXCCODE; + unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; + u32 inst = 0; + + /* + * Fetch the instruction. + */ + if (cause & CAUSEF_BD) + opc += 1; + kvm_get_badinstr(opc, vcpu, &inst); + + kvm_err("Exception Code: %d not handled @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#x\n", + exccode, opc, inst, badvaddr, + kvm_read_c0_guest_status(vcpu->arch.cop0)); + kvm_arch_vcpu_dump_regs(vcpu); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; +} + static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) { struct mips_coproc *cop0 = vcpu->arch.cop0; @@ -82,6 +106,10 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu) ret = RESUME_HOST; break; + case EMULATE_HYPERCALL: + ret = kvm_mips_handle_hypcall(vcpu); + break; + default: BUG(); } @@ -484,6 +512,31 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu) return ret; } +static int kvm_trap_emul_hardware_enable(void) +{ + return 0; +} + +static void kvm_trap_emul_hardware_disable(void) +{ +} + +static int kvm_trap_emul_check_extension(struct kvm *kvm, long ext) +{ + int r; + + switch (ext) { + case KVM_CAP_MIPS_TE: + r = 1; + break; + default: + r = 0; + break; + } + + return r; +} + static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu) { struct mm_struct *kern_mm = &vcpu->arch.guest_kernel_mm; @@ -561,6 +614,9 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) u32 config, config1; int vcpu_id = vcpu->vcpu_id; + /* Start off the timer at 100 MHz */ + kvm_mips_init_count(vcpu, 100*1000*1000); + /* * Arch specific stuff, set up config registers properly so that the * guest will come up as expected @@ -589,6 +645,13 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu) /* Read the cache characteristics from the host Config1 Register */ config1 = (read_c0_config1() & ~0x7f); + /* DCache line size not correctly reported in Config1 on Octeon CPUs */ + if (cpu_dcache_line_size()) { + config1 &= ~MIPS_CONF1_DL; + config1 |= ((ilog2(cpu_dcache_line_size()) - 1) << + MIPS_CONF1_DL_SHF) & MIPS_CONF1_DL; + } + /* Set up MMU size */ config1 &= ~(0x3f << 25); config1 |= ((KVM_MIPS_GUEST_TLB_SIZE - 1) << 25); @@ -892,10 +955,12 @@ static int kvm_trap_emul_set_one_reg(struct kvm_vcpu *vcpu, if (v & CAUSEF_DC) { /* disable timer first */ kvm_mips_count_disable_cause(vcpu); - kvm_change_c0_guest_cause(cop0, ~CAUSEF_DC, v); + kvm_change_c0_guest_cause(cop0, (u32)~CAUSEF_DC, + v); } else { /* enable timer last */ - kvm_change_c0_guest_cause(cop0, ~CAUSEF_DC, v); + kvm_change_c0_guest_cause(cop0, (u32)~CAUSEF_DC, + v); kvm_mips_count_enable_cause(vcpu); } } else { @@ -1230,7 +1295,11 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = { .handle_msa_fpe = kvm_trap_emul_handle_msa_fpe, .handle_fpe = kvm_trap_emul_handle_fpe, .handle_msa_disabled = kvm_trap_emul_handle_msa_disabled, + .handle_guest_exit = kvm_trap_emul_no_handler, + .hardware_enable = kvm_trap_emul_hardware_enable, + .hardware_disable = kvm_trap_emul_hardware_disable, + .check_extension = kvm_trap_emul_check_extension, .vcpu_init = kvm_trap_emul_vcpu_init, .vcpu_uninit = kvm_trap_emul_vcpu_uninit, .vcpu_setup = kvm_trap_emul_vcpu_setup, diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c new file mode 100644 index 000000000000..71d8856ade64 --- /dev/null +++ b/arch/mips/kvm/vz.c @@ -0,0 +1,3223 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * KVM/MIPS: Support for hardware virtualization extensions + * + * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved. + * Authors: Yann Le Du <ledu@kymasys.com> + */ + +#include <linux/errno.h> +#include <linux/err.h> +#include <linux/module.h> +#include <linux/preempt.h> +#include <linux/vmalloc.h> +#include <asm/cacheflush.h> +#include <asm/cacheops.h> +#include <asm/cmpxchg.h> +#include <asm/fpu.h> +#include <asm/hazards.h> +#include <asm/inst.h> +#include <asm/mmu_context.h> +#include <asm/r4kcache.h> +#include <asm/time.h> +#include <asm/tlb.h> +#include <asm/tlbex.h> + +#include <linux/kvm_host.h> + +#include "interrupt.h" + +#include "trace.h" + +/* Pointers to last VCPU loaded on each physical CPU */ +static struct kvm_vcpu *last_vcpu[NR_CPUS]; +/* Pointers to last VCPU executed on each physical CPU */ +static struct kvm_vcpu *last_exec_vcpu[NR_CPUS]; + +/* + * Number of guest VTLB entries to use, so we can catch inconsistency between + * CPUs. + */ +static unsigned int kvm_vz_guest_vtlb_size; + +static inline long kvm_vz_read_gc0_ebase(void) +{ + if (sizeof(long) == 8 && cpu_has_ebase_wg) + return read_gc0_ebase_64(); + else + return read_gc0_ebase(); +} + +static inline void kvm_vz_write_gc0_ebase(long v) +{ + /* + * First write with WG=1 to write upper bits, then write again in case + * WG should be left at 0. + * write_gc0_ebase_64() is no longer UNDEFINED since R6. + */ + if (sizeof(long) == 8 && + (cpu_has_mips64r6 || cpu_has_ebase_wg)) { + write_gc0_ebase_64(v | MIPS_EBASE_WG); + write_gc0_ebase_64(v); + } else { + write_gc0_ebase(v | MIPS_EBASE_WG); + write_gc0_ebase(v); + } +} + +/* + * These Config bits may be writable by the guest: + * Config: [K23, KU] (!TLB), K0 + * Config1: (none) + * Config2: [TU, SU] (impl) + * Config3: ISAOnExc + * Config4: FTLBPageSize + * Config5: K, CV, MSAEn, UFE, FRE, SBRI, UFR + */ + +static inline unsigned int kvm_vz_config_guest_wrmask(struct kvm_vcpu *vcpu) +{ + return CONF_CM_CMASK; +} + +static inline unsigned int kvm_vz_config1_guest_wrmask(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static inline unsigned int kvm_vz_config2_guest_wrmask(struct kvm_vcpu *vcpu) +{ + return 0; +} + +static inline unsigned int kvm_vz_config3_guest_wrmask(struct kvm_vcpu *vcpu) +{ + return MIPS_CONF3_ISA_OE; +} + +static inline unsigned int kvm_vz_config4_guest_wrmask(struct kvm_vcpu *vcpu) +{ + /* no need to be exact */ + return MIPS_CONF4_VFTLBPAGESIZE; +} + +static inline unsigned int kvm_vz_config5_guest_wrmask(struct kvm_vcpu *vcpu) +{ + unsigned int mask = MIPS_CONF5_K | MIPS_CONF5_CV | MIPS_CONF5_SBRI; + + /* Permit MSAEn changes if MSA supported and enabled */ + if (kvm_mips_guest_has_msa(&vcpu->arch)) + mask |= MIPS_CONF5_MSAEN; + + /* + * Permit guest FPU mode changes if FPU is enabled and the relevant + * feature exists according to FIR register. + */ + if (kvm_mips_guest_has_fpu(&vcpu->arch)) { + if (cpu_has_ufr) + mask |= MIPS_CONF5_UFR; + if (cpu_has_fre) + mask |= MIPS_CONF5_FRE | MIPS_CONF5_UFE; + } + + return mask; +} + +/* + * VZ optionally allows these additional Config bits to be written by root: + * Config: M, [MT] + * Config1: M, [MMUSize-1, C2, MD, PC, WR, CA], FP + * Config2: M + * Config3: M, MSAP, [BPG], ULRI, [DSP2P, DSPP], CTXTC, [ITL, LPA, VEIC, + * VInt, SP, CDMM, MT, SM, TL] + * Config4: M, [VTLBSizeExt, MMUSizeExt] + * Config5: MRP + */ + +static inline unsigned int kvm_vz_config_user_wrmask(struct kvm_vcpu *vcpu) +{ + return kvm_vz_config_guest_wrmask(vcpu) | MIPS_CONF_M; +} + +static inline unsigned int kvm_vz_config1_user_wrmask(struct kvm_vcpu *vcpu) +{ + unsigned int mask = kvm_vz_config1_guest_wrmask(vcpu) | MIPS_CONF_M; + + /* Permit FPU to be present if FPU is supported */ + if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) + mask |= MIPS_CONF1_FP; + + return mask; +} + +static inline unsigned int kvm_vz_config2_user_wrmask(struct kvm_vcpu *vcpu) +{ + return kvm_vz_config2_guest_wrmask(vcpu) | MIPS_CONF_M; +} + +static inline unsigned int kvm_vz_config3_user_wrmask(struct kvm_vcpu *vcpu) +{ + unsigned int mask = kvm_vz_config3_guest_wrmask(vcpu) | MIPS_CONF_M | + MIPS_CONF3_ULRI | MIPS_CONF3_CTXTC; + + /* Permit MSA to be present if MSA is supported */ + if (kvm_mips_guest_can_have_msa(&vcpu->arch)) + mask |= MIPS_CONF3_MSA; + + return mask; +} + +static inline unsigned int kvm_vz_config4_user_wrmask(struct kvm_vcpu *vcpu) +{ + return kvm_vz_config4_guest_wrmask(vcpu) | MIPS_CONF_M; +} + +static inline unsigned int kvm_vz_config5_user_wrmask(struct kvm_vcpu *vcpu) +{ + return kvm_vz_config5_guest_wrmask(vcpu) | MIPS_CONF5_MRP; +} + +static gpa_t kvm_vz_gva_to_gpa_cb(gva_t gva) +{ + /* VZ guest has already converted gva to gpa */ + return gva; +} + +static void kvm_vz_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority) +{ + set_bit(priority, &vcpu->arch.pending_exceptions); + clear_bit(priority, &vcpu->arch.pending_exceptions_clr); +} + +static void kvm_vz_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority) +{ + clear_bit(priority, &vcpu->arch.pending_exceptions); + set_bit(priority, &vcpu->arch.pending_exceptions_clr); +} + +static void kvm_vz_queue_timer_int_cb(struct kvm_vcpu *vcpu) +{ + /* + * timer expiry is asynchronous to vcpu execution therefore defer guest + * cp0 accesses + */ + kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_TIMER); +} + +static void kvm_vz_dequeue_timer_int_cb(struct kvm_vcpu *vcpu) +{ + /* + * timer expiry is asynchronous to vcpu execution therefore defer guest + * cp0 accesses + */ + kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_TIMER); +} + +static void kvm_vz_queue_io_int_cb(struct kvm_vcpu *vcpu, + struct kvm_mips_interrupt *irq) +{ + int intr = (int)irq->irq; + + /* + * interrupts are asynchronous to vcpu execution therefore defer guest + * cp0 accesses + */ + switch (intr) { + case 2: + kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IO); + break; + + case 3: + kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IPI_1); + break; + + case 4: + kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_IPI_2); + break; + + default: + break; + } + +} + +static void kvm_vz_dequeue_io_int_cb(struct kvm_vcpu *vcpu, + struct kvm_mips_interrupt *irq) +{ + int intr = (int)irq->irq; + + /* + * interrupts are asynchronous to vcpu execution therefore defer guest + * cp0 accesses + */ + switch (intr) { + case -2: + kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IO); + break; + + case -3: + kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IPI_1); + break; + + case -4: + kvm_vz_dequeue_irq(vcpu, MIPS_EXC_INT_IPI_2); + break; + + default: + break; + } + +} + +static u32 kvm_vz_priority_to_irq[MIPS_EXC_MAX] = { + [MIPS_EXC_INT_TIMER] = C_IRQ5, + [MIPS_EXC_INT_IO] = C_IRQ0, + [MIPS_EXC_INT_IPI_1] = C_IRQ1, + [MIPS_EXC_INT_IPI_2] = C_IRQ2, +}; + +static int kvm_vz_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority, + u32 cause) +{ + u32 irq = (priority < MIPS_EXC_MAX) ? + kvm_vz_priority_to_irq[priority] : 0; + + switch (priority) { + case MIPS_EXC_INT_TIMER: + set_gc0_cause(C_TI); + break; + + case MIPS_EXC_INT_IO: + case MIPS_EXC_INT_IPI_1: + case MIPS_EXC_INT_IPI_2: + if (cpu_has_guestctl2) + set_c0_guestctl2(irq); + else + set_gc0_cause(irq); + break; + + default: + break; + } + + clear_bit(priority, &vcpu->arch.pending_exceptions); + return 1; +} + +static int kvm_vz_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority, + u32 cause) +{ + u32 irq = (priority < MIPS_EXC_MAX) ? + kvm_vz_priority_to_irq[priority] : 0; + + switch (priority) { + case MIPS_EXC_INT_TIMER: + /* + * Call to kvm_write_c0_guest_compare() clears Cause.TI in + * kvm_mips_emulate_CP0(). Explicitly clear irq associated with + * Cause.IP[IPTI] if GuestCtl2 virtual interrupt register not + * supported or if not using GuestCtl2 Hardware Clear. + */ + if (cpu_has_guestctl2) { + if (!(read_c0_guestctl2() & (irq << 14))) + clear_c0_guestctl2(irq); + } else { + clear_gc0_cause(irq); + } + break; + + case MIPS_EXC_INT_IO: + case MIPS_EXC_INT_IPI_1: + case MIPS_EXC_INT_IPI_2: + /* Clear GuestCtl2.VIP irq if not using Hardware Clear */ + if (cpu_has_guestctl2) { + if (!(read_c0_guestctl2() & (irq << 14))) + clear_c0_guestctl2(irq); + } else { + clear_gc0_cause(irq); + } + break; + + default: + break; + } + + clear_bit(priority, &vcpu->arch.pending_exceptions_clr); + return 1; +} + +/* + * VZ guest timer handling. + */ + +/** + * kvm_vz_should_use_htimer() - Find whether to use the VZ hard guest timer. + * @vcpu: Virtual CPU. + * + * Returns: true if the VZ GTOffset & real guest CP0_Count should be used + * instead of software emulation of guest timer. + * false otherwise. + */ +static bool kvm_vz_should_use_htimer(struct kvm_vcpu *vcpu) +{ + if (kvm_mips_count_disabled(vcpu)) + return false; + + /* Chosen frequency must match real frequency */ + if (mips_hpt_frequency != vcpu->arch.count_hz) + return false; + + /* We don't support a CP0_GTOffset with fewer bits than CP0_Count */ + if (current_cpu_data.gtoffset_mask != 0xffffffff) + return false; + + return true; +} + +/** + * _kvm_vz_restore_stimer() - Restore soft timer state. + * @vcpu: Virtual CPU. + * @compare: CP0_Compare register value, restored by caller. + * @cause: CP0_Cause register to restore. + * + * Restore VZ state relating to the soft timer. The hard timer can be enabled + * later. + */ +static void _kvm_vz_restore_stimer(struct kvm_vcpu *vcpu, u32 compare, + u32 cause) +{ + /* + * Avoid spurious counter interrupts by setting Guest CP0_Count to just + * after Guest CP0_Compare. + */ + write_c0_gtoffset(compare - read_c0_count()); + + back_to_back_c0_hazard(); + write_gc0_cause(cause); +} + +/** + * _kvm_vz_restore_htimer() - Restore hard timer state. + * @vcpu: Virtual CPU. + * @compare: CP0_Compare register value, restored by caller. + * @cause: CP0_Cause register to restore. + * + * Restore hard timer Guest.Count & Guest.Cause taking care to preserve the + * value of Guest.CP0_Cause.TI while restoring Guest.CP0_Cause. + */ +static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu, + u32 compare, u32 cause) +{ + u32 start_count, after_count; + ktime_t freeze_time; + unsigned long flags; + + /* + * Freeze the soft-timer and sync the guest CP0_Count with it. We do + * this with interrupts disabled to avoid latency. + */ + local_irq_save(flags); + freeze_time = kvm_mips_freeze_hrtimer(vcpu, &start_count); + write_c0_gtoffset(start_count - read_c0_count()); + local_irq_restore(flags); + + /* restore guest CP0_Cause, as TI may already be set */ + back_to_back_c0_hazard(); + write_gc0_cause(cause); + + /* + * The above sequence isn't atomic and would result in lost timer + * interrupts if we're not careful. Detect if a timer interrupt is due + * and assert it. + */ + back_to_back_c0_hazard(); + after_count = read_gc0_count(); + if (after_count - start_count > compare - start_count - 1) + kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_TIMER); +} + +/** + * kvm_vz_restore_timer() - Restore timer state. + * @vcpu: Virtual CPU. + * + * Restore soft timer state from saved context. + */ +static void kvm_vz_restore_timer(struct kvm_vcpu *vcpu) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + u32 cause, compare; + + compare = kvm_read_sw_gc0_compare(cop0); + cause = kvm_read_sw_gc0_cause(cop0); + + write_gc0_compare(compare); + _kvm_vz_restore_stimer(vcpu, compare, cause); +} + +/** + * kvm_vz_acquire_htimer() - Switch to hard timer state. + * @vcpu: Virtual CPU. + * + * Restore hard timer state on top of existing soft timer state if possible. + * + * Since hard timer won't remain active over preemption, preemption should be + * disabled by the caller. + */ +void kvm_vz_acquire_htimer(struct kvm_vcpu *vcpu) +{ + u32 gctl0; + + gctl0 = read_c0_guestctl0(); + if (!(gctl0 & MIPS_GCTL0_GT) && kvm_vz_should_use_htimer(vcpu)) { + /* enable guest access to hard timer */ + write_c0_guestctl0(gctl0 | MIPS_GCTL0_GT); + + _kvm_vz_restore_htimer(vcpu, read_gc0_compare(), + read_gc0_cause()); + } +} + +/** + * _kvm_vz_save_htimer() - Switch to software emulation of guest timer. + * @vcpu: Virtual CPU. + * @compare: Pointer to write compare value to. + * @cause: Pointer to write cause value to. + * + * Save VZ guest timer state and switch to software emulation of guest CP0 + * timer. The hard timer must already be in use, so preemption should be + * disabled. + */ +static void _kvm_vz_save_htimer(struct kvm_vcpu *vcpu, + u32 *out_compare, u32 *out_cause) +{ + u32 cause, compare, before_count, end_count; + ktime_t before_time; + + compare = read_gc0_compare(); + *out_compare = compare; + + before_time = ktime_get(); + + /* + * Record the CP0_Count *prior* to saving CP0_Cause, so we have a time + * at which no pending timer interrupt is missing. + */ + before_count = read_gc0_count(); + back_to_back_c0_hazard(); + cause = read_gc0_cause(); + *out_cause = cause; + + /* + * Record a final CP0_Count which we will transfer to the soft-timer. + * This is recorded *after* saving CP0_Cause, so we don't get any timer + * interrupts from just after the final CP0_Count point. + */ + back_to_back_c0_hazard(); + end_count = read_gc0_count(); + + /* + * The above sequence isn't atomic, so we could miss a timer interrupt + * between reading CP0_Cause and end_count. Detect and record any timer + * interrupt due between before_count and end_count. + */ + if (end_count - before_count > compare - before_count - 1) + kvm_vz_queue_irq(vcpu, MIPS_EXC_INT_TIMER); + + /* + * Restore soft-timer, ignoring a small amount of negative drift due to + * delay between freeze_hrtimer and setting CP0_GTOffset. + */ + kvm_mips_restore_hrtimer(vcpu, before_time, end_count, -0x10000); +} + +/** + * kvm_vz_save_timer() - Save guest timer state. + * @vcpu: Virtual CPU. + * + * Save VZ guest timer state and switch to soft guest timer if hard timer was in + * use. + */ +static void kvm_vz_save_timer(struct kvm_vcpu *vcpu) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + u32 gctl0, compare, cause; + + gctl0 = read_c0_guestctl0(); + if (gctl0 & MIPS_GCTL0_GT) { + /* disable guest use of hard timer */ + write_c0_guestctl0(gctl0 & ~MIPS_GCTL0_GT); + + /* save hard timer state */ + _kvm_vz_save_htimer(vcpu, &compare, &cause); + } else { + compare = read_gc0_compare(); + cause = read_gc0_cause(); + } + + /* save timer-related state to VCPU context */ + kvm_write_sw_gc0_cause(cop0, cause); + kvm_write_sw_gc0_compare(cop0, compare); +} + +/** + * kvm_vz_lose_htimer() - Ensure hard guest timer is not in use. + * @vcpu: Virtual CPU. + * + * Transfers the state of the hard guest timer to the soft guest timer, leaving + * guest state intact so it can continue to be used with the soft timer. + */ +void kvm_vz_lose_htimer(struct kvm_vcpu *vcpu) +{ + u32 gctl0, compare, cause; + + preempt_disable(); + gctl0 = read_c0_guestctl0(); + if (gctl0 & MIPS_GCTL0_GT) { + /* disable guest use of timer */ + write_c0_guestctl0(gctl0 & ~MIPS_GCTL0_GT); + + /* switch to soft timer */ + _kvm_vz_save_htimer(vcpu, &compare, &cause); + + /* leave soft timer in usable state */ + _kvm_vz_restore_stimer(vcpu, compare, cause); + } + preempt_enable(); +} + +/** + * is_eva_access() - Find whether an instruction is an EVA memory accessor. + * @inst: 32-bit instruction encoding. + * + * Finds whether @inst encodes an EVA memory access instruction, which would + * indicate that emulation of it should access the user mode address space + * instead of the kernel mode address space. This matters for MUSUK segments + * which are TLB mapped for user mode but unmapped for kernel mode. + * + * Returns: Whether @inst encodes an EVA accessor instruction. + */ +static bool is_eva_access(union mips_instruction inst) +{ + if (inst.spec3_format.opcode != spec3_op) + return false; + + switch (inst.spec3_format.func) { + case lwle_op: + case lwre_op: + case cachee_op: + case sbe_op: + case she_op: + case sce_op: + case swe_op: + case swle_op: + case swre_op: + case prefe_op: + case lbue_op: + case lhue_op: + case lbe_op: + case lhe_op: + case lle_op: + case lwe_op: + return true; + default: + return false; + } +} + +/** + * is_eva_am_mapped() - Find whether an access mode is mapped. + * @vcpu: KVM VCPU state. + * @am: 3-bit encoded access mode. + * @eu: Segment becomes unmapped and uncached when Status.ERL=1. + * + * Decode @am to find whether it encodes a mapped segment for the current VCPU + * state. Where necessary @eu and the actual instruction causing the fault are + * taken into account to make the decision. + * + * Returns: Whether the VCPU faulted on a TLB mapped address. + */ +static bool is_eva_am_mapped(struct kvm_vcpu *vcpu, unsigned int am, bool eu) +{ + u32 am_lookup; + int err; + + /* + * Interpret access control mode. We assume address errors will already + * have been caught by the guest, leaving us with: + * AM UM SM KM 31..24 23..16 + * UK 0 000 Unm 0 0 + * MK 1 001 TLB 1 + * MSK 2 010 TLB TLB 1 + * MUSK 3 011 TLB TLB TLB 1 + * MUSUK 4 100 TLB TLB Unm 0 1 + * USK 5 101 Unm Unm 0 0 + * - 6 110 0 0 + * UUSK 7 111 Unm Unm Unm 0 0 + * + * We shift a magic value by AM across the sign bit to find if always + * TLB mapped, and if not shift by 8 again to find if it depends on KM. + */ + am_lookup = 0x70080000 << am; + if ((s32)am_lookup < 0) { + /* + * MK, MSK, MUSK + * Always TLB mapped, unless SegCtl.EU && ERL + */ + if (!eu || !(read_gc0_status() & ST0_ERL)) + return true; + } else { + am_lookup <<= 8; + if ((s32)am_lookup < 0) { + union mips_instruction inst; + unsigned int status; + u32 *opc; + + /* + * MUSUK + * TLB mapped if not in kernel mode + */ + status = read_gc0_status(); + if (!(status & (ST0_EXL | ST0_ERL)) && + (status & ST0_KSU)) + return true; + /* + * EVA access instructions in kernel + * mode access user address space. + */ + opc = (u32 *)vcpu->arch.pc; + if (vcpu->arch.host_cp0_cause & CAUSEF_BD) + opc += 1; + err = kvm_get_badinstr(opc, vcpu, &inst.word); + if (!err && is_eva_access(inst)) + return true; + } + } + + return false; +} + +/** + * kvm_vz_gva_to_gpa() - Convert valid GVA to GPA. + * @vcpu: KVM VCPU state. + * @gva: Guest virtual address to convert. + * @gpa: Output guest physical address. + * + * Convert a guest virtual address (GVA) which is valid according to the guest + * context, to a guest physical address (GPA). + * + * Returns: 0 on success. + * -errno on failure. + */ +static int kvm_vz_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, + unsigned long *gpa) +{ + u32 gva32 = gva; + unsigned long segctl; + + if ((long)gva == (s32)gva32) { + /* Handle canonical 32-bit virtual address */ + if (cpu_guest_has_segments) { + unsigned long mask, pa; + + switch (gva32 >> 29) { + case 0: + case 1: /* CFG5 (1GB) */ + segctl = read_gc0_segctl2() >> 16; + mask = (unsigned long)0xfc0000000ull; + break; + case 2: + case 3: /* CFG4 (1GB) */ + segctl = read_gc0_segctl2(); + mask = (unsigned long)0xfc0000000ull; + break; + case 4: /* CFG3 (512MB) */ + segctl = read_gc0_segctl1() >> 16; + mask = (unsigned long)0xfe0000000ull; + break; + case 5: /* CFG2 (512MB) */ + segctl = read_gc0_segctl1(); + mask = (unsigned long)0xfe0000000ull; + break; + case 6: /* CFG1 (512MB) */ + segctl = read_gc0_segctl0() >> 16; + mask = (unsigned long)0xfe0000000ull; + break; + case 7: /* CFG0 (512MB) */ + segctl = read_gc0_segctl0(); + mask = (unsigned long)0xfe0000000ull; + break; + default: + /* + * GCC 4.9 isn't smart enough to figure out that + * segctl and mask are always initialised. + */ + unreachable(); + } + + if (is_eva_am_mapped(vcpu, (segctl >> 4) & 0x7, + segctl & 0x0008)) + goto tlb_mapped; + + /* Unmapped, find guest physical address */ + pa = (segctl << 20) & mask; + pa |= gva32 & ~mask; + *gpa = pa; + return 0; + } else if ((s32)gva32 < (s32)0xc0000000) { + /* legacy unmapped KSeg0 or KSeg1 */ + *gpa = gva32 & 0x1fffffff; + return 0; + } +#ifdef CONFIG_64BIT + } else if ((gva & 0xc000000000000000) == 0x8000000000000000) { + /* XKPHYS */ + if (cpu_guest_has_segments) { + /* + * Each of the 8 regions can be overridden by SegCtl2.XR + * to use SegCtl1.XAM. + */ + segctl = read_gc0_segctl2(); + if (segctl & (1ull << (56 + ((gva >> 59) & 0x7)))) { + segctl = read_gc0_segctl1(); + if (is_eva_am_mapped(vcpu, (segctl >> 59) & 0x7, + 0)) + goto tlb_mapped; + } + + } + /* + * Traditionally fully unmapped. + * Bits 61:59 specify the CCA, which we can just mask off here. + * Bits 58:PABITS should be zero, but we shouldn't have got here + * if it wasn't. + */ + *gpa = gva & 0x07ffffffffffffff; + return 0; +#endif + } + +tlb_mapped: + return kvm_vz_guest_tlb_lookup(vcpu, gva, gpa); +} + +/** + * kvm_vz_badvaddr_to_gpa() - Convert GVA BadVAddr from root exception to GPA. + * @vcpu: KVM VCPU state. + * @badvaddr: Root BadVAddr. + * @gpa: Output guest physical address. + * + * VZ implementations are permitted to report guest virtual addresses (GVA) in + * BadVAddr on a root exception during guest execution, instead of the more + * convenient guest physical addresses (GPA). When we get a GVA, this function + * converts it to a GPA, taking into account guest segmentation and guest TLB + * state. + * + * Returns: 0 on success. + * -errno on failure. + */ +static int kvm_vz_badvaddr_to_gpa(struct kvm_vcpu *vcpu, unsigned long badvaddr, + unsigned long *gpa) +{ + unsigned int gexccode = (vcpu->arch.host_cp0_guestctl0 & + MIPS_GCTL0_GEXC) >> MIPS_GCTL0_GEXC_SHIFT; + + /* If BadVAddr is GPA, then all is well in the world */ + if (likely(gexccode == MIPS_GCTL0_GEXC_GPA)) { + *gpa = badvaddr; + return 0; + } + + /* Otherwise we'd expect it to be GVA ... */ + if (WARN(gexccode != MIPS_GCTL0_GEXC_GVA, + "Unexpected gexccode %#x\n", gexccode)) + return -EINVAL; + + /* ... and we need to perform the GVA->GPA translation in software */ + return kvm_vz_gva_to_gpa(vcpu, badvaddr, gpa); +} + +static int kvm_trap_vz_no_handler(struct kvm_vcpu *vcpu) +{ + u32 *opc = (u32 *) vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; + u32 exccode = (cause & CAUSEF_EXCCODE) >> CAUSEB_EXCCODE; + unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr; + u32 inst = 0; + + /* + * Fetch the instruction. + */ + if (cause & CAUSEF_BD) + opc += 1; + kvm_get_badinstr(opc, vcpu, &inst); + + kvm_err("Exception Code: %d not handled @ PC: %p, inst: 0x%08x BadVaddr: %#lx Status: %#x\n", + exccode, opc, inst, badvaddr, + read_gc0_status()); + kvm_arch_vcpu_dump_regs(vcpu); + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; +} + +static unsigned long mips_process_maar(unsigned int op, unsigned long val) +{ + /* Mask off unused bits */ + unsigned long mask = 0xfffff000 | MIPS_MAAR_S | MIPS_MAAR_VL; + + if (read_gc0_pagegrain() & PG_ELPA) + mask |= 0x00ffffff00000000ull; + if (cpu_guest_has_mvh) + mask |= MIPS_MAAR_VH; + + /* Set or clear VH */ + if (op == mtc_op) { + /* clear VH */ + val &= ~MIPS_MAAR_VH; + } else if (op == dmtc_op) { + /* set VH to match VL */ + val &= ~MIPS_MAAR_VH; + if (val & MIPS_MAAR_VL) + val |= MIPS_MAAR_VH; + } + + return val & mask; +} + +static void kvm_write_maari(struct kvm_vcpu *vcpu, unsigned long val) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + + val &= MIPS_MAARI_INDEX; + if (val == MIPS_MAARI_INDEX) + kvm_write_sw_gc0_maari(cop0, ARRAY_SIZE(vcpu->arch.maar) - 1); + else if (val < ARRAY_SIZE(vcpu->arch.maar)) + kvm_write_sw_gc0_maari(cop0, val); +} + +static enum emulation_result kvm_vz_gpsi_cop0(union mips_instruction inst, + u32 *opc, u32 cause, + struct kvm_run *run, + struct kvm_vcpu *vcpu) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + enum emulation_result er = EMULATE_DONE; + u32 rt, rd, sel; + unsigned long curr_pc; + unsigned long val; + + /* + * Update PC and hold onto current PC in case there is + * an error and we want to rollback the PC + */ + curr_pc = vcpu->arch.pc; + er = update_pc(vcpu, cause); + if (er == EMULATE_FAIL) + return er; + + if (inst.co_format.co) { + switch (inst.co_format.func) { + case wait_op: + er = kvm_mips_emul_wait(vcpu); + break; + default: + er = EMULATE_FAIL; + } + } else { + rt = inst.c0r_format.rt; + rd = inst.c0r_format.rd; + sel = inst.c0r_format.sel; + + switch (inst.c0r_format.rs) { + case dmfc_op: + case mfc_op: +#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS + cop0->stat[rd][sel]++; +#endif + if (rd == MIPS_CP0_COUNT && + sel == 0) { /* Count */ + val = kvm_mips_read_count(vcpu); + } else if (rd == MIPS_CP0_COMPARE && + sel == 0) { /* Compare */ + val = read_gc0_compare(); + } else if (rd == MIPS_CP0_LLADDR && + sel == 0) { /* LLAddr */ + if (cpu_guest_has_rw_llb) + val = read_gc0_lladdr() & + MIPS_LLADDR_LLB; + else + val = 0; + } else if (rd == MIPS_CP0_LLADDR && + sel == 1 && /* MAAR */ + cpu_guest_has_maar && + !cpu_guest_has_dyn_maar) { + /* MAARI must be in range */ + BUG_ON(kvm_read_sw_gc0_maari(cop0) >= + ARRAY_SIZE(vcpu->arch.maar)); + val = vcpu->arch.maar[ + kvm_read_sw_gc0_maari(cop0)]; + } else if ((rd == MIPS_CP0_PRID && + (sel == 0 || /* PRid */ + sel == 2 || /* CDMMBase */ + sel == 3)) || /* CMGCRBase */ + (rd == MIPS_CP0_STATUS && + (sel == 2 || /* SRSCtl */ + sel == 3)) || /* SRSMap */ + (rd == MIPS_CP0_CONFIG && + (sel == 7)) || /* Config7 */ + (rd == MIPS_CP0_LLADDR && + (sel == 2) && /* MAARI */ + cpu_guest_has_maar && + !cpu_guest_has_dyn_maar) || + (rd == MIPS_CP0_ERRCTL && + (sel == 0))) { /* ErrCtl */ + val = cop0->reg[rd][sel]; + } else { + val = 0; + er = EMULATE_FAIL; + } + + if (er != EMULATE_FAIL) { + /* Sign extend */ + if (inst.c0r_format.rs == mfc_op) + val = (int)val; + vcpu->arch.gprs[rt] = val; + } + + trace_kvm_hwr(vcpu, (inst.c0r_format.rs == mfc_op) ? + KVM_TRACE_MFC0 : KVM_TRACE_DMFC0, + KVM_TRACE_COP0(rd, sel), val); + break; + + case dmtc_op: + case mtc_op: +#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS + cop0->stat[rd][sel]++; +#endif + val = vcpu->arch.gprs[rt]; + trace_kvm_hwr(vcpu, (inst.c0r_format.rs == mtc_op) ? + KVM_TRACE_MTC0 : KVM_TRACE_DMTC0, + KVM_TRACE_COP0(rd, sel), val); + + if (rd == MIPS_CP0_COUNT && + sel == 0) { /* Count */ + kvm_vz_lose_htimer(vcpu); + kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]); + } else if (rd == MIPS_CP0_COMPARE && + sel == 0) { /* Compare */ + kvm_mips_write_compare(vcpu, + vcpu->arch.gprs[rt], + true); + } else if (rd == MIPS_CP0_LLADDR && + sel == 0) { /* LLAddr */ + /* + * P5600 generates GPSI on guest MTC0 LLAddr. + * Only allow the guest to clear LLB. + */ + if (cpu_guest_has_rw_llb && + !(val & MIPS_LLADDR_LLB)) + write_gc0_lladdr(0); + } else if (rd == MIPS_CP0_LLADDR && + sel == 1 && /* MAAR */ + cpu_guest_has_maar && + !cpu_guest_has_dyn_maar) { + val = mips_process_maar(inst.c0r_format.rs, + val); + + /* MAARI must be in range */ + BUG_ON(kvm_read_sw_gc0_maari(cop0) >= + ARRAY_SIZE(vcpu->arch.maar)); + vcpu->arch.maar[kvm_read_sw_gc0_maari(cop0)] = + val; + } else if (rd == MIPS_CP0_LLADDR && + (sel == 2) && /* MAARI */ + cpu_guest_has_maar && + !cpu_guest_has_dyn_maar) { + kvm_write_maari(vcpu, val); + } else if (rd == MIPS_CP0_ERRCTL && + (sel == 0)) { /* ErrCtl */ + /* ignore the written value */ + } else { + er = EMULATE_FAIL; + } + break; + + default: + er = EMULATE_FAIL; + break; + } + } + /* Rollback PC only if emulation was unsuccessful */ + if (er == EMULATE_FAIL) { + kvm_err("[%#lx]%s: unsupported cop0 instruction 0x%08x\n", + curr_pc, __func__, inst.word); + + vcpu->arch.pc = curr_pc; + } + + return er; +} + +static enum emulation_result kvm_vz_gpsi_cache(union mips_instruction inst, + u32 *opc, u32 cause, + struct kvm_run *run, + struct kvm_vcpu *vcpu) +{ + enum emulation_result er = EMULATE_DONE; + u32 cache, op_inst, op, base; + s16 offset; + struct kvm_vcpu_arch *arch = &vcpu->arch; + unsigned long va, curr_pc; + + /* + * Update PC and hold onto current PC in case there is + * an error and we want to rollback the PC + */ + curr_pc = vcpu->arch.pc; + er = update_pc(vcpu, cause); + if (er == EMULATE_FAIL) + return er; + + base = inst.i_format.rs; + op_inst = inst.i_format.rt; + if (cpu_has_mips_r6) + offset = inst.spec3_format.simmediate; + else + offset = inst.i_format.simmediate; + cache = op_inst & CacheOp_Cache; + op = op_inst & CacheOp_Op; + + va = arch->gprs[base] + offset; + + kvm_debug("CACHE (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n", + cache, op, base, arch->gprs[base], offset); + + /* Secondary or tirtiary cache ops ignored */ + if (cache != Cache_I && cache != Cache_D) + return EMULATE_DONE; + + switch (op_inst) { + case Index_Invalidate_I: + flush_icache_line_indexed(va); + return EMULATE_DONE; + case Index_Writeback_Inv_D: + flush_dcache_line_indexed(va); + return EMULATE_DONE; + case Hit_Invalidate_I: + case Hit_Invalidate_D: + case Hit_Writeback_Inv_D: + if (boot_cpu_type() == CPU_CAVIUM_OCTEON3) { + /* We can just flush entire icache */ + local_flush_icache_range(0, 0); + return EMULATE_DONE; + } + + /* So far, other platforms support guest hit cache ops */ + break; + default: + break; + }; + + kvm_err("@ %#lx/%#lx CACHE (cache: %#x, op: %#x, base[%d]: %#lx, offset: %#x\n", + curr_pc, vcpu->arch.gprs[31], cache, op, base, arch->gprs[base], + offset); + /* Rollback PC */ + vcpu->arch.pc = curr_pc; + + return EMULATE_FAIL; +} + +static enum emulation_result kvm_trap_vz_handle_gpsi(u32 cause, u32 *opc, + struct kvm_vcpu *vcpu) +{ + enum emulation_result er = EMULATE_DONE; + struct kvm_vcpu_arch *arch = &vcpu->arch; + struct kvm_run *run = vcpu->run; + union mips_instruction inst; + int rd, rt, sel; + int err; + + /* + * Fetch the instruction. + */ + if (cause & CAUSEF_BD) + opc += 1; + err = kvm_get_badinstr(opc, vcpu, &inst.word); + if (err) + return EMULATE_FAIL; + + switch (inst.r_format.opcode) { + case cop0_op: + er = kvm_vz_gpsi_cop0(inst, opc, cause, run, vcpu); + break; +#ifndef CONFIG_CPU_MIPSR6 + case cache_op: + trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); + er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu); + break; +#endif + case spec3_op: + switch (inst.spec3_format.func) { +#ifdef CONFIG_CPU_MIPSR6 + case cache6_op: + trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE); + er = kvm_vz_gpsi_cache(inst, opc, cause, run, vcpu); + break; +#endif + case rdhwr_op: + if (inst.r_format.rs || (inst.r_format.re >> 3)) + goto unknown; + + rd = inst.r_format.rd; + rt = inst.r_format.rt; + sel = inst.r_format.re & 0x7; + + switch (rd) { + case MIPS_HWR_CC: /* Read count register */ + arch->gprs[rt] = + (long)(int)kvm_mips_read_count(vcpu); + break; + default: + trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR, + KVM_TRACE_HWR(rd, sel), 0); + goto unknown; + }; + + trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR, + KVM_TRACE_HWR(rd, sel), arch->gprs[rt]); + + er = update_pc(vcpu, cause); + break; + default: + goto unknown; + }; + break; +unknown: + + default: + kvm_err("GPSI exception not supported (%p/%#x)\n", + opc, inst.word); + kvm_arch_vcpu_dump_regs(vcpu); + er = EMULATE_FAIL; + break; + } + + return er; +} + +static enum emulation_result kvm_trap_vz_handle_gsfc(u32 cause, u32 *opc, + struct kvm_vcpu *vcpu) +{ + enum emulation_result er = EMULATE_DONE; + struct kvm_vcpu_arch *arch = &vcpu->arch; + union mips_instruction inst; + int err; + + /* + * Fetch the instruction. + */ + if (cause & CAUSEF_BD) + opc += 1; + err = kvm_get_badinstr(opc, vcpu, &inst.word); + if (err) + return EMULATE_FAIL; + + /* complete MTC0 on behalf of guest and advance EPC */ + if (inst.c0r_format.opcode == cop0_op && + inst.c0r_format.rs == mtc_op && + inst.c0r_format.z == 0) { + int rt = inst.c0r_format.rt; + int rd = inst.c0r_format.rd; + int sel = inst.c0r_format.sel; + unsigned int val = arch->gprs[rt]; + unsigned int old_val, change; + + trace_kvm_hwr(vcpu, KVM_TRACE_MTC0, KVM_TRACE_COP0(rd, sel), + val); + + if ((rd == MIPS_CP0_STATUS) && (sel == 0)) { + /* FR bit should read as zero if no FPU */ + if (!kvm_mips_guest_has_fpu(&vcpu->arch)) + val &= ~(ST0_CU1 | ST0_FR); + + /* + * Also don't allow FR to be set if host doesn't support + * it. + */ + if (!(boot_cpu_data.fpu_id & MIPS_FPIR_F64)) + val &= ~ST0_FR; + + old_val = read_gc0_status(); + change = val ^ old_val; + + if (change & ST0_FR) { + /* + * FPU and Vector register state is made + * UNPREDICTABLE by a change of FR, so don't + * even bother saving it. + */ + kvm_drop_fpu(vcpu); + } + + /* + * If MSA state is already live, it is undefined how it + * interacts with FR=0 FPU state, and we don't want to + * hit reserved instruction exceptions trying to save + * the MSA state later when CU=1 && FR=1, so play it + * safe and save it first. + */ + if (change & ST0_CU1 && !(val & ST0_FR) && + vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) + kvm_lose_fpu(vcpu); + + write_gc0_status(val); + } else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) { + u32 old_cause = read_gc0_cause(); + u32 change = old_cause ^ val; + + /* DC bit enabling/disabling timer? */ + if (change & CAUSEF_DC) { + if (val & CAUSEF_DC) { + kvm_vz_lose_htimer(vcpu); + kvm_mips_count_disable_cause(vcpu); + } else { + kvm_mips_count_enable_cause(vcpu); + } + } + + /* Only certain bits are RW to the guest */ + change &= (CAUSEF_DC | CAUSEF_IV | CAUSEF_WP | + CAUSEF_IP0 | CAUSEF_IP1); + + /* WP can only be cleared */ + change &= ~CAUSEF_WP | old_cause; + + write_gc0_cause(old_cause ^ change); + } else if ((rd == MIPS_CP0_STATUS) && (sel == 1)) { /* IntCtl */ + write_gc0_intctl(val); + } else if ((rd == MIPS_CP0_CONFIG) && (sel == 5)) { + old_val = read_gc0_config5(); + change = val ^ old_val; + /* Handle changes in FPU/MSA modes */ + preempt_disable(); + + /* + * Propagate FRE changes immediately if the FPU + * context is already loaded. + */ + if (change & MIPS_CONF5_FRE && + vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) + change_c0_config5(MIPS_CONF5_FRE, val); + + preempt_enable(); + + val = old_val ^ + (change & kvm_vz_config5_guest_wrmask(vcpu)); + write_gc0_config5(val); + } else { + kvm_err("Handle GSFC, unsupported field change @ %p: %#x\n", + opc, inst.word); + er = EMULATE_FAIL; + } + + if (er != EMULATE_FAIL) + er = update_pc(vcpu, cause); + } else { + kvm_err("Handle GSFC, unrecognized instruction @ %p: %#x\n", + opc, inst.word); + er = EMULATE_FAIL; + } + + return er; +} + +static enum emulation_result kvm_trap_vz_handle_ghfc(u32 cause, u32 *opc, + struct kvm_vcpu *vcpu) +{ + /* + * Presumably this is due to MC (guest mode change), so lets trace some + * relevant info. + */ + trace_kvm_guest_mode_change(vcpu); + + return EMULATE_DONE; +} + +static enum emulation_result kvm_trap_vz_handle_hc(u32 cause, u32 *opc, + struct kvm_vcpu *vcpu) +{ + enum emulation_result er; + union mips_instruction inst; + unsigned long curr_pc; + int err; + + if (cause & CAUSEF_BD) + opc += 1; + err = kvm_get_badinstr(opc, vcpu, &inst.word); + if (err) + return EMULATE_FAIL; + + /* + * Update PC and hold onto current PC in case there is + * an error and we want to rollback the PC + */ + curr_pc = vcpu->arch.pc; + er = update_pc(vcpu, cause); + if (er == EMULATE_FAIL) + return er; + + er = kvm_mips_emul_hypcall(vcpu, inst); + if (er == EMULATE_FAIL) + vcpu->arch.pc = curr_pc; + + return er; +} + +static enum emulation_result kvm_trap_vz_no_handler_guest_exit(u32 gexccode, + u32 cause, + u32 *opc, + struct kvm_vcpu *vcpu) +{ + u32 inst; + + /* + * Fetch the instruction. + */ + if (cause & CAUSEF_BD) + opc += 1; + kvm_get_badinstr(opc, vcpu, &inst); + + kvm_err("Guest Exception Code: %d not yet handled @ PC: %p, inst: 0x%08x Status: %#x\n", + gexccode, opc, inst, read_gc0_status()); + + return EMULATE_FAIL; +} + +static int kvm_trap_vz_handle_guest_exit(struct kvm_vcpu *vcpu) +{ + u32 *opc = (u32 *) vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; + enum emulation_result er = EMULATE_DONE; + u32 gexccode = (vcpu->arch.host_cp0_guestctl0 & + MIPS_GCTL0_GEXC) >> MIPS_GCTL0_GEXC_SHIFT; + int ret = RESUME_GUEST; + + trace_kvm_exit(vcpu, KVM_TRACE_EXIT_GEXCCODE_BASE + gexccode); + switch (gexccode) { + case MIPS_GCTL0_GEXC_GPSI: + ++vcpu->stat.vz_gpsi_exits; + er = kvm_trap_vz_handle_gpsi(cause, opc, vcpu); + break; + case MIPS_GCTL0_GEXC_GSFC: + ++vcpu->stat.vz_gsfc_exits; + er = kvm_trap_vz_handle_gsfc(cause, opc, vcpu); + break; + case MIPS_GCTL0_GEXC_HC: + ++vcpu->stat.vz_hc_exits; + er = kvm_trap_vz_handle_hc(cause, opc, vcpu); + break; + case MIPS_GCTL0_GEXC_GRR: + ++vcpu->stat.vz_grr_exits; + er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc, + vcpu); + break; + case MIPS_GCTL0_GEXC_GVA: + ++vcpu->stat.vz_gva_exits; + er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc, + vcpu); + break; + case MIPS_GCTL0_GEXC_GHFC: + ++vcpu->stat.vz_ghfc_exits; + er = kvm_trap_vz_handle_ghfc(cause, opc, vcpu); + break; + case MIPS_GCTL0_GEXC_GPA: + ++vcpu->stat.vz_gpa_exits; + er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc, + vcpu); + break; + default: + ++vcpu->stat.vz_resvd_exits; + er = kvm_trap_vz_no_handler_guest_exit(gexccode, cause, opc, + vcpu); + break; + + } + + if (er == EMULATE_DONE) { + ret = RESUME_GUEST; + } else if (er == EMULATE_HYPERCALL) { + ret = kvm_mips_handle_hypcall(vcpu); + } else { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + ret = RESUME_HOST; + } + return ret; +} + +/** + * kvm_trap_vz_handle_cop_unusuable() - Guest used unusable coprocessor. + * @vcpu: Virtual CPU context. + * + * Handle when the guest attempts to use a coprocessor which hasn't been allowed + * by the root context. + */ +static int kvm_trap_vz_handle_cop_unusable(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + u32 cause = vcpu->arch.host_cp0_cause; + enum emulation_result er = EMULATE_FAIL; + int ret = RESUME_GUEST; + + if (((cause & CAUSEF_CE) >> CAUSEB_CE) == 1) { + /* + * If guest FPU not present, the FPU operation should have been + * treated as a reserved instruction! + * If FPU already in use, we shouldn't get this at all. + */ + if (WARN_ON(!kvm_mips_guest_has_fpu(&vcpu->arch) || + vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)) { + preempt_enable(); + return EMULATE_FAIL; + } + + kvm_own_fpu(vcpu); + er = EMULATE_DONE; + } + /* other coprocessors not handled */ + + switch (er) { + case EMULATE_DONE: + ret = RESUME_GUEST; + break; + + case EMULATE_FAIL: + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + ret = RESUME_HOST; + break; + + default: + BUG(); + } + return ret; +} + +/** + * kvm_trap_vz_handle_msa_disabled() - Guest used MSA while disabled in root. + * @vcpu: Virtual CPU context. + * + * Handle when the guest attempts to use MSA when it is disabled in the root + * context. + */ +static int kvm_trap_vz_handle_msa_disabled(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + + /* + * If MSA not present or not exposed to guest or FR=0, the MSA operation + * should have been treated as a reserved instruction! + * Same if CU1=1, FR=0. + * If MSA already in use, we shouldn't get this at all. + */ + if (!kvm_mips_guest_has_msa(&vcpu->arch) || + (read_gc0_status() & (ST0_CU1 | ST0_FR)) == ST0_CU1 || + !(read_gc0_config5() & MIPS_CONF5_MSAEN) || + vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + + kvm_own_msa(vcpu); + + return RESUME_GUEST; +} + +static int kvm_trap_vz_handle_tlb_ld_miss(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + u32 *opc = (u32 *) vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; + ulong badvaddr = vcpu->arch.host_cp0_badvaddr; + union mips_instruction inst; + enum emulation_result er = EMULATE_DONE; + int err, ret = RESUME_GUEST; + + if (kvm_mips_handle_vz_root_tlb_fault(badvaddr, vcpu, false)) { + /* A code fetch fault doesn't count as an MMIO */ + if (kvm_is_ifetch_fault(&vcpu->arch)) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + + /* Fetch the instruction */ + if (cause & CAUSEF_BD) + opc += 1; + err = kvm_get_badinstr(opc, vcpu, &inst.word); + if (err) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + + /* Treat as MMIO */ + er = kvm_mips_emulate_load(inst, cause, run, vcpu); + if (er == EMULATE_FAIL) { + kvm_err("Guest Emulate Load from MMIO space failed: PC: %p, BadVaddr: %#lx\n", + opc, badvaddr); + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + } + } + + if (er == EMULATE_DONE) { + ret = RESUME_GUEST; + } else if (er == EMULATE_DO_MMIO) { + run->exit_reason = KVM_EXIT_MMIO; + ret = RESUME_HOST; + } else { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + ret = RESUME_HOST; + } + return ret; +} + +static int kvm_trap_vz_handle_tlb_st_miss(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + u32 *opc = (u32 *) vcpu->arch.pc; + u32 cause = vcpu->arch.host_cp0_cause; + ulong badvaddr = vcpu->arch.host_cp0_badvaddr; + union mips_instruction inst; + enum emulation_result er = EMULATE_DONE; + int err; + int ret = RESUME_GUEST; + + /* Just try the access again if we couldn't do the translation */ + if (kvm_vz_badvaddr_to_gpa(vcpu, badvaddr, &badvaddr)) + return RESUME_GUEST; + vcpu->arch.host_cp0_badvaddr = badvaddr; + + if (kvm_mips_handle_vz_root_tlb_fault(badvaddr, vcpu, true)) { + /* Fetch the instruction */ + if (cause & CAUSEF_BD) + opc += 1; + err = kvm_get_badinstr(opc, vcpu, &inst.word); + if (err) { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } + + /* Treat as MMIO */ + er = kvm_mips_emulate_store(inst, cause, run, vcpu); + if (er == EMULATE_FAIL) { + kvm_err("Guest Emulate Store to MMIO space failed: PC: %p, BadVaddr: %#lx\n", + opc, badvaddr); + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + } + } + + if (er == EMULATE_DONE) { + ret = RESUME_GUEST; + } else if (er == EMULATE_DO_MMIO) { + run->exit_reason = KVM_EXIT_MMIO; + ret = RESUME_HOST; + } else { + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + ret = RESUME_HOST; + } + return ret; +} + +static u64 kvm_vz_get_one_regs[] = { + KVM_REG_MIPS_CP0_INDEX, + KVM_REG_MIPS_CP0_ENTRYLO0, + KVM_REG_MIPS_CP0_ENTRYLO1, + KVM_REG_MIPS_CP0_CONTEXT, + KVM_REG_MIPS_CP0_PAGEMASK, + KVM_REG_MIPS_CP0_PAGEGRAIN, + KVM_REG_MIPS_CP0_WIRED, + KVM_REG_MIPS_CP0_HWRENA, + KVM_REG_MIPS_CP0_BADVADDR, + KVM_REG_MIPS_CP0_COUNT, + KVM_REG_MIPS_CP0_ENTRYHI, + KVM_REG_MIPS_CP0_COMPARE, + KVM_REG_MIPS_CP0_STATUS, + KVM_REG_MIPS_CP0_INTCTL, + KVM_REG_MIPS_CP0_CAUSE, + KVM_REG_MIPS_CP0_EPC, + KVM_REG_MIPS_CP0_PRID, + KVM_REG_MIPS_CP0_EBASE, + KVM_REG_MIPS_CP0_CONFIG, + KVM_REG_MIPS_CP0_CONFIG1, + KVM_REG_MIPS_CP0_CONFIG2, + KVM_REG_MIPS_CP0_CONFIG3, + KVM_REG_MIPS_CP0_CONFIG4, + KVM_REG_MIPS_CP0_CONFIG5, +#ifdef CONFIG_64BIT + KVM_REG_MIPS_CP0_XCONTEXT, +#endif + KVM_REG_MIPS_CP0_ERROREPC, + + KVM_REG_MIPS_COUNT_CTL, + KVM_REG_MIPS_COUNT_RESUME, + KVM_REG_MIPS_COUNT_HZ, +}; + +static u64 kvm_vz_get_one_regs_contextconfig[] = { + KVM_REG_MIPS_CP0_CONTEXTCONFIG, +#ifdef CONFIG_64BIT + KVM_REG_MIPS_CP0_XCONTEXTCONFIG, +#endif +}; + +static u64 kvm_vz_get_one_regs_segments[] = { + KVM_REG_MIPS_CP0_SEGCTL0, + KVM_REG_MIPS_CP0_SEGCTL1, + KVM_REG_MIPS_CP0_SEGCTL2, +}; + +static u64 kvm_vz_get_one_regs_htw[] = { + KVM_REG_MIPS_CP0_PWBASE, + KVM_REG_MIPS_CP0_PWFIELD, + KVM_REG_MIPS_CP0_PWSIZE, + KVM_REG_MIPS_CP0_PWCTL, +}; + +static u64 kvm_vz_get_one_regs_kscratch[] = { + KVM_REG_MIPS_CP0_KSCRATCH1, + KVM_REG_MIPS_CP0_KSCRATCH2, + KVM_REG_MIPS_CP0_KSCRATCH3, + KVM_REG_MIPS_CP0_KSCRATCH4, + KVM_REG_MIPS_CP0_KSCRATCH5, + KVM_REG_MIPS_CP0_KSCRATCH6, +}; + +static unsigned long kvm_vz_num_regs(struct kvm_vcpu *vcpu) +{ + unsigned long ret; + + ret = ARRAY_SIZE(kvm_vz_get_one_regs); + if (cpu_guest_has_userlocal) + ++ret; + if (cpu_guest_has_badinstr) + ++ret; + if (cpu_guest_has_badinstrp) + ++ret; + if (cpu_guest_has_contextconfig) + ret += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig); + if (cpu_guest_has_segments) + ret += ARRAY_SIZE(kvm_vz_get_one_regs_segments); + if (cpu_guest_has_htw) + ret += ARRAY_SIZE(kvm_vz_get_one_regs_htw); + if (cpu_guest_has_maar && !cpu_guest_has_dyn_maar) + ret += 1 + ARRAY_SIZE(vcpu->arch.maar); + ret += __arch_hweight8(cpu_data[0].guest.kscratch_mask); + + return ret; +} + +static int kvm_vz_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices) +{ + u64 index; + unsigned int i; + + if (copy_to_user(indices, kvm_vz_get_one_regs, + sizeof(kvm_vz_get_one_regs))) + return -EFAULT; + indices += ARRAY_SIZE(kvm_vz_get_one_regs); + + if (cpu_guest_has_userlocal) { + index = KVM_REG_MIPS_CP0_USERLOCAL; + if (copy_to_user(indices, &index, sizeof(index))) + return -EFAULT; + ++indices; + } + if (cpu_guest_has_badinstr) { + index = KVM_REG_MIPS_CP0_BADINSTR; + if (copy_to_user(indices, &index, sizeof(index))) + return -EFAULT; + ++indices; + } + if (cpu_guest_has_badinstrp) { + index = KVM_REG_MIPS_CP0_BADINSTRP; + if (copy_to_user(indices, &index, sizeof(index))) + return -EFAULT; + ++indices; + } + if (cpu_guest_has_contextconfig) { + if (copy_to_user(indices, kvm_vz_get_one_regs_contextconfig, + sizeof(kvm_vz_get_one_regs_contextconfig))) + return -EFAULT; + indices += ARRAY_SIZE(kvm_vz_get_one_regs_contextconfig); + } + if (cpu_guest_has_segments) { + if (copy_to_user(indices, kvm_vz_get_one_regs_segments, + sizeof(kvm_vz_get_one_regs_segments))) + return -EFAULT; + indices += ARRAY_SIZE(kvm_vz_get_one_regs_segments); + } + if (cpu_guest_has_htw) { + if (copy_to_user(indices, kvm_vz_get_one_regs_htw, + sizeof(kvm_vz_get_one_regs_htw))) + return -EFAULT; + indices += ARRAY_SIZE(kvm_vz_get_one_regs_htw); + } + if (cpu_guest_has_maar && !cpu_guest_has_dyn_maar) { + for (i = 0; i < ARRAY_SIZE(vcpu->arch.maar); ++i) { + index = KVM_REG_MIPS_CP0_MAAR(i); + if (copy_to_user(indices, &index, sizeof(index))) + return -EFAULT; + ++indices; + } + + index = KVM_REG_MIPS_CP0_MAARI; + if (copy_to_user(indices, &index, sizeof(index))) + return -EFAULT; + ++indices; + } + for (i = 0; i < 6; ++i) { + if (!cpu_guest_has_kscr(i + 2)) + continue; + + if (copy_to_user(indices, &kvm_vz_get_one_regs_kscratch[i], + sizeof(kvm_vz_get_one_regs_kscratch[i]))) + return -EFAULT; + ++indices; + } + + return 0; +} + +static inline s64 entrylo_kvm_to_user(unsigned long v) +{ + s64 mask, ret = v; + + if (BITS_PER_LONG == 32) { + /* + * KVM API exposes 64-bit version of the register, so move the + * RI/XI bits up into place. + */ + mask = MIPS_ENTRYLO_RI | MIPS_ENTRYLO_XI; + ret &= ~mask; + ret |= ((s64)v & mask) << 32; + } + return ret; +} + +static inline unsigned long entrylo_user_to_kvm(s64 v) +{ + unsigned long mask, ret = v; + + if (BITS_PER_LONG == 32) { + /* + * KVM API exposes 64-bit versiono of the register, so move the + * RI/XI bits down into place. + */ + mask = MIPS_ENTRYLO_RI | MIPS_ENTRYLO_XI; + ret &= ~mask; + ret |= (v >> 32) & mask; + } + return ret; +} + +static int kvm_vz_get_one_reg(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg, + s64 *v) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + unsigned int idx; + + switch (reg->id) { + case KVM_REG_MIPS_CP0_INDEX: + *v = (long)read_gc0_index(); + break; + case KVM_REG_MIPS_CP0_ENTRYLO0: + *v = entrylo_kvm_to_user(read_gc0_entrylo0()); + break; + case KVM_REG_MIPS_CP0_ENTRYLO1: + *v = entrylo_kvm_to_user(read_gc0_entrylo1()); + break; + case KVM_REG_MIPS_CP0_CONTEXT: + *v = (long)read_gc0_context(); + break; + case KVM_REG_MIPS_CP0_CONTEXTCONFIG: + if (!cpu_guest_has_contextconfig) + return -EINVAL; + *v = read_gc0_contextconfig(); + break; + case KVM_REG_MIPS_CP0_USERLOCAL: + if (!cpu_guest_has_userlocal) + return -EINVAL; + *v = read_gc0_userlocal(); + break; +#ifdef CONFIG_64BIT + case KVM_REG_MIPS_CP0_XCONTEXTCONFIG: + if (!cpu_guest_has_contextconfig) + return -EINVAL; + *v = read_gc0_xcontextconfig(); + break; +#endif + case KVM_REG_MIPS_CP0_PAGEMASK: + *v = (long)read_gc0_pagemask(); + break; + case KVM_REG_MIPS_CP0_PAGEGRAIN: + *v = (long)read_gc0_pagegrain(); + break; + case KVM_REG_MIPS_CP0_SEGCTL0: + if (!cpu_guest_has_segments) + return -EINVAL; + *v = read_gc0_segctl0(); + break; + case KVM_REG_MIPS_CP0_SEGCTL1: + if (!cpu_guest_has_segments) + return -EINVAL; + *v = read_gc0_segctl1(); + break; + case KVM_REG_MIPS_CP0_SEGCTL2: + if (!cpu_guest_has_segments) + return -EINVAL; + *v = read_gc0_segctl2(); + break; + case KVM_REG_MIPS_CP0_PWBASE: + if (!cpu_guest_has_htw) + return -EINVAL; + *v = read_gc0_pwbase(); + break; + case KVM_REG_MIPS_CP0_PWFIELD: + if (!cpu_guest_has_htw) + return -EINVAL; + *v = read_gc0_pwfield(); + break; + case KVM_REG_MIPS_CP0_PWSIZE: + if (!cpu_guest_has_htw) + return -EINVAL; + *v = read_gc0_pwsize(); + break; + case KVM_REG_MIPS_CP0_WIRED: + *v = (long)read_gc0_wired(); + break; + case KVM_REG_MIPS_CP0_PWCTL: + if (!cpu_guest_has_htw) + return -EINVAL; + *v = read_gc0_pwctl(); + break; + case KVM_REG_MIPS_CP0_HWRENA: + *v = (long)read_gc0_hwrena(); + break; + case KVM_REG_MIPS_CP0_BADVADDR: + *v = (long)read_gc0_badvaddr(); + break; + case KVM_REG_MIPS_CP0_BADINSTR: + if (!cpu_guest_has_badinstr) + return -EINVAL; + *v = read_gc0_badinstr(); + break; + case KVM_REG_MIPS_CP0_BADINSTRP: + if (!cpu_guest_has_badinstrp) + return -EINVAL; + *v = read_gc0_badinstrp(); + break; + case KVM_REG_MIPS_CP0_COUNT: + *v = kvm_mips_read_count(vcpu); + break; + case KVM_REG_MIPS_CP0_ENTRYHI: + *v = (long)read_gc0_entryhi(); + break; + case KVM_REG_MIPS_CP0_COMPARE: + *v = (long)read_gc0_compare(); + break; + case KVM_REG_MIPS_CP0_STATUS: + *v = (long)read_gc0_status(); + break; + case KVM_REG_MIPS_CP0_INTCTL: + *v = read_gc0_intctl(); + break; + case KVM_REG_MIPS_CP0_CAUSE: + *v = (long)read_gc0_cause(); + break; + case KVM_REG_MIPS_CP0_EPC: + *v = (long)read_gc0_epc(); + break; + case KVM_REG_MIPS_CP0_PRID: + switch (boot_cpu_type()) { + case CPU_CAVIUM_OCTEON3: + /* Octeon III has a read-only guest.PRid */ + *v = read_gc0_prid(); + break; + default: + *v = (long)kvm_read_c0_guest_prid(cop0); + break; + }; + break; + case KVM_REG_MIPS_CP0_EBASE: + *v = kvm_vz_read_gc0_ebase(); + break; + case KVM_REG_MIPS_CP0_CONFIG: + *v = read_gc0_config(); + break; + case KVM_REG_MIPS_CP0_CONFIG1: + if (!cpu_guest_has_conf1) + return -EINVAL; + *v = read_gc0_config1(); + break; + case KVM_REG_MIPS_CP0_CONFIG2: + if (!cpu_guest_has_conf2) + return -EINVAL; + *v = read_gc0_config2(); + break; + case KVM_REG_MIPS_CP0_CONFIG3: + if (!cpu_guest_has_conf3) + return -EINVAL; + *v = read_gc0_config3(); + break; + case KVM_REG_MIPS_CP0_CONFIG4: + if (!cpu_guest_has_conf4) + return -EINVAL; + *v = read_gc0_config4(); + break; + case KVM_REG_MIPS_CP0_CONFIG5: + if (!cpu_guest_has_conf5) + return -EINVAL; + *v = read_gc0_config5(); + break; + case KVM_REG_MIPS_CP0_MAAR(0) ... KVM_REG_MIPS_CP0_MAAR(0x3f): + if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar) + return -EINVAL; + idx = reg->id - KVM_REG_MIPS_CP0_MAAR(0); + if (idx >= ARRAY_SIZE(vcpu->arch.maar)) + return -EINVAL; + *v = vcpu->arch.maar[idx]; + break; + case KVM_REG_MIPS_CP0_MAARI: + if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar) + return -EINVAL; + *v = kvm_read_sw_gc0_maari(vcpu->arch.cop0); + break; +#ifdef CONFIG_64BIT + case KVM_REG_MIPS_CP0_XCONTEXT: + *v = read_gc0_xcontext(); + break; +#endif + case KVM_REG_MIPS_CP0_ERROREPC: + *v = (long)read_gc0_errorepc(); + break; + case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6: + idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2; + if (!cpu_guest_has_kscr(idx)) + return -EINVAL; + switch (idx) { + case 2: + *v = (long)read_gc0_kscratch1(); + break; + case 3: + *v = (long)read_gc0_kscratch2(); + break; + case 4: + *v = (long)read_gc0_kscratch3(); + break; + case 5: + *v = (long)read_gc0_kscratch4(); + break; + case 6: + *v = (long)read_gc0_kscratch5(); + break; + case 7: + *v = (long)read_gc0_kscratch6(); + break; + } + break; + case KVM_REG_MIPS_COUNT_CTL: + *v = vcpu->arch.count_ctl; + break; + case KVM_REG_MIPS_COUNT_RESUME: + *v = ktime_to_ns(vcpu->arch.count_resume); + break; + case KVM_REG_MIPS_COUNT_HZ: + *v = vcpu->arch.count_hz; + break; + default: + return -EINVAL; + } + return 0; +} + +static int kvm_vz_set_one_reg(struct kvm_vcpu *vcpu, + const struct kvm_one_reg *reg, + s64 v) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + unsigned int idx; + int ret = 0; + unsigned int cur, change; + + switch (reg->id) { + case KVM_REG_MIPS_CP0_INDEX: + write_gc0_index(v); + break; + case KVM_REG_MIPS_CP0_ENTRYLO0: + write_gc0_entrylo0(entrylo_user_to_kvm(v)); + break; + case KVM_REG_MIPS_CP0_ENTRYLO1: + write_gc0_entrylo1(entrylo_user_to_kvm(v)); + break; + case KVM_REG_MIPS_CP0_CONTEXT: + write_gc0_context(v); + break; + case KVM_REG_MIPS_CP0_CONTEXTCONFIG: + if (!cpu_guest_has_contextconfig) + return -EINVAL; + write_gc0_contextconfig(v); + break; + case KVM_REG_MIPS_CP0_USERLOCAL: + if (!cpu_guest_has_userlocal) + return -EINVAL; + write_gc0_userlocal(v); + break; +#ifdef CONFIG_64BIT + case KVM_REG_MIPS_CP0_XCONTEXTCONFIG: + if (!cpu_guest_has_contextconfig) + return -EINVAL; + write_gc0_xcontextconfig(v); + break; +#endif + case KVM_REG_MIPS_CP0_PAGEMASK: + write_gc0_pagemask(v); + break; + case KVM_REG_MIPS_CP0_PAGEGRAIN: + write_gc0_pagegrain(v); + break; + case KVM_REG_MIPS_CP0_SEGCTL0: + if (!cpu_guest_has_segments) + return -EINVAL; + write_gc0_segctl0(v); + break; + case KVM_REG_MIPS_CP0_SEGCTL1: + if (!cpu_guest_has_segments) + return -EINVAL; + write_gc0_segctl1(v); + break; + case KVM_REG_MIPS_CP0_SEGCTL2: + if (!cpu_guest_has_segments) + return -EINVAL; + write_gc0_segctl2(v); + break; + case KVM_REG_MIPS_CP0_PWBASE: + if (!cpu_guest_has_htw) + return -EINVAL; + write_gc0_pwbase(v); + break; + case KVM_REG_MIPS_CP0_PWFIELD: + if (!cpu_guest_has_htw) + return -EINVAL; + write_gc0_pwfield(v); + break; + case KVM_REG_MIPS_CP0_PWSIZE: + if (!cpu_guest_has_htw) + return -EINVAL; + write_gc0_pwsize(v); + break; + case KVM_REG_MIPS_CP0_WIRED: + change_gc0_wired(MIPSR6_WIRED_WIRED, v); + break; + case KVM_REG_MIPS_CP0_PWCTL: + if (!cpu_guest_has_htw) + return -EINVAL; + write_gc0_pwctl(v); + break; + case KVM_REG_MIPS_CP0_HWRENA: + write_gc0_hwrena(v); + break; + case KVM_REG_MIPS_CP0_BADVADDR: + write_gc0_badvaddr(v); + break; + case KVM_REG_MIPS_CP0_BADINSTR: + if (!cpu_guest_has_badinstr) + return -EINVAL; + write_gc0_badinstr(v); + break; + case KVM_REG_MIPS_CP0_BADINSTRP: + if (!cpu_guest_has_badinstrp) + return -EINVAL; + write_gc0_badinstrp(v); + break; + case KVM_REG_MIPS_CP0_COUNT: + kvm_mips_write_count(vcpu, v); + break; + case KVM_REG_MIPS_CP0_ENTRYHI: + write_gc0_entryhi(v); + break; + case KVM_REG_MIPS_CP0_COMPARE: + kvm_mips_write_compare(vcpu, v, false); + break; + case KVM_REG_MIPS_CP0_STATUS: + write_gc0_status(v); + break; + case KVM_REG_MIPS_CP0_INTCTL: + write_gc0_intctl(v); + break; + case KVM_REG_MIPS_CP0_CAUSE: + /* + * If the timer is stopped or started (DC bit) it must look + * atomic with changes to the timer interrupt pending bit (TI). + * A timer interrupt should not happen in between. + */ + if ((read_gc0_cause() ^ v) & CAUSEF_DC) { + if (v & CAUSEF_DC) { + /* disable timer first */ + kvm_mips_count_disable_cause(vcpu); + change_gc0_cause((u32)~CAUSEF_DC, v); + } else { + /* enable timer last */ + change_gc0_cause((u32)~CAUSEF_DC, v); + kvm_mips_count_enable_cause(vcpu); + } + } else { + write_gc0_cause(v); + } + break; + case KVM_REG_MIPS_CP0_EPC: + write_gc0_epc(v); + break; + case KVM_REG_MIPS_CP0_PRID: + switch (boot_cpu_type()) { + case CPU_CAVIUM_OCTEON3: + /* Octeon III has a guest.PRid, but its read-only */ + break; + default: + kvm_write_c0_guest_prid(cop0, v); + break; + }; + break; + case KVM_REG_MIPS_CP0_EBASE: + kvm_vz_write_gc0_ebase(v); + break; + case KVM_REG_MIPS_CP0_CONFIG: + cur = read_gc0_config(); + change = (cur ^ v) & kvm_vz_config_user_wrmask(vcpu); + if (change) { + v = cur ^ change; + write_gc0_config(v); + } + break; + case KVM_REG_MIPS_CP0_CONFIG1: + if (!cpu_guest_has_conf1) + break; + cur = read_gc0_config1(); + change = (cur ^ v) & kvm_vz_config1_user_wrmask(vcpu); + if (change) { + v = cur ^ change; + write_gc0_config1(v); + } + break; + case KVM_REG_MIPS_CP0_CONFIG2: + if (!cpu_guest_has_conf2) + break; + cur = read_gc0_config2(); + change = (cur ^ v) & kvm_vz_config2_user_wrmask(vcpu); + if (change) { + v = cur ^ change; + write_gc0_config2(v); + } + break; + case KVM_REG_MIPS_CP0_CONFIG3: + if (!cpu_guest_has_conf3) + break; + cur = read_gc0_config3(); + change = (cur ^ v) & kvm_vz_config3_user_wrmask(vcpu); + if (change) { + v = cur ^ change; + write_gc0_config3(v); + } + break; + case KVM_REG_MIPS_CP0_CONFIG4: + if (!cpu_guest_has_conf4) + break; + cur = read_gc0_config4(); + change = (cur ^ v) & kvm_vz_config4_user_wrmask(vcpu); + if (change) { + v = cur ^ change; + write_gc0_config4(v); + } + break; + case KVM_REG_MIPS_CP0_CONFIG5: + if (!cpu_guest_has_conf5) + break; + cur = read_gc0_config5(); + change = (cur ^ v) & kvm_vz_config5_user_wrmask(vcpu); + if (change) { + v = cur ^ change; + write_gc0_config5(v); + } + break; + case KVM_REG_MIPS_CP0_MAAR(0) ... KVM_REG_MIPS_CP0_MAAR(0x3f): + if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar) + return -EINVAL; + idx = reg->id - KVM_REG_MIPS_CP0_MAAR(0); + if (idx >= ARRAY_SIZE(vcpu->arch.maar)) + return -EINVAL; + vcpu->arch.maar[idx] = mips_process_maar(dmtc_op, v); + break; + case KVM_REG_MIPS_CP0_MAARI: + if (!cpu_guest_has_maar || cpu_guest_has_dyn_maar) + return -EINVAL; + kvm_write_maari(vcpu, v); + break; +#ifdef CONFIG_64BIT + case KVM_REG_MIPS_CP0_XCONTEXT: + write_gc0_xcontext(v); + break; +#endif + case KVM_REG_MIPS_CP0_ERROREPC: + write_gc0_errorepc(v); + break; + case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6: + idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2; + if (!cpu_guest_has_kscr(idx)) + return -EINVAL; + switch (idx) { + case 2: + write_gc0_kscratch1(v); + break; + case 3: + write_gc0_kscratch2(v); + break; + case 4: + write_gc0_kscratch3(v); + break; + case 5: + write_gc0_kscratch4(v); + break; + case 6: + write_gc0_kscratch5(v); + break; + case 7: + write_gc0_kscratch6(v); + break; + } + break; + case KVM_REG_MIPS_COUNT_CTL: + ret = kvm_mips_set_count_ctl(vcpu, v); + break; + case KVM_REG_MIPS_COUNT_RESUME: + ret = kvm_mips_set_count_resume(vcpu, v); + break; + case KVM_REG_MIPS_COUNT_HZ: + ret = kvm_mips_set_count_hz(vcpu, v); + break; + default: + return -EINVAL; + } + return ret; +} + +#define guestid_cache(cpu) (cpu_data[cpu].guestid_cache) +static void kvm_vz_get_new_guestid(unsigned long cpu, struct kvm_vcpu *vcpu) +{ + unsigned long guestid = guestid_cache(cpu); + + if (!(++guestid & GUESTID_MASK)) { + if (cpu_has_vtag_icache) + flush_icache_all(); + + if (!guestid) /* fix version if needed */ + guestid = GUESTID_FIRST_VERSION; + + ++guestid; /* guestid 0 reserved for root */ + + /* start new guestid cycle */ + kvm_vz_local_flush_roottlb_all_guests(); + kvm_vz_local_flush_guesttlb_all(); + } + + guestid_cache(cpu) = guestid; +} + +/* Returns 1 if the guest TLB may be clobbered */ +static int kvm_vz_check_requests(struct kvm_vcpu *vcpu, int cpu) +{ + int ret = 0; + int i; + + if (!vcpu->requests) + return 0; + + if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) { + if (cpu_has_guestid) { + /* Drop all GuestIDs for this VCPU */ + for_each_possible_cpu(i) + vcpu->arch.vzguestid[i] = 0; + /* This will clobber guest TLB contents too */ + ret = 1; + } + /* + * For Root ASID Dealias (RAD) we don't do anything here, but we + * still need the request to ensure we recheck asid_flush_mask. + * We can still return 0 as only the root TLB will be affected + * by a root ASID flush. + */ + } + + return ret; +} + +static void kvm_vz_vcpu_save_wired(struct kvm_vcpu *vcpu) +{ + unsigned int wired = read_gc0_wired(); + struct kvm_mips_tlb *tlbs; + int i; + + /* Expand the wired TLB array if necessary */ + wired &= MIPSR6_WIRED_WIRED; + if (wired > vcpu->arch.wired_tlb_limit) { + tlbs = krealloc(vcpu->arch.wired_tlb, wired * + sizeof(*vcpu->arch.wired_tlb), GFP_ATOMIC); + if (WARN_ON(!tlbs)) { + /* Save whatever we can */ + wired = vcpu->arch.wired_tlb_limit; + } else { + vcpu->arch.wired_tlb = tlbs; + vcpu->arch.wired_tlb_limit = wired; + } + } + + if (wired) + /* Save wired entries from the guest TLB */ + kvm_vz_save_guesttlb(vcpu->arch.wired_tlb, 0, wired); + /* Invalidate any dropped entries since last time */ + for (i = wired; i < vcpu->arch.wired_tlb_used; ++i) { + vcpu->arch.wired_tlb[i].tlb_hi = UNIQUE_GUEST_ENTRYHI(i); + vcpu->arch.wired_tlb[i].tlb_lo[0] = 0; + vcpu->arch.wired_tlb[i].tlb_lo[1] = 0; + vcpu->arch.wired_tlb[i].tlb_mask = 0; + } + vcpu->arch.wired_tlb_used = wired; +} + +static void kvm_vz_vcpu_load_wired(struct kvm_vcpu *vcpu) +{ + /* Load wired entries into the guest TLB */ + if (vcpu->arch.wired_tlb) + kvm_vz_load_guesttlb(vcpu->arch.wired_tlb, 0, + vcpu->arch.wired_tlb_used); +} + +static void kvm_vz_vcpu_load_tlb(struct kvm_vcpu *vcpu, int cpu) +{ + struct kvm *kvm = vcpu->kvm; + struct mm_struct *gpa_mm = &kvm->arch.gpa_mm; + bool migrated; + + /* + * Are we entering guest context on a different CPU to last time? + * If so, the VCPU's guest TLB state on this CPU may be stale. + */ + migrated = (vcpu->arch.last_exec_cpu != cpu); + vcpu->arch.last_exec_cpu = cpu; + + /* + * A vcpu's GuestID is set in GuestCtl1.ID when the vcpu is loaded and + * remains set until another vcpu is loaded in. As a rule GuestRID + * remains zeroed when in root context unless the kernel is busy + * manipulating guest tlb entries. + */ + if (cpu_has_guestid) { + /* + * Check if our GuestID is of an older version and thus invalid. + * + * We also discard the stored GuestID if we've executed on + * another CPU, as the guest mappings may have changed without + * hypervisor knowledge. + */ + if (migrated || + (vcpu->arch.vzguestid[cpu] ^ guestid_cache(cpu)) & + GUESTID_VERSION_MASK) { + kvm_vz_get_new_guestid(cpu, vcpu); + vcpu->arch.vzguestid[cpu] = guestid_cache(cpu); + trace_kvm_guestid_change(vcpu, + vcpu->arch.vzguestid[cpu]); + } + + /* Restore GuestID */ + change_c0_guestctl1(GUESTID_MASK, vcpu->arch.vzguestid[cpu]); + } else { + /* + * The Guest TLB only stores a single guest's TLB state, so + * flush it if another VCPU has executed on this CPU. + * + * We also flush if we've executed on another CPU, as the guest + * mappings may have changed without hypervisor knowledge. + */ + if (migrated || last_exec_vcpu[cpu] != vcpu) + kvm_vz_local_flush_guesttlb_all(); + last_exec_vcpu[cpu] = vcpu; + + /* + * Root ASID dealiases guest GPA mappings in the root TLB. + * Allocate new root ASID if needed. + */ + if (cpumask_test_and_clear_cpu(cpu, &kvm->arch.asid_flush_mask) + || (cpu_context(cpu, gpa_mm) ^ asid_cache(cpu)) & + asid_version_mask(cpu)) + get_new_mmu_context(gpa_mm, cpu); + } +} + +static int kvm_vz_vcpu_load(struct kvm_vcpu *vcpu, int cpu) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + bool migrated, all; + + /* + * Have we migrated to a different CPU? + * If so, any old guest TLB state may be stale. + */ + migrated = (vcpu->arch.last_sched_cpu != cpu); + + /* + * Was this the last VCPU to run on this CPU? + * If not, any old guest state from this VCPU will have been clobbered. + */ + all = migrated || (last_vcpu[cpu] != vcpu); + last_vcpu[cpu] = vcpu; + + /* + * Restore CP0_Wired unconditionally as we clear it after use, and + * restore wired guest TLB entries (while in guest context). + */ + kvm_restore_gc0_wired(cop0); + if (current->flags & PF_VCPU) { + tlbw_use_hazard(); + kvm_vz_vcpu_load_tlb(vcpu, cpu); + kvm_vz_vcpu_load_wired(vcpu); + } + + /* + * Restore timer state regardless, as e.g. Cause.TI can change over time + * if left unmaintained. + */ + kvm_vz_restore_timer(vcpu); + + /* Set MC bit if we want to trace guest mode changes */ + if (kvm_trace_guest_mode_change) + set_c0_guestctl0(MIPS_GCTL0_MC); + else + clear_c0_guestctl0(MIPS_GCTL0_MC); + + /* Don't bother restoring registers multiple times unless necessary */ + if (!all) + return 0; + + /* + * Restore config registers first, as some implementations restrict + * writes to other registers when the corresponding feature bits aren't + * set. For example Status.CU1 cannot be set unless Config1.FP is set. + */ + kvm_restore_gc0_config(cop0); + if (cpu_guest_has_conf1) + kvm_restore_gc0_config1(cop0); + if (cpu_guest_has_conf2) + kvm_restore_gc0_config2(cop0); + if (cpu_guest_has_conf3) + kvm_restore_gc0_config3(cop0); + if (cpu_guest_has_conf4) + kvm_restore_gc0_config4(cop0); + if (cpu_guest_has_conf5) + kvm_restore_gc0_config5(cop0); + if (cpu_guest_has_conf6) + kvm_restore_gc0_config6(cop0); + if (cpu_guest_has_conf7) + kvm_restore_gc0_config7(cop0); + + kvm_restore_gc0_index(cop0); + kvm_restore_gc0_entrylo0(cop0); + kvm_restore_gc0_entrylo1(cop0); + kvm_restore_gc0_context(cop0); + if (cpu_guest_has_contextconfig) + kvm_restore_gc0_contextconfig(cop0); +#ifdef CONFIG_64BIT + kvm_restore_gc0_xcontext(cop0); + if (cpu_guest_has_contextconfig) + kvm_restore_gc0_xcontextconfig(cop0); +#endif + kvm_restore_gc0_pagemask(cop0); + kvm_restore_gc0_pagegrain(cop0); + kvm_restore_gc0_hwrena(cop0); + kvm_restore_gc0_badvaddr(cop0); + kvm_restore_gc0_entryhi(cop0); + kvm_restore_gc0_status(cop0); + kvm_restore_gc0_intctl(cop0); + kvm_restore_gc0_epc(cop0); + kvm_vz_write_gc0_ebase(kvm_read_sw_gc0_ebase(cop0)); + if (cpu_guest_has_userlocal) + kvm_restore_gc0_userlocal(cop0); + + kvm_restore_gc0_errorepc(cop0); + + /* restore KScratch registers if enabled in guest */ + if (cpu_guest_has_conf4) { + if (cpu_guest_has_kscr(2)) + kvm_restore_gc0_kscratch1(cop0); + if (cpu_guest_has_kscr(3)) + kvm_restore_gc0_kscratch2(cop0); + if (cpu_guest_has_kscr(4)) + kvm_restore_gc0_kscratch3(cop0); + if (cpu_guest_has_kscr(5)) + kvm_restore_gc0_kscratch4(cop0); + if (cpu_guest_has_kscr(6)) + kvm_restore_gc0_kscratch5(cop0); + if (cpu_guest_has_kscr(7)) + kvm_restore_gc0_kscratch6(cop0); + } + + if (cpu_guest_has_badinstr) + kvm_restore_gc0_badinstr(cop0); + if (cpu_guest_has_badinstrp) + kvm_restore_gc0_badinstrp(cop0); + + if (cpu_guest_has_segments) { + kvm_restore_gc0_segctl0(cop0); + kvm_restore_gc0_segctl1(cop0); + kvm_restore_gc0_segctl2(cop0); + } + + /* restore HTW registers */ + if (cpu_guest_has_htw) { + kvm_restore_gc0_pwbase(cop0); + kvm_restore_gc0_pwfield(cop0); + kvm_restore_gc0_pwsize(cop0); + kvm_restore_gc0_pwctl(cop0); + } + + /* restore Root.GuestCtl2 from unused Guest guestctl2 register */ + if (cpu_has_guestctl2) + write_c0_guestctl2( + cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL]); + + /* + * We should clear linked load bit to break interrupted atomics. This + * prevents a SC on the next VCPU from succeeding by matching a LL on + * the previous VCPU. + */ + if (cpu_guest_has_rw_llb) + write_gc0_lladdr(0); + + return 0; +} + +static int kvm_vz_vcpu_put(struct kvm_vcpu *vcpu, int cpu) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + + if (current->flags & PF_VCPU) + kvm_vz_vcpu_save_wired(vcpu); + + kvm_lose_fpu(vcpu); + + kvm_save_gc0_index(cop0); + kvm_save_gc0_entrylo0(cop0); + kvm_save_gc0_entrylo1(cop0); + kvm_save_gc0_context(cop0); + if (cpu_guest_has_contextconfig) + kvm_save_gc0_contextconfig(cop0); +#ifdef CONFIG_64BIT + kvm_save_gc0_xcontext(cop0); + if (cpu_guest_has_contextconfig) + kvm_save_gc0_xcontextconfig(cop0); +#endif + kvm_save_gc0_pagemask(cop0); + kvm_save_gc0_pagegrain(cop0); + kvm_save_gc0_wired(cop0); + /* allow wired TLB entries to be overwritten */ + clear_gc0_wired(MIPSR6_WIRED_WIRED); + kvm_save_gc0_hwrena(cop0); + kvm_save_gc0_badvaddr(cop0); + kvm_save_gc0_entryhi(cop0); + kvm_save_gc0_status(cop0); + kvm_save_gc0_intctl(cop0); + kvm_save_gc0_epc(cop0); + kvm_write_sw_gc0_ebase(cop0, kvm_vz_read_gc0_ebase()); + if (cpu_guest_has_userlocal) + kvm_save_gc0_userlocal(cop0); + + /* only save implemented config registers */ + kvm_save_gc0_config(cop0); + if (cpu_guest_has_conf1) + kvm_save_gc0_config1(cop0); + if (cpu_guest_has_conf2) + kvm_save_gc0_config2(cop0); + if (cpu_guest_has_conf3) + kvm_save_gc0_config3(cop0); + if (cpu_guest_has_conf4) + kvm_save_gc0_config4(cop0); + if (cpu_guest_has_conf5) + kvm_save_gc0_config5(cop0); + if (cpu_guest_has_conf6) + kvm_save_gc0_config6(cop0); + if (cpu_guest_has_conf7) + kvm_save_gc0_config7(cop0); + + kvm_save_gc0_errorepc(cop0); + + /* save KScratch registers if enabled in guest */ + if (cpu_guest_has_conf4) { + if (cpu_guest_has_kscr(2)) + kvm_save_gc0_kscratch1(cop0); + if (cpu_guest_has_kscr(3)) + kvm_save_gc0_kscratch2(cop0); + if (cpu_guest_has_kscr(4)) + kvm_save_gc0_kscratch3(cop0); + if (cpu_guest_has_kscr(5)) + kvm_save_gc0_kscratch4(cop0); + if (cpu_guest_has_kscr(6)) + kvm_save_gc0_kscratch5(cop0); + if (cpu_guest_has_kscr(7)) + kvm_save_gc0_kscratch6(cop0); + } + + if (cpu_guest_has_badinstr) + kvm_save_gc0_badinstr(cop0); + if (cpu_guest_has_badinstrp) + kvm_save_gc0_badinstrp(cop0); + + if (cpu_guest_has_segments) { + kvm_save_gc0_segctl0(cop0); + kvm_save_gc0_segctl1(cop0); + kvm_save_gc0_segctl2(cop0); + } + + /* save HTW registers if enabled in guest */ + if (cpu_guest_has_htw && + kvm_read_sw_gc0_config3(cop0) & MIPS_CONF3_PW) { + kvm_save_gc0_pwbase(cop0); + kvm_save_gc0_pwfield(cop0); + kvm_save_gc0_pwsize(cop0); + kvm_save_gc0_pwctl(cop0); + } + + kvm_vz_save_timer(vcpu); + + /* save Root.GuestCtl2 in unused Guest guestctl2 register */ + if (cpu_has_guestctl2) + cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] = + read_c0_guestctl2(); + + return 0; +} + +/** + * kvm_vz_resize_guest_vtlb() - Attempt to resize guest VTLB. + * @size: Number of guest VTLB entries (0 < @size <= root VTLB entries). + * + * Attempt to resize the guest VTLB by writing guest Config registers. This is + * necessary for cores with a shared root/guest TLB to avoid overlap with wired + * entries in the root VTLB. + * + * Returns: The resulting guest VTLB size. + */ +static unsigned int kvm_vz_resize_guest_vtlb(unsigned int size) +{ + unsigned int config4 = 0, ret = 0, limit; + + /* Write MMUSize - 1 into guest Config registers */ + if (cpu_guest_has_conf1) + change_gc0_config1(MIPS_CONF1_TLBS, + (size - 1) << MIPS_CONF1_TLBS_SHIFT); + if (cpu_guest_has_conf4) { + config4 = read_gc0_config4(); + if (cpu_has_mips_r6 || (config4 & MIPS_CONF4_MMUEXTDEF) == + MIPS_CONF4_MMUEXTDEF_VTLBSIZEEXT) { + config4 &= ~MIPS_CONF4_VTLBSIZEEXT; + config4 |= ((size - 1) >> MIPS_CONF1_TLBS_SIZE) << + MIPS_CONF4_VTLBSIZEEXT_SHIFT; + } else if ((config4 & MIPS_CONF4_MMUEXTDEF) == + MIPS_CONF4_MMUEXTDEF_MMUSIZEEXT) { + config4 &= ~MIPS_CONF4_MMUSIZEEXT; + config4 |= ((size - 1) >> MIPS_CONF1_TLBS_SIZE) << + MIPS_CONF4_MMUSIZEEXT_SHIFT; + } + write_gc0_config4(config4); + } + + /* + * Set Guest.Wired.Limit = 0 (no limit up to Guest.MMUSize-1), unless it + * would exceed Root.Wired.Limit (clearing Guest.Wired.Wired so write + * not dropped) + */ + if (cpu_has_mips_r6) { + limit = (read_c0_wired() & MIPSR6_WIRED_LIMIT) >> + MIPSR6_WIRED_LIMIT_SHIFT; + if (size - 1 <= limit) + limit = 0; + write_gc0_wired(limit << MIPSR6_WIRED_LIMIT_SHIFT); + } + + /* Read back MMUSize - 1 */ + back_to_back_c0_hazard(); + if (cpu_guest_has_conf1) + ret = (read_gc0_config1() & MIPS_CONF1_TLBS) >> + MIPS_CONF1_TLBS_SHIFT; + if (config4) { + if (cpu_has_mips_r6 || (config4 & MIPS_CONF4_MMUEXTDEF) == + MIPS_CONF4_MMUEXTDEF_VTLBSIZEEXT) + ret |= ((config4 & MIPS_CONF4_VTLBSIZEEXT) >> + MIPS_CONF4_VTLBSIZEEXT_SHIFT) << + MIPS_CONF1_TLBS_SIZE; + else if ((config4 & MIPS_CONF4_MMUEXTDEF) == + MIPS_CONF4_MMUEXTDEF_MMUSIZEEXT) + ret |= ((config4 & MIPS_CONF4_MMUSIZEEXT) >> + MIPS_CONF4_MMUSIZEEXT_SHIFT) << + MIPS_CONF1_TLBS_SIZE; + } + return ret + 1; +} + +static int kvm_vz_hardware_enable(void) +{ + unsigned int mmu_size, guest_mmu_size, ftlb_size; + u64 guest_cvmctl, cvmvmconfig; + + switch (current_cpu_type()) { + case CPU_CAVIUM_OCTEON3: + /* Set up guest timer/perfcount IRQ lines */ + guest_cvmctl = read_gc0_cvmctl(); + guest_cvmctl &= ~CVMCTL_IPTI; + guest_cvmctl |= 7ull << CVMCTL_IPTI_SHIFT; + guest_cvmctl &= ~CVMCTL_IPPCI; + guest_cvmctl |= 6ull << CVMCTL_IPPCI_SHIFT; + write_gc0_cvmctl(guest_cvmctl); + + cvmvmconfig = read_c0_cvmvmconfig(); + /* No I/O hole translation. */ + cvmvmconfig |= CVMVMCONF_DGHT; + /* Halve the root MMU size */ + mmu_size = ((cvmvmconfig & CVMVMCONF_MMUSIZEM1) + >> CVMVMCONF_MMUSIZEM1_S) + 1; + guest_mmu_size = mmu_size / 2; + mmu_size -= guest_mmu_size; + cvmvmconfig &= ~CVMVMCONF_RMMUSIZEM1; + cvmvmconfig |= mmu_size - 1; + write_c0_cvmvmconfig(cvmvmconfig); + + /* Update our records */ + current_cpu_data.tlbsize = mmu_size; + current_cpu_data.tlbsizevtlb = mmu_size; + current_cpu_data.guest.tlbsize = guest_mmu_size; + + /* Flush moved entries in new (guest) context */ + kvm_vz_local_flush_guesttlb_all(); + break; + default: + /* + * ImgTec cores tend to use a shared root/guest TLB. To avoid + * overlap of root wired and guest entries, the guest TLB may + * need resizing. + */ + mmu_size = current_cpu_data.tlbsizevtlb; + ftlb_size = current_cpu_data.tlbsize - mmu_size; + + /* Try switching to maximum guest VTLB size for flush */ + guest_mmu_size = kvm_vz_resize_guest_vtlb(mmu_size); + current_cpu_data.guest.tlbsize = guest_mmu_size + ftlb_size; + kvm_vz_local_flush_guesttlb_all(); + + /* + * Reduce to make space for root wired entries and at least 2 + * root non-wired entries. This does assume that long-term wired + * entries won't be added later. + */ + guest_mmu_size = mmu_size - num_wired_entries() - 2; + guest_mmu_size = kvm_vz_resize_guest_vtlb(guest_mmu_size); + current_cpu_data.guest.tlbsize = guest_mmu_size + ftlb_size; + + /* + * Write the VTLB size, but if another CPU has already written, + * check it matches or we won't provide a consistent view to the + * guest. If this ever happens it suggests an asymmetric number + * of wired entries. + */ + if (cmpxchg(&kvm_vz_guest_vtlb_size, 0, guest_mmu_size) && + WARN(guest_mmu_size != kvm_vz_guest_vtlb_size, + "Available guest VTLB size mismatch")) + return -EINVAL; + break; + } + + /* + * Enable virtualization features granting guest direct control of + * certain features: + * CP0=1: Guest coprocessor 0 context. + * AT=Guest: Guest MMU. + * CG=1: Hit (virtual address) CACHE operations (optional). + * CF=1: Guest Config registers. + * CGI=1: Indexed flush CACHE operations (optional). + */ + write_c0_guestctl0(MIPS_GCTL0_CP0 | + (MIPS_GCTL0_AT_GUEST << MIPS_GCTL0_AT_SHIFT) | + MIPS_GCTL0_CG | MIPS_GCTL0_CF); + if (cpu_has_guestctl0ext) + set_c0_guestctl0ext(MIPS_GCTL0EXT_CGI); + + if (cpu_has_guestid) { + write_c0_guestctl1(0); + kvm_vz_local_flush_roottlb_all_guests(); + + GUESTID_MASK = current_cpu_data.guestid_mask; + GUESTID_FIRST_VERSION = GUESTID_MASK + 1; + GUESTID_VERSION_MASK = ~GUESTID_MASK; + + current_cpu_data.guestid_cache = GUESTID_FIRST_VERSION; + } + + /* clear any pending injected virtual guest interrupts */ + if (cpu_has_guestctl2) + clear_c0_guestctl2(0x3f << 10); + + return 0; +} + +static void kvm_vz_hardware_disable(void) +{ + u64 cvmvmconfig; + unsigned int mmu_size; + + /* Flush any remaining guest TLB entries */ + kvm_vz_local_flush_guesttlb_all(); + + switch (current_cpu_type()) { + case CPU_CAVIUM_OCTEON3: + /* + * Allocate whole TLB for root. Existing guest TLB entries will + * change ownership to the root TLB. We should be safe though as + * they've already been flushed above while in guest TLB. + */ + cvmvmconfig = read_c0_cvmvmconfig(); + mmu_size = ((cvmvmconfig & CVMVMCONF_MMUSIZEM1) + >> CVMVMCONF_MMUSIZEM1_S) + 1; + cvmvmconfig &= ~CVMVMCONF_RMMUSIZEM1; + cvmvmconfig |= mmu_size - 1; + write_c0_cvmvmconfig(cvmvmconfig); + + /* Update our records */ + current_cpu_data.tlbsize = mmu_size; + current_cpu_data.tlbsizevtlb = mmu_size; + current_cpu_data.guest.tlbsize = 0; + + /* Flush moved entries in new (root) context */ + local_flush_tlb_all(); + break; + } + + if (cpu_has_guestid) { + write_c0_guestctl1(0); + kvm_vz_local_flush_roottlb_all_guests(); + } +} + +static int kvm_vz_check_extension(struct kvm *kvm, long ext) +{ + int r; + + switch (ext) { + case KVM_CAP_MIPS_VZ: + /* we wouldn't be here unless cpu_has_vz */ + r = 1; + break; +#ifdef CONFIG_64BIT + case KVM_CAP_MIPS_64BIT: + /* We support 64-bit registers/operations and addresses */ + r = 2; + break; +#endif + default: + r = 0; + break; + } + + return r; +} + +static int kvm_vz_vcpu_init(struct kvm_vcpu *vcpu) +{ + int i; + + for_each_possible_cpu(i) + vcpu->arch.vzguestid[i] = 0; + + return 0; +} + +static void kvm_vz_vcpu_uninit(struct kvm_vcpu *vcpu) +{ + int cpu; + + /* + * If the VCPU is freed and reused as another VCPU, we don't want the + * matching pointer wrongly hanging around in last_vcpu[] or + * last_exec_vcpu[]. + */ + for_each_possible_cpu(cpu) { + if (last_vcpu[cpu] == vcpu) + last_vcpu[cpu] = NULL; + if (last_exec_vcpu[cpu] == vcpu) + last_exec_vcpu[cpu] = NULL; + } +} + +static int kvm_vz_vcpu_setup(struct kvm_vcpu *vcpu) +{ + struct mips_coproc *cop0 = vcpu->arch.cop0; + unsigned long count_hz = 100*1000*1000; /* default to 100 MHz */ + + /* + * Start off the timer at the same frequency as the host timer, but the + * soft timer doesn't handle frequencies greater than 1GHz yet. + */ + if (mips_hpt_frequency && mips_hpt_frequency <= NSEC_PER_SEC) + count_hz = mips_hpt_frequency; + kvm_mips_init_count(vcpu, count_hz); + + /* + * Initialize guest register state to valid architectural reset state. + */ + + /* PageGrain */ + if (cpu_has_mips_r6) + kvm_write_sw_gc0_pagegrain(cop0, PG_RIE | PG_XIE | PG_IEC); + /* Wired */ + if (cpu_has_mips_r6) + kvm_write_sw_gc0_wired(cop0, + read_gc0_wired() & MIPSR6_WIRED_LIMIT); + /* Status */ + kvm_write_sw_gc0_status(cop0, ST0_BEV | ST0_ERL); + if (cpu_has_mips_r6) + kvm_change_sw_gc0_status(cop0, ST0_FR, read_gc0_status()); + /* IntCtl */ + kvm_write_sw_gc0_intctl(cop0, read_gc0_intctl() & + (INTCTLF_IPFDC | INTCTLF_IPPCI | INTCTLF_IPTI)); + /* PRId */ + kvm_write_sw_gc0_prid(cop0, boot_cpu_data.processor_id); + /* EBase */ + kvm_write_sw_gc0_ebase(cop0, (s32)0x80000000 | vcpu->vcpu_id); + /* Config */ + kvm_save_gc0_config(cop0); + /* architecturally writable (e.g. from guest) */ + kvm_change_sw_gc0_config(cop0, CONF_CM_CMASK, + _page_cachable_default >> _CACHE_SHIFT); + /* architecturally read only, but maybe writable from root */ + kvm_change_sw_gc0_config(cop0, MIPS_CONF_MT, read_c0_config()); + if (cpu_guest_has_conf1) { + kvm_set_sw_gc0_config(cop0, MIPS_CONF_M); + /* Config1 */ + kvm_save_gc0_config1(cop0); + /* architecturally read only, but maybe writable from root */ + kvm_clear_sw_gc0_config1(cop0, MIPS_CONF1_C2 | + MIPS_CONF1_MD | + MIPS_CONF1_PC | + MIPS_CONF1_WR | + MIPS_CONF1_CA | + MIPS_CONF1_FP); + } + if (cpu_guest_has_conf2) { + kvm_set_sw_gc0_config1(cop0, MIPS_CONF_M); + /* Config2 */ + kvm_save_gc0_config2(cop0); + } + if (cpu_guest_has_conf3) { + kvm_set_sw_gc0_config2(cop0, MIPS_CONF_M); + /* Config3 */ + kvm_save_gc0_config3(cop0); + /* architecturally writable (e.g. from guest) */ + kvm_clear_sw_gc0_config3(cop0, MIPS_CONF3_ISA_OE); + /* architecturally read only, but maybe writable from root */ + kvm_clear_sw_gc0_config3(cop0, MIPS_CONF3_MSA | + MIPS_CONF3_BPG | + MIPS_CONF3_ULRI | + MIPS_CONF3_DSP | + MIPS_CONF3_CTXTC | + MIPS_CONF3_ITL | + MIPS_CONF3_LPA | + MIPS_CONF3_VEIC | + MIPS_CONF3_VINT | + MIPS_CONF3_SP | + MIPS_CONF3_CDMM | + MIPS_CONF3_MT | + MIPS_CONF3_SM | + MIPS_CONF3_TL); + } + if (cpu_guest_has_conf4) { + kvm_set_sw_gc0_config3(cop0, MIPS_CONF_M); + /* Config4 */ + kvm_save_gc0_config4(cop0); + } + if (cpu_guest_has_conf5) { + kvm_set_sw_gc0_config4(cop0, MIPS_CONF_M); + /* Config5 */ + kvm_save_gc0_config5(cop0); + /* architecturally writable (e.g. from guest) */ + kvm_clear_sw_gc0_config5(cop0, MIPS_CONF5_K | + MIPS_CONF5_CV | + MIPS_CONF5_MSAEN | + MIPS_CONF5_UFE | + MIPS_CONF5_FRE | + MIPS_CONF5_SBRI | + MIPS_CONF5_UFR); + /* architecturally read only, but maybe writable from root */ + kvm_clear_sw_gc0_config5(cop0, MIPS_CONF5_MRP); + } + + if (cpu_guest_has_contextconfig) { + /* ContextConfig */ + kvm_write_sw_gc0_contextconfig(cop0, 0x007ffff0); +#ifdef CONFIG_64BIT + /* XContextConfig */ + /* bits SEGBITS-13+3:4 set */ + kvm_write_sw_gc0_xcontextconfig(cop0, + ((1ull << (cpu_vmbits - 13)) - 1) << 4); +#endif + } + + /* Implementation dependent, use the legacy layout */ + if (cpu_guest_has_segments) { + /* SegCtl0, SegCtl1, SegCtl2 */ + kvm_write_sw_gc0_segctl0(cop0, 0x00200010); + kvm_write_sw_gc0_segctl1(cop0, 0x00000002 | + (_page_cachable_default >> _CACHE_SHIFT) << + (16 + MIPS_SEGCFG_C_SHIFT)); + kvm_write_sw_gc0_segctl2(cop0, 0x00380438); + } + + /* reset HTW registers */ + if (cpu_guest_has_htw && cpu_has_mips_r6) { + /* PWField */ + kvm_write_sw_gc0_pwfield(cop0, 0x0c30c302); + /* PWSize */ + kvm_write_sw_gc0_pwsize(cop0, 1 << MIPS_PWSIZE_PTW_SHIFT); + } + + /* start with no pending virtual guest interrupts */ + if (cpu_has_guestctl2) + cop0->reg[MIPS_CP0_GUESTCTL2][MIPS_CP0_GUESTCTL2_SEL] = 0; + + /* Put PC at reset vector */ + vcpu->arch.pc = CKSEG1ADDR(0x1fc00000); + + return 0; +} + +static void kvm_vz_flush_shadow_all(struct kvm *kvm) +{ + if (cpu_has_guestid) { + /* Flush GuestID for each VCPU individually */ + kvm_flush_remote_tlbs(kvm); + } else { + /* + * For each CPU there is a single GPA ASID used by all VCPUs in + * the VM, so it doesn't make sense for the VCPUs to handle + * invalidation of these ASIDs individually. + * + * Instead mark all CPUs as needing ASID invalidation in + * asid_flush_mask, and just use kvm_flush_remote_tlbs(kvm) to + * kick any running VCPUs so they check asid_flush_mask. + */ + cpumask_setall(&kvm->arch.asid_flush_mask); + kvm_flush_remote_tlbs(kvm); + } +} + +static void kvm_vz_flush_shadow_memslot(struct kvm *kvm, + const struct kvm_memory_slot *slot) +{ + kvm_vz_flush_shadow_all(kvm); +} + +static void kvm_vz_vcpu_reenter(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + int cpu = smp_processor_id(); + int preserve_guest_tlb; + + preserve_guest_tlb = kvm_vz_check_requests(vcpu, cpu); + + if (preserve_guest_tlb) + kvm_vz_vcpu_save_wired(vcpu); + + kvm_vz_vcpu_load_tlb(vcpu, cpu); + + if (preserve_guest_tlb) + kvm_vz_vcpu_load_wired(vcpu); +} + +static int kvm_vz_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu) +{ + int cpu = smp_processor_id(); + int r; + + kvm_vz_acquire_htimer(vcpu); + /* Check if we have any exceptions/interrupts pending */ + kvm_mips_deliver_interrupts(vcpu, read_gc0_cause()); + + kvm_vz_check_requests(vcpu, cpu); + kvm_vz_vcpu_load_tlb(vcpu, cpu); + kvm_vz_vcpu_load_wired(vcpu); + + r = vcpu->arch.vcpu_run(run, vcpu); + + kvm_vz_vcpu_save_wired(vcpu); + + return r; +} + +static struct kvm_mips_callbacks kvm_vz_callbacks = { + .handle_cop_unusable = kvm_trap_vz_handle_cop_unusable, + .handle_tlb_mod = kvm_trap_vz_handle_tlb_st_miss, + .handle_tlb_ld_miss = kvm_trap_vz_handle_tlb_ld_miss, + .handle_tlb_st_miss = kvm_trap_vz_handle_tlb_st_miss, + .handle_addr_err_st = kvm_trap_vz_no_handler, + .handle_addr_err_ld = kvm_trap_vz_no_handler, + .handle_syscall = kvm_trap_vz_no_handler, + .handle_res_inst = kvm_trap_vz_no_handler, + .handle_break = kvm_trap_vz_no_handler, + .handle_msa_disabled = kvm_trap_vz_handle_msa_disabled, + .handle_guest_exit = kvm_trap_vz_handle_guest_exit, + + .hardware_enable = kvm_vz_hardware_enable, + .hardware_disable = kvm_vz_hardware_disable, + .check_extension = kvm_vz_check_extension, + .vcpu_init = kvm_vz_vcpu_init, + .vcpu_uninit = kvm_vz_vcpu_uninit, + .vcpu_setup = kvm_vz_vcpu_setup, + .flush_shadow_all = kvm_vz_flush_shadow_all, + .flush_shadow_memslot = kvm_vz_flush_shadow_memslot, + .gva_to_gpa = kvm_vz_gva_to_gpa_cb, + .queue_timer_int = kvm_vz_queue_timer_int_cb, + .dequeue_timer_int = kvm_vz_dequeue_timer_int_cb, + .queue_io_int = kvm_vz_queue_io_int_cb, + .dequeue_io_int = kvm_vz_dequeue_io_int_cb, + .irq_deliver = kvm_vz_irq_deliver_cb, + .irq_clear = kvm_vz_irq_clear_cb, + .num_regs = kvm_vz_num_regs, + .copy_reg_indices = kvm_vz_copy_reg_indices, + .get_one_reg = kvm_vz_get_one_reg, + .set_one_reg = kvm_vz_set_one_reg, + .vcpu_load = kvm_vz_vcpu_load, + .vcpu_put = kvm_vz_vcpu_put, + .vcpu_run = kvm_vz_vcpu_run, + .vcpu_reenter = kvm_vz_vcpu_reenter, +}; + +int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks) +{ + if (!cpu_has_vz) + return -ENODEV; + + /* + * VZ requires at least 2 KScratch registers, so it should have been + * possible to allocate pgd_reg. + */ + if (WARN(pgd_reg == -1, + "pgd_reg not allocated even though cpu_has_vz\n")) + return -ENODEV; + + pr_info("Starting KVM with MIPS VZ extensions\n"); + + *install_callbacks = &kvm_vz_callbacks; + return 0; +} diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 6db341347202..899e46279902 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -24,6 +24,7 @@ /* Cache operations. */ void (*flush_cache_all)(void); void (*__flush_cache_all)(void); +EXPORT_SYMBOL_GPL(__flush_cache_all); void (*flush_cache_mm)(struct mm_struct *mm); void (*flush_cache_range)(struct vm_area_struct *vma, unsigned long start, unsigned long end); diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index aa75849c36bc..3ca20283b31e 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -348,7 +348,7 @@ void maar_init(void) upper = ((upper & MIPS_MAAR_ADDR) << 4) | 0xffff; pr_info(" [%d]: ", i / 2); - if (!(attr & MIPS_MAAR_V)) { + if (!(attr & MIPS_MAAR_VL)) { pr_cont("disabled\n"); continue; } diff --git a/arch/powerpc/include/asm/disassemble.h b/arch/powerpc/include/asm/disassemble.h index 4852e849128b..c0a55050f70f 100644 --- a/arch/powerpc/include/asm/disassemble.h +++ b/arch/powerpc/include/asm/disassemble.h @@ -87,6 +87,11 @@ static inline unsigned int get_oc(u32 inst) return (inst >> 11) & 0x7fff; } +static inline unsigned int get_tx_or_sx(u32 inst) +{ + return (inst) & 0x1; +} + #define IS_XFORM(inst) (get_op(inst) == 31) #define IS_DSFORM(inst) (get_op(inst) >= 56) diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h index d96142572e6d..8a8ce220d7d0 100644 --- a/arch/powerpc/include/asm/iommu.h +++ b/arch/powerpc/include/asm/iommu.h @@ -296,11 +296,21 @@ static inline void iommu_restore(void) #endif /* The API to support IOMMU operations for VFIO */ -extern int iommu_tce_clear_param_check(struct iommu_table *tbl, - unsigned long ioba, unsigned long tce_value, - unsigned long npages); -extern int iommu_tce_put_param_check(struct iommu_table *tbl, - unsigned long ioba, unsigned long tce); +extern int iommu_tce_check_ioba(unsigned long page_shift, + unsigned long offset, unsigned long size, + unsigned long ioba, unsigned long npages); +extern int iommu_tce_check_gpa(unsigned long page_shift, + unsigned long gpa); + +#define iommu_tce_clear_param_check(tbl, ioba, tce_value, npages) \ + (iommu_tce_check_ioba((tbl)->it_page_shift, \ + (tbl)->it_offset, (tbl)->it_size, \ + (ioba), (npages)) || (tce_value)) +#define iommu_tce_put_param_check(tbl, ioba, gpa) \ + (iommu_tce_check_ioba((tbl)->it_page_shift, \ + (tbl)->it_offset, (tbl)->it_size, \ + (ioba), 1) || \ + iommu_tce_check_gpa((tbl)->it_page_shift, (gpa))) extern void iommu_flush_tce(struct iommu_table *tbl); extern int iommu_take_ownership(struct iommu_table *tbl); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 7bba8f415627..77c60826d145 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -45,9 +45,6 @@ #define __KVM_HAVE_ARCH_INTC_INITIALIZED -#ifdef CONFIG_KVM_MMIO -#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 -#endif #define KVM_HALT_POLL_NS_DEFAULT 10000 /* 10 us */ /* These values are internal and can be increased later */ @@ -191,6 +188,13 @@ struct kvmppc_pginfo { atomic_t refcnt; }; +struct kvmppc_spapr_tce_iommu_table { + struct rcu_head rcu; + struct list_head next; + struct iommu_table *tbl; + struct kref kref; +}; + struct kvmppc_spapr_tce_table { struct list_head list; struct kvm *kvm; @@ -199,6 +203,7 @@ struct kvmppc_spapr_tce_table { u32 page_shift; u64 offset; /* in pages */ u64 size; /* window size in pages */ + struct list_head iommu_tables; struct page *pages[0]; }; @@ -345,6 +350,7 @@ struct kvmppc_pte { bool may_read : 1; bool may_write : 1; bool may_execute : 1; + unsigned long wimg; u8 page_size; /* MMU_PAGE_xxx */ }; @@ -441,6 +447,11 @@ struct mmio_hpte_cache { unsigned int index; }; +#define KVMPPC_VSX_COPY_NONE 0 +#define KVMPPC_VSX_COPY_WORD 1 +#define KVMPPC_VSX_COPY_DWORD 2 +#define KVMPPC_VSX_COPY_DWORD_LOAD_DUMP 3 + struct openpic; struct kvm_vcpu_arch { @@ -644,6 +655,21 @@ struct kvm_vcpu_arch { u8 io_gpr; /* GPR used as IO source/target */ u8 mmio_host_swabbed; u8 mmio_sign_extend; + /* conversion between single and double precision */ + u8 mmio_sp64_extend; + /* + * Number of simulations for vsx. + * If we use 2*8bytes to simulate 1*16bytes, + * then the number should be 2 and + * mmio_vsx_copy_type=KVMPPC_VSX_COPY_DWORD. + * If we use 4*4bytes to simulate 1*16bytes, + * the number should be 4 and + * mmio_vsx_copy_type=KVMPPC_VSX_COPY_WORD. + */ + u8 mmio_vsx_copy_nums; + u8 mmio_vsx_offset; + u8 mmio_vsx_copy_type; + u8 mmio_vsx_tx_sx_enabled; u8 osi_needed; u8 osi_enabled; u8 papr_enabled; @@ -732,6 +758,8 @@ struct kvm_vcpu_arch { }; #define VCPU_FPR(vcpu, i) (vcpu)->arch.fp.fpr[i][TS_FPROFFSET] +#define VCPU_VSX_FPR(vcpu, i, j) ((vcpu)->arch.fp.fpr[i][j]) +#define VCPU_VSX_VR(vcpu, i) ((vcpu)->arch.vr.vr[i]) /* Values for vcpu->arch.state */ #define KVMPPC_VCPU_NOTREADY 0 @@ -745,6 +773,7 @@ struct kvm_vcpu_arch { #define KVM_MMIO_REG_FPR 0x0020 #define KVM_MMIO_REG_QPR 0x0040 #define KVM_MMIO_REG_FQPR 0x0060 +#define KVM_MMIO_REG_VSX 0x0080 #define __KVM_HAVE_ARCH_WQP #define __KVM_HAVE_CREATE_DEVICE diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index c3877992eff9..76e940a3c145 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -78,9 +78,15 @@ extern int kvmppc_handle_load(struct kvm_run *run, struct kvm_vcpu *vcpu, extern int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int rt, unsigned int bytes, int is_default_endian); +extern int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int rt, unsigned int bytes, + int is_default_endian, int mmio_sign_extend); extern int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, u64 val, unsigned int bytes, int is_default_endian); +extern int kvmppc_handle_vsx_store(struct kvm_run *run, struct kvm_vcpu *vcpu, + int rs, unsigned int bytes, + int is_default_endian); extern int kvmppc_load_last_inst(struct kvm_vcpu *vcpu, enum instruction_type type, u32 *inst); @@ -132,6 +138,9 @@ extern void kvmppc_core_vcpu_put(struct kvm_vcpu *vcpu); extern int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu); extern int kvmppc_core_pending_dec(struct kvm_vcpu *vcpu); extern void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags); +extern void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu); +extern void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu); +extern void kvmppc_core_queue_vsx_unavail(struct kvm_vcpu *vcpu); extern void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu); extern void kvmppc_core_dequeue_dec(struct kvm_vcpu *vcpu); extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, @@ -164,13 +173,19 @@ extern long kvmppc_prepare_vrma(struct kvm *kvm, extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, unsigned long porder); extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); +extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, + struct iommu_group *grp); +extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, + struct iommu_group *grp); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce_64 *args); extern struct kvmppc_spapr_tce_table *kvmppc_find_table( - struct kvm_vcpu *vcpu, unsigned long liobn); -extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, - unsigned long ioba, unsigned long npages); + struct kvm *kvm, unsigned long liobn); +#define kvmppc_ioba_validate(stt, ioba, npages) \ + (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \ + (stt)->size, (ioba), (npages)) ? \ + H_PARAMETER : H_SUCCESS) extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt, unsigned long tce); extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, @@ -240,6 +255,7 @@ union kvmppc_one_reg { u64 dval; vector128 vval; u64 vsxval[2]; + u32 vsx32val[4]; struct { u64 addr; u64 length; diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 142d78d645f4..3a8d278e7421 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -86,32 +86,79 @@ #define OP_TRAP_64 2 #define OP_31_XOP_TRAP 4 +#define OP_31_XOP_LDX 21 #define OP_31_XOP_LWZX 23 +#define OP_31_XOP_LDUX 53 #define OP_31_XOP_DCBST 54 #define OP_31_XOP_LWZUX 55 #define OP_31_XOP_TRAP_64 68 #define OP_31_XOP_DCBF 86 #define OP_31_XOP_LBZX 87 +#define OP_31_XOP_STDX 149 #define OP_31_XOP_STWX 151 +#define OP_31_XOP_STDUX 181 +#define OP_31_XOP_STWUX 183 #define OP_31_XOP_STBX 215 #define OP_31_XOP_LBZUX 119 #define OP_31_XOP_STBUX 247 #define OP_31_XOP_LHZX 279 #define OP_31_XOP_LHZUX 311 #define OP_31_XOP_MFSPR 339 +#define OP_31_XOP_LWAX 341 #define OP_31_XOP_LHAX 343 +#define OP_31_XOP_LWAUX 373 #define OP_31_XOP_LHAUX 375 #define OP_31_XOP_STHX 407 #define OP_31_XOP_STHUX 439 #define OP_31_XOP_MTSPR 467 #define OP_31_XOP_DCBI 470 +#define OP_31_XOP_LDBRX 532 #define OP_31_XOP_LWBRX 534 #define OP_31_XOP_TLBSYNC 566 +#define OP_31_XOP_STDBRX 660 #define OP_31_XOP_STWBRX 662 +#define OP_31_XOP_STFSX 663 +#define OP_31_XOP_STFSUX 695 +#define OP_31_XOP_STFDX 727 +#define OP_31_XOP_STFDUX 759 #define OP_31_XOP_LHBRX 790 +#define OP_31_XOP_LFIWAX 855 +#define OP_31_XOP_LFIWZX 887 #define OP_31_XOP_STHBRX 918 +#define OP_31_XOP_STFIWX 983 + +/* VSX Scalar Load Instructions */ +#define OP_31_XOP_LXSDX 588 +#define OP_31_XOP_LXSSPX 524 +#define OP_31_XOP_LXSIWAX 76 +#define OP_31_XOP_LXSIWZX 12 + +/* VSX Scalar Store Instructions */ +#define OP_31_XOP_STXSDX 716 +#define OP_31_XOP_STXSSPX 652 +#define OP_31_XOP_STXSIWX 140 + +/* VSX Vector Load Instructions */ +#define OP_31_XOP_LXVD2X 844 +#define OP_31_XOP_LXVW4X 780 + +/* VSX Vector Load and Splat Instruction */ +#define OP_31_XOP_LXVDSX 332 + +/* VSX Vector Store Instructions */ +#define OP_31_XOP_STXVD2X 972 +#define OP_31_XOP_STXVW4X 908 + +#define OP_31_XOP_LFSX 535 +#define OP_31_XOP_LFSUX 567 +#define OP_31_XOP_LFDX 599 +#define OP_31_XOP_LFDUX 631 #define OP_LWZ 32 +#define OP_STFS 52 +#define OP_STFSU 53 +#define OP_STFD 54 +#define OP_STFDU 55 #define OP_LD 58 #define OP_LWZU 33 #define OP_LBZ 34 @@ -127,6 +174,17 @@ #define OP_LHAU 43 #define OP_STH 44 #define OP_STHU 45 +#define OP_LMW 46 +#define OP_STMW 47 +#define OP_LFS 48 +#define OP_LFSU 49 +#define OP_LFD 50 +#define OP_LFDU 51 +#define OP_STFS 52 +#define OP_STFSU 53 +#define OP_STFD 54 +#define OP_STFDU 55 +#define OP_LQ 56 /* sorted alphabetically */ #define PPC_INST_BHRBE 0x7c00025c diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 4edbe4bb0e8b..07fbeb927834 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -29,6 +29,9 @@ #define __KVM_HAVE_IRQ_LINE #define __KVM_HAVE_GUEST_DEBUG +/* Not always available, but if it is, this is the correct offset. */ +#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 + struct kvm_regs { __u64 pc; __u64 cr; diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 5a3231fedf08..f2b724cd9e64 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -963,47 +963,36 @@ void iommu_flush_tce(struct iommu_table *tbl) } EXPORT_SYMBOL_GPL(iommu_flush_tce); -int iommu_tce_clear_param_check(struct iommu_table *tbl, - unsigned long ioba, unsigned long tce_value, - unsigned long npages) +int iommu_tce_check_ioba(unsigned long page_shift, + unsigned long offset, unsigned long size, + unsigned long ioba, unsigned long npages) { - /* tbl->it_ops->clear() does not support any value but 0 */ - if (tce_value) - return -EINVAL; + unsigned long mask = (1UL << page_shift) - 1; - if (ioba & ~IOMMU_PAGE_MASK(tbl)) + if (ioba & mask) return -EINVAL; - ioba >>= tbl->it_page_shift; - if (ioba < tbl->it_offset) + ioba >>= page_shift; + if (ioba < offset) return -EINVAL; - if ((ioba + npages) > (tbl->it_offset + tbl->it_size)) + if ((ioba + 1) > (offset + size)) return -EINVAL; return 0; } -EXPORT_SYMBOL_GPL(iommu_tce_clear_param_check); +EXPORT_SYMBOL_GPL(iommu_tce_check_ioba); -int iommu_tce_put_param_check(struct iommu_table *tbl, - unsigned long ioba, unsigned long tce) +int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa) { - if (tce & ~IOMMU_PAGE_MASK(tbl)) - return -EINVAL; - - if (ioba & ~IOMMU_PAGE_MASK(tbl)) - return -EINVAL; - - ioba >>= tbl->it_page_shift; - if (ioba < tbl->it_offset) - return -EINVAL; + unsigned long mask = (1UL << page_shift) - 1; - if ((ioba + 1) > (tbl->it_offset + tbl->it_size)) + if (gpa & mask) return -EINVAL; return 0; } -EXPORT_SYMBOL_GPL(iommu_tce_put_param_check); +EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, unsigned long *hpa, enum dma_data_direction *direction) diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 029be26b5a17..65a471de96de 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -67,6 +67,7 @@ config KVM_BOOK3S_64 select KVM_BOOK3S_64_HANDLER select KVM select KVM_BOOK3S_PR_POSSIBLE if !KVM_BOOK3S_HV_POSSIBLE + select SPAPR_TCE_IOMMU if IOMMU_SUPPORT ---help--- Support running unmodified book3s_64 and book3s_32 guest kernels in virtual machines on book3s_64 host processors. diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index aedacefd961d..8c4d7e9d27d2 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -197,6 +197,24 @@ void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong flags) } EXPORT_SYMBOL_GPL(kvmppc_core_queue_program); +void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu) +{ + /* might as well deliver this straight away */ + kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_FP_UNAVAIL, 0); +} + +void kvmppc_core_queue_vec_unavail(struct kvm_vcpu *vcpu) +{ + /* might as well deliver this straight away */ + kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_ALTIVEC, 0); +} + +void kvmppc_core_queue_vsx_unavail(struct kvm_vcpu *vcpu) +{ + /* might as well deliver this straight away */ + kvmppc_inject_interrupt(vcpu, BOOK3S_INTERRUPT_VSX, 0); +} + void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu) { kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DECREMENTER); diff --git a/arch/powerpc/kvm/book3s_64_mmu.c b/arch/powerpc/kvm/book3s_64_mmu.c index 70153578131a..29ebe2fd5867 100644 --- a/arch/powerpc/kvm/book3s_64_mmu.c +++ b/arch/powerpc/kvm/book3s_64_mmu.c @@ -319,6 +319,7 @@ do_second: gpte->may_execute = true; gpte->may_read = false; gpte->may_write = false; + gpte->wimg = r & HPTE_R_WIMG; switch (pp) { case 0: diff --git a/arch/powerpc/kvm/book3s_64_mmu_host.c b/arch/powerpc/kvm/book3s_64_mmu_host.c index 74b0153780e3..9a4614cd0e53 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_host.c +++ b/arch/powerpc/kvm/book3s_64_mmu_host.c @@ -145,6 +145,8 @@ int kvmppc_mmu_map_page(struct kvm_vcpu *vcpu, struct kvmppc_pte *orig_pte, else kvmppc_mmu_flush_icache(pfn); + rflags = (rflags & ~HPTE_R_WIMG) | orig_pte->wimg; + /* * Use 64K pages if possible; otherwise, on 64K page kernels, * we need to transfer 4 more bits from guest real to host real addr. @@ -177,12 +179,15 @@ map_again: ret = mmu_hash_ops.hpte_insert(hpteg, vpn, hpaddr, rflags, vflags, hpsize, hpsize, MMU_SEGSIZE_256M); - if (ret < 0) { + if (ret == -1) { /* If we couldn't map a primary PTE, try a secondary */ hash = ~hash; vflags ^= HPTE_V_SECONDARY; attempt++; goto map_again; + } else if (ret < 0) { + r = -EIO; + goto out_unlock; } else { trace_kvm_book3s_64_mmu_map(rflags, hpteg, vpn, hpaddr, orig_pte); diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 3e26cd4979f9..a160c14304eb 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -28,6 +28,8 @@ #include <linux/hugetlb.h> #include <linux/list.h> #include <linux/anon_inodes.h> +#include <linux/iommu.h> +#include <linux/file.h> #include <asm/tlbflush.h> #include <asm/kvm_ppc.h> @@ -40,6 +42,7 @@ #include <asm/udbg.h> #include <asm/iommu.h> #include <asm/tce.h> +#include <asm/mmu_context.h> static unsigned long kvmppc_tce_pages(unsigned long iommu_pages) { @@ -91,6 +94,137 @@ static long kvmppc_account_memlimit(unsigned long stt_pages, bool inc) return ret; } +static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head) +{ + struct kvmppc_spapr_tce_iommu_table *stit = container_of(head, + struct kvmppc_spapr_tce_iommu_table, rcu); + + iommu_tce_table_put(stit->tbl); + + kfree(stit); +} + +static void kvm_spapr_tce_liobn_put(struct kref *kref) +{ + struct kvmppc_spapr_tce_iommu_table *stit = container_of(kref, + struct kvmppc_spapr_tce_iommu_table, kref); + + list_del_rcu(&stit->next); + + call_rcu(&stit->rcu, kvm_spapr_tce_iommu_table_free); +} + +extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, + struct iommu_group *grp) +{ + int i; + struct kvmppc_spapr_tce_table *stt; + struct kvmppc_spapr_tce_iommu_table *stit, *tmp; + struct iommu_table_group *table_group = NULL; + + list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) { + + table_group = iommu_group_get_iommudata(grp); + if (WARN_ON(!table_group)) + continue; + + list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) { + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + if (table_group->tables[i] != stit->tbl) + continue; + + kref_put(&stit->kref, kvm_spapr_tce_liobn_put); + return; + } + } + } +} + +extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, + struct iommu_group *grp) +{ + struct kvmppc_spapr_tce_table *stt = NULL; + bool found = false; + struct iommu_table *tbl = NULL; + struct iommu_table_group *table_group; + long i; + struct kvmppc_spapr_tce_iommu_table *stit; + struct fd f; + + f = fdget(tablefd); + if (!f.file) + return -EBADF; + + list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt == f.file->private_data) { + found = true; + break; + } + } + + fdput(f); + + if (!found) + return -EINVAL; + + table_group = iommu_group_get_iommudata(grp); + if (WARN_ON(!table_group)) + return -EFAULT; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbltmp = table_group->tables[i]; + + if (!tbltmp) + continue; + /* + * Make sure hardware table parameters are exactly the same; + * this is used in the TCE handlers where boundary checks + * use only the first attached table. + */ + if ((tbltmp->it_page_shift == stt->page_shift) && + (tbltmp->it_offset == stt->offset) && + (tbltmp->it_size == stt->size)) { + /* + * Reference the table to avoid races with + * add/remove DMA windows. + */ + tbl = iommu_tce_table_get(tbltmp); + break; + } + } + if (!tbl) + return -EINVAL; + + list_for_each_entry_rcu(stit, &stt->iommu_tables, next) { + if (tbl != stit->tbl) + continue; + + if (!kref_get_unless_zero(&stit->kref)) { + /* stit is being destroyed */ + iommu_tce_table_put(tbl); + return -ENOTTY; + } + /* + * The table is already known to this KVM, we just increased + * its KVM reference counter and can return. + */ + return 0; + } + + stit = kzalloc(sizeof(*stit), GFP_KERNEL); + if (!stit) { + iommu_tce_table_put(tbl); + return -ENOMEM; + } + + stit->tbl = tbl; + kref_init(&stit->kref); + + list_add_rcu(&stit->next, &stt->iommu_tables); + + return 0; +} + static void release_spapr_tce_table(struct rcu_head *head) { struct kvmppc_spapr_tce_table *stt = container_of(head, @@ -130,9 +264,18 @@ static int kvm_spapr_tce_mmap(struct file *file, struct vm_area_struct *vma) static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) { struct kvmppc_spapr_tce_table *stt = filp->private_data; + struct kvmppc_spapr_tce_iommu_table *stit, *tmp; list_del_rcu(&stt->list); + list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) { + WARN_ON(!kref_read(&stit->kref)); + while (1) { + if (kref_put(&stit->kref, kvm_spapr_tce_liobn_put)) + break; + } + } + kvm_put_kvm(stt->kvm); kvmppc_account_memlimit( @@ -164,7 +307,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, return -EBUSY; } - size = args->size; + size = _ALIGN_UP(args->size, PAGE_SIZE >> 3); npages = kvmppc_tce_pages(size); ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true); if (ret) { @@ -183,6 +326,7 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, stt->offset = args->offset; stt->size = size; stt->kvm = kvm; + INIT_LIST_HEAD_RCU(&stt->iommu_tables); for (i = 0; i < npages; i++) { stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); @@ -211,15 +355,106 @@ fail: return ret; } +static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry) +{ + unsigned long hpa = 0; + enum dma_data_direction dir = DMA_NONE; + + iommu_tce_xchg(tbl, entry, &hpa, &dir); +} + +static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, + struct iommu_table *tbl, unsigned long entry) +{ + struct mm_iommu_table_group_mem_t *mem = NULL; + const unsigned long pgsize = 1ULL << tbl->it_page_shift; + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + + if (!pua) + /* it_userspace allocation might be delayed */ + return H_TOO_HARD; + + mem = mm_iommu_lookup(kvm->mm, *pua, pgsize); + if (!mem) + return H_TOO_HARD; + + mm_iommu_mapped_dec(mem); + + *pua = 0; + + return H_SUCCESS; +} + +static long kvmppc_tce_iommu_unmap(struct kvm *kvm, + struct iommu_table *tbl, unsigned long entry) +{ + enum dma_data_direction dir = DMA_NONE; + unsigned long hpa = 0; + long ret; + + if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir))) + return H_HARDWARE; + + if (dir == DMA_NONE) + return H_SUCCESS; + + ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry); + if (ret != H_SUCCESS) + iommu_tce_xchg(tbl, entry, &hpa, &dir); + + return ret; +} + +long kvmppc_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl, + unsigned long entry, unsigned long ua, + enum dma_data_direction dir) +{ + long ret; + unsigned long hpa, *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + struct mm_iommu_table_group_mem_t *mem; + + if (!pua) + /* it_userspace allocation might be delayed */ + return H_TOO_HARD; + + mem = mm_iommu_lookup(kvm->mm, ua, 1ULL << tbl->it_page_shift); + if (!mem) + /* This only handles v2 IOMMU type, v1 is handled via ioctl() */ + return H_TOO_HARD; + + if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, &hpa))) + return H_HARDWARE; + + if (mm_iommu_mapped_inc(mem)) + return H_CLOSED; + + ret = iommu_tce_xchg(tbl, entry, &hpa, &dir); + if (WARN_ON_ONCE(ret)) { + mm_iommu_mapped_dec(mem); + return H_HARDWARE; + } + + if (dir != DMA_NONE) + kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry); + + *pua = ua; + + return 0; +} + long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce) { - struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); - long ret; + struct kvmppc_spapr_tce_table *stt; + long ret, idx; + struct kvmppc_spapr_tce_iommu_table *stit; + unsigned long entry, ua = 0; + enum dma_data_direction dir; /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ /* liobn, ioba, tce); */ + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -231,7 +466,35 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - kvmppc_tce_put(stt, ioba >> stt->page_shift, tce); + dir = iommu_tce_direction(tce); + if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) + return H_PARAMETER; + + entry = ioba >> stt->page_shift; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + if (dir == DMA_NONE) { + ret = kvmppc_tce_iommu_unmap(vcpu->kvm, + stit->tbl, entry); + } else { + idx = srcu_read_lock(&vcpu->kvm->srcu); + ret = kvmppc_tce_iommu_map(vcpu->kvm, stit->tbl, + entry, ua, dir); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + return ret; + + WARN_ON_ONCE(1); + kvmppc_clear_tce(stit->tbl, entry); + } + + kvmppc_tce_put(stt, entry, tce); return H_SUCCESS; } @@ -246,8 +509,9 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, unsigned long entry, ua = 0; u64 __user *tces; u64 tce; + struct kvmppc_spapr_tce_iommu_table *stit; - stt = kvmppc_find_table(vcpu, liobn); + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -284,6 +548,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (ret != H_SUCCESS) goto unlock_exit; + if (kvmppc_gpa_to_ua(vcpu->kvm, + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), + &ua, NULL)) + return H_PARAMETER; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + ret = kvmppc_tce_iommu_map(vcpu->kvm, + stit->tbl, entry + i, ua, + iommu_tce_direction(tce)); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + goto unlock_exit; + + WARN_ON_ONCE(1); + kvmppc_clear_tce(stit->tbl, entry); + } + kvmppc_tce_put(stt, entry + i, tce); } @@ -300,8 +584,9 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, { struct kvmppc_spapr_tce_table *stt; long i, ret; + struct kvmppc_spapr_tce_iommu_table *stit; - stt = kvmppc_find_table(vcpu, liobn); + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -313,6 +598,24 @@ long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu, if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) return H_PARAMETER; + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + unsigned long entry = ioba >> stit->tbl->it_page_shift; + + for (i = 0; i < npages; ++i) { + ret = kvmppc_tce_iommu_unmap(vcpu->kvm, + stit->tbl, entry + i); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + return ret; + + WARN_ON_ONCE(1); + kvmppc_clear_tce(stit->tbl, entry); + } + } + for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index e4c4ea973e57..eda0a8f6fae8 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -40,6 +40,31 @@ #include <asm/iommu.h> #include <asm/tce.h> +#ifdef CONFIG_BUG + +#define WARN_ON_ONCE_RM(condition) ({ \ + static bool __section(.data.unlikely) __warned; \ + int __ret_warn_once = !!(condition); \ + \ + if (unlikely(__ret_warn_once && !__warned)) { \ + __warned = true; \ + pr_err("WARN_ON_ONCE_RM: (%s) at %s:%u\n", \ + __stringify(condition), \ + __func__, __LINE__); \ + dump_stack(); \ + } \ + unlikely(__ret_warn_once); \ +}) + +#else + +#define WARN_ON_ONCE_RM(condition) ({ \ + int __ret_warn_on = !!(condition); \ + unlikely(__ret_warn_on); \ +}) + +#endif + #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) /* @@ -48,10 +73,9 @@ * WARNING: This will be called in real or virtual mode on HV KVM and virtual * mode on PR KVM */ -struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, +struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm, unsigned long liobn) { - struct kvm *kvm = vcpu->kvm; struct kvmppc_spapr_tce_table *stt; list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list) @@ -63,27 +87,6 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, EXPORT_SYMBOL_GPL(kvmppc_find_table); /* - * Validates IO address. - * - * WARNING: This will be called in real-mode on HV KVM and virtual - * mode on PR KVM - */ -long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, - unsigned long ioba, unsigned long npages) -{ - unsigned long mask = (1ULL << stt->page_shift) - 1; - unsigned long idx = ioba >> stt->page_shift; - - if ((ioba & mask) || (idx < stt->offset) || - (idx - stt->offset + npages > stt->size) || - (idx + npages < idx)) - return H_PARAMETER; - - return H_SUCCESS; -} -EXPORT_SYMBOL_GPL(kvmppc_ioba_validate); - -/* * Validates TCE address. * At the moment flags and page mask are validated. * As the host kernel does not access those addresses (just puts them @@ -96,10 +99,14 @@ EXPORT_SYMBOL_GPL(kvmppc_ioba_validate); */ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) { - unsigned long page_mask = ~((1ULL << stt->page_shift) - 1); - unsigned long mask = ~(page_mask | TCE_PCI_WRITE | TCE_PCI_READ); + unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE); + enum dma_data_direction dir = iommu_tce_direction(tce); + + /* Allow userspace to poison TCE table */ + if (dir == DMA_NONE) + return H_SUCCESS; - if (tce & mask) + if (iommu_tce_check_gpa(stt->page_shift, gpa)) return H_PARAMETER; return H_SUCCESS; @@ -179,15 +186,122 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +static void kvmppc_rm_clear_tce(struct iommu_table *tbl, unsigned long entry) +{ + unsigned long hpa = 0; + enum dma_data_direction dir = DMA_NONE; + + iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); +} + +static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, + struct iommu_table *tbl, unsigned long entry) +{ + struct mm_iommu_table_group_mem_t *mem = NULL; + const unsigned long pgsize = 1ULL << tbl->it_page_shift; + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + + if (!pua) + /* it_userspace allocation might be delayed */ + return H_TOO_HARD; + + pua = (void *) vmalloc_to_phys(pua); + if (WARN_ON_ONCE_RM(!pua)) + return H_HARDWARE; + + mem = mm_iommu_lookup_rm(kvm->mm, *pua, pgsize); + if (!mem) + return H_TOO_HARD; + + mm_iommu_mapped_dec(mem); + + *pua = 0; + + return H_SUCCESS; +} + +static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm, + struct iommu_table *tbl, unsigned long entry) +{ + enum dma_data_direction dir = DMA_NONE; + unsigned long hpa = 0; + long ret; + + if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir)) + /* + * real mode xchg can fail if struct page crosses + * a page boundary + */ + return H_TOO_HARD; + + if (dir == DMA_NONE) + return H_SUCCESS; + + ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); + if (ret) + iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); + + return ret; +} + +static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl, + unsigned long entry, unsigned long ua, + enum dma_data_direction dir) +{ + long ret; + unsigned long hpa = 0; + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + struct mm_iommu_table_group_mem_t *mem; + + if (!pua) + /* it_userspace allocation might be delayed */ + return H_TOO_HARD; + + mem = mm_iommu_lookup_rm(kvm->mm, ua, 1ULL << tbl->it_page_shift); + if (!mem) + return H_TOO_HARD; + + if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa))) + return H_HARDWARE; + + pua = (void *) vmalloc_to_phys(pua); + if (WARN_ON_ONCE_RM(!pua)) + return H_HARDWARE; + + if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) + return H_CLOSED; + + ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); + if (ret) { + mm_iommu_mapped_dec(mem); + /* + * real mode xchg can fail if struct page crosses + * a page boundary + */ + return H_TOO_HARD; + } + + if (dir != DMA_NONE) + kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); + + *pua = ua; + + return 0; +} + long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce) { - struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); + struct kvmppc_spapr_tce_table *stt; long ret; + struct kvmppc_spapr_tce_iommu_table *stit; + unsigned long entry, ua = 0; + enum dma_data_direction dir; /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ /* liobn, ioba, tce); */ + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -199,7 +313,32 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, if (ret != H_SUCCESS) return ret; - kvmppc_tce_put(stt, ioba >> stt->page_shift, tce); + dir = iommu_tce_direction(tce); + if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) + return H_PARAMETER; + + entry = ioba >> stt->page_shift; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + if (dir == DMA_NONE) + ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm, + stit->tbl, entry); + else + ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, + stit->tbl, entry, ua, dir); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + return ret; + + WARN_ON_ONCE_RM(1); + kvmppc_rm_clear_tce(stit->tbl, entry); + } + + kvmppc_tce_put(stt, entry, tce); return H_SUCCESS; } @@ -239,8 +378,10 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, long i, ret = H_SUCCESS; unsigned long tces, entry, ua = 0; unsigned long *rmap = NULL; + bool prereg = false; + struct kvmppc_spapr_tce_iommu_table *stit; - stt = kvmppc_find_table(vcpu, liobn); + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -259,23 +400,49 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (ret != H_SUCCESS) return ret; - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) - return H_TOO_HARD; + if (mm_iommu_preregistered(vcpu->kvm->mm)) { + /* + * We get here if guest memory was pre-registered which + * is normally VFIO case and gpa->hpa translation does not + * depend on hpt. + */ + struct mm_iommu_table_group_mem_t *mem; - rmap = (void *) vmalloc_to_phys(rmap); + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) + return H_TOO_HARD; - /* - * Synchronize with the MMU notifier callbacks in - * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). - * While we have the rmap lock, code running on other CPUs - * cannot finish unmapping the host real page that backs - * this guest real page, so we are OK to access the host - * real page. - */ - lock_rmap(rmap); - if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { - ret = H_TOO_HARD; - goto unlock_exit; + mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K); + if (mem) + prereg = mm_iommu_ua_to_hpa_rm(mem, ua, &tces) == 0; + } + + if (!prereg) { + /* + * This is usually a case of a guest with emulated devices only + * when TCE list is not in preregistered memory. + * We do not require memory to be preregistered in this case + * so lock rmap and do __find_linux_pte_or_hugepte(). + */ + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) + return H_TOO_HARD; + + rmap = (void *) vmalloc_to_phys(rmap); + if (WARN_ON_ONCE_RM(!rmap)) + return H_HARDWARE; + + /* + * Synchronize with the MMU notifier callbacks in + * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). + * While we have the rmap lock, code running on other CPUs + * cannot finish unmapping the host real page that backs + * this guest real page, so we are OK to access the host + * real page. + */ + lock_rmap(rmap); + if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { + ret = H_TOO_HARD; + goto unlock_exit; + } } for (i = 0; i < npages; ++i) { @@ -285,11 +452,33 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, if (ret != H_SUCCESS) goto unlock_exit; + ua = 0; + if (kvmppc_gpa_to_ua(vcpu->kvm, + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), + &ua, NULL)) + return H_PARAMETER; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, + stit->tbl, entry + i, ua, + iommu_tce_direction(tce)); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + goto unlock_exit; + + WARN_ON_ONCE_RM(1); + kvmppc_rm_clear_tce(stit->tbl, entry); + } + kvmppc_tce_put(stt, entry + i, tce); } unlock_exit: - unlock_rmap(rmap); + if (rmap) + unlock_rmap(rmap); return ret; } @@ -300,8 +489,9 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, { struct kvmppc_spapr_tce_table *stt; long i, ret; + struct kvmppc_spapr_tce_iommu_table *stit; - stt = kvmppc_find_table(vcpu, liobn); + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -313,6 +503,24 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) return H_PARAMETER; + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + unsigned long entry = ioba >> stit->tbl->it_page_shift; + + for (i = 0; i < npages; ++i) { + ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm, + stit->tbl, entry + i); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + return ret; + + WARN_ON_ONCE_RM(1); + kvmppc_rm_clear_tce(stit->tbl, entry); + } + } + for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); @@ -322,12 +530,13 @@ long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba) { - struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); + struct kvmppc_spapr_tce_table *stt; long ret; unsigned long idx; struct page *page; u64 *tbl; + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 8359752b3efc..68d68983948e 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -503,10 +503,18 @@ int kvmppc_core_emulate_mtspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) break; unprivileged: default: - printk(KERN_INFO "KVM: invalid SPR write: %d\n", sprn); -#ifndef DEBUG_SPR - emulated = EMULATE_FAIL; -#endif + pr_info_ratelimited("KVM: invalid SPR write: %d\n", sprn); + if (sprn & 0x10) { + if (kvmppc_get_msr(vcpu) & MSR_PR) { + kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV); + emulated = EMULATE_AGAIN; + } + } else { + if ((kvmppc_get_msr(vcpu) & MSR_PR) || sprn == 0) { + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + emulated = EMULATE_AGAIN; + } + } break; } @@ -648,10 +656,20 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val break; default: unprivileged: - printk(KERN_INFO "KVM: invalid SPR read: %d\n", sprn); -#ifndef DEBUG_SPR - emulated = EMULATE_FAIL; -#endif + pr_info_ratelimited("KVM: invalid SPR read: %d\n", sprn); + if (sprn & 0x10) { + if (kvmppc_get_msr(vcpu) & MSR_PR) { + kvmppc_core_queue_program(vcpu, SRR1_PROGPRIV); + emulated = EMULATE_AGAIN; + } + } else { + if ((kvmppc_get_msr(vcpu) & MSR_PR) || sprn == 0 || + sprn == 4 || sprn == 5 || sprn == 6) { + kvmppc_core_queue_program(vcpu, SRR1_PROGILL); + emulated = EMULATE_AGAIN; + } + } + break; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index fadb75abfe37..549dd6070dee 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3624,11 +3624,9 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) return -EIO; mutex_lock(&kvm->lock); + if (!kvm->arch.pimap) + goto unlock; - if (kvm->arch.pimap == NULL) { - mutex_unlock(&kvm->lock); - return 0; - } pimap = kvm->arch.pimap; for (i = 0; i < pimap->n_mapped; i++) { @@ -3650,7 +3648,7 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) * We don't free this structure even when the count goes to * zero. The structure is freed when we destroy the VM. */ - + unlock: mutex_unlock(&kvm->lock); return 0; } diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index d4dfc0ca2a44..69a09444d46e 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -349,7 +349,7 @@ static void kvmppc_set_msr_pr(struct kvm_vcpu *vcpu, u64 msr) if (msr & MSR_POW) { if (!vcpu->arch.pending_exceptions) { kvm_vcpu_block(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); vcpu->stat.halt_wakeup++; /* Unset POW bit after we woke up */ @@ -537,8 +537,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, int r = RESUME_GUEST; int relocated; int page_found = 0; - struct kvmppc_pte pte; - bool is_mmio = false; + struct kvmppc_pte pte = { 0 }; bool dr = (kvmppc_get_msr(vcpu) & MSR_DR) ? true : false; bool ir = (kvmppc_get_msr(vcpu) & MSR_IR) ? true : false; u64 vsid; @@ -616,8 +615,7 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, /* Page not found in guest SLB */ kvmppc_set_dar(vcpu, kvmppc_get_fault_dar(vcpu)); kvmppc_book3s_queue_irqprio(vcpu, vec + 0x80); - } else if (!is_mmio && - kvmppc_visible_gpa(vcpu, pte.raddr)) { + } else if (kvmppc_visible_gpa(vcpu, pte.raddr)) { if (data && !(vcpu->arch.fault_dsisr & DSISR_NOHPTE)) { /* * There is already a host HPTE there, presumably @@ -627,7 +625,11 @@ int kvmppc_handle_pagefault(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_mmu_unmap_page(vcpu, &pte); } /* The guest's PTE is not mapped yet. Map on the host */ - kvmppc_mmu_map_page(vcpu, &pte, iswrite); + if (kvmppc_mmu_map_page(vcpu, &pte, iswrite) == -EIO) { + /* Exit KVM if mapping failed */ + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + return RESUME_HOST; + } if (data) vcpu->stat.sp_storage++; else if (vcpu->arch.mmu.is_dcbz32(vcpu) && diff --git a/arch/powerpc/kvm/book3s_pr_papr.c b/arch/powerpc/kvm/book3s_pr_papr.c index f102616febc7..bcbeeb62dd13 100644 --- a/arch/powerpc/kvm/book3s_pr_papr.c +++ b/arch/powerpc/kvm/book3s_pr_papr.c @@ -344,7 +344,7 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd) case H_CEDE: kvmppc_set_msr_fast(vcpu, kvmppc_get_msr(vcpu) | MSR_EE); kvm_vcpu_block(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); vcpu->stat.halt_wakeup++; return EMULATE_DONE; case H_LOGICAL_CI_LOAD: diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 0514cbd4e533..3eaac3809977 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -300,6 +300,11 @@ void kvmppc_core_queue_program(struct kvm_vcpu *vcpu, ulong esr_flags) kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_PROGRAM); } +void kvmppc_core_queue_fpunavail(struct kvm_vcpu *vcpu) +{ + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_FP_UNAVAIL); +} + void kvmppc_core_queue_dec(struct kvm_vcpu *vcpu) { kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_DECREMENTER); @@ -579,7 +584,7 @@ static void arm_next_watchdog(struct kvm_vcpu *vcpu) * userspace, so clear the KVM_REQ_WATCHDOG request. */ if ((vcpu->arch.tsr & (TSR_ENW | TSR_WIS)) != (TSR_ENW | TSR_WIS)) - clear_bit(KVM_REQ_WATCHDOG, &vcpu->requests); + kvm_clear_request(KVM_REQ_WATCHDOG, vcpu); spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); nr_jiffies = watchdog_next_timeout(vcpu); @@ -690,7 +695,7 @@ int kvmppc_core_prepare_to_enter(struct kvm_vcpu *vcpu) if (vcpu->arch.shared->msr & MSR_WE) { local_irq_enable(); kvm_vcpu_block(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); hard_irq_disable(); kvmppc_set_exit_type(vcpu, EMULATED_MTMSRWE_EXITS); diff --git a/arch/powerpc/kvm/e500_mmu_host.c b/arch/powerpc/kvm/e500_mmu_host.c index 0fda4230f6c0..77fd043b3ecc 100644 --- a/arch/powerpc/kvm/e500_mmu_host.c +++ b/arch/powerpc/kvm/e500_mmu_host.c @@ -797,9 +797,8 @@ int e500_mmu_host_init(struct kvmppc_vcpu_e500 *vcpu_e500) host_tlb_params[0].sets = host_tlb_params[0].entries / host_tlb_params[0].ways; host_tlb_params[1].sets = 1; - - vcpu_e500->h2g_tlb1_rmap = kzalloc(sizeof(unsigned int) * - host_tlb_params[1].entries, + vcpu_e500->h2g_tlb1_rmap = kcalloc(host_tlb_params[1].entries, + sizeof(*vcpu_e500->h2g_tlb1_rmap), GFP_KERNEL); if (!vcpu_e500->h2g_tlb1_rmap) return -EINVAL; diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c index b379146de55b..c873ffe55362 100644 --- a/arch/powerpc/kvm/emulate.c +++ b/arch/powerpc/kvm/emulate.c @@ -259,10 +259,18 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu) case OP_31_XOP_MFSPR: emulated = kvmppc_emulate_mfspr(vcpu, sprn, rt); + if (emulated == EMULATE_AGAIN) { + emulated = EMULATE_DONE; + advance = 0; + } break; case OP_31_XOP_MTSPR: emulated = kvmppc_emulate_mtspr(vcpu, sprn, rs); + if (emulated == EMULATE_AGAIN) { + emulated = EMULATE_DONE; + advance = 0; + } break; case OP_31_XOP_TLBSYNC: diff --git a/arch/powerpc/kvm/emulate_loadstore.c b/arch/powerpc/kvm/emulate_loadstore.c index 6d3c0ee1d744..af833531af31 100644 --- a/arch/powerpc/kvm/emulate_loadstore.c +++ b/arch/powerpc/kvm/emulate_loadstore.c @@ -34,18 +34,38 @@ #include "timing.h" #include "trace.h" -/* XXX to do: - * lhax - * lhaux - * lswx - * lswi - * stswx - * stswi - * lha - * lhau - * lmw - * stmw +#ifdef CONFIG_PPC_FPU +static bool kvmppc_check_fp_disabled(struct kvm_vcpu *vcpu) +{ + if (!(kvmppc_get_msr(vcpu) & MSR_FP)) { + kvmppc_core_queue_fpunavail(vcpu); + return true; + } + + return false; +} +#endif /* CONFIG_PPC_FPU */ + +#ifdef CONFIG_VSX +static bool kvmppc_check_vsx_disabled(struct kvm_vcpu *vcpu) +{ + if (!(kvmppc_get_msr(vcpu) & MSR_VSX)) { + kvmppc_core_queue_vsx_unavail(vcpu); + return true; + } + + return false; +} +#endif /* CONFIG_VSX */ + +/* + * XXX to do: + * lfiwax, lfiwzx + * vector loads and stores * + * Instructions that trap when used on cache-inhibited mappings + * are not emulated here: multiple and string instructions, + * lq/stq, and the load-reserve/store-conditional instructions. */ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) { @@ -66,6 +86,19 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) rs = get_rs(inst); rt = get_rt(inst); + /* + * if mmio_vsx_tx_sx_enabled == 0, copy data between + * VSR[0..31] and memory + * if mmio_vsx_tx_sx_enabled == 1, copy data between + * VSR[32..63] and memory + */ + vcpu->arch.mmio_vsx_tx_sx_enabled = get_tx_or_sx(inst); + vcpu->arch.mmio_vsx_copy_nums = 0; + vcpu->arch.mmio_vsx_offset = 0; + vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_NONE; + vcpu->arch.mmio_sp64_extend = 0; + vcpu->arch.mmio_sign_extend = 0; + switch (get_op(inst)) { case 31: switch (get_xop(inst)) { @@ -73,6 +106,11 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); break; + case OP_31_XOP_LWZUX: + emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + case OP_31_XOP_LBZX: emulated = kvmppc_handle_load(run, vcpu, rt, 1, 1); break; @@ -82,22 +120,36 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; + case OP_31_XOP_STDX: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 8, 1); + break; + + case OP_31_XOP_STDUX: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 8, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + case OP_31_XOP_STWX: emulated = kvmppc_handle_store(run, vcpu, - kvmppc_get_gpr(vcpu, rs), - 4, 1); + kvmppc_get_gpr(vcpu, rs), 4, 1); + break; + + case OP_31_XOP_STWUX: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 4, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_31_XOP_STBX: emulated = kvmppc_handle_store(run, vcpu, - kvmppc_get_gpr(vcpu, rs), - 1, 1); + kvmppc_get_gpr(vcpu, rs), 1, 1); break; case OP_31_XOP_STBUX: emulated = kvmppc_handle_store(run, vcpu, - kvmppc_get_gpr(vcpu, rs), - 1, 1); + kvmppc_get_gpr(vcpu, rs), 1, 1); kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; @@ -105,6 +157,11 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1); break; + case OP_31_XOP_LHAUX: + emulated = kvmppc_handle_loads(run, vcpu, rt, 2, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + case OP_31_XOP_LHZX: emulated = kvmppc_handle_load(run, vcpu, rt, 2, 1); break; @@ -116,14 +173,12 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) case OP_31_XOP_STHX: emulated = kvmppc_handle_store(run, vcpu, - kvmppc_get_gpr(vcpu, rs), - 2, 1); + kvmppc_get_gpr(vcpu, rs), 2, 1); break; case OP_31_XOP_STHUX: emulated = kvmppc_handle_store(run, vcpu, - kvmppc_get_gpr(vcpu, rs), - 2, 1); + kvmppc_get_gpr(vcpu, rs), 2, 1); kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; @@ -143,8 +198,7 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) case OP_31_XOP_STWBRX: emulated = kvmppc_handle_store(run, vcpu, - kvmppc_get_gpr(vcpu, rs), - 4, 0); + kvmppc_get_gpr(vcpu, rs), 4, 0); break; case OP_31_XOP_LHBRX: @@ -153,10 +207,258 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) case OP_31_XOP_STHBRX: emulated = kvmppc_handle_store(run, vcpu, - kvmppc_get_gpr(vcpu, rs), - 2, 0); + kvmppc_get_gpr(vcpu, rs), 2, 0); + break; + + case OP_31_XOP_LDBRX: + emulated = kvmppc_handle_load(run, vcpu, rt, 8, 0); + break; + + case OP_31_XOP_STDBRX: + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 8, 0); + break; + + case OP_31_XOP_LDX: + emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1); + break; + + case OP_31_XOP_LDUX: + emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_31_XOP_LWAX: + emulated = kvmppc_handle_loads(run, vcpu, rt, 4, 1); + break; + + case OP_31_XOP_LWAUX: + emulated = kvmppc_handle_loads(run, vcpu, rt, 4, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + +#ifdef CONFIG_PPC_FPU + case OP_31_XOP_LFSX: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_sp64_extend = 1; + emulated = kvmppc_handle_load(run, vcpu, + KVM_MMIO_REG_FPR|rt, 4, 1); + break; + + case OP_31_XOP_LFSUX: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_sp64_extend = 1; + emulated = kvmppc_handle_load(run, vcpu, + KVM_MMIO_REG_FPR|rt, 4, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_31_XOP_LFDX: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + emulated = kvmppc_handle_load(run, vcpu, + KVM_MMIO_REG_FPR|rt, 8, 1); + break; + + case OP_31_XOP_LFDUX: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + emulated = kvmppc_handle_load(run, vcpu, + KVM_MMIO_REG_FPR|rt, 8, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_31_XOP_LFIWAX: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + emulated = kvmppc_handle_loads(run, vcpu, + KVM_MMIO_REG_FPR|rt, 4, 1); + break; + + case OP_31_XOP_LFIWZX: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + emulated = kvmppc_handle_load(run, vcpu, + KVM_MMIO_REG_FPR|rt, 4, 1); + break; + + case OP_31_XOP_STFSX: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_sp64_extend = 1; + emulated = kvmppc_handle_store(run, vcpu, + VCPU_FPR(vcpu, rs), 4, 1); + break; + + case OP_31_XOP_STFSUX: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_sp64_extend = 1; + emulated = kvmppc_handle_store(run, vcpu, + VCPU_FPR(vcpu, rs), 4, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_31_XOP_STFDX: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + emulated = kvmppc_handle_store(run, vcpu, + VCPU_FPR(vcpu, rs), 8, 1); + break; + + case OP_31_XOP_STFDUX: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + emulated = kvmppc_handle_store(run, vcpu, + VCPU_FPR(vcpu, rs), 8, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_31_XOP_STFIWX: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + emulated = kvmppc_handle_store(run, vcpu, + VCPU_FPR(vcpu, rs), 4, 1); + break; +#endif + +#ifdef CONFIG_VSX + case OP_31_XOP_LXSDX: + if (kvmppc_check_vsx_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_vsx_copy_nums = 1; + vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD; + emulated = kvmppc_handle_vsx_load(run, vcpu, + KVM_MMIO_REG_VSX|rt, 8, 1, 0); + break; + + case OP_31_XOP_LXSSPX: + if (kvmppc_check_vsx_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_vsx_copy_nums = 1; + vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD; + vcpu->arch.mmio_sp64_extend = 1; + emulated = kvmppc_handle_vsx_load(run, vcpu, + KVM_MMIO_REG_VSX|rt, 4, 1, 0); + break; + + case OP_31_XOP_LXSIWAX: + if (kvmppc_check_vsx_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_vsx_copy_nums = 1; + vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD; + emulated = kvmppc_handle_vsx_load(run, vcpu, + KVM_MMIO_REG_VSX|rt, 4, 1, 1); + break; + + case OP_31_XOP_LXSIWZX: + if (kvmppc_check_vsx_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_vsx_copy_nums = 1; + vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD; + emulated = kvmppc_handle_vsx_load(run, vcpu, + KVM_MMIO_REG_VSX|rt, 4, 1, 0); + break; + + case OP_31_XOP_LXVD2X: + /* + * In this case, the official load/store process is like this: + * Step1, exit from vm by page fault isr, then kvm save vsr. + * Please see guest_exit_cont->store_fp_state->SAVE_32VSRS + * as reference. + * + * Step2, copy data between memory and VCPU + * Notice: for LXVD2X/STXVD2X/LXVW4X/STXVW4X, we use + * 2copies*8bytes or 4copies*4bytes + * to simulate one copy of 16bytes. + * Also there is an endian issue here, we should notice the + * layout of memory. + * Please see MARCO of LXVD2X_ROT/STXVD2X_ROT as more reference. + * If host is little-endian, kvm will call XXSWAPD for + * LXVD2X_ROT/STXVD2X_ROT. + * So, if host is little-endian, + * the postion of memeory should be swapped. + * + * Step3, return to guest, kvm reset register. + * Please see kvmppc_hv_entry->load_fp_state->REST_32VSRS + * as reference. + */ + if (kvmppc_check_vsx_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_vsx_copy_nums = 2; + vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD; + emulated = kvmppc_handle_vsx_load(run, vcpu, + KVM_MMIO_REG_VSX|rt, 8, 1, 0); + break; + + case OP_31_XOP_LXVW4X: + if (kvmppc_check_vsx_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_vsx_copy_nums = 4; + vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_WORD; + emulated = kvmppc_handle_vsx_load(run, vcpu, + KVM_MMIO_REG_VSX|rt, 4, 1, 0); + break; + + case OP_31_XOP_LXVDSX: + if (kvmppc_check_vsx_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_vsx_copy_nums = 1; + vcpu->arch.mmio_vsx_copy_type = + KVMPPC_VSX_COPY_DWORD_LOAD_DUMP; + emulated = kvmppc_handle_vsx_load(run, vcpu, + KVM_MMIO_REG_VSX|rt, 8, 1, 0); + break; + + case OP_31_XOP_STXSDX: + if (kvmppc_check_vsx_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_vsx_copy_nums = 1; + vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD; + emulated = kvmppc_handle_vsx_store(run, vcpu, + rs, 8, 1); break; + case OP_31_XOP_STXSSPX: + if (kvmppc_check_vsx_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_vsx_copy_nums = 1; + vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD; + vcpu->arch.mmio_sp64_extend = 1; + emulated = kvmppc_handle_vsx_store(run, vcpu, + rs, 4, 1); + break; + + case OP_31_XOP_STXSIWX: + if (kvmppc_check_vsx_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_vsx_offset = 1; + vcpu->arch.mmio_vsx_copy_nums = 1; + vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_WORD; + emulated = kvmppc_handle_vsx_store(run, vcpu, + rs, 4, 1); + break; + + case OP_31_XOP_STXVD2X: + if (kvmppc_check_vsx_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_vsx_copy_nums = 2; + vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_DWORD; + emulated = kvmppc_handle_vsx_store(run, vcpu, + rs, 8, 1); + break; + + case OP_31_XOP_STXVW4X: + if (kvmppc_check_vsx_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_vsx_copy_nums = 4; + vcpu->arch.mmio_vsx_copy_type = KVMPPC_VSX_COPY_WORD; + emulated = kvmppc_handle_vsx_store(run, vcpu, + rs, 4, 1); + break; +#endif /* CONFIG_VSX */ default: emulated = EMULATE_FAIL; break; @@ -167,10 +469,60 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) emulated = kvmppc_handle_load(run, vcpu, rt, 4, 1); break; - /* TBD: Add support for other 64 bit load variants like ldu, ldux, ldx etc. */ +#ifdef CONFIG_PPC_FPU + case OP_STFS: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_sp64_extend = 1; + emulated = kvmppc_handle_store(run, vcpu, + VCPU_FPR(vcpu, rs), + 4, 1); + break; + + case OP_STFSU: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_sp64_extend = 1; + emulated = kvmppc_handle_store(run, vcpu, + VCPU_FPR(vcpu, rs), + 4, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_STFD: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + emulated = kvmppc_handle_store(run, vcpu, + VCPU_FPR(vcpu, rs), + 8, 1); + break; + + case OP_STFDU: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + emulated = kvmppc_handle_store(run, vcpu, + VCPU_FPR(vcpu, rs), + 8, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; +#endif + case OP_LD: rt = get_rt(inst); - emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1); + switch (inst & 3) { + case 0: /* ld */ + emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1); + break; + case 1: /* ldu */ + emulated = kvmppc_handle_load(run, vcpu, rt, 8, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + case 2: /* lwa */ + emulated = kvmppc_handle_loads(run, vcpu, rt, 4, 1); + break; + default: + emulated = EMULATE_FAIL; + } break; case OP_LWZU: @@ -193,31 +545,37 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) 4, 1); break; - /* TBD: Add support for other 64 bit store variants like stdu, stdux, stdx etc. */ case OP_STD: rs = get_rs(inst); - emulated = kvmppc_handle_store(run, vcpu, - kvmppc_get_gpr(vcpu, rs), - 8, 1); + switch (inst & 3) { + case 0: /* std */ + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 8, 1); + break; + case 1: /* stdu */ + emulated = kvmppc_handle_store(run, vcpu, + kvmppc_get_gpr(vcpu, rs), 8, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + default: + emulated = EMULATE_FAIL; + } break; case OP_STWU: emulated = kvmppc_handle_store(run, vcpu, - kvmppc_get_gpr(vcpu, rs), - 4, 1); + kvmppc_get_gpr(vcpu, rs), 4, 1); kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; case OP_STB: emulated = kvmppc_handle_store(run, vcpu, - kvmppc_get_gpr(vcpu, rs), - 1, 1); + kvmppc_get_gpr(vcpu, rs), 1, 1); break; case OP_STBU: emulated = kvmppc_handle_store(run, vcpu, - kvmppc_get_gpr(vcpu, rs), - 1, 1); + kvmppc_get_gpr(vcpu, rs), 1, 1); kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; @@ -241,16 +599,48 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) case OP_STH: emulated = kvmppc_handle_store(run, vcpu, - kvmppc_get_gpr(vcpu, rs), - 2, 1); + kvmppc_get_gpr(vcpu, rs), 2, 1); break; case OP_STHU: emulated = kvmppc_handle_store(run, vcpu, - kvmppc_get_gpr(vcpu, rs), - 2, 1); + kvmppc_get_gpr(vcpu, rs), 2, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + +#ifdef CONFIG_PPC_FPU + case OP_LFS: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_sp64_extend = 1; + emulated = kvmppc_handle_load(run, vcpu, + KVM_MMIO_REG_FPR|rt, 4, 1); + break; + + case OP_LFSU: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + vcpu->arch.mmio_sp64_extend = 1; + emulated = kvmppc_handle_load(run, vcpu, + KVM_MMIO_REG_FPR|rt, 4, 1); + kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); + break; + + case OP_LFD: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + emulated = kvmppc_handle_load(run, vcpu, + KVM_MMIO_REG_FPR|rt, 8, 1); + break; + + case OP_LFDU: + if (kvmppc_check_fp_disabled(vcpu)) + return EMULATE_DONE; + emulated = kvmppc_handle_load(run, vcpu, + KVM_MMIO_REG_FPR|rt, 8, 1); kvmppc_set_gpr(vcpu, ra, vcpu->arch.vaddr_accessed); break; +#endif default: emulated = EMULATE_FAIL; diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 95c91a9de351..1ee22a910074 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -37,6 +37,7 @@ #include <asm/cputhreads.h> #include <asm/irqflags.h> #include <asm/iommu.h> +#include <asm/switch_to.h> #include "timing.h" #include "irq.h" #include "../mm/mmu_decl.h" @@ -232,7 +233,7 @@ int kvmppc_kvm_pv(struct kvm_vcpu *vcpu) case EV_HCALL_TOKEN(EV_IDLE): r = EV_SUCCESS; kvm_vcpu_block(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); break; default: r = EV_UNIMPLEMENTED; @@ -524,11 +525,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) /* We support this only for PR */ r = !hv_enabled; break; -#ifdef CONFIG_KVM_MMIO - case KVM_CAP_COALESCED_MMIO: - r = KVM_COALESCED_MMIO_PAGE_OFFSET; - break; -#endif #ifdef CONFIG_KVM_MPIC case KVM_CAP_IRQ_MPIC: r = 1; @@ -538,6 +534,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: case KVM_CAP_SPAPR_TCE_64: + /* fallthrough */ + case KVM_CAP_SPAPR_TCE_VFIO: case KVM_CAP_PPC_RTAS: case KVM_CAP_PPC_FIXUP_HCALL: case KVM_CAP_PPC_ENABLE_HCALL: @@ -806,6 +804,129 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod); } +#ifdef CONFIG_VSX +static inline int kvmppc_get_vsr_dword_offset(int index) +{ + int offset; + + if ((index != 0) && (index != 1)) + return -1; + +#ifdef __BIG_ENDIAN + offset = index; +#else + offset = 1 - index; +#endif + + return offset; +} + +static inline int kvmppc_get_vsr_word_offset(int index) +{ + int offset; + + if ((index > 3) || (index < 0)) + return -1; + +#ifdef __BIG_ENDIAN + offset = index; +#else + offset = 3 - index; +#endif + return offset; +} + +static inline void kvmppc_set_vsr_dword(struct kvm_vcpu *vcpu, + u64 gpr) +{ + union kvmppc_one_reg val; + int offset = kvmppc_get_vsr_dword_offset(vcpu->arch.mmio_vsx_offset); + int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK; + + if (offset == -1) + return; + + if (vcpu->arch.mmio_vsx_tx_sx_enabled) { + val.vval = VCPU_VSX_VR(vcpu, index); + val.vsxval[offset] = gpr; + VCPU_VSX_VR(vcpu, index) = val.vval; + } else { + VCPU_VSX_FPR(vcpu, index, offset) = gpr; + } +} + +static inline void kvmppc_set_vsr_dword_dump(struct kvm_vcpu *vcpu, + u64 gpr) +{ + union kvmppc_one_reg val; + int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK; + + if (vcpu->arch.mmio_vsx_tx_sx_enabled) { + val.vval = VCPU_VSX_VR(vcpu, index); + val.vsxval[0] = gpr; + val.vsxval[1] = gpr; + VCPU_VSX_VR(vcpu, index) = val.vval; + } else { + VCPU_VSX_FPR(vcpu, index, 0) = gpr; + VCPU_VSX_FPR(vcpu, index, 1) = gpr; + } +} + +static inline void kvmppc_set_vsr_word(struct kvm_vcpu *vcpu, + u32 gpr32) +{ + union kvmppc_one_reg val; + int offset = kvmppc_get_vsr_word_offset(vcpu->arch.mmio_vsx_offset); + int index = vcpu->arch.io_gpr & KVM_MMIO_REG_MASK; + int dword_offset, word_offset; + + if (offset == -1) + return; + + if (vcpu->arch.mmio_vsx_tx_sx_enabled) { + val.vval = VCPU_VSX_VR(vcpu, index); + val.vsx32val[offset] = gpr32; + VCPU_VSX_VR(vcpu, index) = val.vval; + } else { + dword_offset = offset / 2; + word_offset = offset % 2; + val.vsxval[0] = VCPU_VSX_FPR(vcpu, index, dword_offset); + val.vsx32val[word_offset] = gpr32; + VCPU_VSX_FPR(vcpu, index, dword_offset) = val.vsxval[0]; + } +} +#endif /* CONFIG_VSX */ + +#ifdef CONFIG_PPC_FPU +static inline u64 sp_to_dp(u32 fprs) +{ + u64 fprd; + + preempt_disable(); + enable_kernel_fp(); + asm ("lfs%U1%X1 0,%1; stfd%U0%X0 0,%0" : "=m" (fprd) : "m" (fprs) + : "fr0"); + preempt_enable(); + return fprd; +} + +static inline u32 dp_to_sp(u64 fprd) +{ + u32 fprs; + + preempt_disable(); + enable_kernel_fp(); + asm ("lfd%U1%X1 0,%1; stfs%U0%X0 0,%0" : "=m" (fprs) : "m" (fprd) + : "fr0"); + preempt_enable(); + return fprs; +} + +#else +#define sp_to_dp(x) (x) +#define dp_to_sp(x) (x) +#endif /* CONFIG_PPC_FPU */ + static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run) { @@ -832,6 +953,10 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, } } + /* conversion between single and double precision */ + if ((vcpu->arch.mmio_sp64_extend) && (run->mmio.len == 4)) + gpr = sp_to_dp(gpr); + if (vcpu->arch.mmio_sign_extend) { switch (run->mmio.len) { #ifdef CONFIG_PPC64 @@ -848,8 +973,6 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, } } - kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); - switch (vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) { case KVM_MMIO_REG_GPR: kvmppc_set_gpr(vcpu, vcpu->arch.io_gpr, gpr); @@ -866,6 +989,17 @@ static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, vcpu->arch.qpr[vcpu->arch.io_gpr & KVM_MMIO_REG_MASK] = gpr; break; #endif +#ifdef CONFIG_VSX + case KVM_MMIO_REG_VSX: + if (vcpu->arch.mmio_vsx_copy_type == KVMPPC_VSX_COPY_DWORD) + kvmppc_set_vsr_dword(vcpu, gpr); + else if (vcpu->arch.mmio_vsx_copy_type == KVMPPC_VSX_COPY_WORD) + kvmppc_set_vsr_word(vcpu, gpr); + else if (vcpu->arch.mmio_vsx_copy_type == + KVMPPC_VSX_COPY_DWORD_LOAD_DUMP) + kvmppc_set_vsr_dword_dump(vcpu, gpr); + break; +#endif default: BUG(); } @@ -932,6 +1066,35 @@ int kvmppc_handle_loads(struct kvm_run *run, struct kvm_vcpu *vcpu, return __kvmppc_handle_load(run, vcpu, rt, bytes, is_default_endian, 1); } +#ifdef CONFIG_VSX +int kvmppc_handle_vsx_load(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned int rt, unsigned int bytes, + int is_default_endian, int mmio_sign_extend) +{ + enum emulation_result emulated = EMULATE_DONE; + + /* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */ + if ( (vcpu->arch.mmio_vsx_copy_nums > 4) || + (vcpu->arch.mmio_vsx_copy_nums < 0) ) { + return EMULATE_FAIL; + } + + while (vcpu->arch.mmio_vsx_copy_nums) { + emulated = __kvmppc_handle_load(run, vcpu, rt, bytes, + is_default_endian, mmio_sign_extend); + + if (emulated != EMULATE_DONE) + break; + + vcpu->arch.paddr_accessed += run->mmio.len; + + vcpu->arch.mmio_vsx_copy_nums--; + vcpu->arch.mmio_vsx_offset++; + } + return emulated; +} +#endif /* CONFIG_VSX */ + int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, u64 val, unsigned int bytes, int is_default_endian) { @@ -957,6 +1120,9 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, vcpu->mmio_needed = 1; vcpu->mmio_is_write = 1; + if ((vcpu->arch.mmio_sp64_extend) && (bytes == 4)) + val = dp_to_sp(val); + /* Store the value at the lowest bytes in 'data'. */ if (!host_swabbed) { switch (bytes) { @@ -990,6 +1156,129 @@ int kvmppc_handle_store(struct kvm_run *run, struct kvm_vcpu *vcpu, } EXPORT_SYMBOL_GPL(kvmppc_handle_store); +#ifdef CONFIG_VSX +static inline int kvmppc_get_vsr_data(struct kvm_vcpu *vcpu, int rs, u64 *val) +{ + u32 dword_offset, word_offset; + union kvmppc_one_reg reg; + int vsx_offset = 0; + int copy_type = vcpu->arch.mmio_vsx_copy_type; + int result = 0; + + switch (copy_type) { + case KVMPPC_VSX_COPY_DWORD: + vsx_offset = + kvmppc_get_vsr_dword_offset(vcpu->arch.mmio_vsx_offset); + + if (vsx_offset == -1) { + result = -1; + break; + } + + if (!vcpu->arch.mmio_vsx_tx_sx_enabled) { + *val = VCPU_VSX_FPR(vcpu, rs, vsx_offset); + } else { + reg.vval = VCPU_VSX_VR(vcpu, rs); + *val = reg.vsxval[vsx_offset]; + } + break; + + case KVMPPC_VSX_COPY_WORD: + vsx_offset = + kvmppc_get_vsr_word_offset(vcpu->arch.mmio_vsx_offset); + + if (vsx_offset == -1) { + result = -1; + break; + } + + if (!vcpu->arch.mmio_vsx_tx_sx_enabled) { + dword_offset = vsx_offset / 2; + word_offset = vsx_offset % 2; + reg.vsxval[0] = VCPU_VSX_FPR(vcpu, rs, dword_offset); + *val = reg.vsx32val[word_offset]; + } else { + reg.vval = VCPU_VSX_VR(vcpu, rs); + *val = reg.vsx32val[vsx_offset]; + } + break; + + default: + result = -1; + break; + } + + return result; +} + +int kvmppc_handle_vsx_store(struct kvm_run *run, struct kvm_vcpu *vcpu, + int rs, unsigned int bytes, int is_default_endian) +{ + u64 val; + enum emulation_result emulated = EMULATE_DONE; + + vcpu->arch.io_gpr = rs; + + /* Currently, mmio_vsx_copy_nums only allowed to be less than 4 */ + if ( (vcpu->arch.mmio_vsx_copy_nums > 4) || + (vcpu->arch.mmio_vsx_copy_nums < 0) ) { + return EMULATE_FAIL; + } + + while (vcpu->arch.mmio_vsx_copy_nums) { + if (kvmppc_get_vsr_data(vcpu, rs, &val) == -1) + return EMULATE_FAIL; + + emulated = kvmppc_handle_store(run, vcpu, + val, bytes, is_default_endian); + + if (emulated != EMULATE_DONE) + break; + + vcpu->arch.paddr_accessed += run->mmio.len; + + vcpu->arch.mmio_vsx_copy_nums--; + vcpu->arch.mmio_vsx_offset++; + } + + return emulated; +} + +static int kvmppc_emulate_mmio_vsx_loadstore(struct kvm_vcpu *vcpu, + struct kvm_run *run) +{ + enum emulation_result emulated = EMULATE_FAIL; + int r; + + vcpu->arch.paddr_accessed += run->mmio.len; + + if (!vcpu->mmio_is_write) { + emulated = kvmppc_handle_vsx_load(run, vcpu, vcpu->arch.io_gpr, + run->mmio.len, 1, vcpu->arch.mmio_sign_extend); + } else { + emulated = kvmppc_handle_vsx_store(run, vcpu, + vcpu->arch.io_gpr, run->mmio.len, 1); + } + + switch (emulated) { + case EMULATE_DO_MMIO: + run->exit_reason = KVM_EXIT_MMIO; + r = RESUME_HOST; + break; + case EMULATE_FAIL: + pr_info("KVM: MMIO emulation failed (VSX repeat)\n"); + run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + r = RESUME_HOST; + break; + default: + r = RESUME_GUEST; + break; + } + return r; +} +#endif /* CONFIG_VSX */ + int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) { int r = 0; @@ -1092,13 +1381,24 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) int r; sigset_t sigsaved; - if (vcpu->sigset_active) - sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); - if (vcpu->mmio_needed) { + vcpu->mmio_needed = 0; if (!vcpu->mmio_is_write) kvmppc_complete_mmio_load(vcpu, run); - vcpu->mmio_needed = 0; +#ifdef CONFIG_VSX + if (vcpu->arch.mmio_vsx_copy_nums > 0) { + vcpu->arch.mmio_vsx_copy_nums--; + vcpu->arch.mmio_vsx_offset++; + } + + if (vcpu->arch.mmio_vsx_copy_nums > 0) { + r = kvmppc_emulate_mmio_vsx_loadstore(vcpu, run); + if (r == RESUME_HOST) { + vcpu->mmio_needed = 1; + return r; + } + } +#endif } else if (vcpu->arch.osi_needed) { u64 *gprs = run->osi.gprs; int i; @@ -1120,6 +1420,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run) #endif } + if (vcpu->sigset_active) + sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); + if (run->immediate_exit) r = -EINTR; else diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index a41faf34b034..426614a882a9 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -25,6 +25,7 @@ #include <asm/cpu.h> #include <asm/fpu/api.h> #include <asm/isc.h> +#include <asm/guarded_storage.h> #define KVM_S390_BSCA_CPU_SLOTS 64 #define KVM_S390_ESCA_CPU_SLOTS 248 @@ -121,6 +122,7 @@ struct esca_block { #define CPUSTAT_SLSR 0x00002000 #define CPUSTAT_ZARCH 0x00000800 #define CPUSTAT_MCDS 0x00000100 +#define CPUSTAT_KSS 0x00000200 #define CPUSTAT_SM 0x00000080 #define CPUSTAT_IBS 0x00000040 #define CPUSTAT_GED2 0x00000010 @@ -164,16 +166,27 @@ struct kvm_s390_sie_block { #define ICTL_RRBE 0x00001000 #define ICTL_TPROT 0x00000200 __u32 ictl; /* 0x0048 */ +#define ECA_CEI 0x80000000 +#define ECA_IB 0x40000000 +#define ECA_SIGPI 0x10000000 +#define ECA_MVPGI 0x01000000 +#define ECA_VX 0x00020000 +#define ECA_PROTEXCI 0x00002000 +#define ECA_SII 0x00000001 __u32 eca; /* 0x004c */ #define ICPT_INST 0x04 #define ICPT_PROGI 0x08 #define ICPT_INSTPROGI 0x0C +#define ICPT_EXTREQ 0x10 #define ICPT_EXTINT 0x14 +#define ICPT_IOREQ 0x18 +#define ICPT_WAIT 0x1c #define ICPT_VALIDITY 0x20 #define ICPT_STOP 0x28 #define ICPT_OPEREXC 0x2C #define ICPT_PARTEXEC 0x38 #define ICPT_IOINST 0x40 +#define ICPT_KSS 0x5c __u8 icptcode; /* 0x0050 */ __u8 icptstatus; /* 0x0051 */ __u16 ihcpu; /* 0x0052 */ @@ -182,10 +195,19 @@ struct kvm_s390_sie_block { __u32 ipb; /* 0x0058 */ __u32 scaoh; /* 0x005c */ __u8 reserved60; /* 0x0060 */ +#define ECB_GS 0x40 +#define ECB_TE 0x10 +#define ECB_SRSI 0x04 +#define ECB_HOSTPROTINT 0x02 __u8 ecb; /* 0x0061 */ +#define ECB2_CMMA 0x80 +#define ECB2_IEP 0x20 +#define ECB2_PFMFI 0x08 +#define ECB2_ESCA 0x04 __u8 ecb2; /* 0x0062 */ -#define ECB3_AES 0x04 #define ECB3_DEA 0x08 +#define ECB3_AES 0x04 +#define ECB3_RI 0x01 __u8 ecb3; /* 0x0063 */ __u32 scaol; /* 0x0064 */ __u8 reserved68[4]; /* 0x0068 */ @@ -219,11 +241,14 @@ struct kvm_s390_sie_block { __u32 crycbd; /* 0x00fc */ __u64 gcr[16]; /* 0x0100 */ __u64 gbea; /* 0x0180 */ - __u8 reserved188[24]; /* 0x0188 */ + __u8 reserved188[8]; /* 0x0188 */ + __u64 sdnxo; /* 0x0190 */ + __u8 reserved198[8]; /* 0x0198 */ __u32 fac; /* 0x01a0 */ __u8 reserved1a4[20]; /* 0x01a4 */ __u64 cbrlo; /* 0x01b8 */ __u8 reserved1c0[8]; /* 0x01c0 */ +#define ECD_HOSTREGMGMT 0x20000000 __u32 ecd; /* 0x01c8 */ __u8 reserved1cc[18]; /* 0x01cc */ __u64 pp; /* 0x01de */ @@ -498,6 +523,12 @@ struct kvm_s390_local_interrupt { #define FIRQ_CNTR_PFAULT 3 #define FIRQ_MAX_COUNT 4 +/* mask the AIS mode for a given ISC */ +#define AIS_MODE_MASK(isc) (0x80 >> isc) + +#define KVM_S390_AIS_MODE_ALL 0 +#define KVM_S390_AIS_MODE_SINGLE 1 + struct kvm_s390_float_interrupt { unsigned long pending_irqs; spinlock_t lock; @@ -507,6 +538,10 @@ struct kvm_s390_float_interrupt { struct kvm_s390_ext_info srv_signal; int next_rr_cpu; unsigned long idle_mask[BITS_TO_LONGS(KVM_MAX_VCPUS)]; + struct mutex ais_lock; + u8 simm; + u8 nimm; + int ais_enabled; }; struct kvm_hw_wp_info_arch { @@ -554,6 +589,7 @@ struct kvm_vcpu_arch { /* if vsie is active, currently executed shadow sie control block */ struct kvm_s390_sie_block *vsie_block; unsigned int host_acrs[NUM_ACRS]; + struct gs_cb *host_gscb; struct fpu host_fpregs; struct kvm_s390_local_interrupt local_int; struct hrtimer ckc_timer; @@ -574,6 +610,7 @@ struct kvm_vcpu_arch { */ seqcount_t cputm_seqcount; __u64 cputm_start; + bool gs_enabled; }; struct kvm_vm_stat { @@ -596,6 +633,7 @@ struct s390_io_adapter { bool maskable; bool masked; bool swap; + bool suppressible; struct rw_semaphore maps_lock; struct list_head maps; atomic_t nr_maps; diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index ace3bd315438..6f5167bc1928 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -75,6 +75,7 @@ struct sclp_info { unsigned char has_pfmfi : 1; unsigned char has_ibs : 1; unsigned char has_skey : 1; + unsigned char has_kss : 1; unsigned int ibc; unsigned int mtid; unsigned int mtid_cp; diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h index a2ffec4139ad..3dd2a1d308dd 100644 --- a/arch/s390/include/uapi/asm/kvm.h +++ b/arch/s390/include/uapi/asm/kvm.h @@ -26,6 +26,8 @@ #define KVM_DEV_FLIC_ADAPTER_REGISTER 6 #define KVM_DEV_FLIC_ADAPTER_MODIFY 7 #define KVM_DEV_FLIC_CLEAR_IO_IRQ 8 +#define KVM_DEV_FLIC_AISM 9 +#define KVM_DEV_FLIC_AIRQ_INJECT 10 /* * We can have up to 4*64k pending subchannels + 8 adapter interrupts, * as well as up to ASYNC_PF_PER_VCPU*KVM_MAX_VCPUS pfault done interrupts. @@ -41,7 +43,14 @@ struct kvm_s390_io_adapter { __u8 isc; __u8 maskable; __u8 swap; - __u8 pad; + __u8 flags; +}; + +#define KVM_S390_ADAPTER_SUPPRESSIBLE 0x01 + +struct kvm_s390_ais_req { + __u8 isc; + __u16 mode; }; #define KVM_S390_IO_ADAPTER_MASK 1 @@ -110,6 +119,7 @@ struct kvm_s390_vm_cpu_machine { #define KVM_S390_VM_CPU_FEAT_CMMA 10 #define KVM_S390_VM_CPU_FEAT_PFMFI 11 #define KVM_S390_VM_CPU_FEAT_SIGPIF 12 +#define KVM_S390_VM_CPU_FEAT_KSS 13 struct kvm_s390_vm_cpu_feat { __u64 feat[16]; }; @@ -131,7 +141,8 @@ struct kvm_s390_vm_cpu_subfunc { __u8 kmo[16]; /* with MSA4 */ __u8 pcc[16]; /* with MSA4 */ __u8 ppno[16]; /* with MSA5 */ - __u8 reserved[1824]; + __u8 kma[16]; /* with MSA8 */ + __u8 reserved[1808]; }; /* kvm attributes for crypto */ @@ -197,6 +208,10 @@ struct kvm_guest_debug_arch { #define KVM_SYNC_VRS (1UL << 6) #define KVM_SYNC_RICCB (1UL << 7) #define KVM_SYNC_FPRS (1UL << 8) +#define KVM_SYNC_GSCB (1UL << 9) +/* length and alignment of the sdnx as a power of two */ +#define SDNXC 8 +#define SDNXL (1UL << SDNXC) /* definition of registers in kvm_run */ struct kvm_sync_regs { __u64 prefix; /* prefix register */ @@ -217,8 +232,16 @@ struct kvm_sync_regs { }; __u8 reserved[512]; /* for future vector expansion */ __u32 fpc; /* valid on KVM_SYNC_VRS or KVM_SYNC_FPRS */ - __u8 padding[52]; /* riccb needs to be 64byte aligned */ + __u8 padding1[52]; /* riccb needs to be 64byte aligned */ __u8 riccb[64]; /* runtime instrumentation controls block */ + __u8 padding2[192]; /* sdnx needs to be 256byte aligned */ + union { + __u8 sdnx[SDNXL]; /* state description annex */ + struct { + __u64 reserved1[2]; + __u64 gscb[4]; + }; + }; }; #define KVM_REG_S390_TODPR (KVM_REG_S390 | KVM_REG_SIZE_U32 | 0x1) diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c index ddbffb715b40..9da243d94cc3 100644 --- a/arch/s390/kvm/gaccess.c +++ b/arch/s390/kvm/gaccess.c @@ -261,7 +261,7 @@ struct aste { int ipte_lock_held(struct kvm_vcpu *vcpu) { - if (vcpu->arch.sie_block->eca & 1) { + if (vcpu->arch.sie_block->eca & ECA_SII) { int rc; read_lock(&vcpu->kvm->arch.sca_lock); @@ -360,7 +360,7 @@ static void ipte_unlock_siif(struct kvm_vcpu *vcpu) void ipte_lock(struct kvm_vcpu *vcpu) { - if (vcpu->arch.sie_block->eca & 1) + if (vcpu->arch.sie_block->eca & ECA_SII) ipte_lock_siif(vcpu); else ipte_lock_simple(vcpu); @@ -368,7 +368,7 @@ void ipte_lock(struct kvm_vcpu *vcpu) void ipte_unlock(struct kvm_vcpu *vcpu) { - if (vcpu->arch.sie_block->eca & 1) + if (vcpu->arch.sie_block->eca & ECA_SII) ipte_unlock_siif(vcpu); else ipte_unlock_simple(vcpu); diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c index 59920f96ebc0..a4752bf6b526 100644 --- a/arch/s390/kvm/intercept.c +++ b/arch/s390/kvm/intercept.c @@ -35,6 +35,7 @@ static const intercept_handler_t instruction_handlers[256] = { [0xb6] = kvm_s390_handle_stctl, [0xb7] = kvm_s390_handle_lctl, [0xb9] = kvm_s390_handle_b9, + [0xe3] = kvm_s390_handle_e3, [0xe5] = kvm_s390_handle_e5, [0xeb] = kvm_s390_handle_eb, }; @@ -368,8 +369,7 @@ static int handle_operexc(struct kvm_vcpu *vcpu) trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa, vcpu->arch.sie_block->ipb); - if (vcpu->arch.sie_block->ipa == 0xb256 && - test_kvm_facility(vcpu->kvm, 74)) + if (vcpu->arch.sie_block->ipa == 0xb256) return handle_sthyi(vcpu); if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0) @@ -404,28 +404,31 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; switch (vcpu->arch.sie_block->icptcode) { - case 0x10: - case 0x18: + case ICPT_EXTREQ: + case ICPT_IOREQ: return handle_noop(vcpu); - case 0x04: + case ICPT_INST: rc = handle_instruction(vcpu); break; - case 0x08: + case ICPT_PROGI: return handle_prog(vcpu); - case 0x14: + case ICPT_EXTINT: return handle_external_interrupt(vcpu); - case 0x1c: + case ICPT_WAIT: return kvm_s390_handle_wait(vcpu); - case 0x20: + case ICPT_VALIDITY: return handle_validity(vcpu); - case 0x28: + case ICPT_STOP: return handle_stop(vcpu); - case 0x2c: + case ICPT_OPEREXC: rc = handle_operexc(vcpu); break; - case 0x38: + case ICPT_PARTEXEC: rc = handle_partial_execution(vcpu); break; + case ICPT_KSS: + rc = kvm_s390_skey_check_enable(vcpu); + break; default: return -EOPNOTSUPP; } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index 169558dc7daf..caf15c8a8948 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -410,6 +410,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, struct kvm_s390_mchk_info *mchk) { unsigned long ext_sa_addr; + unsigned long lc; freg_t fprs[NUM_FPRS]; union mci mci; int rc; @@ -418,12 +419,34 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, /* take care of lazy register loading */ save_fpu_regs(); save_access_regs(vcpu->run->s.regs.acrs); + if (MACHINE_HAS_GS && vcpu->arch.gs_enabled) + save_gs_cb(current->thread.gs_cb); /* Extended save area */ rc = read_guest_lc(vcpu, __LC_MCESAD, &ext_sa_addr, sizeof(unsigned long)); - /* Only bits 0-53 are used for address formation */ - ext_sa_addr &= ~0x3ffUL; + /* Only bits 0 through 63-LC are used for address formation */ + lc = ext_sa_addr & MCESA_LC_MASK; + if (test_kvm_facility(vcpu->kvm, 133)) { + switch (lc) { + case 0: + case 10: + ext_sa_addr &= ~0x3ffUL; + break; + case 11: + ext_sa_addr &= ~0x7ffUL; + break; + case 12: + ext_sa_addr &= ~0xfffUL; + break; + default: + ext_sa_addr = 0; + break; + } + } else { + ext_sa_addr &= ~0x3ffUL; + } + if (!rc && mci.vr && ext_sa_addr && test_kvm_facility(vcpu->kvm, 129)) { if (write_guest_abs(vcpu, ext_sa_addr, vcpu->run->s.regs.vrs, 512)) @@ -431,6 +454,14 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, } else { mci.vr = 0; } + if (!rc && mci.gs && ext_sa_addr && test_kvm_facility(vcpu->kvm, 133) + && (lc == 11 || lc == 12)) { + if (write_guest_abs(vcpu, ext_sa_addr + 1024, + &vcpu->run->s.regs.gscb, 32)) + mci.gs = 0; + } else { + mci.gs = 0; + } /* General interruption information */ rc |= put_guest_lc(vcpu, 1, (u8 __user *) __LC_AR_MODE_ID); @@ -1968,6 +1999,8 @@ static int register_io_adapter(struct kvm_device *dev, adapter->maskable = adapter_info.maskable; adapter->masked = false; adapter->swap = adapter_info.swap; + adapter->suppressible = (adapter_info.flags) & + KVM_S390_ADAPTER_SUPPRESSIBLE; dev->kvm->arch.adapters[adapter->id] = adapter; return 0; @@ -2121,6 +2154,87 @@ static int clear_io_irq(struct kvm *kvm, struct kvm_device_attr *attr) return 0; } +static int modify_ais_mode(struct kvm *kvm, struct kvm_device_attr *attr) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + struct kvm_s390_ais_req req; + int ret = 0; + + if (!fi->ais_enabled) + return -ENOTSUPP; + + if (copy_from_user(&req, (void __user *)attr->addr, sizeof(req))) + return -EFAULT; + + if (req.isc > MAX_ISC) + return -EINVAL; + + trace_kvm_s390_modify_ais_mode(req.isc, + (fi->simm & AIS_MODE_MASK(req.isc)) ? + (fi->nimm & AIS_MODE_MASK(req.isc)) ? + 2 : KVM_S390_AIS_MODE_SINGLE : + KVM_S390_AIS_MODE_ALL, req.mode); + + mutex_lock(&fi->ais_lock); + switch (req.mode) { + case KVM_S390_AIS_MODE_ALL: + fi->simm &= ~AIS_MODE_MASK(req.isc); + fi->nimm &= ~AIS_MODE_MASK(req.isc); + break; + case KVM_S390_AIS_MODE_SINGLE: + fi->simm |= AIS_MODE_MASK(req.isc); + fi->nimm &= ~AIS_MODE_MASK(req.isc); + break; + default: + ret = -EINVAL; + } + mutex_unlock(&fi->ais_lock); + + return ret; +} + +static int kvm_s390_inject_airq(struct kvm *kvm, + struct s390_io_adapter *adapter) +{ + struct kvm_s390_float_interrupt *fi = &kvm->arch.float_int; + struct kvm_s390_interrupt s390int = { + .type = KVM_S390_INT_IO(1, 0, 0, 0), + .parm = 0, + .parm64 = (adapter->isc << 27) | 0x80000000, + }; + int ret = 0; + + if (!fi->ais_enabled || !adapter->suppressible) + return kvm_s390_inject_vm(kvm, &s390int); + + mutex_lock(&fi->ais_lock); + if (fi->nimm & AIS_MODE_MASK(adapter->isc)) { + trace_kvm_s390_airq_suppressed(adapter->id, adapter->isc); + goto out; + } + + ret = kvm_s390_inject_vm(kvm, &s390int); + if (!ret && (fi->simm & AIS_MODE_MASK(adapter->isc))) { + fi->nimm |= AIS_MODE_MASK(adapter->isc); + trace_kvm_s390_modify_ais_mode(adapter->isc, + KVM_S390_AIS_MODE_SINGLE, 2); + } +out: + mutex_unlock(&fi->ais_lock); + return ret; +} + +static int flic_inject_airq(struct kvm *kvm, struct kvm_device_attr *attr) +{ + unsigned int id = attr->attr; + struct s390_io_adapter *adapter = get_io_adapter(kvm, id); + + if (!adapter) + return -EINVAL; + + return kvm_s390_inject_airq(kvm, adapter); +} + static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) { int r = 0; @@ -2157,6 +2271,12 @@ static int flic_set_attr(struct kvm_device *dev, struct kvm_device_attr *attr) case KVM_DEV_FLIC_CLEAR_IO_IRQ: r = clear_io_irq(dev->kvm, attr); break; + case KVM_DEV_FLIC_AISM: + r = modify_ais_mode(dev->kvm, attr); + break; + case KVM_DEV_FLIC_AIRQ_INJECT: + r = flic_inject_airq(dev->kvm, attr); + break; default: r = -EINVAL; } @@ -2176,6 +2296,8 @@ static int flic_has_attr(struct kvm_device *dev, case KVM_DEV_FLIC_ADAPTER_REGISTER: case KVM_DEV_FLIC_ADAPTER_MODIFY: case KVM_DEV_FLIC_CLEAR_IO_IRQ: + case KVM_DEV_FLIC_AISM: + case KVM_DEV_FLIC_AIRQ_INJECT: return 0; } return -ENXIO; @@ -2286,12 +2408,7 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e, ret = adapter_indicators_set(kvm, adapter, &e->adapter); up_read(&adapter->maps_lock); if ((ret > 0) && !adapter->masked) { - struct kvm_s390_interrupt s390int = { - .type = KVM_S390_INT_IO(1, 0, 0, 0), - .parm = 0, - .parm64 = (adapter->isc << 27) | 0x80000000, - }; - ret = kvm_s390_inject_vm(kvm, &s390int); + ret = kvm_s390_inject_airq(kvm, adapter); if (ret == 0) ret = 1; } diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 323297e55e80..689ac48361c6 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -276,6 +276,10 @@ static void kvm_s390_cpu_feat_init(void) __cpacf_query(CPACF_PRNO, (cpacf_mask_t *) kvm_s390_available_subfunc.ppno); + if (test_facility(146)) /* MSA8 */ + __cpacf_query(CPACF_KMA, (cpacf_mask_t *) + kvm_s390_available_subfunc.kma); + if (MACHINE_HAS_ESOP) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP); /* @@ -300,6 +304,8 @@ static void kvm_s390_cpu_feat_init(void) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI); if (sclp.has_ibs) allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS); + if (sclp.has_kss) + allow_cpu_feat(KVM_S390_VM_CPU_FEAT_KSS); /* * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make * all skey handling functions read/set the skey from the PGSTE @@ -380,6 +386,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_SKEYS: case KVM_CAP_S390_IRQ_STATE: case KVM_CAP_S390_USER_INSTR0: + case KVM_CAP_S390_AIS: r = 1; break; case KVM_CAP_S390_MEM_OP: @@ -405,6 +412,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_S390_RI: r = test_facility(64); break; + case KVM_CAP_S390_GS: + r = test_facility(133); + break; default: r = 0; } @@ -541,6 +551,34 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap) VM_EVENT(kvm, 3, "ENABLE: CAP_S390_RI %s", r ? "(not available)" : "(success)"); break; + case KVM_CAP_S390_AIS: + mutex_lock(&kvm->lock); + if (kvm->created_vcpus) { + r = -EBUSY; + } else { + set_kvm_facility(kvm->arch.model.fac_mask, 72); + set_kvm_facility(kvm->arch.model.fac_list, 72); + kvm->arch.float_int.ais_enabled = 1; + r = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: AIS %s", + r ? "(not available)" : "(success)"); + break; + case KVM_CAP_S390_GS: + r = -EINVAL; + mutex_lock(&kvm->lock); + if (atomic_read(&kvm->online_vcpus)) { + r = -EBUSY; + } else if (test_facility(133)) { + set_kvm_facility(kvm->arch.model.fac_mask, 133); + set_kvm_facility(kvm->arch.model.fac_list, 133); + r = 0; + } + mutex_unlock(&kvm->lock); + VM_EVENT(kvm, 3, "ENABLE: CAP_S390_GS %s", + r ? "(not available)" : "(success)"); + break; case KVM_CAP_S390_USER_STSI: VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_STSI"); kvm->arch.user_stsi = 1; @@ -1492,6 +1530,10 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_s390_crypto_init(kvm); + mutex_init(&kvm->arch.float_int.ais_lock); + kvm->arch.float_int.simm = 0; + kvm->arch.float_int.nimm = 0; + kvm->arch.float_int.ais_enabled = 0; spin_lock_init(&kvm->arch.float_int.lock); for (i = 0; i < FIRQ_LIST_COUNT; i++) INIT_LIST_HEAD(&kvm->arch.float_int.lists[i]); @@ -1640,7 +1682,7 @@ static void sca_add_vcpu(struct kvm_vcpu *vcpu) sca->cpu[vcpu->vcpu_id].sda = (__u64) vcpu->arch.sie_block; vcpu->arch.sie_block->scaoh = (__u32)(((__u64)sca) >> 32); vcpu->arch.sie_block->scaol = (__u32)(__u64)sca & ~0x3fU; - vcpu->arch.sie_block->ecb2 |= 0x04U; + vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; set_bit_inv(vcpu->vcpu_id, (unsigned long *) sca->mcn); } else { struct bsca_block *sca = vcpu->kvm->arch.sca; @@ -1694,7 +1736,7 @@ static int sca_switch_to_extended(struct kvm *kvm) kvm_for_each_vcpu(vcpu_idx, vcpu, kvm) { vcpu->arch.sie_block->scaoh = scaoh; vcpu->arch.sie_block->scaol = scaol; - vcpu->arch.sie_block->ecb2 |= 0x04U; + vcpu->arch.sie_block->ecb2 |= ECB2_ESCA; } kvm->arch.sca = new_sca; kvm->arch.use_esca = 1; @@ -1743,6 +1785,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) kvm_s390_set_prefix(vcpu, 0); if (test_kvm_facility(vcpu->kvm, 64)) vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; + if (test_kvm_facility(vcpu->kvm, 133)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; /* fprs can be synchronized via vrs, even if the guest has no vx. With * MACHINE_HAS_VX, (load|store)_fpu_regs() will work with vrs format. */ @@ -1933,8 +1977,8 @@ int kvm_s390_vcpu_setup_cmma(struct kvm_vcpu *vcpu) if (!vcpu->arch.sie_block->cbrlo) return -ENOMEM; - vcpu->arch.sie_block->ecb2 |= 0x80; - vcpu->arch.sie_block->ecb2 &= ~0x08; + vcpu->arch.sie_block->ecb2 |= ECB2_CMMA; + vcpu->arch.sie_block->ecb2 &= ~ECB2_PFMFI; return 0; } @@ -1964,31 +2008,37 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */ if (MACHINE_HAS_ESOP) - vcpu->arch.sie_block->ecb |= 0x02; + vcpu->arch.sie_block->ecb |= ECB_HOSTPROTINT; if (test_kvm_facility(vcpu->kvm, 9)) - vcpu->arch.sie_block->ecb |= 0x04; + vcpu->arch.sie_block->ecb |= ECB_SRSI; if (test_kvm_facility(vcpu->kvm, 73)) - vcpu->arch.sie_block->ecb |= 0x10; + vcpu->arch.sie_block->ecb |= ECB_TE; if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi) - vcpu->arch.sie_block->ecb2 |= 0x08; + vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI; if (test_kvm_facility(vcpu->kvm, 130)) - vcpu->arch.sie_block->ecb2 |= 0x20; - vcpu->arch.sie_block->eca = 0x1002000U; + vcpu->arch.sie_block->ecb2 |= ECB2_IEP; + vcpu->arch.sie_block->eca = ECA_MVPGI | ECA_PROTEXCI; if (sclp.has_cei) - vcpu->arch.sie_block->eca |= 0x80000000U; + vcpu->arch.sie_block->eca |= ECA_CEI; if (sclp.has_ib) - vcpu->arch.sie_block->eca |= 0x40000000U; + vcpu->arch.sie_block->eca |= ECA_IB; if (sclp.has_siif) - vcpu->arch.sie_block->eca |= 1; + vcpu->arch.sie_block->eca |= ECA_SII; if (sclp.has_sigpif) - vcpu->arch.sie_block->eca |= 0x10000000U; + vcpu->arch.sie_block->eca |= ECA_SIGPI; if (test_kvm_facility(vcpu->kvm, 129)) { - vcpu->arch.sie_block->eca |= 0x00020000; - vcpu->arch.sie_block->ecd |= 0x20000000; + vcpu->arch.sie_block->eca |= ECA_VX; + vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT; } + vcpu->arch.sie_block->sdnxo = ((unsigned long) &vcpu->run->s.regs.sdnx) + | SDNXC; vcpu->arch.sie_block->riccbd = (unsigned long) &vcpu->run->s.regs.riccb; - vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; + + if (sclp.has_kss) + atomic_or(CPUSTAT_KSS, &vcpu->arch.sie_block->cpuflags); + else + vcpu->arch.sie_block->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; if (vcpu->kvm->arch.use_cmma) { rc = kvm_s390_vcpu_setup_cmma(vcpu); @@ -2440,7 +2490,7 @@ retry: } /* nothing to do, just clear the request */ - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); return 0; } @@ -2713,6 +2763,11 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { + struct runtime_instr_cb *riccb; + struct gs_cb *gscb; + + riccb = (struct runtime_instr_cb *) &kvm_run->s.regs.riccb; + gscb = (struct gs_cb *) &kvm_run->s.regs.gscb; vcpu->arch.sie_block->gpsw.mask = kvm_run->psw_mask; vcpu->arch.sie_block->gpsw.addr = kvm_run->psw_addr; if (kvm_run->kvm_dirty_regs & KVM_SYNC_PREFIX) @@ -2741,12 +2796,24 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) * we should enable RI here instead of doing the lazy enablement. */ if ((kvm_run->kvm_dirty_regs & KVM_SYNC_RICCB) && - test_kvm_facility(vcpu->kvm, 64)) { - struct runtime_instr_cb *riccb = - (struct runtime_instr_cb *) &kvm_run->s.regs.riccb; - - if (riccb->valid) - vcpu->arch.sie_block->ecb3 |= 0x01; + test_kvm_facility(vcpu->kvm, 64) && + riccb->valid && + !(vcpu->arch.sie_block->ecb3 & ECB3_RI)) { + VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (sync_regs)"); + vcpu->arch.sie_block->ecb3 |= ECB3_RI; + } + /* + * If userspace sets the gscb (e.g. after migration) to non-zero, + * we should enable GS here instead of doing the lazy enablement. + */ + if ((kvm_run->kvm_dirty_regs & KVM_SYNC_GSCB) && + test_kvm_facility(vcpu->kvm, 133) && + gscb->gssm && + !vcpu->arch.gs_enabled) { + VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (sync_regs)"); + vcpu->arch.sie_block->ecb |= ECB_GS; + vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT; + vcpu->arch.gs_enabled = 1; } save_access_regs(vcpu->arch.host_acrs); restore_access_regs(vcpu->run->s.regs.acrs); @@ -2762,6 +2829,20 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (test_fp_ctl(current->thread.fpu.fpc)) /* User space provided an invalid FPC, let's clear it */ current->thread.fpu.fpc = 0; + if (MACHINE_HAS_GS) { + preempt_disable(); + __ctl_set_bit(2, 4); + if (current->thread.gs_cb) { + vcpu->arch.host_gscb = current->thread.gs_cb; + save_gs_cb(vcpu->arch.host_gscb); + } + if (vcpu->arch.gs_enabled) { + current->thread.gs_cb = (struct gs_cb *) + &vcpu->run->s.regs.gscb; + restore_gs_cb(current->thread.gs_cb); + } + preempt_enable(); + } kvm_run->kvm_dirty_regs = 0; } @@ -2788,6 +2869,18 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) /* Restore will be done lazily at return */ current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc; current->thread.fpu.regs = vcpu->arch.host_fpregs.regs; + if (MACHINE_HAS_GS) { + __ctl_set_bit(2, 4); + if (vcpu->arch.gs_enabled) + save_gs_cb(current->thread.gs_cb); + preempt_disable(); + current->thread.gs_cb = vcpu->arch.host_gscb; + restore_gs_cb(vcpu->arch.host_gscb); + preempt_enable(); + if (!vcpu->arch.host_gscb) + __ctl_clear_bit(2, 4); + vcpu->arch.host_gscb = NULL; + } } diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index af9fa91a0c91..55f5c8457d6d 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -25,7 +25,7 @@ typedef int (*intercept_handler_t)(struct kvm_vcpu *vcpu); /* Transactional Memory Execution related macros */ -#define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & 0x10)) +#define IS_TE_ENABLED(vcpu) ((vcpu->arch.sie_block->ecb & ECB_TE)) #define TDB_FORMAT1 1 #define IS_ITDB_VALID(vcpu) ((*(char *)vcpu->arch.sie_block->itdba == TDB_FORMAT1)) @@ -246,6 +246,7 @@ static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu) int is_valid_psw(psw_t *psw); int kvm_s390_handle_aa(struct kvm_vcpu *vcpu); int kvm_s390_handle_b2(struct kvm_vcpu *vcpu); +int kvm_s390_handle_e3(struct kvm_vcpu *vcpu); int kvm_s390_handle_e5(struct kvm_vcpu *vcpu); int kvm_s390_handle_01(struct kvm_vcpu *vcpu); int kvm_s390_handle_b9(struct kvm_vcpu *vcpu); @@ -253,6 +254,7 @@ int kvm_s390_handle_lpsw(struct kvm_vcpu *vcpu); int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu); int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu); int kvm_s390_handle_eb(struct kvm_vcpu *vcpu); +int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu); /* implemented in vsie.c */ int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu); diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c index 64b6a309f2c4..c03106c428cf 100644 --- a/arch/s390/kvm/priv.c +++ b/arch/s390/kvm/priv.c @@ -37,7 +37,8 @@ static int handle_ri(struct kvm_vcpu *vcpu) { if (test_kvm_facility(vcpu->kvm, 64)) { - vcpu->arch.sie_block->ecb3 |= 0x01; + VCPU_EVENT(vcpu, 3, "%s", "ENABLE: RI (lazy)"); + vcpu->arch.sie_block->ecb3 |= ECB3_RI; kvm_s390_retry_instr(vcpu); return 0; } else @@ -52,6 +53,33 @@ int kvm_s390_handle_aa(struct kvm_vcpu *vcpu) return -EOPNOTSUPP; } +static int handle_gs(struct kvm_vcpu *vcpu) +{ + if (test_kvm_facility(vcpu->kvm, 133)) { + VCPU_EVENT(vcpu, 3, "%s", "ENABLE: GS (lazy)"); + preempt_disable(); + __ctl_set_bit(2, 4); + current->thread.gs_cb = (struct gs_cb *)&vcpu->run->s.regs.gscb; + restore_gs_cb(current->thread.gs_cb); + preempt_enable(); + vcpu->arch.sie_block->ecb |= ECB_GS; + vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT; + vcpu->arch.gs_enabled = 1; + kvm_s390_retry_instr(vcpu); + return 0; + } else + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); +} + +int kvm_s390_handle_e3(struct kvm_vcpu *vcpu) +{ + int code = vcpu->arch.sie_block->ipb & 0xff; + + if (code == 0x49 || code == 0x4d) + return handle_gs(vcpu); + else + return -EOPNOTSUPP; +} /* Handle SCK (SET CLOCK) interception */ static int handle_set_clock(struct kvm_vcpu *vcpu) { @@ -170,18 +198,25 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu) return 0; } -static int __skey_check_enable(struct kvm_vcpu *vcpu) +int kvm_s390_skey_check_enable(struct kvm_vcpu *vcpu) { int rc = 0; + struct kvm_s390_sie_block *sie_block = vcpu->arch.sie_block; trace_kvm_s390_skey_related_inst(vcpu); - if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE))) + if (!(sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)) && + !(atomic_read(&sie_block->cpuflags) & CPUSTAT_KSS)) return rc; rc = s390_enable_skey(); VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc); - if (!rc) - vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE); + if (!rc) { + if (atomic_read(&sie_block->cpuflags) & CPUSTAT_KSS) + atomic_andnot(CPUSTAT_KSS, &sie_block->cpuflags); + else + sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | + ICTL_RRBE); + } return rc; } @@ -190,7 +225,7 @@ static int try_handle_skey(struct kvm_vcpu *vcpu) int rc; vcpu->stat.instruction_storage_key++; - rc = __skey_check_enable(vcpu); + rc = kvm_s390_skey_check_enable(vcpu); if (rc) return rc; if (sclp.has_skey) { @@ -759,6 +794,7 @@ static const intercept_handler_t b2_handlers[256] = { [0x3b] = handle_io_inst, [0x3c] = handle_io_inst, [0x50] = handle_ipte_interlock, + [0x56] = handle_sthyi, [0x5f] = handle_io_inst, [0x74] = handle_io_inst, [0x76] = handle_io_inst, @@ -887,7 +923,7 @@ static int handle_pfmf(struct kvm_vcpu *vcpu) } if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK) { - int rc = __skey_check_enable(vcpu); + int rc = kvm_s390_skey_check_enable(vcpu); if (rc) return rc; diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c index 05c98bb853cf..926b5244263e 100644 --- a/arch/s390/kvm/sthyi.c +++ b/arch/s390/kvm/sthyi.c @@ -404,6 +404,9 @@ int handle_sthyi(struct kvm_vcpu *vcpu) u64 code, addr, cc = 0; struct sthyi_sctns *sctns = NULL; + if (!test_kvm_facility(vcpu->kvm, 74)) + return kvm_s390_inject_program_int(vcpu, PGM_OPERATION); + /* * STHYI requires extensive locking in the higher hypervisors * and is very computational/memory expensive. Therefore we diff --git a/arch/s390/kvm/trace-s390.h b/arch/s390/kvm/trace-s390.h index 396485bca191..78b7e847984a 100644 --- a/arch/s390/kvm/trace-s390.h +++ b/arch/s390/kvm/trace-s390.h @@ -280,6 +280,58 @@ TRACE_EVENT(kvm_s390_enable_disable_ibs, __entry->state ? "enabling" : "disabling", __entry->id) ); +/* + * Trace point for modifying ais mode for a given isc. + */ +TRACE_EVENT(kvm_s390_modify_ais_mode, + TP_PROTO(__u8 isc, __u16 from, __u16 to), + TP_ARGS(isc, from, to), + + TP_STRUCT__entry( + __field(__u8, isc) + __field(__u16, from) + __field(__u16, to) + ), + + TP_fast_assign( + __entry->isc = isc; + __entry->from = from; + __entry->to = to; + ), + + TP_printk("for isc %x, modifying interruption mode from %s to %s", + __entry->isc, + (__entry->from == KVM_S390_AIS_MODE_ALL) ? + "ALL-Interruptions Mode" : + (__entry->from == KVM_S390_AIS_MODE_SINGLE) ? + "Single-Interruption Mode" : "No-Interruptions Mode", + (__entry->to == KVM_S390_AIS_MODE_ALL) ? + "ALL-Interruptions Mode" : + (__entry->to == KVM_S390_AIS_MODE_SINGLE) ? + "Single-Interruption Mode" : "No-Interruptions Mode") + ); + +/* + * Trace point for suppressed adapter I/O interrupt. + */ +TRACE_EVENT(kvm_s390_airq_suppressed, + TP_PROTO(__u32 id, __u8 isc), + TP_ARGS(id, isc), + + TP_STRUCT__entry( + __field(__u32, id) + __field(__u8, isc) + ), + + TP_fast_assign( + __entry->id = id; + __entry->isc = isc; + ), + + TP_printk("adapter I/O interrupt suppressed (id:%x isc:%x)", + __entry->id, __entry->isc) + ); + #endif /* _TRACE_KVMS390_H */ diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 5491be39776b..4719ecb9ab42 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -117,6 +117,8 @@ static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) newflags |= cpuflags & CPUSTAT_SM; if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS)) newflags |= cpuflags & CPUSTAT_IBS; + if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_KSS)) + newflags |= cpuflags & CPUSTAT_KSS; atomic_set(&scb_s->cpuflags, newflags); return 0; @@ -249,7 +251,7 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) { struct kvm_s390_sie_block *scb_o = vsie_page->scb_o; struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s; - bool had_tx = scb_s->ecb & 0x10U; + bool had_tx = scb_s->ecb & ECB_TE; unsigned long new_mso = 0; int rc; @@ -289,7 +291,9 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) * bits. Therefore we cannot provide interpretation and would later * have to provide own emulation handlers. */ - scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; + if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_KSS)) + scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE; + scb_s->icpua = scb_o->icpua; if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM)) @@ -307,34 +311,39 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) scb_s->ihcpu = scb_o->ihcpu; /* MVPG and Protection Exception Interpretation are always available */ - scb_s->eca |= scb_o->eca & 0x01002000U; + scb_s->eca |= scb_o->eca & (ECA_MVPGI | ECA_PROTEXCI); /* Host-protection-interruption introduced with ESOP */ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP)) - scb_s->ecb |= scb_o->ecb & 0x02U; + scb_s->ecb |= scb_o->ecb & ECB_HOSTPROTINT; /* transactional execution */ if (test_kvm_facility(vcpu->kvm, 73)) { /* remap the prefix is tx is toggled on */ - if ((scb_o->ecb & 0x10U) && !had_tx) + if ((scb_o->ecb & ECB_TE) && !had_tx) prefix_unmapped(vsie_page); - scb_s->ecb |= scb_o->ecb & 0x10U; + scb_s->ecb |= scb_o->ecb & ECB_TE; } /* SIMD */ if (test_kvm_facility(vcpu->kvm, 129)) { - scb_s->eca |= scb_o->eca & 0x00020000U; - scb_s->ecd |= scb_o->ecd & 0x20000000U; + scb_s->eca |= scb_o->eca & ECA_VX; + scb_s->ecd |= scb_o->ecd & ECD_HOSTREGMGMT; } /* Run-time-Instrumentation */ if (test_kvm_facility(vcpu->kvm, 64)) - scb_s->ecb3 |= scb_o->ecb3 & 0x01U; + scb_s->ecb3 |= scb_o->ecb3 & ECB3_RI; /* Instruction Execution Prevention */ if (test_kvm_facility(vcpu->kvm, 130)) - scb_s->ecb2 |= scb_o->ecb2 & 0x20U; + scb_s->ecb2 |= scb_o->ecb2 & ECB2_IEP; + /* Guarded Storage */ + if (test_kvm_facility(vcpu->kvm, 133)) { + scb_s->ecb |= scb_o->ecb & ECB_GS; + scb_s->ecd |= scb_o->ecd & ECD_HOSTREGMGMT; + } if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF)) - scb_s->eca |= scb_o->eca & 0x00000001U; + scb_s->eca |= scb_o->eca & ECA_SII; if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB)) - scb_s->eca |= scb_o->eca & 0x40000000U; + scb_s->eca |= scb_o->eca & ECA_IB; if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI)) - scb_s->eca |= scb_o->eca & 0x80000000U; + scb_s->eca |= scb_o->eca & ECA_CEI; prepare_ibc(vcpu, vsie_page); rc = shadow_crycb(vcpu, vsie_page); @@ -406,7 +415,7 @@ static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) prefix += scb_s->mso; rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix); - if (!rc && (scb_s->ecb & 0x10U)) + if (!rc && (scb_s->ecb & ECB_TE)) rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix + PAGE_SIZE); /* @@ -496,6 +505,13 @@ static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) unpin_guest_page(vcpu->kvm, gpa, hpa); scb_s->riccbd = 0; } + + hpa = scb_s->sdnxo; + if (hpa) { + gpa = scb_o->sdnxo; + unpin_guest_page(vcpu->kvm, gpa, hpa); + scb_s->sdnxo = 0; + } } /* @@ -543,7 +559,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } gpa = scb_o->itdba & ~0xffUL; - if (gpa && (scb_s->ecb & 0x10U)) { + if (gpa && (scb_s->ecb & ECB_TE)) { if (!(gpa & ~0x1fffU)) { rc = set_validity_icpt(scb_s, 0x0080U); goto unpin; @@ -558,8 +574,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } gpa = scb_o->gvrd & ~0x1ffUL; - if (gpa && (scb_s->eca & 0x00020000U) && - !(scb_s->ecd & 0x20000000U)) { + if (gpa && (scb_s->eca & ECA_VX) && !(scb_s->ecd & ECD_HOSTREGMGMT)) { if (!(gpa & ~0x1fffUL)) { rc = set_validity_icpt(scb_s, 0x1310U); goto unpin; @@ -577,7 +592,7 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) } gpa = scb_o->riccbd & ~0x3fUL; - if (gpa && (scb_s->ecb3 & 0x01U)) { + if (gpa && (scb_s->ecb3 & ECB3_RI)) { if (!(gpa & ~0x1fffUL)) { rc = set_validity_icpt(scb_s, 0x0043U); goto unpin; @@ -591,6 +606,33 @@ static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) goto unpin; scb_s->riccbd = hpa; } + if ((scb_s->ecb & ECB_GS) && !(scb_s->ecd & ECD_HOSTREGMGMT)) { + unsigned long sdnxc; + + gpa = scb_o->sdnxo & ~0xfUL; + sdnxc = scb_o->sdnxo & 0xfUL; + if (!gpa || !(gpa & ~0x1fffUL)) { + rc = set_validity_icpt(scb_s, 0x10b0U); + goto unpin; + } + if (sdnxc < 6 || sdnxc > 12) { + rc = set_validity_icpt(scb_s, 0x10b1U); + goto unpin; + } + if (gpa & ((1 << sdnxc) - 1)) { + rc = set_validity_icpt(scb_s, 0x10b2U); + goto unpin; + } + /* Due to alignment rules (checked above) this cannot + * cross page boundaries + */ + rc = pin_guest_page(vcpu->kvm, gpa, &hpa); + if (rc == -EINVAL) + rc = set_validity_icpt(scb_s, 0x10b0U); + if (rc) + goto unpin; + scb_s->sdnxo = hpa | sdnxc; + } return 0; unpin: unpin_blocks(vcpu, vsie_page); diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index 0cf802de52a1..be63fbd699fd 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -82,6 +82,7 @@ static struct facility_def facility_defs[] = { 78, /* enhanced-DAT 2 */ 130, /* instruction-execution-protection */ 131, /* enhanced-SOP 2 and side-effect */ + 146, /* msa extension 8 */ -1 /* END */ } }, diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 3e8c287090e4..055962615779 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -221,6 +221,9 @@ struct x86_emulate_ops { void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); + + unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); + void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); }; typedef u32 __attribute__((vector_size(16))) sse128_t; @@ -290,7 +293,6 @@ struct x86_emulate_ctxt { /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; - int emul_flags; bool perm_ok; /* do not check permissions if true */ bool ud; /* inject an #UD if host doesn't support insn */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 74ef58c8ff53..f5bddf92faba 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -43,8 +43,6 @@ #define KVM_PRIVATE_MEM_SLOTS 3 #define KVM_MEM_SLOTS_NUM (KVM_USER_MEM_SLOTS + KVM_PRIVATE_MEM_SLOTS) -#define KVM_PIO_PAGE_OFFSET 1 -#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 #define KVM_HALT_POLL_NS_DEFAULT 400000 #define KVM_IRQCHIP_NUM_PINS KVM_IOAPIC_NUM_PINS @@ -63,10 +61,10 @@ #define KVM_REQ_PMI 19 #define KVM_REQ_SMI 20 #define KVM_REQ_MASTERCLOCK_UPDATE 21 -#define KVM_REQ_MCLOCK_INPROGRESS 22 -#define KVM_REQ_SCAN_IOAPIC 23 +#define KVM_REQ_MCLOCK_INPROGRESS (22 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_SCAN_IOAPIC (23 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_GLOBAL_CLOCK_UPDATE 24 -#define KVM_REQ_APIC_PAGE_RELOAD 25 +#define KVM_REQ_APIC_PAGE_RELOAD (25 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_HV_CRASH 26 #define KVM_REQ_IOAPIC_EOI_EXIT 27 #define KVM_REQ_HV_RESET 28 @@ -343,9 +341,10 @@ struct kvm_mmu { void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, const void *pte); hpa_t root_hpa; - int root_level; - int shadow_root_level; union kvm_mmu_page_role base_role; + u8 root_level; + u8 shadow_root_level; + u8 ept_ad; bool direct_map; /* @@ -612,6 +611,8 @@ struct kvm_vcpu_arch { unsigned long dr7; unsigned long eff_db[KVM_NR_DB_REGS]; unsigned long guest_debug_dr7; + u64 msr_platform_info; + u64 msr_misc_features_enables; u64 mcg_cap; u64 mcg_status; diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index cc54b7026567..35cd06f636ab 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -70,8 +70,10 @@ #define SECONDARY_EXEC_APIC_REGISTER_VIRT 0x00000100 #define SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY 0x00000200 #define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400 +#define SECONDARY_EXEC_RDRAND 0x00000800 #define SECONDARY_EXEC_ENABLE_INVPCID 0x00001000 #define SECONDARY_EXEC_SHADOW_VMCS 0x00004000 +#define SECONDARY_EXEC_RDSEED 0x00010000 #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 #define SECONDARY_EXEC_TSC_SCALING 0x02000000 @@ -516,12 +518,14 @@ struct vmx_msr_entry { #define EPT_VIOLATION_READABLE_BIT 3 #define EPT_VIOLATION_WRITABLE_BIT 4 #define EPT_VIOLATION_EXECUTABLE_BIT 5 +#define EPT_VIOLATION_GVA_TRANSLATED_BIT 8 #define EPT_VIOLATION_ACC_READ (1 << EPT_VIOLATION_ACC_READ_BIT) #define EPT_VIOLATION_ACC_WRITE (1 << EPT_VIOLATION_ACC_WRITE_BIT) #define EPT_VIOLATION_ACC_INSTR (1 << EPT_VIOLATION_ACC_INSTR_BIT) #define EPT_VIOLATION_READABLE (1 << EPT_VIOLATION_READABLE_BIT) #define EPT_VIOLATION_WRITABLE (1 << EPT_VIOLATION_WRITABLE_BIT) #define EPT_VIOLATION_EXECUTABLE (1 << EPT_VIOLATION_EXECUTABLE_BIT) +#define EPT_VIOLATION_GVA_TRANSLATED (1 << EPT_VIOLATION_GVA_TRANSLATED_BIT) /* * VM-instruction error numbers diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index 739c0c594022..c2824d02ba37 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -9,6 +9,9 @@ #include <linux/types.h> #include <linux/ioctl.h> +#define KVM_PIO_PAGE_OFFSET 1 +#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 + #define DE_VECTOR 0 #define DB_VECTOR 1 #define BP_VECTOR 3 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 14458658e988..690a2dcf4078 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -76,7 +76,11 @@ #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 +#define EXIT_REASON_RDRAND 57 #define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_VMFUNC 59 +#define EXIT_REASON_ENCLS 60 +#define EXIT_REASON_RDSEED 61 #define EXIT_REASON_PML_FULL 62 #define EXIT_REASON_XSAVES 63 #define EXIT_REASON_XRSTORS 64 @@ -90,6 +94,7 @@ { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ { EXIT_REASON_CPUID, "CPUID" }, \ { EXIT_REASON_HLT, "HLT" }, \ + { EXIT_REASON_INVD, "INVD" }, \ { EXIT_REASON_INVLPG, "INVLPG" }, \ { EXIT_REASON_RDPMC, "RDPMC" }, \ { EXIT_REASON_RDTSC, "RDTSC" }, \ @@ -108,6 +113,8 @@ { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ { EXIT_REASON_MSR_READ, "MSR_READ" }, \ { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ + { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ + { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \ { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ { EXIT_REASON_MONITOR_TRAP_FLAG, "MONITOR_TRAP_FLAG" }, \ { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ @@ -115,20 +122,24 @@ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ - { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \ - { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \ + { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ + { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \ + { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ { EXIT_REASON_INVEPT, "INVEPT" }, \ + { EXIT_REASON_RDTSCP, "RDTSCP" }, \ { EXIT_REASON_PREEMPTION_TIMER, "PREEMPTION_TIMER" }, \ + { EXIT_REASON_INVVPID, "INVVPID" }, \ { EXIT_REASON_WBINVD, "WBINVD" }, \ + { EXIT_REASON_XSETBV, "XSETBV" }, \ { EXIT_REASON_APIC_WRITE, "APIC_WRITE" }, \ - { EXIT_REASON_EOI_INDUCED, "EOI_INDUCED" }, \ - { EXIT_REASON_INVALID_STATE, "INVALID_STATE" }, \ - { EXIT_REASON_MSR_LOAD_FAIL, "MSR_LOAD_FAIL" }, \ - { EXIT_REASON_INVD, "INVD" }, \ - { EXIT_REASON_INVVPID, "INVVPID" }, \ + { EXIT_REASON_RDRAND, "RDRAND" }, \ { EXIT_REASON_INVPCID, "INVPCID" }, \ + { EXIT_REASON_VMFUNC, "VMFUNC" }, \ + { EXIT_REASON_ENCLS, "ENCLS" }, \ + { EXIT_REASON_RDSEED, "RDSEED" }, \ + { EXIT_REASON_PML_FULL, "PML_FULL" }, \ { EXIT_REASON_XSAVES, "XSAVES" }, \ { EXIT_REASON_XRSTORS, "XRSTORS" } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 14f65a5f938e..da5c09789984 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -396,9 +396,9 @@ static u64 kvm_steal_clock(int cpu) src = &per_cpu(steal_time, cpu); do { version = src->version; - rmb(); + virt_rmb(); steal = src->steal; - rmb(); + virt_rmb(); } while ((version & 1) || (version != src->version)); return steal; diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig index ab8e32f7b9a8..760433b2574a 100644 --- a/arch/x86/kvm/Kconfig +++ b/arch/x86/kvm/Kconfig @@ -86,18 +86,6 @@ config KVM_MMU_AUDIT This option adds a R/W kVM module parameter 'mmu_audit', which allows auditing of KVM MMU events at runtime. -config KVM_DEVICE_ASSIGNMENT - bool "KVM legacy PCI device assignment support (DEPRECATED)" - depends on KVM && PCI && IOMMU_API - default n - ---help--- - Provide support for legacy PCI device assignment through KVM. The - kernel now also supports a full featured userspace device driver - framework through VFIO, which supersedes this support and provides - better security. - - If unsure, say N. - # OK, it's a little counter-intuitive to do this, but it puts it neatly under # the virtualization menu. source drivers/vhost/Kconfig diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 3bff20710471..09d4b17be022 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -15,8 +15,6 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ i8254.o ioapic.o irq_comm.o cpuid.o pmu.o mtrr.o \ hyperv.o page_track.o debugfs.o -kvm-$(CONFIG_KVM_DEVICE_ASSIGNMENT) += assigned-dev.o iommu.o - kvm-intel-y += vmx.o pmu_intel.o kvm-amd-y += svm.o pmu_amd.o diff --git a/arch/x86/kvm/assigned-dev.c b/arch/x86/kvm/assigned-dev.c deleted file mode 100644 index 308b8597c691..000000000000 --- a/arch/x86/kvm/assigned-dev.c +++ /dev/null @@ -1,1058 +0,0 @@ -/* - * Kernel-based Virtual Machine - device assignment support - * - * Copyright (C) 2010 Red Hat, Inc. and/or its affiliates. - * - * This work is licensed under the terms of the GNU GPL, version 2. See - * the COPYING file in the top-level directory. - * - */ - -#include <linux/kvm_host.h> -#include <linux/kvm.h> -#include <linux/uaccess.h> -#include <linux/vmalloc.h> -#include <linux/errno.h> -#include <linux/spinlock.h> -#include <linux/pci.h> -#include <linux/interrupt.h> -#include <linux/slab.h> -#include <linux/namei.h> -#include <linux/fs.h> -#include "irq.h" -#include "assigned-dev.h" -#include "trace/events/kvm.h" - -struct kvm_assigned_dev_kernel { - struct kvm_irq_ack_notifier ack_notifier; - struct list_head list; - int assigned_dev_id; - int host_segnr; - int host_busnr; - int host_devfn; - unsigned int entries_nr; - int host_irq; - bool host_irq_disabled; - bool pci_2_3; - struct msix_entry *host_msix_entries; - int guest_irq; - struct msix_entry *guest_msix_entries; - unsigned long irq_requested_type; - int irq_source_id; - int flags; - struct pci_dev *dev; - struct kvm *kvm; - spinlock_t intx_lock; - spinlock_t intx_mask_lock; - char irq_name[32]; - struct pci_saved_state *pci_saved_state; -}; - -static struct kvm_assigned_dev_kernel *kvm_find_assigned_dev(struct list_head *head, - int assigned_dev_id) -{ - struct kvm_assigned_dev_kernel *match; - - list_for_each_entry(match, head, list) { - if (match->assigned_dev_id == assigned_dev_id) - return match; - } - return NULL; -} - -static int find_index_from_host_irq(struct kvm_assigned_dev_kernel - *assigned_dev, int irq) -{ - int i, index; - struct msix_entry *host_msix_entries; - - host_msix_entries = assigned_dev->host_msix_entries; - - index = -1; - for (i = 0; i < assigned_dev->entries_nr; i++) - if (irq == host_msix_entries[i].vector) { - index = i; - break; - } - if (index < 0) - printk(KERN_WARNING "Fail to find correlated MSI-X entry!\n"); - - return index; -} - -static irqreturn_t kvm_assigned_dev_intx(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int ret; - - spin_lock(&assigned_dev->intx_lock); - if (pci_check_and_mask_intx(assigned_dev->dev)) { - assigned_dev->host_irq_disabled = true; - ret = IRQ_WAKE_THREAD; - } else - ret = IRQ_NONE; - spin_unlock(&assigned_dev->intx_lock); - - return ret; -} - -static void -kvm_assigned_dev_raise_guest_irq(struct kvm_assigned_dev_kernel *assigned_dev, - int vector) -{ - if (unlikely(assigned_dev->irq_requested_type & - KVM_DEV_IRQ_GUEST_INTX)) { - spin_lock(&assigned_dev->intx_mask_lock); - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) - kvm_set_irq(assigned_dev->kvm, - assigned_dev->irq_source_id, vector, 1, - false); - spin_unlock(&assigned_dev->intx_mask_lock); - } else - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - vector, 1, false); -} - -static irqreturn_t kvm_assigned_dev_thread_intx(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - spin_lock_irq(&assigned_dev->intx_lock); - disable_irq_nosync(irq); - assigned_dev->host_irq_disabled = true; - spin_unlock_irq(&assigned_dev->intx_lock); - } - - kvm_assigned_dev_raise_guest_irq(assigned_dev, - assigned_dev->guest_irq); - - return IRQ_HANDLED; -} - -/* - * Deliver an IRQ in an atomic context if we can, or return a failure, - * user can retry in a process context. - * Return value: - * -EWOULDBLOCK - Can't deliver in atomic context: retry in a process context. - * Other values - No need to retry. - */ -static int kvm_set_irq_inatomic(struct kvm *kvm, int irq_source_id, u32 irq, - int level) -{ - struct kvm_kernel_irq_routing_entry entries[KVM_NR_IRQCHIPS]; - struct kvm_kernel_irq_routing_entry *e; - int ret = -EINVAL; - int idx; - - trace_kvm_set_irq(irq, level, irq_source_id); - - /* - * Injection into either PIC or IOAPIC might need to scan all CPUs, - * which would need to be retried from thread context; when same GSI - * is connected to both PIC and IOAPIC, we'd have to report a - * partial failure here. - * Since there's no easy way to do this, we only support injecting MSI - * which is limited to 1:1 GSI mapping. - */ - idx = srcu_read_lock(&kvm->irq_srcu); - if (kvm_irq_map_gsi(kvm, entries, irq) > 0) { - e = &entries[0]; - ret = kvm_arch_set_irq_inatomic(e, kvm, irq_source_id, - irq, level); - } - srcu_read_unlock(&kvm->irq_srcu, idx); - return ret; -} - - -static irqreturn_t kvm_assigned_dev_msi(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int ret = kvm_set_irq_inatomic(assigned_dev->kvm, - assigned_dev->irq_source_id, - assigned_dev->guest_irq, 1); - return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_thread_msi(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - - kvm_assigned_dev_raise_guest_irq(assigned_dev, - assigned_dev->guest_irq); - - return IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_msix(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int index = find_index_from_host_irq(assigned_dev, irq); - u32 vector; - int ret = 0; - - if (index >= 0) { - vector = assigned_dev->guest_msix_entries[index].vector; - ret = kvm_set_irq_inatomic(assigned_dev->kvm, - assigned_dev->irq_source_id, - vector, 1); - } - - return unlikely(ret == -EWOULDBLOCK) ? IRQ_WAKE_THREAD : IRQ_HANDLED; -} - -static irqreturn_t kvm_assigned_dev_thread_msix(int irq, void *dev_id) -{ - struct kvm_assigned_dev_kernel *assigned_dev = dev_id; - int index = find_index_from_host_irq(assigned_dev, irq); - u32 vector; - - if (index >= 0) { - vector = assigned_dev->guest_msix_entries[index].vector; - kvm_assigned_dev_raise_guest_irq(assigned_dev, vector); - } - - return IRQ_HANDLED; -} - -/* Ack the irq line for an assigned device */ -static void kvm_assigned_dev_ack_irq(struct kvm_irq_ack_notifier *kian) -{ - struct kvm_assigned_dev_kernel *dev = - container_of(kian, struct kvm_assigned_dev_kernel, - ack_notifier); - - kvm_set_irq(dev->kvm, dev->irq_source_id, dev->guest_irq, 0, false); - - spin_lock(&dev->intx_mask_lock); - - if (!(dev->flags & KVM_DEV_ASSIGN_MASK_INTX)) { - bool reassert = false; - - spin_lock_irq(&dev->intx_lock); - /* - * The guest IRQ may be shared so this ack can come from an - * IRQ for another guest device. - */ - if (dev->host_irq_disabled) { - if (!(dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) - enable_irq(dev->host_irq); - else if (!pci_check_and_unmask_intx(dev->dev)) - reassert = true; - dev->host_irq_disabled = reassert; - } - spin_unlock_irq(&dev->intx_lock); - - if (reassert) - kvm_set_irq(dev->kvm, dev->irq_source_id, - dev->guest_irq, 1, false); - } - - spin_unlock(&dev->intx_mask_lock); -} - -static void deassign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - if (assigned_dev->ack_notifier.gsi != -1) - kvm_unregister_irq_ack_notifier(kvm, - &assigned_dev->ack_notifier); - - kvm_set_irq(assigned_dev->kvm, assigned_dev->irq_source_id, - assigned_dev->guest_irq, 0, false); - - if (assigned_dev->irq_source_id != -1) - kvm_free_irq_source_id(kvm, assigned_dev->irq_source_id); - assigned_dev->irq_source_id = -1; - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_GUEST_MASK); -} - -/* The function implicit hold kvm->lock mutex due to cancel_work_sync() */ -static void deassign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - /* - * We disable irq here to prevent further events. - * - * Notice this maybe result in nested disable if the interrupt type is - * INTx, but it's OK for we are going to free it. - * - * If this function is a part of VM destroy, please ensure that till - * now, the kvm state is still legal for probably we also have to wait - * on a currently running IRQ handler. - */ - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSIX) { - int i; - for (i = 0; i < assigned_dev->entries_nr; i++) - disable_irq(assigned_dev->host_msix_entries[i].vector); - - for (i = 0; i < assigned_dev->entries_nr; i++) - free_irq(assigned_dev->host_msix_entries[i].vector, - assigned_dev); - - assigned_dev->entries_nr = 0; - kfree(assigned_dev->host_msix_entries); - kfree(assigned_dev->guest_msix_entries); - pci_disable_msix(assigned_dev->dev); - } else { - /* Deal with MSI and INTx */ - if ((assigned_dev->irq_requested_type & - KVM_DEV_IRQ_HOST_INTX) && - (assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - spin_lock_irq(&assigned_dev->intx_lock); - pci_intx(assigned_dev->dev, false); - spin_unlock_irq(&assigned_dev->intx_lock); - synchronize_irq(assigned_dev->host_irq); - } else - disable_irq(assigned_dev->host_irq); - - free_irq(assigned_dev->host_irq, assigned_dev); - - if (assigned_dev->irq_requested_type & KVM_DEV_IRQ_HOST_MSI) - pci_disable_msi(assigned_dev->dev); - } - - assigned_dev->irq_requested_type &= ~(KVM_DEV_IRQ_HOST_MASK); -} - -static int kvm_deassign_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev, - unsigned long irq_requested_type) -{ - unsigned long guest_irq_type, host_irq_type; - - if (!irqchip_in_kernel(kvm)) - return -EINVAL; - /* no irq assignment to deassign */ - if (!assigned_dev->irq_requested_type) - return -ENXIO; - - host_irq_type = irq_requested_type & KVM_DEV_IRQ_HOST_MASK; - guest_irq_type = irq_requested_type & KVM_DEV_IRQ_GUEST_MASK; - - if (host_irq_type) - deassign_host_irq(kvm, assigned_dev); - if (guest_irq_type) - deassign_guest_irq(kvm, assigned_dev); - - return 0; -} - -static void kvm_free_assigned_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *assigned_dev) -{ - kvm_deassign_irq(kvm, assigned_dev, assigned_dev->irq_requested_type); -} - -static void kvm_free_assigned_device(struct kvm *kvm, - struct kvm_assigned_dev_kernel - *assigned_dev) -{ - kvm_free_assigned_irq(kvm, assigned_dev); - - pci_reset_function(assigned_dev->dev); - if (pci_load_and_free_saved_state(assigned_dev->dev, - &assigned_dev->pci_saved_state)) - printk(KERN_INFO "%s: Couldn't reload %s saved state\n", - __func__, dev_name(&assigned_dev->dev->dev)); - else - pci_restore_state(assigned_dev->dev); - - pci_clear_dev_assigned(assigned_dev->dev); - - pci_release_regions(assigned_dev->dev); - pci_disable_device(assigned_dev->dev); - pci_dev_put(assigned_dev->dev); - - list_del(&assigned_dev->list); - kfree(assigned_dev); -} - -void kvm_free_all_assigned_devices(struct kvm *kvm) -{ - struct kvm_assigned_dev_kernel *assigned_dev, *tmp; - - list_for_each_entry_safe(assigned_dev, tmp, - &kvm->arch.assigned_dev_head, list) { - kvm_free_assigned_device(kvm, assigned_dev); - } -} - -static int assigned_device_enable_host_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - irq_handler_t irq_handler; - unsigned long flags; - - dev->host_irq = dev->dev->irq; - - /* - * We can only share the IRQ line with other host devices if we are - * able to disable the IRQ source at device-level - independently of - * the guest driver. Otherwise host devices may suffer from unbounded - * IRQ latencies when the guest keeps the line asserted. - */ - if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { - irq_handler = kvm_assigned_dev_intx; - flags = IRQF_SHARED; - } else { - irq_handler = NULL; - flags = IRQF_ONESHOT; - } - if (request_threaded_irq(dev->host_irq, irq_handler, - kvm_assigned_dev_thread_intx, flags, - dev->irq_name, dev)) - return -EIO; - - if (dev->flags & KVM_DEV_ASSIGN_PCI_2_3) { - spin_lock_irq(&dev->intx_lock); - pci_intx(dev->dev, true); - spin_unlock_irq(&dev->intx_lock); - } - return 0; -} - -static int assigned_device_enable_host_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int r; - - if (!dev->dev->msi_enabled) { - r = pci_enable_msi(dev->dev); - if (r) - return r; - } - - dev->host_irq = dev->dev->irq; - if (request_threaded_irq(dev->host_irq, kvm_assigned_dev_msi, - kvm_assigned_dev_thread_msi, 0, - dev->irq_name, dev)) { - pci_disable_msi(dev->dev); - return -EIO; - } - - return 0; -} - -static int assigned_device_enable_host_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev) -{ - int i, r = -EINVAL; - - /* host_msix_entries and guest_msix_entries should have been - * initialized */ - if (dev->entries_nr == 0) - return r; - - r = pci_enable_msix_exact(dev->dev, - dev->host_msix_entries, dev->entries_nr); - if (r) - return r; - - for (i = 0; i < dev->entries_nr; i++) { - r = request_threaded_irq(dev->host_msix_entries[i].vector, - kvm_assigned_dev_msix, - kvm_assigned_dev_thread_msix, - 0, dev->irq_name, dev); - if (r) - goto err; - } - - return 0; -err: - for (i -= 1; i >= 0; i--) - free_irq(dev->host_msix_entries[i].vector, dev); - pci_disable_msix(dev->dev); - return r; -} - -static int assigned_device_enable_guest_intx(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = irq->guest_irq; - return 0; -} - -static int assigned_device_enable_guest_msi(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} - -static int assigned_device_enable_guest_msix(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq) -{ - dev->guest_irq = irq->guest_irq; - dev->ack_notifier.gsi = -1; - return 0; -} - -static int assign_host_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - __u32 host_irq_type) -{ - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_HOST_MASK) - return r; - - snprintf(dev->irq_name, sizeof(dev->irq_name), "kvm:%s", - pci_name(dev->dev)); - - switch (host_irq_type) { - case KVM_DEV_IRQ_HOST_INTX: - r = assigned_device_enable_host_intx(kvm, dev); - break; - case KVM_DEV_IRQ_HOST_MSI: - r = assigned_device_enable_host_msi(kvm, dev); - break; - case KVM_DEV_IRQ_HOST_MSIX: - r = assigned_device_enable_host_msix(kvm, dev); - break; - default: - r = -EINVAL; - } - dev->host_irq_disabled = false; - - if (!r) - dev->irq_requested_type |= host_irq_type; - - return r; -} - -static int assign_guest_irq(struct kvm *kvm, - struct kvm_assigned_dev_kernel *dev, - struct kvm_assigned_irq *irq, - unsigned long guest_irq_type) -{ - int id; - int r = -EEXIST; - - if (dev->irq_requested_type & KVM_DEV_IRQ_GUEST_MASK) - return r; - - id = kvm_request_irq_source_id(kvm); - if (id < 0) - return id; - - dev->irq_source_id = id; - - switch (guest_irq_type) { - case KVM_DEV_IRQ_GUEST_INTX: - r = assigned_device_enable_guest_intx(kvm, dev, irq); - break; - case KVM_DEV_IRQ_GUEST_MSI: - r = assigned_device_enable_guest_msi(kvm, dev, irq); - break; - case KVM_DEV_IRQ_GUEST_MSIX: - r = assigned_device_enable_guest_msix(kvm, dev, irq); - break; - default: - r = -EINVAL; - } - - if (!r) { - dev->irq_requested_type |= guest_irq_type; - if (dev->ack_notifier.gsi != -1) - kvm_register_irq_ack_notifier(kvm, &dev->ack_notifier); - } else { - kvm_free_irq_source_id(kvm, dev->irq_source_id); - dev->irq_source_id = -1; - } - - return r; -} - -/* TODO Deal with KVM_DEV_IRQ_ASSIGNED_MASK_MSIX */ -static int kvm_vm_ioctl_assign_irq(struct kvm *kvm, - struct kvm_assigned_irq *assigned_irq) -{ - int r = -EINVAL; - struct kvm_assigned_dev_kernel *match; - unsigned long host_irq_type, guest_irq_type; - - if (!irqchip_in_kernel(kvm)) - return r; - - mutex_lock(&kvm->lock); - r = -ENODEV; - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - host_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_HOST_MASK); - guest_irq_type = (assigned_irq->flags & KVM_DEV_IRQ_GUEST_MASK); - - r = -EINVAL; - /* can only assign one type at a time */ - if (hweight_long(host_irq_type) > 1) - goto out; - if (hweight_long(guest_irq_type) > 1) - goto out; - if (host_irq_type == 0 && guest_irq_type == 0) - goto out; - - r = 0; - if (host_irq_type) - r = assign_host_irq(kvm, match, host_irq_type); - if (r) - goto out; - - if (guest_irq_type) - r = assign_guest_irq(kvm, match, assigned_irq, guest_irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_dev_irq(struct kvm *kvm, - struct kvm_assigned_irq - *assigned_irq) -{ - int r = -ENODEV; - struct kvm_assigned_dev_kernel *match; - unsigned long irq_type; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_irq->assigned_dev_id); - if (!match) - goto out; - - irq_type = assigned_irq->flags & (KVM_DEV_IRQ_HOST_MASK | - KVM_DEV_IRQ_GUEST_MASK); - r = kvm_deassign_irq(kvm, match, irq_type); -out: - mutex_unlock(&kvm->lock); - return r; -} - -/* - * We want to test whether the caller has been granted permissions to - * use this device. To be able to configure and control the device, - * the user needs access to PCI configuration space and BAR resources. - * These are accessed through PCI sysfs. PCI config space is often - * passed to the process calling this ioctl via file descriptor, so we - * can't rely on access to that file. We can check for permissions - * on each of the BAR resource files, which is a pretty clear - * indicator that the user has been granted access to the device. - */ -static int probe_sysfs_permissions(struct pci_dev *dev) -{ -#ifdef CONFIG_SYSFS - int i; - bool bar_found = false; - - for (i = PCI_STD_RESOURCES; i <= PCI_STD_RESOURCE_END; i++) { - char *kpath, *syspath; - struct path path; - struct inode *inode; - int r; - - if (!pci_resource_len(dev, i)) - continue; - - kpath = kobject_get_path(&dev->dev.kobj, GFP_KERNEL); - if (!kpath) - return -ENOMEM; - - /* Per sysfs-rules, sysfs is always at /sys */ - syspath = kasprintf(GFP_KERNEL, "/sys%s/resource%d", kpath, i); - kfree(kpath); - if (!syspath) - return -ENOMEM; - - r = kern_path(syspath, LOOKUP_FOLLOW, &path); - kfree(syspath); - if (r) - return r; - - inode = d_backing_inode(path.dentry); - - r = inode_permission(inode, MAY_READ | MAY_WRITE | MAY_ACCESS); - path_put(&path); - if (r) - return r; - - bar_found = true; - } - - /* If no resources, probably something special */ - if (!bar_found) - return -EPERM; - - return 0; -#else - return -EINVAL; /* No way to control the device without sysfs */ -#endif -} - -static int kvm_vm_ioctl_assign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0, idx; - struct kvm_assigned_dev_kernel *match; - struct pci_dev *dev; - - if (!(assigned_dev->flags & KVM_DEV_ASSIGN_ENABLE_IOMMU)) - return -EINVAL; - - mutex_lock(&kvm->lock); - idx = srcu_read_lock(&kvm->srcu); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (match) { - /* device already assigned */ - r = -EEXIST; - goto out; - } - - match = kzalloc(sizeof(struct kvm_assigned_dev_kernel), GFP_KERNEL); - if (match == NULL) { - printk(KERN_INFO "%s: Couldn't allocate memory\n", - __func__); - r = -ENOMEM; - goto out; - } - dev = pci_get_domain_bus_and_slot(assigned_dev->segnr, - assigned_dev->busnr, - assigned_dev->devfn); - if (!dev) { - printk(KERN_INFO "%s: host device not found\n", __func__); - r = -EINVAL; - goto out_free; - } - - /* Don't allow bridges to be assigned */ - if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL) { - r = -EPERM; - goto out_put; - } - - r = probe_sysfs_permissions(dev); - if (r) - goto out_put; - - if (pci_enable_device(dev)) { - printk(KERN_INFO "%s: Could not enable PCI device\n", __func__); - r = -EBUSY; - goto out_put; - } - r = pci_request_regions(dev, "kvm_assigned_device"); - if (r) { - printk(KERN_INFO "%s: Could not get access to device regions\n", - __func__); - goto out_disable; - } - - pci_reset_function(dev); - pci_save_state(dev); - match->pci_saved_state = pci_store_saved_state(dev); - if (!match->pci_saved_state) - printk(KERN_DEBUG "%s: Couldn't store %s saved state\n", - __func__, dev_name(&dev->dev)); - - if (!pci_intx_mask_supported(dev)) - assigned_dev->flags &= ~KVM_DEV_ASSIGN_PCI_2_3; - - match->assigned_dev_id = assigned_dev->assigned_dev_id; - match->host_segnr = assigned_dev->segnr; - match->host_busnr = assigned_dev->busnr; - match->host_devfn = assigned_dev->devfn; - match->flags = assigned_dev->flags; - match->dev = dev; - spin_lock_init(&match->intx_lock); - spin_lock_init(&match->intx_mask_lock); - match->irq_source_id = -1; - match->kvm = kvm; - match->ack_notifier.irq_acked = kvm_assigned_dev_ack_irq; - - list_add(&match->list, &kvm->arch.assigned_dev_head); - - if (!kvm->arch.iommu_domain) { - r = kvm_iommu_map_guest(kvm); - if (r) - goto out_list_del; - } - r = kvm_assign_device(kvm, match->dev); - if (r) - goto out_list_del; - -out: - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); - return r; -out_list_del: - if (pci_load_and_free_saved_state(dev, &match->pci_saved_state)) - printk(KERN_INFO "%s: Couldn't reload %s saved state\n", - __func__, dev_name(&dev->dev)); - list_del(&match->list); - pci_release_regions(dev); -out_disable: - pci_disable_device(dev); -out_put: - pci_dev_put(dev); -out_free: - kfree(match); - srcu_read_unlock(&kvm->srcu, idx); - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_deassign_device(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - printk(KERN_INFO "%s: device hasn't been assigned before, " - "so cannot be deassigned\n", __func__); - r = -EINVAL; - goto out; - } - - kvm_deassign_device(kvm, match->dev); - - kvm_free_assigned_device(kvm, match); - -out: - mutex_unlock(&kvm->lock); - return r; -} - - -static int kvm_vm_ioctl_set_msix_nr(struct kvm *kvm, - struct kvm_assigned_msix_nr *entry_nr) -{ - int r = 0; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry_nr->assigned_dev_id); - if (!adev) { - r = -EINVAL; - goto msix_nr_out; - } - - if (adev->entries_nr == 0) { - adev->entries_nr = entry_nr->entry_nr; - if (adev->entries_nr == 0 || - adev->entries_nr > KVM_MAX_MSIX_PER_DEV) { - r = -EINVAL; - goto msix_nr_out; - } - - adev->host_msix_entries = kzalloc(sizeof(struct msix_entry) * - entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->host_msix_entries) { - r = -ENOMEM; - goto msix_nr_out; - } - adev->guest_msix_entries = - kzalloc(sizeof(struct msix_entry) * entry_nr->entry_nr, - GFP_KERNEL); - if (!adev->guest_msix_entries) { - kfree(adev->host_msix_entries); - r = -ENOMEM; - goto msix_nr_out; - } - } else /* Not allowed set MSI-X number twice */ - r = -EINVAL; -msix_nr_out: - mutex_unlock(&kvm->lock); - return r; -} - -static int kvm_vm_ioctl_set_msix_entry(struct kvm *kvm, - struct kvm_assigned_msix_entry *entry) -{ - int r = 0, i; - struct kvm_assigned_dev_kernel *adev; - - mutex_lock(&kvm->lock); - - adev = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - entry->assigned_dev_id); - - if (!adev) { - r = -EINVAL; - goto msix_entry_out; - } - - for (i = 0; i < adev->entries_nr; i++) - if (adev->guest_msix_entries[i].vector == 0 || - adev->guest_msix_entries[i].entry == entry->entry) { - adev->guest_msix_entries[i].entry = entry->entry; - adev->guest_msix_entries[i].vector = entry->gsi; - adev->host_msix_entries[i].entry = entry->entry; - break; - } - if (i == adev->entries_nr) { - r = -ENOSPC; - goto msix_entry_out; - } - -msix_entry_out: - mutex_unlock(&kvm->lock); - - return r; -} - -static int kvm_vm_ioctl_set_pci_irq_mask(struct kvm *kvm, - struct kvm_assigned_pci_dev *assigned_dev) -{ - int r = 0; - struct kvm_assigned_dev_kernel *match; - - mutex_lock(&kvm->lock); - - match = kvm_find_assigned_dev(&kvm->arch.assigned_dev_head, - assigned_dev->assigned_dev_id); - if (!match) { - r = -ENODEV; - goto out; - } - - spin_lock(&match->intx_mask_lock); - - match->flags &= ~KVM_DEV_ASSIGN_MASK_INTX; - match->flags |= assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX; - - if (match->irq_requested_type & KVM_DEV_IRQ_GUEST_INTX) { - if (assigned_dev->flags & KVM_DEV_ASSIGN_MASK_INTX) { - kvm_set_irq(match->kvm, match->irq_source_id, - match->guest_irq, 0, false); - /* - * Masking at hardware-level is performed on demand, - * i.e. when an IRQ actually arrives at the host. - */ - } else if (!(assigned_dev->flags & KVM_DEV_ASSIGN_PCI_2_3)) { - /* - * Unmask the IRQ line if required. Unmasking at - * device level will be performed by user space. - */ - spin_lock_irq(&match->intx_lock); - if (match->host_irq_disabled) { - enable_irq(match->host_irq); - match->host_irq_disabled = false; - } - spin_unlock_irq(&match->intx_lock); - } - } - - spin_unlock(&match->intx_mask_lock); - -out: - mutex_unlock(&kvm->lock); - return r; -} - -long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg) -{ - void __user *argp = (void __user *)arg; - int r; - - switch (ioctl) { - case KVM_ASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_assign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_IRQ: { - r = -EOPNOTSUPP; - break; - } - case KVM_ASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_assign_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } - case KVM_DEASSIGN_DEV_IRQ: { - struct kvm_assigned_irq assigned_irq; - - r = -EFAULT; - if (copy_from_user(&assigned_irq, argp, sizeof assigned_irq)) - goto out; - r = kvm_vm_ioctl_deassign_dev_irq(kvm, &assigned_irq); - if (r) - goto out; - break; - } - case KVM_DEASSIGN_PCI_DEVICE: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_deassign_device(kvm, &assigned_dev); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_NR: { - struct kvm_assigned_msix_nr entry_nr; - r = -EFAULT; - if (copy_from_user(&entry_nr, argp, sizeof entry_nr)) - goto out; - r = kvm_vm_ioctl_set_msix_nr(kvm, &entry_nr); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_MSIX_ENTRY: { - struct kvm_assigned_msix_entry entry; - r = -EFAULT; - if (copy_from_user(&entry, argp, sizeof entry)) - goto out; - r = kvm_vm_ioctl_set_msix_entry(kvm, &entry); - if (r) - goto out; - break; - } - case KVM_ASSIGN_SET_INTX_MASK: { - struct kvm_assigned_pci_dev assigned_dev; - - r = -EFAULT; - if (copy_from_user(&assigned_dev, argp, sizeof assigned_dev)) - goto out; - r = kvm_vm_ioctl_set_pci_irq_mask(kvm, &assigned_dev); - break; - } - default: - r = -ENOTTY; - break; - } -out: - return r; -} diff --git a/arch/x86/kvm/assigned-dev.h b/arch/x86/kvm/assigned-dev.h deleted file mode 100644 index a428c1a211b2..000000000000 --- a/arch/x86/kvm/assigned-dev.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef ARCH_X86_KVM_ASSIGNED_DEV_H -#define ARCH_X86_KVM_ASSIGNED_DEV_H - -#include <linux/kvm_host.h> - -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT -int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev); -int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev); - -int kvm_iommu_map_guest(struct kvm *kvm); -int kvm_iommu_unmap_guest(struct kvm *kvm); - -long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg); - -void kvm_free_all_assigned_devices(struct kvm *kvm); -#else -static inline int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - return 0; -} - -static inline long kvm_vm_ioctl_assigned_device(struct kvm *kvm, unsigned ioctl, - unsigned long arg) -{ - return -ENOTTY; -} - -static inline void kvm_free_all_assigned_devices(struct kvm *kvm) {} -#endif /* CONFIG_KVM_DEVICE_ASSIGNMENT */ - -#endif /* ARCH_X86_KVM_ASSIGNED_DEV_H */ diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index efde6cc50875..a181ae76c71c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -876,6 +876,9 @@ int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { u32 eax, ebx, ecx, edx; + if (cpuid_fault_enabled(vcpu) && !kvm_require_cpl(vcpu, 0)) + return 1; + eax = kvm_register_read(vcpu, VCPU_REGS_RAX); ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 35058c2c0eea..a6fd40aade7c 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -205,4 +205,15 @@ static inline int guest_cpuid_stepping(struct kvm_vcpu *vcpu) return x86_stepping(best->eax); } +static inline bool supports_cpuid_fault(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.msr_platform_info & MSR_PLATFORM_INFO_CPUID_FAULT; +} + +static inline bool cpuid_fault_enabled(struct kvm_vcpu *vcpu) +{ + return vcpu->arch.msr_misc_features_enables & + MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 45c7306c8780..c25cfaf584e7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2547,7 +2547,7 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) u64 smbase; int ret; - if ((ctxt->emul_flags & X86EMUL_SMM_MASK) == 0) + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_MASK) == 0) return emulate_ud(ctxt); /* @@ -2596,11 +2596,11 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) return X86EMUL_UNHANDLEABLE; } - if ((ctxt->emul_flags & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) + if ((ctxt->ops->get_hflags(ctxt) & X86EMUL_SMM_INSIDE_NMI_MASK) == 0) ctxt->ops->set_nmi_mask(ctxt, false); - ctxt->emul_flags &= ~X86EMUL_SMM_INSIDE_NMI_MASK; - ctxt->emul_flags &= ~X86EMUL_SMM_MASK; + ctxt->ops->set_hflags(ctxt, ctxt->ops->get_hflags(ctxt) & + ~(X86EMUL_SMM_INSIDE_NMI_MASK | X86EMUL_SMM_MASK)); return X86EMUL_CONTINUE; } @@ -3854,6 +3854,13 @@ static int em_sti(struct x86_emulate_ctxt *ctxt) static int em_cpuid(struct x86_emulate_ctxt *ctxt) { u32 eax, ebx, ecx, edx; + u64 msr = 0; + + ctxt->ops->get_msr(ctxt, MSR_MISC_FEATURES_ENABLES, &msr); + if (msr & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && + ctxt->ops->cpl(ctxt)) { + return emulate_gp(ctxt, 0); + } eax = reg_read(ctxt, VCPU_REGS_RAX); ecx = reg_read(ctxt, VCPU_REGS_RCX); @@ -5316,6 +5323,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) const struct x86_emulate_ops *ops = ctxt->ops; int rc = X86EMUL_CONTINUE; int saved_dst_type = ctxt->dst.type; + unsigned emul_flags; ctxt->mem_read.pos = 0; @@ -5330,6 +5338,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + emul_flags = ctxt->ops->get_hflags(ctxt); if (unlikely(ctxt->d & (No64|Undefined|Sse|Mmx|Intercept|CheckPerm|Priv|Prot|String))) { if ((ctxt->mode == X86EMUL_MODE_PROT64 && (ctxt->d & No64)) || @@ -5363,7 +5372,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) fetch_possible_mmx_operand(ctxt, &ctxt->dst); } - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && ctxt->intercept) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_PRE_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5392,7 +5401,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_EXCEPT); if (rc != X86EMUL_CONTINUE) @@ -5446,7 +5455,7 @@ int x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: - if (unlikely(ctxt->emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { + if (unlikely(emul_flags & X86EMUL_GUEST_MASK) && (ctxt->d & Intercept)) { rc = emulator_check_intercept(ctxt, ctxt->intercept, X86_ICPT_POST_MEMACCESS); if (rc != X86EMUL_CONTINUE) diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 047b17a26269..bdcd4139eca9 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -49,7 +49,7 @@ static void pic_unlock(struct kvm_pic *s) __releases(&s->lock) { bool wakeup = s->wakeup_needed; - struct kvm_vcpu *vcpu, *found = NULL; + struct kvm_vcpu *vcpu; int i; s->wakeup_needed = false; @@ -59,16 +59,11 @@ static void pic_unlock(struct kvm_pic *s) if (wakeup) { kvm_for_each_vcpu(i, vcpu, s->kvm) { if (kvm_apic_accept_pic_intr(vcpu)) { - found = vcpu; - break; + kvm_make_request(KVM_REQ_EVENT, vcpu); + kvm_vcpu_kick(vcpu); + return; } } - - if (!found) - return; - - kvm_make_request(KVM_REQ_EVENT, found); - kvm_vcpu_kick(found); } } @@ -239,7 +234,7 @@ static inline void pic_intack(struct kvm_kpic_state *s, int irq) int kvm_pic_read_irq(struct kvm *kvm) { int irq, irq2, intno; - struct kvm_pic *s = pic_irqchip(kvm); + struct kvm_pic *s = kvm->arch.vpic; s->output = 0; @@ -273,7 +268,7 @@ int kvm_pic_read_irq(struct kvm *kvm) return intno; } -void kvm_pic_reset(struct kvm_kpic_state *s) +static void kvm_pic_reset(struct kvm_kpic_state *s) { int irq, i; struct kvm_vcpu *vcpu; @@ -422,19 +417,16 @@ static u32 pic_poll_read(struct kvm_kpic_state *s, u32 addr1) return ret; } -static u32 pic_ioport_read(void *opaque, u32 addr1) +static u32 pic_ioport_read(void *opaque, u32 addr) { struct kvm_kpic_state *s = opaque; - unsigned int addr; int ret; - addr = addr1; - addr &= 1; if (s->poll) { - ret = pic_poll_read(s, addr1); + ret = pic_poll_read(s, addr); s->poll = 0; } else - if (addr == 0) + if ((addr & 1) == 0) if (s->read_reg_select) ret = s->isr; else @@ -456,76 +448,64 @@ static u32 elcr_ioport_read(void *opaque, u32 addr1) return s->elcr; } -static int picdev_in_range(gpa_t addr) -{ - switch (addr) { - case 0x20: - case 0x21: - case 0xa0: - case 0xa1: - case 0x4d0: - case 0x4d1: - return 1; - default: - return 0; - } -} - static int picdev_write(struct kvm_pic *s, gpa_t addr, int len, const void *val) { unsigned char data = *(unsigned char *)val; - if (!picdev_in_range(addr)) - return -EOPNOTSUPP; if (len != 1) { pr_pic_unimpl("non byte write\n"); return 0; } - pic_lock(s); switch (addr) { case 0x20: case 0x21: case 0xa0: case 0xa1: + pic_lock(s); pic_ioport_write(&s->pics[addr >> 7], addr, data); + pic_unlock(s); break; case 0x4d0: case 0x4d1: + pic_lock(s); elcr_ioport_write(&s->pics[addr & 1], addr, data); + pic_unlock(s); break; + default: + return -EOPNOTSUPP; } - pic_unlock(s); return 0; } static int picdev_read(struct kvm_pic *s, gpa_t addr, int len, void *val) { - unsigned char data = 0; - if (!picdev_in_range(addr)) - return -EOPNOTSUPP; + unsigned char *data = (unsigned char *)val; if (len != 1) { memset(val, 0, len); pr_pic_unimpl("non byte read\n"); return 0; } - pic_lock(s); switch (addr) { case 0x20: case 0x21: case 0xa0: case 0xa1: - data = pic_ioport_read(&s->pics[addr >> 7], addr); + pic_lock(s); + *data = pic_ioport_read(&s->pics[addr >> 7], addr); + pic_unlock(s); break; case 0x4d0: case 0x4d1: - data = elcr_ioport_read(&s->pics[addr & 1], addr); + pic_lock(s); + *data = elcr_ioport_read(&s->pics[addr & 1], addr); + pic_unlock(s); break; + default: + return -EOPNOTSUPP; } - *(unsigned char *)val = data; - pic_unlock(s); return 0; } @@ -576,7 +556,7 @@ static int picdev_eclr_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev, */ static void pic_irq_request(struct kvm *kvm, int level) { - struct kvm_pic *s = pic_irqchip(kvm); + struct kvm_pic *s = kvm->arch.vpic; if (!s->output) s->wakeup_needed = true; @@ -660,9 +640,11 @@ void kvm_pic_destroy(struct kvm *kvm) if (!vpic) return; + mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_master); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_slave); kvm_io_bus_unregister_dev(vpic->kvm, KVM_PIO_BUS, &vpic->dev_eclr); + mutex_unlock(&kvm->slots_lock); kvm->arch.vpic = NULL; kfree(vpic); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 289270a6aecb..bdff437acbcb 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -266,11 +266,9 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors) spin_unlock(&ioapic->lock); } -void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) +void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) { - struct kvm_ioapic *ioapic = kvm->arch.vioapic; - - if (!ioapic) + if (!ioapic_in_kernel(kvm)) return; kvm_make_scan_ioapic_request(kvm); } @@ -315,7 +313,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val) if (e->fields.trig_mode == IOAPIC_LEVEL_TRIG && ioapic->irr & (1 << index)) ioapic_service(ioapic, index, false); - kvm_vcpu_request_scan_ioapic(ioapic->kvm); + kvm_make_scan_ioapic_request(ioapic->kvm); break; } } @@ -624,10 +622,8 @@ int kvm_ioapic_init(struct kvm *kvm) if (ret < 0) { kvm->arch.vioapic = NULL; kfree(ioapic); - return ret; } - kvm_vcpu_request_scan_ioapic(kvm); return ret; } @@ -639,36 +635,32 @@ void kvm_ioapic_destroy(struct kvm *kvm) return; cancel_delayed_work_sync(&ioapic->eoi_inject); + mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &ioapic->dev); + mutex_unlock(&kvm->slots_lock); kvm->arch.vioapic = NULL; kfree(ioapic); } -int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) { - struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); - if (!ioapic) - return -EINVAL; + struct kvm_ioapic *ioapic = kvm->arch.vioapic; spin_lock(&ioapic->lock); memcpy(state, ioapic, sizeof(struct kvm_ioapic_state)); state->irr &= ~ioapic->irr_delivered; spin_unlock(&ioapic->lock); - return 0; } -int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) +void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state) { - struct kvm_ioapic *ioapic = ioapic_irqchip(kvm); - if (!ioapic) - return -EINVAL; + struct kvm_ioapic *ioapic = kvm->arch.vioapic; spin_lock(&ioapic->lock); memcpy(ioapic, state, sizeof(struct kvm_ioapic_state)); ioapic->irr = 0; ioapic->irr_delivered = 0; - kvm_vcpu_request_scan_ioapic(kvm); + kvm_make_scan_ioapic_request(kvm); kvm_ioapic_inject_all(ioapic, state->irr); spin_unlock(&ioapic->lock); - return 0; } diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 1cc6e54436db..29ce19732ccf 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -105,17 +105,13 @@ do { \ #define ASSERT(x) do { } while (0) #endif -static inline struct kvm_ioapic *ioapic_irqchip(struct kvm *kvm) -{ - return kvm->arch.vioapic; -} - static inline int ioapic_in_kernel(struct kvm *kvm) { - int ret; + int mode = kvm->arch.irqchip_mode; - ret = (ioapic_irqchip(kvm) != NULL); - return ret; + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_KERNEL; } void kvm_rtc_eoi_tracking_restore_one(struct kvm_vcpu *vcpu); @@ -132,8 +128,8 @@ void kvm_ioapic_clear_all(struct kvm_ioapic *ioapic, int irq_source_id); int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src, struct kvm_lapic_irq *irq, struct dest_map *dest_map); -int kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); -int kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +void kvm_get_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); +void kvm_set_ioapic(struct kvm *kvm, struct kvm_ioapic_state *state); void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, ulong *ioapic_handled_vectors); void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c deleted file mode 100644 index b181426f67b4..000000000000 --- a/arch/x86/kvm/iommu.c +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Copyright (c) 2006, Intel Corporation. - * - * This program is free software; you can redistribute it and/or modify it - * under the terms and conditions of the GNU General Public License, - * version 2, as published by the Free Software Foundation. - * - * This program is distributed in the hope it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for - * more details. - * - * You should have received a copy of the GNU General Public License along with - * this program; if not, write to the Free Software Foundation, Inc., 59 Temple - * Place - Suite 330, Boston, MA 02111-1307 USA. - * - * Copyright (C) 2006-2008 Intel Corporation - * Copyright IBM Corporation, 2008 - * Copyright 2010 Red Hat, Inc. and/or its affiliates. - * - * Author: Allen M. Kay <allen.m.kay@intel.com> - * Author: Weidong Han <weidong.han@intel.com> - * Author: Ben-Ami Yassour <benami@il.ibm.com> - */ - -#include <linux/list.h> -#include <linux/kvm_host.h> -#include <linux/moduleparam.h> -#include <linux/pci.h> -#include <linux/stat.h> -#include <linux/iommu.h> -#include "assigned-dev.h" - -static bool allow_unsafe_assigned_interrupts; -module_param_named(allow_unsafe_assigned_interrupts, - allow_unsafe_assigned_interrupts, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(allow_unsafe_assigned_interrupts, - "Enable device assignment on platforms without interrupt remapping support."); - -static int kvm_iommu_unmap_memslots(struct kvm *kvm); -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages); - -static kvm_pfn_t kvm_pin_pages(struct kvm_memory_slot *slot, gfn_t gfn, - unsigned long npages) -{ - gfn_t end_gfn; - kvm_pfn_t pfn; - - pfn = gfn_to_pfn_memslot(slot, gfn); - end_gfn = gfn + npages; - gfn += 1; - - if (is_error_noslot_pfn(pfn)) - return pfn; - - while (gfn < end_gfn) - gfn_to_pfn_memslot(slot, gfn++); - - return pfn; -} - -static void kvm_unpin_pages(struct kvm *kvm, kvm_pfn_t pfn, - unsigned long npages) -{ - unsigned long i; - - for (i = 0; i < npages; ++i) - kvm_release_pfn_clean(pfn + i); -} - -int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - gfn_t gfn, end_gfn; - kvm_pfn_t pfn; - int r = 0; - struct iommu_domain *domain = kvm->arch.iommu_domain; - int flags; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - gfn = slot->base_gfn; - end_gfn = gfn + slot->npages; - - flags = IOMMU_READ; - if (!(slot->flags & KVM_MEM_READONLY)) - flags |= IOMMU_WRITE; - if (!kvm->arch.iommu_noncoherent) - flags |= IOMMU_CACHE; - - - while (gfn < end_gfn) { - unsigned long page_size; - - /* Check if already mapped */ - if (iommu_iova_to_phys(domain, gfn_to_gpa(gfn))) { - gfn += 1; - continue; - } - - /* Get the page size we could use to map */ - page_size = kvm_host_page_size(kvm, gfn); - - /* Make sure the page_size does not exceed the memslot */ - while ((gfn + (page_size >> PAGE_SHIFT)) > end_gfn) - page_size >>= 1; - - /* Make sure gfn is aligned to the page size we want to map */ - while ((gfn << PAGE_SHIFT) & (page_size - 1)) - page_size >>= 1; - - /* Make sure hva is aligned to the page size we want to map */ - while (__gfn_to_hva_memslot(slot, gfn) & (page_size - 1)) - page_size >>= 1; - - /* - * Pin all pages we are about to map in memory. This is - * important because we unmap and unpin in 4kb steps later. - */ - pfn = kvm_pin_pages(slot, gfn, page_size >> PAGE_SHIFT); - if (is_error_noslot_pfn(pfn)) { - gfn += 1; - continue; - } - - /* Map into IO address space */ - r = iommu_map(domain, gfn_to_gpa(gfn), pfn_to_hpa(pfn), - page_size, flags); - if (r) { - printk(KERN_ERR "kvm_iommu_map_address:" - "iommu failed to map pfn=%llx\n", pfn); - kvm_unpin_pages(kvm, pfn, page_size >> PAGE_SHIFT); - goto unmap_pages; - } - - gfn += page_size >> PAGE_SHIFT; - - cond_resched(); - } - - return 0; - -unmap_pages: - kvm_iommu_put_pages(kvm, slot->base_gfn, gfn - slot->base_gfn); - return r; -} - -static int kvm_iommu_map_memslots(struct kvm *kvm) -{ - int idx, r = 0; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - if (kvm->arch.iommu_noncoherent) - kvm_arch_register_noncoherent_dma(kvm); - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) { - r = kvm_iommu_map_pages(kvm, memslot); - if (r) - break; - } - srcu_read_unlock(&kvm->srcu, idx); - - return r; -} - -int kvm_assign_device(struct kvm *kvm, struct pci_dev *pdev) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - int r; - bool noncoherent; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - if (pdev == NULL) - return -ENODEV; - - r = iommu_attach_device(domain, &pdev->dev); - if (r) { - dev_err(&pdev->dev, "kvm assign device failed ret %d", r); - return r; - } - - noncoherent = !iommu_capable(&pci_bus_type, IOMMU_CAP_CACHE_COHERENCY); - - /* Check if need to update IOMMU page table for guest memory */ - if (noncoherent != kvm->arch.iommu_noncoherent) { - kvm_iommu_unmap_memslots(kvm); - kvm->arch.iommu_noncoherent = noncoherent; - r = kvm_iommu_map_memslots(kvm); - if (r) - goto out_unmap; - } - - kvm_arch_start_assignment(kvm); - pci_set_dev_assigned(pdev); - - dev_info(&pdev->dev, "kvm assign device\n"); - - return 0; -out_unmap: - kvm_iommu_unmap_memslots(kvm); - return r; -} - -int kvm_deassign_device(struct kvm *kvm, struct pci_dev *pdev) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - if (pdev == NULL) - return -ENODEV; - - iommu_detach_device(domain, &pdev->dev); - - pci_clear_dev_assigned(pdev); - kvm_arch_end_assignment(kvm); - - dev_info(&pdev->dev, "kvm deassign device\n"); - - return 0; -} - -int kvm_iommu_map_guest(struct kvm *kvm) -{ - int r; - - if (!iommu_present(&pci_bus_type)) { - printk(KERN_ERR "%s: iommu not found\n", __func__); - return -ENODEV; - } - - mutex_lock(&kvm->slots_lock); - - kvm->arch.iommu_domain = iommu_domain_alloc(&pci_bus_type); - if (!kvm->arch.iommu_domain) { - r = -ENOMEM; - goto out_unlock; - } - - if (!allow_unsafe_assigned_interrupts && - !iommu_capable(&pci_bus_type, IOMMU_CAP_INTR_REMAP)) { - printk(KERN_WARNING "%s: No interrupt remapping support," - " disallowing device assignment." - " Re-enable with \"allow_unsafe_assigned_interrupts=1\"" - " module option.\n", __func__); - iommu_domain_free(kvm->arch.iommu_domain); - kvm->arch.iommu_domain = NULL; - r = -EPERM; - goto out_unlock; - } - - r = kvm_iommu_map_memslots(kvm); - if (r) - kvm_iommu_unmap_memslots(kvm); - -out_unlock: - mutex_unlock(&kvm->slots_lock); - return r; -} - -static void kvm_iommu_put_pages(struct kvm *kvm, - gfn_t base_gfn, unsigned long npages) -{ - struct iommu_domain *domain; - gfn_t end_gfn, gfn; - kvm_pfn_t pfn; - u64 phys; - - domain = kvm->arch.iommu_domain; - end_gfn = base_gfn + npages; - gfn = base_gfn; - - /* check if iommu exists and in use */ - if (!domain) - return; - - while (gfn < end_gfn) { - unsigned long unmap_pages; - size_t size; - - /* Get physical address */ - phys = iommu_iova_to_phys(domain, gfn_to_gpa(gfn)); - - if (!phys) { - gfn++; - continue; - } - - pfn = phys >> PAGE_SHIFT; - - /* Unmap address from IO address space */ - size = iommu_unmap(domain, gfn_to_gpa(gfn), PAGE_SIZE); - unmap_pages = 1ULL << get_order(size); - - /* Unpin all pages we just unmapped to not leak any memory */ - kvm_unpin_pages(kvm, pfn, unmap_pages); - - gfn += unmap_pages; - - cond_resched(); - } -} - -void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot) -{ - kvm_iommu_put_pages(kvm, slot->base_gfn, slot->npages); -} - -static int kvm_iommu_unmap_memslots(struct kvm *kvm) -{ - int idx; - struct kvm_memslots *slots; - struct kvm_memory_slot *memslot; - - idx = srcu_read_lock(&kvm->srcu); - slots = kvm_memslots(kvm); - - kvm_for_each_memslot(memslot, slots) - kvm_iommu_unmap_pages(kvm, memslot); - - srcu_read_unlock(&kvm->srcu, idx); - - if (kvm->arch.iommu_noncoherent) - kvm_arch_unregister_noncoherent_dma(kvm); - - return 0; -} - -int kvm_iommu_unmap_guest(struct kvm *kvm) -{ - struct iommu_domain *domain = kvm->arch.iommu_domain; - - /* check if iommu exists and in use */ - if (!domain) - return 0; - - mutex_lock(&kvm->slots_lock); - kvm_iommu_unmap_memslots(kvm); - kvm->arch.iommu_domain = NULL; - kvm->arch.iommu_noncoherent = false; - mutex_unlock(&kvm->slots_lock); - - iommu_domain_free(domain); - return 0; -} diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c index 60d91c9d160c..5c24811e8b0b 100644 --- a/arch/x86/kvm/irq.c +++ b/arch/x86/kvm/irq.c @@ -60,7 +60,7 @@ static int kvm_cpu_has_extint(struct kvm_vcpu *v) if (irqchip_split(v->kvm)) return pending_userspace_extint(v); else - return pic_irqchip(v->kvm)->output; + return v->kvm->arch.vpic->output; } else return 0; } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 40d5b2cf6061..d5005cc26521 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -78,40 +78,42 @@ void kvm_pic_destroy(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); -static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) -{ - return kvm->arch.vpic; -} - static inline int pic_in_kernel(struct kvm *kvm) { - int ret; + int mode = kvm->arch.irqchip_mode; - ret = (pic_irqchip(kvm) != NULL); - return ret; + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_KERNEL; } static inline int irqchip_split(struct kvm *kvm) { - return kvm->arch.irqchip_mode == KVM_IRQCHIP_SPLIT; + int mode = kvm->arch.irqchip_mode; + + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_SPLIT; } static inline int irqchip_kernel(struct kvm *kvm) { - return kvm->arch.irqchip_mode == KVM_IRQCHIP_KERNEL; + int mode = kvm->arch.irqchip_mode; + + /* Matches smp_wmb() when setting irqchip_mode */ + smp_rmb(); + return mode == KVM_IRQCHIP_KERNEL; } static inline int irqchip_in_kernel(struct kvm *kvm) { - bool ret = kvm->arch.irqchip_mode != KVM_IRQCHIP_NONE; + int mode = kvm->arch.irqchip_mode; - /* Matches with wmb after initializing kvm->irq_routing. */ + /* Matches smp_wmb() when setting irqchip_mode */ smp_rmb(); - return ret; + return mode != KVM_IRQCHIP_NONE; } -void kvm_pic_reset(struct kvm_kpic_state *s); - void kvm_inject_pending_timer_irqs(struct kvm_vcpu *vcpu); void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu); void kvm_apic_nmi_wd_deliver(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c index 6825cd36d13b..3cc3b2d130a0 100644 --- a/arch/x86/kvm/irq_comm.c +++ b/arch/x86/kvm/irq_comm.c @@ -42,7 +42,7 @@ static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, int irq_source_id, int level, bool line_status) { - struct kvm_pic *pic = pic_irqchip(kvm); + struct kvm_pic *pic = kvm->arch.vpic; return kvm_pic_set_irq(pic, e->irqchip.pin, irq_source_id, level); } @@ -232,11 +232,11 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id) goto unlock; } clear_bit(irq_source_id, &kvm->arch.irq_sources_bitmap); - if (!ioapic_in_kernel(kvm)) + if (!irqchip_kernel(kvm)) goto unlock; kvm_ioapic_clear_all(kvm->arch.vioapic, irq_source_id); - kvm_pic_clear_all(pic_irqchip(kvm), irq_source_id); + kvm_pic_clear_all(kvm->arch.vpic, irq_source_id); unlock: mutex_unlock(&kvm->irq_lock); } @@ -274,42 +274,42 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin, srcu_read_unlock(&kvm->irq_srcu, idx); } +bool kvm_arch_can_set_irq_routing(struct kvm *kvm) +{ + return irqchip_in_kernel(kvm); +} + int kvm_set_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { - int r = -EINVAL; - int delta; - unsigned max_pin; - + /* We can't check irqchip_in_kernel() here as some callers are + * currently inititalizing the irqchip. Other callers should therefore + * check kvm_arch_can_set_irq_routing() before calling this function. + */ switch (ue->type) { case KVM_IRQ_ROUTING_IRQCHIP: - delta = 0; + if (irqchip_split(kvm)) + return -EINVAL; + e->irqchip.pin = ue->u.irqchip.pin; switch (ue->u.irqchip.irqchip) { case KVM_IRQCHIP_PIC_SLAVE: - delta = 8; + e->irqchip.pin += PIC_NUM_PINS / 2; /* fall through */ case KVM_IRQCHIP_PIC_MASTER: - if (!pic_in_kernel(kvm)) - goto out; - + if (ue->u.irqchip.pin >= PIC_NUM_PINS / 2) + return -EINVAL; e->set = kvm_set_pic_irq; - max_pin = PIC_NUM_PINS; break; case KVM_IRQCHIP_IOAPIC: - if (!ioapic_in_kernel(kvm)) - goto out; - - max_pin = KVM_IOAPIC_NUM_PINS; + if (ue->u.irqchip.pin >= KVM_IOAPIC_NUM_PINS) + return -EINVAL; e->set = kvm_set_ioapic_irq; break; default: - goto out; + return -EINVAL; } e->irqchip.irqchip = ue->u.irqchip.irqchip; - e->irqchip.pin = ue->u.irqchip.pin + delta; - if (e->irqchip.pin >= max_pin) - goto out; break; case KVM_IRQ_ROUTING_MSI: e->set = kvm_set_msi; @@ -318,7 +318,7 @@ int kvm_set_routing_entry(struct kvm *kvm, e->msi.data = ue->u.msi.data; if (kvm_msi_route_invalid(kvm, e)) - goto out; + return -EINVAL; break; case KVM_IRQ_ROUTING_HV_SINT: e->set = kvm_hv_set_sint; @@ -326,12 +326,10 @@ int kvm_set_routing_entry(struct kvm *kvm, e->hv_sint.sint = ue->u.hv_sint.sint; break; default: - goto out; + return -EINVAL; } - r = 0; -out: - return r; + return 0; } bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq, diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d2a892fc92bf..c329d2894905 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -529,14 +529,16 @@ int kvm_apic_set_irq(struct kvm_vcpu *vcpu, struct kvm_lapic_irq *irq, static int pv_eoi_put_user(struct kvm_vcpu *vcpu, u8 val) { - return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, &val, - sizeof(val)); + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, &val, + sizeof(val)); } static int pv_eoi_get_user(struct kvm_vcpu *vcpu, u8 *val) { - return kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.pv_eoi.data, val, - sizeof(*val)); + + return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.pv_eoi.data, val, + sizeof(*val)); } static inline bool pv_eoi_enabled(struct kvm_vcpu *vcpu) @@ -2285,8 +2287,8 @@ void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu) if (!test_bit(KVM_APIC_CHECK_VAPIC, &vcpu->arch.apic_attention)) return; - if (kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32))) + if (kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32))) return; apic_set_tpr(vcpu->arch.apic, data & 0xff); @@ -2338,14 +2340,14 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu) max_isr = 0; data = (tpr & 0xff) | ((max_isr & 0xf0) << 8) | (max_irr << 24); - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apic->vapic_cache, &data, - sizeof(u32)); + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apic->vapic_cache, &data, + sizeof(u32)); } int kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr) { if (vapic_addr) { - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apic->vapic_cache, vapic_addr, sizeof(u32))) return -EINVAL; @@ -2439,7 +2441,7 @@ int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data) vcpu->arch.pv_eoi.msr_val = data; if (!pv_eoi_enabled(vcpu)) return 0; - return kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.pv_eoi.data, + return kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_eoi.data, addr, sizeof(u8)); } diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index ac7810513d0e..558676538fca 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4340,7 +4340,8 @@ void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, + bool accessed_dirty) { struct kvm_mmu *context = &vcpu->arch.mmu; @@ -4349,6 +4350,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) context->shadow_root_level = kvm_x86_ops->get_tdp_level(); context->nx = true; + context->ept_ad = accessed_dirty; context->page_fault = ept_page_fault; context->gva_to_gpa = ept_gva_to_gpa; context->sync_page = ept_sync_page; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index ddc56e91f2e4..d8ccb32f7308 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -74,7 +74,8 @@ enum { int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); -void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); +void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly, + bool accessed_dirty); static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm) { diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index a01105485315..314d2071b337 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -23,13 +23,6 @@ * so the code in this file is compiled twice, once per pte size. */ -/* - * This is used to catch non optimized PT_GUEST_(DIRTY|ACCESS)_SHIFT macro - * uses for EPT without A/D paging type. - */ -extern u64 __pure __using_nonexistent_pte_bit(void) - __compiletime_error("wrong use of PT_GUEST_(DIRTY|ACCESS)_SHIFT"); - #if PTTYPE == 64 #define pt_element_t u64 #define guest_walker guest_walker64 @@ -39,10 +32,9 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS - #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK - #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT + #define PT_HAVE_ACCESSED_DIRTY(mmu) true #ifdef CONFIG_X86_64 #define PT_MAX_FULL_LEVELS 4 #define CMPXCHG cmpxchg @@ -60,10 +52,9 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_INDEX(addr, level) PT32_INDEX(addr, level) #define PT_LEVEL_BITS PT32_LEVEL_BITS #define PT_MAX_FULL_LEVELS 2 - #define PT_GUEST_ACCESSED_MASK PT_ACCESSED_MASK - #define PT_GUEST_DIRTY_MASK PT_DIRTY_MASK #define PT_GUEST_DIRTY_SHIFT PT_DIRTY_SHIFT #define PT_GUEST_ACCESSED_SHIFT PT_ACCESSED_SHIFT + #define PT_HAVE_ACCESSED_DIRTY(mmu) true #define CMPXCHG cmpxchg #elif PTTYPE == PTTYPE_EPT #define pt_element_t u64 @@ -74,16 +65,18 @@ extern u64 __pure __using_nonexistent_pte_bit(void) #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) #define PT_INDEX(addr, level) PT64_INDEX(addr, level) #define PT_LEVEL_BITS PT64_LEVEL_BITS - #define PT_GUEST_ACCESSED_MASK 0 - #define PT_GUEST_DIRTY_MASK 0 - #define PT_GUEST_DIRTY_SHIFT __using_nonexistent_pte_bit() - #define PT_GUEST_ACCESSED_SHIFT __using_nonexistent_pte_bit() + #define PT_GUEST_DIRTY_SHIFT 9 + #define PT_GUEST_ACCESSED_SHIFT 8 + #define PT_HAVE_ACCESSED_DIRTY(mmu) ((mmu)->ept_ad) #define CMPXCHG cmpxchg64 #define PT_MAX_FULL_LEVELS 4 #else #error Invalid PTTYPE value #endif +#define PT_GUEST_DIRTY_MASK (1 << PT_GUEST_DIRTY_SHIFT) +#define PT_GUEST_ACCESSED_MASK (1 << PT_GUEST_ACCESSED_SHIFT) + #define gpte_to_gfn_lvl FNAME(gpte_to_gfn_lvl) #define gpte_to_gfn(pte) gpte_to_gfn_lvl((pte), PT_PAGE_TABLE_LEVEL) @@ -111,12 +104,13 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } -static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte) +static inline void FNAME(protect_clean_gpte)(struct kvm_mmu *mmu, unsigned *access, + unsigned gpte) { unsigned mask; /* dirty bit is not supported, so no need to track it */ - if (!PT_GUEST_DIRTY_MASK) + if (!PT_HAVE_ACCESSED_DIRTY(mmu)) return; BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK); @@ -171,7 +165,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu, goto no_present; /* if accessed bit is not supported prefetch non accessed gpte */ - if (PT_GUEST_ACCESSED_MASK && !(gpte & PT_GUEST_ACCESSED_MASK)) + if (PT_HAVE_ACCESSED_DIRTY(&vcpu->arch.mmu) && !(gpte & PT_GUEST_ACCESSED_MASK)) goto no_present; return false; @@ -217,7 +211,7 @@ static int FNAME(update_accessed_dirty_bits)(struct kvm_vcpu *vcpu, int ret; /* dirty/accessed bits are not supported, so no need to update them */ - if (!PT_GUEST_DIRTY_MASK) + if (!PT_HAVE_ACCESSED_DIRTY(mmu)) return 0; for (level = walker->max_level; level >= walker->level; --level) { @@ -286,7 +280,9 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, pt_element_t __user *uninitialized_var(ptep_user); gfn_t table_gfn; unsigned index, pt_access, pte_access, accessed_dirty, pte_pkey; + unsigned nested_access; gpa_t pte_gpa; + bool have_ad; int offset; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; @@ -299,6 +295,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, retry_walk: walker->level = mmu->root_level; pte = mmu->get_cr3(vcpu); + have_ad = PT_HAVE_ACCESSED_DIRTY(mmu); #if PTTYPE == 64 if (walker->level == PT32E_ROOT_LEVEL) { @@ -312,7 +309,15 @@ retry_walk: walker->max_level = walker->level; ASSERT(!(is_long_mode(vcpu) && !is_pae(vcpu))); - accessed_dirty = PT_GUEST_ACCESSED_MASK; + accessed_dirty = have_ad ? PT_GUEST_ACCESSED_MASK : 0; + + /* + * FIXME: on Intel processors, loads of the PDPTE registers for PAE paging + * by the MOV to CR instruction are treated as reads and do not cause the + * processor to set the dirty flag in any EPT paging-structure entry. + */ + nested_access = (have_ad ? PFERR_WRITE_MASK : 0) | PFERR_USER_MASK; + pt_access = pte_access = ACC_ALL; ++walker->level; @@ -332,7 +337,7 @@ retry_walk: walker->pte_gpa[walker->level - 1] = pte_gpa; real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), - PFERR_USER_MASK|PFERR_WRITE_MASK, + nested_access, &walker->fault); /* @@ -394,7 +399,7 @@ retry_walk: walker->gfn = real_gpa >> PAGE_SHIFT; if (!write_fault) - FNAME(protect_clean_gpte)(&pte_access, pte); + FNAME(protect_clean_gpte)(mmu, &pte_access, pte); else /* * On a write fault, fold the dirty bit into accessed_dirty. @@ -485,7 +490,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, gfn = gpte_to_gfn(gpte); pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); - FNAME(protect_clean_gpte)(&pte_access, gpte); + FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn, no_dirty_log && (pte_access & ACC_WRITE_MASK)); if (is_error_pfn(pfn)) @@ -979,7 +984,7 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) gfn = gpte_to_gfn(gpte); pte_access = sp->role.access; pte_access &= FNAME(gpte_access)(vcpu, gpte); - FNAME(protect_clean_gpte)(&pte_access, gpte); + FNAME(protect_clean_gpte)(&vcpu->arch.mmu, &pte_access, gpte); if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access, &nr_present)) @@ -1025,3 +1030,4 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp) #undef PT_GUEST_DIRTY_MASK #undef PT_GUEST_DIRTY_SHIFT #undef PT_GUEST_ACCESSED_SHIFT +#undef PT_HAVE_ACCESSED_DIRTY diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5f48f62b8dc2..c27ac6923a18 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1196,10 +1196,13 @@ static void init_vmcb(struct vcpu_svm *svm) set_intercept(svm, INTERCEPT_CLGI); set_intercept(svm, INTERCEPT_SKINIT); set_intercept(svm, INTERCEPT_WBINVD); - set_intercept(svm, INTERCEPT_MONITOR); - set_intercept(svm, INTERCEPT_MWAIT); set_intercept(svm, INTERCEPT_XSETBV); + if (!kvm_mwait_in_guest()) { + set_intercept(svm, INTERCEPT_MONITOR); + set_intercept(svm, INTERCEPT_MWAIT); + } + control->iopm_base_pa = iopm_base; control->msrpm_base_pa = __pa(svm->msrpm); control->int_ctl = V_INTR_MASKING_MASK; @@ -5254,6 +5257,12 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } +static void svm_setup_mce(struct kvm_vcpu *vcpu) +{ + /* [63:9] are reserved. */ + vcpu->arch.mcg_cap &= 0x1ff; +} + static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -5365,6 +5374,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .pmu_ops = &amd_pmu_ops, .deliver_posted_interrupt = svm_deliver_avic_intr, .update_pi_irte = svm_update_pi_irte, + .setup_mce = svm_setup_mce, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 1a471e5f963f..c5fd459c4043 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -84,9 +84,6 @@ module_param_named(eptad, enable_ept_ad_bits, bool, S_IRUGO); static bool __read_mostly emulate_invalid_guest_state = true; module_param(emulate_invalid_guest_state, bool, S_IRUGO); -static bool __read_mostly vmm_exclusive = 1; -module_param(vmm_exclusive, bool, S_IRUGO); - static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); @@ -615,10 +612,6 @@ struct vcpu_vmx { int vpid; bool emulation_required; - /* Support for vnmi-less CPUs */ - int soft_vnmi_blocked; - ktime_t entry_time; - s64 vnmi_blocked_time; u32 exit_reason; /* Posted interrupt descriptor */ @@ -914,8 +907,6 @@ static void nested_release_page_clean(struct page *page) static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu); static u64 construct_eptp(unsigned long root_hpa); -static void kvm_cpu_vmxon(u64 addr); -static void kvm_cpu_vmxoff(void); static bool vmx_xsaves_supported(void); static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr); static void vmx_set_segment(struct kvm_vcpu *vcpu, @@ -1289,11 +1280,6 @@ static inline bool cpu_has_vmx_invpcid(void) SECONDARY_EXEC_ENABLE_INVPCID; } -static inline bool cpu_has_virtual_nmis(void) -{ - return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; -} - static inline bool cpu_has_vmx_wbinvd_exit(void) { return vmcs_config.cpu_based_2nd_exec_ctrl & @@ -2238,15 +2224,10 @@ static void decache_tsc_multiplier(struct vcpu_vmx *vmx) static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); bool already_loaded = vmx->loaded_vmcs->cpu == cpu; - if (!vmm_exclusive) - kvm_cpu_vmxon(phys_addr); - else if (!already_loaded) - loaded_vmcs_clear(vmx->loaded_vmcs); - if (!already_loaded) { + loaded_vmcs_clear(vmx->loaded_vmcs); local_irq_disable(); crash_disable_local_vmclear(cpu); @@ -2324,11 +2305,6 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu) vmx_vcpu_pi_put(vcpu); __vmx_load_host_state(to_vmx(vcpu)); - if (!vmm_exclusive) { - __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs); - vcpu->cpu = -1; - kvm_cpu_vmxoff(); - } } static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu); @@ -2752,6 +2728,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_secondary_ctls_high); vmx->nested.nested_vmx_secondary_ctls_low = 0; vmx->nested.nested_vmx_secondary_ctls_high &= + SECONDARY_EXEC_RDRAND | SECONDARY_EXEC_RDSEED | SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | SECONDARY_EXEC_DESC | @@ -2766,14 +2743,16 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_secondary_ctls_high |= SECONDARY_EXEC_ENABLE_EPT; vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT | - VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT | - VMX_EPT_INVEPT_BIT; + VMX_EPTP_WB_BIT | VMX_EPT_INVEPT_BIT; if (cpu_has_vmx_ept_execute_only()) vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXECUTE_ONLY_BIT; vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept; vmx->nested.nested_vmx_ept_caps |= VMX_EPT_EXTENT_GLOBAL_BIT | - VMX_EPT_EXTENT_CONTEXT_BIT; + VMX_EPT_EXTENT_CONTEXT_BIT | VMX_EPT_2MB_PAGE_BIT | + VMX_EPT_1GB_PAGE_BIT; + if (enable_ept_ad_bits) + vmx->nested.nested_vmx_ept_caps |= VMX_EPT_AD_BIT; } else vmx->nested.nested_vmx_ept_caps = 0; @@ -3420,6 +3399,7 @@ static __init int vmx_disabled_by_bios(void) static void kvm_cpu_vmxon(u64 addr) { + cr4_set_bits(X86_CR4_VMXE); intel_pt_handle_vmx(1); asm volatile (ASM_VMX_VMXON_RAX @@ -3462,12 +3442,8 @@ static int hardware_enable(void) /* enable and lock */ wrmsrl(MSR_IA32_FEATURE_CONTROL, old | test_bits); } - cr4_set_bits(X86_CR4_VMXE); - - if (vmm_exclusive) { - kvm_cpu_vmxon(phys_addr); - ept_sync_global(); - } + kvm_cpu_vmxon(phys_addr); + ept_sync_global(); return 0; } @@ -3491,15 +3467,13 @@ static void kvm_cpu_vmxoff(void) asm volatile (__ex(ASM_VMX_VMXOFF) : : : "cc"); intel_pt_handle_vmx(0); + cr4_clear_bits(X86_CR4_VMXE); } static void hardware_disable(void) { - if (vmm_exclusive) { - vmclear_local_loaded_vmcss(); - kvm_cpu_vmxoff(); - } - cr4_clear_bits(X86_CR4_VMXE); + vmclear_local_loaded_vmcss(); + kvm_cpu_vmxoff(); } static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, @@ -3549,11 +3523,13 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_USE_IO_BITMAPS | CPU_BASED_MOV_DR_EXITING | CPU_BASED_USE_TSC_OFFSETING | - CPU_BASED_MWAIT_EXITING | - CPU_BASED_MONITOR_EXITING | CPU_BASED_INVLPG_EXITING | CPU_BASED_RDPMC_EXITING; + if (!kvm_mwait_in_guest()) + min |= CPU_BASED_MWAIT_EXITING | + CPU_BASED_MONITOR_EXITING; + opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -3619,9 +3595,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_vmexit_control) < 0) return -EIO; - min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING; - opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR | - PIN_BASED_VMX_PREEMPTION_TIMER; + min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING | + PIN_BASED_VIRTUAL_NMIS; + opt = PIN_BASED_POSTED_INTR | PIN_BASED_VMX_PREEMPTION_TIMER; if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS, &_pin_based_exec_control) < 0) return -EIO; @@ -4013,11 +3989,12 @@ static void exit_lmode(struct kvm_vcpu *vcpu) static inline void __vmx_flush_tlb(struct kvm_vcpu *vcpu, int vpid) { - vpid_sync_context(vpid); if (enable_ept) { if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) return; ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); + } else { + vpid_sync_context(vpid); } } @@ -5293,8 +5270,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmx->rmode.vm86_active = 0; - vmx->soft_vnmi_blocked = 0; - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); kvm_set_cr8(vcpu, 0); @@ -5414,8 +5389,7 @@ static void enable_irq_window(struct kvm_vcpu *vcpu) static void enable_nmi_window(struct kvm_vcpu *vcpu) { - if (!cpu_has_virtual_nmis() || - vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { + if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) { enable_irq_window(vcpu); return; } @@ -5456,19 +5430,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (!is_guest_mode(vcpu)) { - if (!cpu_has_virtual_nmis()) { - /* - * Tracking the NMI-blocked state in software is built upon - * finding the next open IRQ window. This, in turn, depends on - * well-behaving guests: They have to keep IRQs disabled at - * least as long as the NMI handler runs. Otherwise we may - * cause NMI nesting, maybe breaking the guest. But as this is - * highly unlikely, we can live with the residual risk. - */ - vmx->soft_vnmi_blocked = 1; - vmx->vnmi_blocked_time = 0; - } - ++vcpu->stat.nmi_injections; vmx->nmi_known_unmasked = false; } @@ -5485,8 +5446,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { - if (!cpu_has_virtual_nmis()) - return to_vmx(vcpu)->soft_vnmi_blocked; if (to_vmx(vcpu)->nmi_known_unmasked) return false; return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; @@ -5496,20 +5455,13 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!cpu_has_virtual_nmis()) { - if (vmx->soft_vnmi_blocked != masked) { - vmx->soft_vnmi_blocked = masked; - vmx->vnmi_blocked_time = 0; - } - } else { - vmx->nmi_known_unmasked = !masked; - if (masked) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - } + vmx->nmi_known_unmasked = !masked; + if (masked) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); } static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) @@ -5517,9 +5469,6 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) if (to_vmx(vcpu)->nested.nested_run_pending) return 0; - if (!cpu_has_virtual_nmis() && to_vmx(vcpu)->soft_vnmi_blocked) - return 0; - return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI | GUEST_INTR_STATE_NMI)); @@ -6240,21 +6189,18 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) unsigned long exit_qualification; gpa_t gpa; u32 error_code; - int gla_validity; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); - gla_validity = (exit_qualification >> 7) & 0x3; - if (gla_validity == 0x2) { - printk(KERN_ERR "EPT: Handling EPT violation failed!\n"); - printk(KERN_ERR "EPT: GPA: 0x%lx, GVA: 0x%lx\n", - (long unsigned int)vmcs_read64(GUEST_PHYSICAL_ADDRESS), - vmcs_readl(GUEST_LINEAR_ADDRESS)); - printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", - (long unsigned int)exit_qualification); - vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; - vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; - return 0; + if (is_guest_mode(vcpu) + && !(exit_qualification & EPT_VIOLATION_GVA_TRANSLATED)) { + /* + * Fix up exit_qualification according to whether guest + * page table accesses are reads or writes. + */ + u64 eptp = nested_ept_get_cr3(vcpu); + if (!(eptp & VMX_EPT_AD_ENABLE_BIT)) + exit_qualification &= ~EPT_VIOLATION_ACC_WRITE; } /* @@ -6264,7 +6210,6 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu) * AAK134, BY25. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - cpu_has_virtual_nmis() && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -6350,7 +6295,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu) if (intr_window_requested && vmx_interrupt_allowed(vcpu)) return handle_interrupt_window(&vmx->vcpu); - if (test_bit(KVM_REQ_EVENT, &vcpu->requests)) + if (kvm_test_request(KVM_REQ_EVENT, vcpu)) return 1; err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE); @@ -7103,34 +7048,24 @@ out_msr_bitmap: static int handle_vmon(struct kvm_vcpu *vcpu) { int ret; - struct kvm_segment cs; struct vcpu_vmx *vmx = to_vmx(vcpu); const u64 VMXON_NEEDED_FEATURES = FEATURE_CONTROL_LOCKED | FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; - /* The Intel VMX Instruction Reference lists a bunch of bits that - * are prerequisite to running VMXON, most notably cr4.VMXE must be - * set to 1 (see vmx_set_cr4() for when we allow the guest to set this). - * Otherwise, we should fail with #UD. We test these now: + /* + * The Intel VMX Instruction Reference lists a bunch of bits that are + * prerequisite to running VMXON, most notably cr4.VMXE must be set to + * 1 (see vmx_set_cr4() for when we allow the guest to set this). + * Otherwise, we should fail with #UD. But most faulting conditions + * have already been checked by hardware, prior to the VM-exit for + * VMXON. We do test guest cr4.VMXE because processor CR4 always has + * that bit set to 1 in non-root mode. */ - if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE) || - !kvm_read_cr0_bits(vcpu, X86_CR0_PE) || - (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) { + if (!kvm_read_cr4_bits(vcpu, X86_CR4_VMXE)) { kvm_queue_exception(vcpu, UD_VECTOR); return 1; } - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - if (is_long_mode(vcpu) && !cs.l) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 1; - } - if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); return kvm_skip_emulated_instruction(vcpu); @@ -7157,29 +7092,15 @@ static int handle_vmon(struct kvm_vcpu *vcpu) * Intel's VMX Instruction Reference specifies a common set of prerequisites * for running VMX instructions (except VMXON, whose prerequisites are * slightly different). It also specifies what exception to inject otherwise. + * Note that many of these exceptions have priority over VM exits, so they + * don't have to be checked again here. */ static int nested_vmx_check_permission(struct kvm_vcpu *vcpu) { - struct kvm_segment cs; - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!vmx->nested.vmxon) { + if (!to_vmx(vcpu)->nested.vmxon) { kvm_queue_exception(vcpu, UD_VECTOR); return 0; } - - vmx_get_segment(vcpu, &cs, VCPU_SREG_CS); - if ((vmx_get_rflags(vcpu) & X86_EFLAGS_VM) || - (is_long_mode(vcpu) && !cs.l)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 0; - } - - if (vmx_get_cpl(vcpu)) { - kvm_inject_gp(vcpu, 0); - return 0; - } - return 1; } @@ -7523,7 +7444,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, exit_qualification, vmx_instruction_info, true, &gva)) return 1; - /* _system ok, as nested_vmx_check_permission verified cpl=0 */ + /* _system ok, as hardware has verified cpl=0 */ kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, gva, &field_value, (is_long_mode(vcpu) ? 8 : 4), NULL); } @@ -7656,7 +7577,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) if (get_vmx_mem_address(vcpu, exit_qualification, vmx_instruction_info, true, &vmcs_gva)) return 1; - /* ok to use *_system, as nested_vmx_check_permission verified cpl=0 */ + /* ok to use *_system, as hardware has verified cpl=0 */ if (kvm_write_guest_virt_system(&vcpu->arch.emulate_ctxt, vmcs_gva, (void *)&to_vmx(vcpu)->nested.current_vmptr, sizeof(u64), &e)) { @@ -7689,11 +7610,6 @@ static int handle_invept(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; - if (!kvm_read_cr0_bits(vcpu, X86_CR0_PE)) { - kvm_queue_exception(vcpu, UD_VECTOR); - return 1; - } - vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); @@ -7815,7 +7731,6 @@ static int handle_pml_full(struct kvm_vcpu *vcpu) * "blocked by NMI" bit has to be set before next VM entry. */ if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) && - cpu_has_virtual_nmis() && (exit_qualification & INTR_INFO_UNBLOCK_NMI)) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -8117,6 +8032,10 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has(vmcs12, CPU_BASED_INVLPG_EXITING); case EXIT_REASON_RDPMC: return nested_cpu_has(vmcs12, CPU_BASED_RDPMC_EXITING); + case EXIT_REASON_RDRAND: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDRAND); + case EXIT_REASON_RDSEED: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_RDSEED); case EXIT_REASON_RDTSC: case EXIT_REASON_RDTSCP: return nested_cpu_has(vmcs12, CPU_BASED_RDTSC_EXITING); case EXIT_REASON_VMCALL: case EXIT_REASON_VMCLEAR: @@ -8490,26 +8409,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) return 0; } - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked && - !(is_guest_mode(vcpu) && nested_cpu_has_virtual_nmis( - get_vmcs12(vcpu))))) { - if (vmx_interrupt_allowed(vcpu)) { - vmx->soft_vnmi_blocked = 0; - } else if (vmx->vnmi_blocked_time > 1000000000LL && - vcpu->arch.nmi_pending) { - /* - * This CPU don't support us in finding the end of an - * NMI-blocked window if the guest runs with IRQs - * disabled. So we pull the trigger after 1 s of - * futile waiting, but inform the user about this. - */ - printk(KERN_WARNING "%s: Breaking out of NMI-blocked " - "state on VCPU %d after 1 s timeout\n", - __func__, vcpu->vcpu_id); - vmx->soft_vnmi_blocked = 0; - } - } - if (exit_reason < kvm_vmx_max_exit_handlers && kvm_vmx_exit_handlers[exit_reason]) return kvm_vmx_exit_handlers[exit_reason](vcpu); @@ -8785,37 +8684,33 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; - if (cpu_has_virtual_nmis()) { - if (vmx->nmi_known_unmasked) - return; - /* - * Can't use vmx->exit_intr_info since we're not sure what - * the exit reason is. - */ - exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); - unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; - vector = exit_intr_info & INTR_INFO_VECTOR_MASK; - /* - * SDM 3: 27.7.1.2 (September 2008) - * Re-set bit "block by NMI" before VM entry if vmexit caused by - * a guest IRET fault. - * SDM 3: 23.2.2 (September 2008) - * Bit 12 is undefined in any of the following cases: - * If the VM exit sets the valid bit in the IDT-vectoring - * information field. - * If the VM exit is due to a double fault. - */ - if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && - vector != DF_VECTOR && !idtv_info_valid) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); - else - vmx->nmi_known_unmasked = - !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) - & GUEST_INTR_STATE_NMI); - } else if (unlikely(vmx->soft_vnmi_blocked)) - vmx->vnmi_blocked_time += - ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); + if (vmx->nmi_known_unmasked) + return; + /* + * Can't use vmx->exit_intr_info since we're not sure what + * the exit reason is. + */ + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; + vector = exit_intr_info & INTR_INFO_VECTOR_MASK; + /* + * SDM 3: 27.7.1.2 (September 2008) + * Re-set bit "block by NMI" before VM entry if vmexit caused by + * a guest IRET fault. + * SDM 3: 23.2.2 (September 2008) + * Bit 12 is undefined in any of the following cases: + * If the VM exit sets the valid bit in the IDT-vectoring + * information field. + * If the VM exit is due to a double fault. + */ + if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi && + vector != DF_VECTOR && !idtv_info_valid) + vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, + GUEST_INTR_STATE_NMI); + else + vmx->nmi_known_unmasked = + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + & GUEST_INTR_STATE_NMI); } static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu, @@ -8932,10 +8827,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); unsigned long debugctlmsr, cr4; - /* Record the guest's net vcpu time for enforced NMI injections. */ - if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) - vmx->entry_time = ktime_get(); - /* Don't enter VMX if guest state is invalid, let the exit handler start emulation until we arrive back to a valid state */ if (vmx->emulation_required) @@ -9143,16 +9034,16 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx_complete_interrupts(vmx); } -static void vmx_load_vmcs01(struct kvm_vcpu *vcpu) +static void vmx_switch_vmcs(struct kvm_vcpu *vcpu, struct loaded_vmcs *vmcs) { struct vcpu_vmx *vmx = to_vmx(vcpu); int cpu; - if (vmx->loaded_vmcs == &vmx->vmcs01) + if (vmx->loaded_vmcs == vmcs) return; cpu = get_cpu(); - vmx->loaded_vmcs = &vmx->vmcs01; + vmx->loaded_vmcs = vmcs; vmx_vcpu_put(vcpu); vmx_vcpu_load(vcpu, cpu); vcpu->cpu = cpu; @@ -9170,7 +9061,7 @@ static void vmx_free_vcpu_nested(struct kvm_vcpu *vcpu) r = vcpu_load(vcpu); BUG_ON(r); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); free_nested(vmx); vcpu_put(vcpu); } @@ -9231,11 +9122,7 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) vmx->loaded_vmcs->shadow_vmcs = NULL; if (!vmx->loaded_vmcs->vmcs) goto free_msrs; - if (!vmm_exclusive) - kvm_cpu_vmxon(__pa(per_cpu(vmxarea, raw_smp_processor_id()))); loaded_vmcs_init(vmx->loaded_vmcs); - if (!vmm_exclusive) - kvm_cpu_vmxoff(); cpu = get_cpu(); vmx_vcpu_load(&vmx->vcpu, cpu); @@ -9495,17 +9382,26 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu) return get_vmcs12(vcpu)->ept_pointer; } -static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) +static int nested_ept_init_mmu_context(struct kvm_vcpu *vcpu) { + u64 eptp; + WARN_ON(mmu_is_nested(vcpu)); + eptp = nested_ept_get_cr3(vcpu); + if ((eptp & VMX_EPT_AD_ENABLE_BIT) && !enable_ept_ad_bits) + return 1; + + kvm_mmu_unload(vcpu); kvm_init_shadow_ept_mmu(vcpu, to_vmx(vcpu)->nested.nested_vmx_ept_caps & - VMX_EPT_EXECUTE_ONLY_BIT); + VMX_EPT_EXECUTE_ONLY_BIT, + eptp & VMX_EPT_AD_ENABLE_BIT); vcpu->arch.mmu.set_cr3 = vmx_set_cr3; vcpu->arch.mmu.get_cr3 = nested_ept_get_cr3; vcpu->arch.mmu.inject_page_fault = nested_ept_inject_page_fault; vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; + return 0; } static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu) @@ -10279,8 +10175,10 @@ static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, } if (nested_cpu_has_ept(vmcs12)) { - kvm_mmu_unload(vcpu); - nested_ept_init_mmu_context(vcpu); + if (nested_ept_init_mmu_context(vcpu)) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } } else if (nested_cpu_has2(vmcs12, SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)) { vmx_flush_tlb_ept_only(vcpu); @@ -10353,9 +10251,10 @@ static int check_vmentry_prereqs(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high) || - !vmx_control_verify(vmcs12->secondary_vm_exec_control, - vmx->nested.nested_vmx_secondary_ctls_low, - vmx->nested.nested_vmx_secondary_ctls_high) || + (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) && + !vmx_control_verify(vmcs12->secondary_vm_exec_control, + vmx->nested.nested_vmx_secondary_ctls_low, + vmx->nested.nested_vmx_secondary_ctls_high)) || !vmx_control_verify(vmcs12->pin_based_vm_exec_control, vmx->nested.nested_vmx_pinbased_ctls_low, vmx->nested.nested_vmx_pinbased_ctls_high) || @@ -10434,7 +10333,6 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); struct loaded_vmcs *vmcs02; - int cpu; u32 msr_entry_idx; u32 exit_qual; @@ -10447,18 +10345,12 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL); - cpu = get_cpu(); - vmx->loaded_vmcs = vmcs02; - vmx_vcpu_put(vcpu); - vmx_vcpu_load(vcpu, cpu); - vcpu->cpu = cpu; - put_cpu(); - + vmx_switch_vmcs(vcpu, vmcs02); vmx_segment_cache_clear(vmx); if (prepare_vmcs02(vcpu, vmcs12, from_vmentry, &exit_qual)) { leave_guest_mode(vcpu); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, exit_qual); return 1; @@ -10471,7 +10363,7 @@ static int enter_vmx_non_root_mode(struct kvm_vcpu *vcpu, bool from_vmentry) vmcs12->vm_entry_msr_load_count); if (msr_entry_idx) { leave_guest_mode(vcpu); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_MSR_LOAD_FAIL, msr_entry_idx); return 1; @@ -11039,7 +10931,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, if (unlikely(vmx->fail)) vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR); - vmx_load_vmcs01(vcpu); + vmx_switch_vmcs(vcpu, &vmx->vmcs01); if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) && nested_exit_intr_ack_set(vcpu)) { diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ee22226e3807..464da936c53d 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -27,7 +27,6 @@ #include "kvm_cache_regs.h" #include "x86.h" #include "cpuid.h" -#include "assigned-dev.h" #include "pmu.h" #include "hyperv.h" @@ -1008,6 +1007,8 @@ static u32 emulated_msrs[] = { MSR_IA32_MCG_CTL, MSR_IA32_MCG_EXT_CTL, MSR_IA32_SMBASE, + MSR_PLATFORM_INFO, + MSR_MISC_FEATURES_ENABLES, }; static unsigned num_emulated_msrs; @@ -1444,10 +1445,10 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - s64 usdiff; bool matched; bool already_matched; u64 data = msr->data; + bool synchronizing = false; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_compute_tsc_offset(vcpu, data); @@ -1455,51 +1456,34 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) elapsed = ns - kvm->arch.last_tsc_nsec; if (vcpu->arch.virtual_tsc_khz) { - int faulted = 0; - - /* n.b - signed multiplication and division required */ - usdiff = data - kvm->arch.last_tsc_write; -#ifdef CONFIG_X86_64 - usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; -#else - /* do_div() only does unsigned */ - asm("1: idivl %[divisor]\n" - "2: xor %%edx, %%edx\n" - " movl $0, %[faulted]\n" - "3:\n" - ".section .fixup,\"ax\"\n" - "4: movl $1, %[faulted]\n" - " jmp 3b\n" - ".previous\n" - - _ASM_EXTABLE(1b, 4b) - - : "=A"(usdiff), [faulted] "=r" (faulted) - : "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz)); - -#endif - do_div(elapsed, 1000); - usdiff -= elapsed; - if (usdiff < 0) - usdiff = -usdiff; - - /* idivl overflow => difference is larger than USEC_PER_SEC */ - if (faulted) - usdiff = USEC_PER_SEC; - } else - usdiff = USEC_PER_SEC; /* disable TSC match window below */ + if (data == 0 && msr->host_initiated) { + /* + * detection of vcpu initialization -- need to sync + * with other vCPUs. This particularly helps to keep + * kvm_clock stable after CPU hotplug + */ + synchronizing = true; + } else { + u64 tsc_exp = kvm->arch.last_tsc_write + + nsec_to_cycles(vcpu, elapsed); + u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; + /* + * Special case: TSC write with a small delta (1 second) + * of virtual cycle time against real time is + * interpreted as an attempt to synchronize the CPU. + */ + synchronizing = data < tsc_exp + tsc_hz && + data + tsc_hz > tsc_exp; + } + } /* - * Special case: TSC write with a small delta (1 second) of virtual - * cycle time against real time is interpreted as an attempt to - * synchronize the CPU. - * * For a reliable TSC, we can match TSC offsets, and for an unstable * TSC, we add elapsed time in this computation. We could let the * compensation code attempt to catch up if we fall behind, but * it's better to try to match offsets from the beginning. */ - if (usdiff < USEC_PER_SEC && + if (synchronizing && vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { offset = kvm->arch.cur_tsc_offset; @@ -1769,13 +1753,13 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) /* guest entries allowed */ kvm_for_each_vcpu(i, vcpu, kvm) - clear_bit(KVM_REQ_MCLOCK_INPROGRESS, &vcpu->requests); + kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); spin_unlock(&ka->pvclock_gtod_sync_lock); #endif } -static u64 __get_kvmclock_ns(struct kvm *kvm) +u64 get_kvmclock_ns(struct kvm *kvm) { struct kvm_arch *ka = &kvm->arch; struct pvclock_vcpu_time_info hv_clock; @@ -1796,24 +1780,12 @@ static u64 __get_kvmclock_ns(struct kvm *kvm) return __pvclock_read_cycles(&hv_clock, rdtsc()); } -u64 get_kvmclock_ns(struct kvm *kvm) -{ - unsigned long flags; - s64 ns; - - local_irq_save(flags); - ns = __get_kvmclock_ns(kvm); - local_irq_restore(flags); - - return ns; -} - static void kvm_setup_pvclock_page(struct kvm_vcpu *v) { struct kvm_vcpu_arch *vcpu = &v->arch; struct pvclock_vcpu_time_info guest_hv_clock; - if (unlikely(kvm_vcpu_read_guest_cached(v, &vcpu->pv_time, + if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time, &guest_hv_clock, sizeof(guest_hv_clock)))) return; @@ -1834,9 +1806,9 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); vcpu->hv_clock.version = guest_hv_clock.version + 1; - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); smp_wmb(); @@ -1850,16 +1822,16 @@ static void kvm_setup_pvclock_page(struct kvm_vcpu *v) trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock); - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock)); smp_wmb(); vcpu->hv_clock.version++; - kvm_vcpu_write_guest_cached(v, &vcpu->pv_time, - &vcpu->hv_clock, - sizeof(vcpu->hv_clock.version)); + kvm_write_guest_cached(v->kvm, &vcpu->pv_time, + &vcpu->hv_clock, + sizeof(vcpu->hv_clock.version)); } static int kvm_guest_time_update(struct kvm_vcpu *v) @@ -2092,7 +2064,7 @@ static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) return 0; } - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.apf.data, gpa, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa, sizeof(u32))) return 1; @@ -2111,7 +2083,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - if (unlikely(kvm_vcpu_read_guest_cached(vcpu, &vcpu->arch.st.stime, + if (unlikely(kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; @@ -2122,7 +2094,7 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.steal.version += 1; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); @@ -2131,14 +2103,14 @@ static void record_steal_time(struct kvm_vcpu *vcpu) vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); smp_wmb(); vcpu->arch.st.steal.version += 1; - kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)); } @@ -2155,6 +2127,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_VM_HSAVE_PA: case MSR_AMD64_PATCH_LOADER: case MSR_AMD64_BU_CFG2: + case MSR_AMD64_DC_CFG: break; case MSR_EFER: @@ -2230,8 +2203,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) bool tmp = (msr == MSR_KVM_SYSTEM_TIME); if (ka->boot_vcpu_runs_old_kvmclock != tmp) - set_bit(KVM_REQ_MASTERCLOCK_UPDATE, - &vcpu->requests); + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); ka->boot_vcpu_runs_old_kvmclock = tmp; } @@ -2243,7 +2215,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) vcpu->arch.pv_time_enabled = false; @@ -2264,7 +2236,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (data & KVM_STEAL_RESERVED_MASK) return 1; - if (kvm_vcpu_gfn_to_hva_cache_init(vcpu, &vcpu->arch.st.stime, + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.st.stime, data & KVM_STEAL_VALID_BITS, sizeof(struct kvm_steal_time))) return 1; @@ -2331,6 +2303,21 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; vcpu->arch.osvw.status = data; break; + case MSR_PLATFORM_INFO: + if (!msr_info->host_initiated || + data & ~MSR_PLATFORM_INFO_CPUID_FAULT || + (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && + cpuid_fault_enabled(vcpu))) + return 1; + vcpu->arch.msr_platform_info = data; + break; + case MSR_MISC_FEATURES_ENABLES: + if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT || + (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && + !supports_cpuid_fault(vcpu))) + return 1; + vcpu->arch.msr_misc_features_enables = data; + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -2417,6 +2404,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_FAM10H_MMIO_CONF_BASE: case MSR_AMD64_BU_CFG2: case MSR_IA32_PERF_CTL: + case MSR_AMD64_DC_CFG: msr_info->data = 0; break; case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: @@ -2545,6 +2533,12 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) return 1; msr_info->data = vcpu->arch.osvw.status; break; + case MSR_PLATFORM_INFO: + msr_info->data = vcpu->arch.msr_platform_info; + break; + case MSR_MISC_FEATURES_ENABLES: + msr_info->data = vcpu->arch.msr_misc_features_enables; + break; default: if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); @@ -2675,15 +2669,14 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_SET_BOOT_CPU_ID: case KVM_CAP_SPLIT_IRQCHIP: case KVM_CAP_IMMEDIATE_EXIT: -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT - case KVM_CAP_ASSIGN_DEV_IRQ: - case KVM_CAP_PCI_2_3: -#endif r = 1; break; case KVM_CAP_ADJUST_CLOCK: r = KVM_CLOCK_TSC_STABLE; break; + case KVM_CAP_X86_GUEST_MWAIT: + r = kvm_mwait_in_guest(); + break; case KVM_CAP_X86_SMM: /* SMBASE is usually relocated above 1M on modern chipsets, * and SMM handlers might indeed rely on 4G segment limits, @@ -2695,9 +2688,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) */ r = kvm_x86_ops->cpu_has_high_real_mode_segbase(); break; - case KVM_CAP_COALESCED_MMIO: - r = KVM_COALESCED_MMIO_PAGE_OFFSET; - break; case KVM_CAP_VAPIC: r = !kvm_x86_ops->cpu_has_accelerated_tpr(); break; @@ -2713,11 +2703,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PV_MMU: /* obsolete */ r = 0; break; -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT - case KVM_CAP_IOMMU: - r = iommu_present(&pci_bus_type); - break; -#endif case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; break; @@ -2816,11 +2801,6 @@ static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) return kvm_arch_has_noncoherent_dma(vcpu->kvm); } -static inline void kvm_migrate_timers(struct kvm_vcpu *vcpu) -{ - set_bit(KVM_REQ_MIGRATE_TIMER, &vcpu->requests); -} - void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { /* Address WBINVD may be executed by guest */ @@ -2864,7 +2844,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); if (vcpu->cpu != cpu) - kvm_migrate_timers(vcpu); + kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu); vcpu->cpu = cpu; } @@ -2878,7 +2858,7 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) vcpu->arch.st.steal.preempted = 1; - kvm_vcpu_write_guest_offset_cached(vcpu, &vcpu->arch.st.stime, + kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, &vcpu->arch.st.steal.preempted, offsetof(struct kvm_steal_time, preempted), sizeof(vcpu->arch.st.steal.preempted)); @@ -3124,7 +3104,14 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, return -EINVAL; if (events->exception.injected && - (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) + (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR || + is_guest_mode(vcpu))) + return -EINVAL; + + /* INITs are latched while in SMM */ + if (events->flags & KVM_VCPUEVENT_VALID_SMM && + (events->smi.smm || events->smi.pending) && + vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) return -EINVAL; process_nmi(vcpu); @@ -3721,22 +3708,21 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { + struct kvm_pic *pic = kvm->arch.vpic; int r; r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - memcpy(&chip->chip.pic, - &pic_irqchip(kvm)->pics[0], + memcpy(&chip->chip.pic, &pic->pics[0], sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_PIC_SLAVE: - memcpy(&chip->chip.pic, - &pic_irqchip(kvm)->pics[1], + memcpy(&chip->chip.pic, &pic->pics[1], sizeof(struct kvm_pic_state)); break; case KVM_IRQCHIP_IOAPIC: - r = kvm_get_ioapic(kvm, &chip->chip.ioapic); + kvm_get_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; @@ -3747,32 +3733,31 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) { + struct kvm_pic *pic = kvm->arch.vpic; int r; r = 0; switch (chip->chip_id) { case KVM_IRQCHIP_PIC_MASTER: - spin_lock(&pic_irqchip(kvm)->lock); - memcpy(&pic_irqchip(kvm)->pics[0], - &chip->chip.pic, + spin_lock(&pic->lock); + memcpy(&pic->pics[0], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic->lock); break; case KVM_IRQCHIP_PIC_SLAVE: - spin_lock(&pic_irqchip(kvm)->lock); - memcpy(&pic_irqchip(kvm)->pics[1], - &chip->chip.pic, + spin_lock(&pic->lock); + memcpy(&pic->pics[1], &chip->chip.pic, sizeof(struct kvm_pic_state)); - spin_unlock(&pic_irqchip(kvm)->lock); + spin_unlock(&pic->lock); break; case KVM_IRQCHIP_IOAPIC: - r = kvm_set_ioapic(kvm, &chip->chip.ioapic); + kvm_set_ioapic(kvm, &chip->chip.ioapic); break; default: r = -EINVAL; break; } - kvm_pic_update_irq(pic_irqchip(kvm)); + kvm_pic_update_irq(pic); return r; } @@ -4018,20 +4003,14 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_ioapic_init(kvm); if (r) { - mutex_lock(&kvm->slots_lock); kvm_pic_destroy(kvm); - mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; } r = kvm_setup_default_irq_routing(kvm); if (r) { - mutex_lock(&kvm->slots_lock); - mutex_lock(&kvm->irq_lock); kvm_ioapic_destroy(kvm); kvm_pic_destroy(kvm); - mutex_unlock(&kvm->irq_lock); - mutex_unlock(&kvm->slots_lock); goto create_irqchip_unlock; } /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ @@ -4196,10 +4175,8 @@ long kvm_arch_vm_ioctl(struct file *filp, goto out; r = 0; - local_irq_disable(); - now_ns = __get_kvmclock_ns(kvm); + now_ns = get_kvmclock_ns(kvm); kvm->arch.kvmclock_offset += user_ns.clock - now_ns; - local_irq_enable(); kvm_gen_update_masterclock(kvm); break; } @@ -4207,11 +4184,9 @@ long kvm_arch_vm_ioctl(struct file *filp, struct kvm_clock_data user_ns; u64 now_ns; - local_irq_disable(); - now_ns = __get_kvmclock_ns(kvm); + now_ns = get_kvmclock_ns(kvm); user_ns.clock = now_ns; user_ns.flags = kvm->arch.use_master_clock ? KVM_CLOCK_TSC_STABLE : 0; - local_irq_enable(); memset(&user_ns.pad, 0, sizeof(user_ns.pad)); r = -EFAULT; @@ -4230,7 +4205,7 @@ long kvm_arch_vm_ioctl(struct file *filp, break; } default: - r = kvm_vm_ioctl_assigned_device(kvm, ioctl, arg); + r = -ENOTTY; } out: return r; @@ -5223,6 +5198,16 @@ static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) kvm_x86_ops->set_nmi_mask(emul_to_vcpu(ctxt), masked); } +static unsigned emulator_get_hflags(struct x86_emulate_ctxt *ctxt) +{ + return emul_to_vcpu(ctxt)->arch.hflags; +} + +static void emulator_set_hflags(struct x86_emulate_ctxt *ctxt, unsigned emul_flags) +{ + kvm_set_hflags(emul_to_vcpu(ctxt), emul_flags); +} + static const struct x86_emulate_ops emulate_ops = { .read_gpr = emulator_read_gpr, .write_gpr = emulator_write_gpr, @@ -5262,6 +5247,8 @@ static const struct x86_emulate_ops emulate_ops = { .intercept = emulator_intercept, .get_cpuid = emulator_get_cpuid, .set_nmi_mask = emulator_set_nmi_mask, + .get_hflags = emulator_get_hflags, + .set_hflags = emulator_set_hflags, }; static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) @@ -5314,7 +5301,6 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) BUILD_BUG_ON(HF_GUEST_MASK != X86EMUL_GUEST_MASK); BUILD_BUG_ON(HF_SMM_MASK != X86EMUL_SMM_MASK); BUILD_BUG_ON(HF_SMM_INSIDE_NMI_MASK != X86EMUL_SMM_INSIDE_NMI_MASK); - ctxt->emul_flags = vcpu->arch.hflags; init_decode_cache(ctxt); vcpu->arch.emulate_regs_need_sync_from_vcpu = false; @@ -5718,8 +5704,6 @@ restart: unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); toggle_interruptibility(vcpu, ctxt->interruptibility); vcpu->arch.emulate_regs_need_sync_to_vcpu = false; - if (vcpu->arch.hflags != ctxt->emul_flags) - kvm_set_hflags(vcpu, ctxt->emul_flags); kvm_rip_write(vcpu, ctxt->eip); if (r == EMULATE_DONE) kvm_vcpu_check_singlestep(vcpu, rflags, &r); @@ -6869,7 +6853,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * 1) We should set ->mode before checking ->requests. Please see - * the comment in kvm_make_all_cpus_request. + * the comment in kvm_vcpu_exiting_guest_mode(). * * 2) For APICv, we should set ->mode before checking PIR.ON. This * pairs with the memory barrier implicit in pi_test_and_set_on @@ -7051,7 +7035,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu) if (r <= 0) break; - clear_bit(KVM_REQ_PENDING_TIMER, &vcpu->requests); + kvm_clear_request(KVM_REQ_PENDING_TIMER, vcpu); if (kvm_cpu_has_pending_timer(vcpu)) kvm_inject_pending_timer_irqs(vcpu); @@ -7179,7 +7163,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { kvm_vcpu_block(vcpu); kvm_apic_accept_events(vcpu); - clear_bit(KVM_REQ_UNHALT, &vcpu->requests); + kvm_clear_request(KVM_REQ_UNHALT, vcpu); r = -EAGAIN; goto out; } @@ -7355,6 +7339,12 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, mp_state->mp_state != KVM_MP_STATE_RUNNABLE) return -EINVAL; + /* INITs are latched while in SMM */ + if ((is_smm(vcpu) || vcpu->arch.smi_pending) && + (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || + mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) + return -EINVAL; + if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; set_bit(KVM_APIC_SIPI, &vcpu->arch.apic->pending_events); @@ -7724,6 +7714,9 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (!init_event) { kvm_pmu_reset(vcpu); vcpu->arch.smbase = 0x30000; + + vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; + vcpu->arch.msr_misc_features_enables = 0; } memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); @@ -8068,7 +8061,6 @@ void kvm_arch_sync_events(struct kvm *kvm) { cancel_delayed_work_sync(&kvm->arch.kvmclock_sync_work); cancel_delayed_work_sync(&kvm->arch.kvmclock_update_work); - kvm_free_all_assigned_devices(kvm); kvm_free_pit(kvm); } @@ -8152,7 +8144,6 @@ void kvm_arch_destroy_vm(struct kvm *kvm) } if (kvm_x86_ops->vm_destroy) kvm_x86_ops->vm_destroy(kvm); - kvm_iommu_unmap_guest(kvm); kvm_pic_destroy(kvm); kvm_ioapic_destroy(kvm); kvm_free_vcpus(kvm); @@ -8385,7 +8376,7 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) if (atomic_read(&vcpu->arch.nmi_queued)) return true; - if (test_bit(KVM_REQ_SMI, &vcpu->requests)) + if (kvm_test_request(KVM_REQ_SMI, vcpu)) return true; if (kvm_arch_interrupt_allowed(vcpu) && @@ -8536,8 +8527,9 @@ static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) static int apf_put_user(struct kvm_vcpu *vcpu, u32 val) { - return kvm_vcpu_write_guest_cached(vcpu, &vcpu->arch.apf.data, &val, - sizeof(val)); + + return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val, + sizeof(val)); } void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index e8ff3e4ce38a..612067074905 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -1,6 +1,8 @@ #ifndef ARCH_X86_KVM_X86_H #define ARCH_X86_KVM_X86_H +#include <asm/processor.h> +#include <asm/mwait.h> #include <linux/kvm_host.h> #include <asm/pvclock.h> #include "kvm_cache_regs.h" @@ -212,4 +214,38 @@ static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) __rem; \ }) +static inline bool kvm_mwait_in_guest(void) +{ + unsigned int eax, ebx, ecx, edx; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_MWAIT)) + return false; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_AMD: + /* All AMD CPUs have a working MWAIT implementation */ + return true; + case X86_VENDOR_INTEL: + /* Handle Intel below */ + break; + default: + return false; + } + + /* + * Intel CPUs without CPUID5_ECX_INTERRUPT_BREAK are problematic as + * they would allow guest to stop the CPU completely by disabling + * interrupts then invoking MWAIT. + */ + if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) + return false; + + cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); + + if (!(ecx & CPUID5_ECX_INTERRUPT_BREAK)) + return false; + + return true; +} + #endif diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index 86ffcd68e496..003eeee3fbc6 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -113,6 +113,21 @@ */ #define MAKE_PC_FROM_RA(ra,sp) (((ra) & 0x3fffffff) | ((sp) & 0xc0000000)) +/* Spill slot location for the register reg in the spill area under the stack + * pointer sp. reg must be in the range [0..4). + */ +#define SPILL_SLOT(sp, reg) (*(((unsigned long *)(sp)) - 4 + (reg))) + +/* Spill slot location for the register reg in the spill area under the stack + * pointer sp for the call8. reg must be in the range [4..8). + */ +#define SPILL_SLOT_CALL8(sp, reg) (*(((unsigned long *)(sp)) - 12 + (reg))) + +/* Spill slot location for the register reg in the spill area under the stack + * pointer sp for the call12. reg must be in the range [4..12). + */ +#define SPILL_SLOT_CALL12(sp, reg) (*(((unsigned long *)(sp)) - 16 + (reg))) + typedef struct { unsigned long seg; } mm_segment_t; diff --git a/arch/xtensa/include/asm/ptrace.h b/arch/xtensa/include/asm/ptrace.h index 598e752dcbcd..e2d9c5eb10bd 100644 --- a/arch/xtensa/include/asm/ptrace.h +++ b/arch/xtensa/include/asm/ptrace.h @@ -12,6 +12,45 @@ #include <uapi/asm/ptrace.h> +/* + * Kernel stack + * + * +-----------------------+ -------- STACK_SIZE + * | register file | | + * +-----------------------+ | + * | struct pt_regs | | + * +-----------------------+ | ------ PT_REGS_OFFSET + * double : 16 bytes spill area : | ^ + * excetion :- - - - - - - - - - - -: | | + * frame : struct pt_regs : | | + * :- - - - - - - - - - - -: | | + * | | | | + * | memory stack | | | + * | | | | + * ~ ~ ~ ~ + * ~ ~ ~ ~ + * | | | | + * | | | | + * +-----------------------+ | | --- STACK_BIAS + * | struct task_struct | | | ^ + * current --> +-----------------------+ | | | + * | struct thread_info | | | | + * +-----------------------+ -------- + */ + +#define KERNEL_STACK_SIZE (2 * PAGE_SIZE) + +/* Offsets for exception_handlers[] (3 x 64-entries x 4-byte tables). */ + +#define EXC_TABLE_KSTK 0x004 /* Kernel Stack */ +#define EXC_TABLE_DOUBLE_SAVE 0x008 /* Double exception save area for a0 */ +#define EXC_TABLE_FIXUP 0x00c /* Fixup handler */ +#define EXC_TABLE_PARAM 0x010 /* For passing a parameter to fixup */ +#define EXC_TABLE_SYSCALL_SAVE 0x014 /* For fast syscall handler */ +#define EXC_TABLE_FAST_USER 0x100 /* Fast user exception handler */ +#define EXC_TABLE_FAST_KERNEL 0x200 /* Fast kernel exception handler */ +#define EXC_TABLE_DEFAULT 0x300 /* Default C-Handler */ +#define EXC_TABLE_SIZE 0x400 #ifndef __ASSEMBLY__ diff --git a/arch/xtensa/include/uapi/asm/ptrace.h b/arch/xtensa/include/uapi/asm/ptrace.h index 6ccbd9e38e35..8853a0d544c8 100644 --- a/arch/xtensa/include/uapi/asm/ptrace.h +++ b/arch/xtensa/include/uapi/asm/ptrace.h @@ -11,46 +11,6 @@ #ifndef _UAPI_XTENSA_PTRACE_H #define _UAPI_XTENSA_PTRACE_H -/* - * Kernel stack - * - * +-----------------------+ -------- STACK_SIZE - * | register file | | - * +-----------------------+ | - * | struct pt_regs | | - * +-----------------------+ | ------ PT_REGS_OFFSET - * double : 16 bytes spill area : | ^ - * excetion :- - - - - - - - - - - -: | | - * frame : struct pt_regs : | | - * :- - - - - - - - - - - -: | | - * | | | | - * | memory stack | | | - * | | | | - * ~ ~ ~ ~ - * ~ ~ ~ ~ - * | | | | - * | | | | - * +-----------------------+ | | --- STACK_BIAS - * | struct task_struct | | | ^ - * current --> +-----------------------+ | | | - * | struct thread_info | | | | - * +-----------------------+ -------- - */ - -#define KERNEL_STACK_SIZE (2 * PAGE_SIZE) - -/* Offsets for exception_handlers[] (3 x 64-entries x 4-byte tables). */ - -#define EXC_TABLE_KSTK 0x004 /* Kernel Stack */ -#define EXC_TABLE_DOUBLE_SAVE 0x008 /* Double exception save area for a0 */ -#define EXC_TABLE_FIXUP 0x00c /* Fixup handler */ -#define EXC_TABLE_PARAM 0x010 /* For passing a parameter to fixup */ -#define EXC_TABLE_SYSCALL_SAVE 0x014 /* For fast syscall handler */ -#define EXC_TABLE_FAST_USER 0x100 /* Fast user exception handler */ -#define EXC_TABLE_FAST_KERNEL 0x200 /* Fast kernel exception handler */ -#define EXC_TABLE_DEFAULT 0x300 /* Default C-Handler */ -#define EXC_TABLE_SIZE 0x400 - /* Registers used by strace */ #define REG_A_BASE 0x0000 diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S index 6911e384f608..3a98503ad11a 100644 --- a/arch/xtensa/kernel/coprocessor.S +++ b/arch/xtensa/kernel/coprocessor.S @@ -26,30 +26,6 @@ #include <asm/signal.h> #include <asm/tlbflush.h> -/* - * Entry condition: - * - * a0: trashed, original value saved on stack (PT_AREG0) - * a1: a1 - * a2: new stack pointer, original in DEPC - * a3: a3 - * depc: a2, original value saved on stack (PT_DEPC) - * excsave_1: dispatch table - * - * PT_DEPC >= VALID_DOUBLE_EXCEPTION_ADDRESS: double exception, DEPC - * < VALID_DOUBLE_EXCEPTION_ADDRESS: regular exception - */ - -/* IO protection is currently unsupported. */ - -ENTRY(fast_io_protect) - - wsr a0, excsave1 - movi a0, unrecoverable_exception - callx0 a0 - -ENDPROC(fast_io_protect) - #if XTENSA_HAVE_COPROCESSORS /* diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S index f5ef3cc0497c..37a239556889 100644 --- a/arch/xtensa/kernel/entry.S +++ b/arch/xtensa/kernel/entry.S @@ -1899,10 +1899,11 @@ ENTRY(system_call) movi a4, do_syscall_trace_enter s32i a3, a2, PT_SYSCALL callx4 a4 + mov a3, a6 /* syscall = sys_call_table[syscall_nr] */ - movi a4, sys_call_table; + movi a4, sys_call_table movi a5, __NR_syscall_count movi a6, -ENOSYS bgeu a3, a5, 1f diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 58f96d1230d4..ff4f0ecb03dd 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -204,8 +204,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, #endif /* Create a call4 dummy-frame: a0 = 0, a1 = childregs. */ - *((int*)childregs - 3) = (unsigned long)childregs; - *((int*)childregs - 4) = 0; + SPILL_SLOT(childregs, 1) = (unsigned long)childregs; + SPILL_SLOT(childregs, 0) = 0; p->thread.sp = (unsigned long)childregs; @@ -266,8 +266,8 @@ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, /* pass parameters to ret_from_kernel_thread: * a2 = thread_fn, a3 = thread_fn arg */ - *((int *)childregs - 1) = thread_fn_arg; - *((int *)childregs - 2) = usp_thread_fn; + SPILL_SLOT(childregs, 3) = thread_fn_arg; + SPILL_SLOT(childregs, 2) = usp_thread_fn; /* Childregs are only used when we're going to userspace * in which case start_thread will set them up. diff --git a/arch/xtensa/kernel/ptrace.c b/arch/xtensa/kernel/ptrace.c index e0f583fed06a..e2461968efb2 100644 --- a/arch/xtensa/kernel/ptrace.c +++ b/arch/xtensa/kernel/ptrace.c @@ -1,4 +1,3 @@ -// TODO some minor issues /* * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive @@ -24,13 +23,14 @@ #include <linux/security.h> #include <linux/signal.h> #include <linux/smp.h> +#include <linux/tracehook.h> +#include <linux/uaccess.h> #include <asm/coprocessor.h> #include <asm/elf.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/ptrace.h> -#include <linux/uaccess.h> void user_enable_single_step(struct task_struct *child) @@ -52,7 +52,7 @@ void ptrace_disable(struct task_struct *child) /* Nothing to do.. */ } -int ptrace_getregs(struct task_struct *child, void __user *uregs) +static int ptrace_getregs(struct task_struct *child, void __user *uregs) { struct pt_regs *regs = task_pt_regs(child); xtensa_gregset_t __user *gregset = uregs; @@ -73,12 +73,12 @@ int ptrace_getregs(struct task_struct *child, void __user *uregs) for (i = 0; i < XCHAL_NUM_AREGS; i++) __put_user(regs->areg[i], - gregset->a + ((wb * 4 + i) % XCHAL_NUM_AREGS)); + gregset->a + ((wb * 4 + i) % XCHAL_NUM_AREGS)); return 0; } -int ptrace_setregs(struct task_struct *child, void __user *uregs) +static int ptrace_setregs(struct task_struct *child, void __user *uregs) { struct pt_regs *regs = task_pt_regs(child); xtensa_gregset_t *gregset = uregs; @@ -107,7 +107,7 @@ int ptrace_setregs(struct task_struct *child, void __user *uregs) unsigned long rotws, wmask; rotws = (((ws | (ws << WSBITS)) >> wb) & - ((1 << WSBITS) - 1)) & ~1; + ((1 << WSBITS) - 1)) & ~1; wmask = ((rotws ? WSBITS + 1 - ffs(rotws) : 0) << 4) | (rotws & 0xF) | 1; regs->windowbase = wb; @@ -115,19 +115,19 @@ int ptrace_setregs(struct task_struct *child, void __user *uregs) regs->wmask = wmask; } - if (wb != 0 && __copy_from_user(regs->areg + XCHAL_NUM_AREGS - wb * 4, - gregset->a, wb * 16)) + if (wb != 0 && __copy_from_user(regs->areg + XCHAL_NUM_AREGS - wb * 4, + gregset->a, wb * 16)) return -EFAULT; if (__copy_from_user(regs->areg, gregset->a + wb * 4, - (WSBITS - wb) * 16)) + (WSBITS - wb) * 16)) return -EFAULT; return 0; } -int ptrace_getxregs(struct task_struct *child, void __user *uregs) +static int ptrace_getxregs(struct task_struct *child, void __user *uregs) { struct pt_regs *regs = task_pt_regs(child); struct thread_info *ti = task_thread_info(child); @@ -151,7 +151,7 @@ int ptrace_getxregs(struct task_struct *child, void __user *uregs) return ret ? -EFAULT : 0; } -int ptrace_setxregs(struct task_struct *child, void __user *uregs) +static int ptrace_setxregs(struct task_struct *child, void __user *uregs) { struct thread_info *ti = task_thread_info(child); struct pt_regs *regs = task_pt_regs(child); @@ -177,7 +177,8 @@ int ptrace_setxregs(struct task_struct *child, void __user *uregs) return ret ? -EFAULT : 0; } -int ptrace_peekusr(struct task_struct *child, long regno, long __user *ret) +static int ptrace_peekusr(struct task_struct *child, long regno, + long __user *ret) { struct pt_regs *regs; unsigned long tmp; @@ -186,86 +187,87 @@ int ptrace_peekusr(struct task_struct *child, long regno, long __user *ret) tmp = 0; /* Default return value. */ switch(regno) { + case REG_AR_BASE ... REG_AR_BASE + XCHAL_NUM_AREGS - 1: + tmp = regs->areg[regno - REG_AR_BASE]; + break; - case REG_AR_BASE ... REG_AR_BASE + XCHAL_NUM_AREGS - 1: - tmp = regs->areg[regno - REG_AR_BASE]; - break; - - case REG_A_BASE ... REG_A_BASE + 15: - tmp = regs->areg[regno - REG_A_BASE]; - break; + case REG_A_BASE ... REG_A_BASE + 15: + tmp = regs->areg[regno - REG_A_BASE]; + break; - case REG_PC: - tmp = regs->pc; - break; + case REG_PC: + tmp = regs->pc; + break; - case REG_PS: - /* Note: PS.EXCM is not set while user task is running; - * its being set in regs is for exception handling - * convenience. */ - tmp = (regs->ps & ~(1 << PS_EXCM_BIT)); - break; + case REG_PS: + /* Note: PS.EXCM is not set while user task is running; + * its being set in regs is for exception handling + * convenience. + */ + tmp = (regs->ps & ~(1 << PS_EXCM_BIT)); + break; - case REG_WB: - break; /* tmp = 0 */ + case REG_WB: + break; /* tmp = 0 */ - case REG_WS: + case REG_WS: { unsigned long wb = regs->windowbase; unsigned long ws = regs->windowstart; - tmp = ((ws>>wb) | (ws<<(WSBITS-wb))) & ((1<<WSBITS)-1); + tmp = ((ws >> wb) | (ws << (WSBITS - wb))) & + ((1 << WSBITS) - 1); break; } - case REG_LBEG: - tmp = regs->lbeg; - break; + case REG_LBEG: + tmp = regs->lbeg; + break; - case REG_LEND: - tmp = regs->lend; - break; + case REG_LEND: + tmp = regs->lend; + break; - case REG_LCOUNT: - tmp = regs->lcount; - break; + case REG_LCOUNT: + tmp = regs->lcount; + break; - case REG_SAR: - tmp = regs->sar; - break; + case REG_SAR: + tmp = regs->sar; + break; - case SYSCALL_NR: - tmp = regs->syscall; - break; + case SYSCALL_NR: + tmp = regs->syscall; + break; - default: - return -EIO; + default: + return -EIO; } return put_user(tmp, ret); } -int ptrace_pokeusr(struct task_struct *child, long regno, long val) +static int ptrace_pokeusr(struct task_struct *child, long regno, long val) { struct pt_regs *regs; regs = task_pt_regs(child); switch (regno) { - case REG_AR_BASE ... REG_AR_BASE + XCHAL_NUM_AREGS - 1: - regs->areg[regno - REG_AR_BASE] = val; - break; + case REG_AR_BASE ... REG_AR_BASE + XCHAL_NUM_AREGS - 1: + regs->areg[regno - REG_AR_BASE] = val; + break; - case REG_A_BASE ... REG_A_BASE + 15: - regs->areg[regno - REG_A_BASE] = val; - break; + case REG_A_BASE ... REG_A_BASE + 15: + regs->areg[regno - REG_A_BASE] = val; + break; - case REG_PC: - regs->pc = val; - break; + case REG_PC: + regs->pc = val; + break; - case SYSCALL_NR: - regs->syscall = val; - break; + case SYSCALL_NR: + regs->syscall = val; + break; - default: - return -EIO; + default: + return -EIO; } return 0; } @@ -467,39 +469,21 @@ long arch_ptrace(struct task_struct *child, long request, return ret; } -void do_syscall_trace(void) -{ - /* - * The 0x80 provides a way for the tracing parent to distinguish - * between a syscall stop and SIGTRAP delivery - */ - ptrace_notify(SIGTRAP|((current->ptrace & PT_TRACESYSGOOD) ? 0x80 : 0)); - - /* - * this isn't the same as continuing with a signal, but it will do - * for normal use. strace only continues with a signal if the - * stopping signal is not SIGTRAP. -brl - */ - if (current->exit_code) { - send_sig(current->exit_code, current, 1); - current->exit_code = 0; - } -} - -void do_syscall_trace_enter(struct pt_regs *regs) +unsigned long do_syscall_trace_enter(struct pt_regs *regs) { - if (test_thread_flag(TIF_SYSCALL_TRACE) - && (current->ptrace & PT_PTRACED)) - do_syscall_trace(); + if (test_thread_flag(TIF_SYSCALL_TRACE) && + tracehook_report_syscall_entry(regs)) + return -1; -#if 0 - audit_syscall_entry(...); -#endif + return regs->areg[2]; } void do_syscall_trace_leave(struct pt_regs *regs) { - if ((test_thread_flag(TIF_SYSCALL_TRACE)) - && (current->ptrace & PT_PTRACED)) - do_syscall_trace(); + int step; + + step = test_thread_flag(TIF_SINGLESTEP); + + if (step || test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, step); } diff --git a/arch/xtensa/kernel/setup.c b/arch/xtensa/kernel/setup.c index 197e75b400b1..394ef08300b6 100644 --- a/arch/xtensa/kernel/setup.c +++ b/arch/xtensa/kernel/setup.c @@ -317,8 +317,9 @@ static inline int mem_reserve(unsigned long start, unsigned long end) void __init setup_arch(char **cmdline_p) { - strlcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); *cmdline_p = command_line; + platform_setup(cmdline_p); + strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE); /* Reserve some memory regions */ @@ -382,8 +383,6 @@ void __init setup_arch(char **cmdline_p) unflatten_and_copy_device_tree(); - platform_setup(cmdline_p); - #ifdef CONFIG_SMP smp_init_cpus(); #endif @@ -453,9 +452,9 @@ void cpu_reset(void) tmpaddr += SZ_512M; /* Invalidate mapping in the selected temporary area */ - if (itlb_probe(tmpaddr) & 0x8) + if (itlb_probe(tmpaddr) & BIT(ITLB_HIT_BIT)) invalidate_itlb_entry(itlb_probe(tmpaddr)); - if (itlb_probe(tmpaddr + PAGE_SIZE) & 0x8) + if (itlb_probe(tmpaddr + PAGE_SIZE) & BIT(ITLB_HIT_BIT)) invalidate_itlb_entry(itlb_probe(tmpaddr + PAGE_SIZE)); /* diff --git a/arch/xtensa/kernel/signal.c b/arch/xtensa/kernel/signal.c index 70a131945443..d427e784ab44 100644 --- a/arch/xtensa/kernel/signal.c +++ b/arch/xtensa/kernel/signal.c @@ -91,14 +91,14 @@ flush_window_regs_user(struct pt_regs *regs) inc = 1; } else if (m & 4) { /* call8 */ - if (copy_to_user((void*)(sp - 32), - ®s->areg[(base + 1) * 4], 16)) + if (copy_to_user(&SPILL_SLOT_CALL8(sp, 4), + ®s->areg[(base + 1) * 4], 16)) goto errout; inc = 2; } else if (m & 8) { /* call12 */ - if (copy_to_user((void*)(sp - 48), - ®s->areg[(base + 1) * 4], 32)) + if (copy_to_user(&SPILL_SLOT_CALL12(sp, 4), + ®s->areg[(base + 1) * 4], 32)) goto errout; inc = 3; } @@ -106,7 +106,7 @@ flush_window_regs_user(struct pt_regs *regs) /* Save current frame a0..a3 under next SP */ sp = regs->areg[((base + inc) * 4 + 1) % XCHAL_NUM_AREGS]; - if (copy_to_user((void*)(sp - 16), ®s->areg[base * 4], 16)) + if (copy_to_user(&SPILL_SLOT(sp, 0), ®s->areg[base * 4], 16)) goto errout; /* Get current stack pointer for next loop iteration. */ diff --git a/arch/xtensa/kernel/stacktrace.c b/arch/xtensa/kernel/stacktrace.c index e7d30ee0fd48..0df4080fa20f 100644 --- a/arch/xtensa/kernel/stacktrace.c +++ b/arch/xtensa/kernel/stacktrace.c @@ -23,14 +23,6 @@ */ extern int common_exception_return; -/* A struct that maps to the part of the frame containing the a0 and - * a1 registers. - */ -struct frame_start { - unsigned long a0; - unsigned long a1; -}; - void xtensa_backtrace_user(struct pt_regs *regs, unsigned int depth, int (*ufn)(struct stackframe *frame, void *data), void *data) @@ -96,26 +88,16 @@ void xtensa_backtrace_user(struct pt_regs *regs, unsigned int depth, /* Start from the a1 register. */ /* a1 = regs->areg[1]; */ while (a0 != 0 && depth--) { - struct frame_start frame_start; - /* Get the location for a1, a0 for the - * previous frame from the current a1. - */ - unsigned long *psp = (unsigned long *)a1; - - psp -= 4; + pc = MAKE_PC_FROM_RA(a0, pc); /* Check if the region is OK to access. */ - if (!access_ok(VERIFY_READ, psp, sizeof(frame_start))) + if (!access_ok(VERIFY_READ, &SPILL_SLOT(a1, 0), 8)) return; /* Copy a1, a0 from user space stack frame. */ - if (__copy_from_user_inatomic(&frame_start, psp, - sizeof(frame_start))) + if (__get_user(a0, &SPILL_SLOT(a1, 0)) || + __get_user(a1, &SPILL_SLOT(a1, 1))) return; - pc = MAKE_PC_FROM_RA(a0, pc); - a0 = frame_start.a0; - a1 = frame_start.a1; - frame.pc = pc; frame.sp = a1; @@ -147,7 +129,6 @@ void xtensa_backtrace_kernel(struct pt_regs *regs, unsigned int depth, */ while (a1 > sp_start && a1 < sp_end && depth--) { struct stackframe frame; - unsigned long *psp = (unsigned long *)a1; frame.pc = pc; frame.sp = a1; @@ -171,8 +152,8 @@ void xtensa_backtrace_kernel(struct pt_regs *regs, unsigned int depth, sp_start = a1; pc = MAKE_PC_FROM_RA(a0, pc); - a0 = *(psp - 4); - a1 = *(psp - 3); + a0 = SPILL_SLOT(a1, 0); + a1 = SPILL_SLOT(a1, 1); } } EXPORT_SYMBOL(xtensa_backtrace_kernel); @@ -196,8 +177,8 @@ void walk_stackframe(unsigned long *sp, sp = (unsigned long *)a1; - a0 = *(sp - 4); - a1 = *(sp - 3); + a0 = SPILL_SLOT(a1, 0); + a1 = SPILL_SLOT(a1, 1); if (a1 <= (unsigned long)sp) break; diff --git a/arch/xtensa/platforms/iss/include/platform/simcall.h b/arch/xtensa/platforms/iss/include/platform/simcall.h index 27d7a528b41a..2ba45858e50a 100644 --- a/arch/xtensa/platforms/iss/include/platform/simcall.h +++ b/arch/xtensa/platforms/iss/include/platform/simcall.h @@ -6,6 +6,7 @@ * for more details. * * Copyright (C) 2001 Tensilica Inc. + * Copyright (C) 2017 Cadence Design Systems Inc. */ #ifndef _XTENSA_PLATFORM_ISS_SIMCALL_H @@ -49,6 +50,10 @@ #define SYS_bind 30 #define SYS_ioctl 31 +#define SYS_iss_argc 1000 /* returns value of argc */ +#define SYS_iss_argv_size 1001 /* bytes needed for argv & arg strings */ +#define SYS_iss_set_argv 1002 /* saves argv & arg strings at given addr */ + /* * SYS_select_one specifiers */ @@ -118,5 +123,20 @@ static inline int simc_lseek(int fd, uint32_t off, int whence) return __simc(SYS_lseek, fd, off, whence); } +static inline int simc_argc(void) +{ + return __simc(SYS_iss_argc, 0, 0, 0); +} + +static inline int simc_argv_size(void) +{ + return __simc(SYS_iss_argv_size, 0, 0, 0); +} + +static inline void simc_argv(void *buf) +{ + __simc(SYS_iss_set_argv, (int)buf, 0, 0); +} + #endif /* _XTENSA_PLATFORM_ISS_SIMCALL_H */ diff --git a/arch/xtensa/platforms/iss/setup.c b/arch/xtensa/platforms/iss/setup.c index 379aeddcc638..f4bbb28026f8 100644 --- a/arch/xtensa/platforms/iss/setup.c +++ b/arch/xtensa/platforms/iss/setup.c @@ -8,6 +8,7 @@ * Joe Taylor <joe@tensilica.com> * * Copyright 2001 - 2005 Tensilica Inc. + * Copyright 2017 Cadence Design Systems Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -15,6 +16,7 @@ * option) any later version. * */ +#include <linux/bootmem.h> #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/init.h> @@ -31,13 +33,13 @@ #include <asm/platform.h> #include <asm/bootparam.h> +#include <asm/setup.h> #include <platform/simcall.h> void __init platform_init(bp_tag_t* bootparam) { - } void platform_halt(void) @@ -59,26 +61,10 @@ void platform_restart(void) /* control never gets here */ } -extern void iss_net_poll(void); - -const char twirl[]="|/-\\|/-\\"; - void platform_heartbeat(void) { -#if 0 - static int i = 0, j = 0; - - if (--i < 0) { - i = 99; - printk("\r%c\r", twirl[j++]); - if (j == 8) - j = 0; - } -#endif } - - static int iss_panic_event(struct notifier_block *this, unsigned long event, void *ptr) { @@ -87,12 +73,29 @@ iss_panic_event(struct notifier_block *this, unsigned long event, void *ptr) } static struct notifier_block iss_panic_block = { - iss_panic_event, - NULL, - 0 + .notifier_call = iss_panic_event, }; void __init platform_setup(char **p_cmdline) { + int argc = simc_argc(); + int argv_size = simc_argv_size(); + + if (argc > 1) { + void **argv = alloc_bootmem(argv_size); + char *cmdline = alloc_bootmem(argv_size); + int i; + + cmdline[0] = 0; + simc_argv((void *)argv); + + for (i = 1; i < argc; ++i) { + if (i > 1) + strcat(cmdline, " "); + strcat(cmdline, argv[i]); + } + *p_cmdline = cmdline; + } + atomic_notifier_chain_register(&panic_notifier_list, &iss_panic_block); } diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c index 519ec1787117..efd84d1d178b 100644 --- a/drivers/s390/char/sclp_early.c +++ b/drivers/s390/char/sclp_early.c @@ -40,7 +40,8 @@ struct read_info_sccb { u8 fac85; /* 85 */ u8 _pad_86[91 - 86]; /* 86-90 */ u8 flags; /* 91 */ - u8 _pad_92[99 - 92]; /* 92-98 */ + u8 _pad_92[98 - 92]; /* 92-97 */ + u8 fac98; /* 98 */ u8 hamaxpow; /* 99 */ u32 rnsize2; /* 100-103 */ u64 rnmax2; /* 104-111 */ @@ -99,6 +100,7 @@ static void __init sclp_early_facilities_detect(struct read_info_sccb *sccb) sclp.has_pfmfi = !!(sccb->fac117 & 0x40); sclp.has_ibs = !!(sccb->fac117 & 0x20); sclp.has_hvs = !!(sccb->fac119 & 0x80); + sclp.has_kss = !!(sccb->fac98 & 0x01); if (sccb->fac85 & 0x02) S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP; sclp.rnmax = sccb->rnmax ? sccb->rnmax : sccb->rnmax2; diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 37b49894c762..d1bb02b1ee58 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -159,6 +159,8 @@ static int fname_decrypt(struct inode *inode, static const char *lookup_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; +#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + /** * digest_encode() - * @@ -230,11 +232,14 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size); int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 ilen, struct fscrypt_str *crypto_str) { - unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); + u32 olen = fscrypt_fname_encrypted_size(inode, ilen); + const u32 max_encoded_len = + max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE), + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))); crypto_str->len = olen; - if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2) - olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + olen = max(olen, max_encoded_len); + /* * Allocated buffer can hold one more character to null-terminate the * string @@ -266,6 +271,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); * * The caller must have allocated sufficient memory for the @oname string. * + * If the key is available, we'll decrypt the disk name; otherwise, we'll encode + * it for presentation. Short names are directly base64-encoded, while long + * names are encoded in fscrypt_digested_name format. + * * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, @@ -274,7 +283,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); - char buf[24]; + struct fscrypt_digested_name digested_name; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; @@ -289,20 +298,24 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, if (inode->i_crypt_info) return fname_decrypt(inode, iname, oname); - if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { + if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) { oname->len = digest_encode(iname->name, iname->len, oname->name); return 0; } if (hash) { - memcpy(buf, &hash, 4); - memcpy(buf + 4, &minor_hash, 4); + digested_name.hash = hash; + digested_name.minor_hash = minor_hash; } else { - memset(buf, 0, 8); + digested_name.hash = 0; + digested_name.minor_hash = 0; } - memcpy(buf + 8, iname->name + iname->len - 16, 16); + memcpy(digested_name.digest, + FSCRYPT_FNAME_DIGEST(iname->name, iname->len), + FSCRYPT_FNAME_DIGEST_SIZE); oname->name[0] = '_'; - oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + oname->len = 1 + digest_encode((const char *)&digested_name, + sizeof(digested_name), oname->name + 1); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); @@ -336,10 +349,35 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, } EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); +/** + * fscrypt_setup_filename() - prepare to search a possibly encrypted directory + * @dir: the directory that will be searched + * @iname: the user-provided filename being searched for + * @lookup: 1 if we're allowed to proceed without the key because it's + * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot + * proceed without the key because we're going to create the dir_entry. + * @fname: the filename information to be filled in + * + * Given a user-provided filename @iname, this function sets @fname->disk_name + * to the name that would be stored in the on-disk directory entry, if possible. + * If the directory is unencrypted this is simply @iname. Else, if we have the + * directory's encryption key, then @iname is the plaintext, so we encrypt it to + * get the disk_name. + * + * Else, for keyless @lookup operations, @iname is the presented ciphertext, so + * we decode it to get either the ciphertext disk_name (for short names) or the + * fscrypt_digested_name (for long names). Non-@lookup operations will be + * impossible in this case, so we fail them with ENOKEY. + * + * If successful, fscrypt_free_filename() must be called later to clean up. + * + * Return: 0 on success, -errno on failure + */ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { - int ret = 0, bigname = 0; + int ret; + int digested; memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; @@ -373,25 +411,37 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, * We don't have the key and we are doing a lookup; decode the * user-supplied name */ - if (iname->name[0] == '_') - bigname = 1; - if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43))) - return -ENOENT; + if (iname->name[0] == '_') { + if (iname->len != + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))) + return -ENOENT; + digested = 1; + } else { + if (iname->len > + BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)) + return -ENOENT; + digested = 0; + } - fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + fname->crypto_buf.name = + kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE, + sizeof(struct fscrypt_digested_name)), + GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = digest_decode(iname->name + bigname, iname->len - bigname, + ret = digest_decode(iname->name + digested, iname->len - digested, fname->crypto_buf.name); if (ret < 0) { ret = -ENOENT; goto errout; } fname->crypto_buf.len = ret; - if (bigname) { - memcpy(&fname->hash, fname->crypto_buf.name, 4); - memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4); + if (digested) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + fname->hash = n->hash; + fname->minor_hash = n->minor_hash; } else { fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index e39696e64494..1e1f8a361b75 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -13,8 +13,6 @@ #include <linux/fscrypt_supp.h> -#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 - /* Encryption parameters */ #define FS_XTS_TWEAK_SIZE 16 #define FS_AES_128_ECB_KEY_SIZE 16 @@ -22,10 +20,6 @@ #define FS_AES_256_CBC_KEY_SIZE 32 #define FS_AES_256_CTS_KEY_SIZE 32 #define FS_AES_256_XTS_KEY_SIZE 64 -#define FS_MAX_KEY_SIZE 64 - -#define FS_KEY_DESC_PREFIX "fscrypt:" -#define FS_KEY_DESC_PREFIX_SIZE 8 #define FS_KEY_DERIVATION_NONCE_SIZE 16 @@ -51,13 +45,6 @@ struct fscrypt_context { #define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 -/* This is passed in from userspace into the kernel keyring */ -struct fscrypt_key { - u32 mode; - u8 raw[FS_MAX_KEY_SIZE]; - u32 size; -} __packed; - /* * A pointer to this structure is stored in the file system's in-core * representation of an inode. diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 8cdfddce2b34..179e578b875b 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -183,9 +183,6 @@ int fscrypt_get_encryption_info(struct inode *inode) if (res) return res; - if (!inode->i_sb->s_cop->get_context) - return -EOPNOTSUPP; - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { if (!fscrypt_dummy_context_enabled(inode) || diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 4908906d54d5..210976e7a269 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -34,9 +34,6 @@ static int create_encryption_context_from_policy(struct inode *inode, { struct fscrypt_context ctx; - if (!inode->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); @@ -87,8 +84,6 @@ int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) ret = -ENOTDIR; - else if (!inode->i_sb->s_cop->empty_dir) - ret = -EOPNOTSUPP; else if (!inode->i_sb->s_cop->empty_dir(inode)) ret = -ENOTEMPTY; else @@ -118,8 +113,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) struct fscrypt_policy policy; int res; - if (!inode->i_sb->s_cop->get_context || - !inode->i_sb->s_cop->is_encrypted(inode)) + if (!inode->i_sb->s_cop->is_encrypted(inode)) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); @@ -143,27 +137,61 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) } EXPORT_SYMBOL(fscrypt_ioctl_get_policy); +/** + * fscrypt_has_permitted_context() - is a file's encryption policy permitted + * within its directory? + * + * @parent: inode for parent directory + * @child: inode for file being looked up, opened, or linked into @parent + * + * Filesystems must call this before permitting access to an inode in a + * situation where the parent directory is encrypted (either before allowing + * ->lookup() to succeed, or for a regular file before allowing it to be opened) + * and before any operation that involves linking an inode into an encrypted + * directory, including link, rename, and cross rename. It enforces the + * constraint that within a given encrypted directory tree, all files use the + * same encryption policy. The pre-access check is needed to detect potentially + * malicious offline violations of this constraint, while the link and rename + * checks are needed to prevent online violations of this constraint. + * + * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail + * the filesystem operation with EPERM. + */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { - struct fscrypt_info *parent_ci, *child_ci; + const struct fscrypt_operations *cops = parent->i_sb->s_cop; + const struct fscrypt_info *parent_ci, *child_ci; + struct fscrypt_context parent_ctx, child_ctx; int res; - if ((parent == NULL) || (child == NULL)) { - printk(KERN_ERR "parent %p child %p\n", parent, child); - BUG_ON(1); - } - /* No restrictions on file types which are never encrypted */ if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && !S_ISLNK(child->i_mode)) return 1; - /* no restrictions if the parent directory is not encrypted */ - if (!parent->i_sb->s_cop->is_encrypted(parent)) + /* No restrictions if the parent directory is unencrypted */ + if (!cops->is_encrypted(parent)) return 1; - /* if the child directory is not encrypted, this is always a problem */ - if (!parent->i_sb->s_cop->is_encrypted(child)) + + /* Encrypted directories must not contain unencrypted files */ + if (!cops->is_encrypted(child)) return 0; + + /* + * Both parent and child are encrypted, so verify they use the same + * encryption policy. Compare the fscrypt_info structs if the keys are + * available, otherwise retrieve and compare the fscrypt_contexts. + * + * Note that the fscrypt_context retrieval will be required frequently + * when accessing an encrypted directory tree without the key. + * Performance-wise this is not a big deal because we already don't + * really optimize for file access without the key (to the extent that + * such access is even possible), given that any attempted access + * already causes a fscrypt_context retrieval and keyring search. + * + * In any case, if an unexpected error occurs, fall back to "forbidden". + */ + res = fscrypt_get_encryption_info(parent); if (res) return 0; @@ -172,17 +200,32 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) return 0; parent_ci = parent->i_crypt_info; child_ci = child->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) + + if (parent_ci && child_ci) { + return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == + child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags); + } + + res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx)); + if (res != sizeof(parent_ctx)) return 0; - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); + res = cops->get_context(child, &child_ctx, sizeof(child_ctx)); + if (res != sizeof(child_ctx)) + return 0; + + return memcmp(parent_ctx.master_key_descriptor, + child_ctx.master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ctx.contents_encryption_mode == + child_ctx.contents_encryption_mode) && + (parent_ctx.filenames_encryption_mode == + child_ctx.filenames_encryption_mode) && + (parent_ctx.flags == child_ctx.flags); } EXPORT_SYMBOL(fscrypt_has_permitted_context); @@ -202,9 +245,6 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, struct fscrypt_info *ci; int res; - if (!parent->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - res = fscrypt_get_encryption_info(parent); if (res < 0) return res; diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index 354103f3490c..d9beca1653c5 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -4,11 +4,11 @@ obj-$(CONFIG_EXT4_FS) += ext4.o -ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \ - mmp.o indirect.o extents_status.o xattr.o xattr_user.o \ - xattr_trusted.o inline.o readpage.o sysfs.o +ext4-y := balloc.o bitmap.o block_validity.o dir.o ext4_jbd2.o extents.o \ + extents_status.o file.o fsmap.o fsync.o hash.o ialloc.o \ + indirect.o inline.o inode.o ioctl.o mballoc.o migrate.o \ + mmp.o move_extent.o namei.o page-io.o readpage.o resize.o \ + super.o symlink.o sysfs.o xattr.o xattr_trusted.o xattr_user.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f7b465b4fb69..8e8046104f4d 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2356,17 +2356,16 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); -int ext4_insert_dentry(struct inode *dir, - struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - struct ext4_filename *fname); +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { if (!ext4_has_feature_dir_index(inode->i_sb)) ext4_clear_inode_flag(inode, EXT4_INODE_INDEX); } -static unsigned char ext4_filetype_table[] = { +static const unsigned char ext4_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK }; @@ -3050,7 +3049,7 @@ extern int ext4_handle_dirty_dirent_node(handle_t *handle, struct inode *inode, struct buffer_head *bh); #define S_SHIFT 12 -static unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { +static const unsigned char ext4_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = EXT4_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = EXT4_FT_DIR, [S_IFCHR >> S_SHIFT] = EXT4_FT_CHRDEV, diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c new file mode 100644 index 000000000000..b19436098837 --- /dev/null +++ b/fs/ext4/fsmap.c @@ -0,0 +1,722 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "ext4.h" +#include <linux/fsmap.h> +#include "fsmap.h" +#include "mballoc.h" +#include <linux/sort.h> +#include <linux/list_sort.h> +#include <trace/events/ext4.h> + +/* Convert an ext4_fsmap to an fsmap. */ +void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest, + struct ext4_fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = src->fmr_physical << sb->s_blocksize_bits; + dest->fmr_owner = src->fmr_owner; + dest->fmr_offset = 0; + dest->fmr_length = src->fmr_length << sb->s_blocksize_bits; + dest->fmr_reserved[0] = 0; + dest->fmr_reserved[1] = 0; + dest->fmr_reserved[2] = 0; +} + +/* Convert an fsmap to an ext4_fsmap. */ +void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest, + struct fsmap *src) +{ + dest->fmr_device = src->fmr_device; + dest->fmr_flags = src->fmr_flags; + dest->fmr_physical = src->fmr_physical >> sb->s_blocksize_bits; + dest->fmr_owner = src->fmr_owner; + dest->fmr_length = src->fmr_length >> sb->s_blocksize_bits; +} + +/* getfsmap query state */ +struct ext4_getfsmap_info { + struct ext4_fsmap_head *gfi_head; + ext4_fsmap_format_t gfi_formatter; /* formatting fn */ + void *gfi_format_arg;/* format buffer */ + ext4_fsblk_t gfi_next_fsblk; /* next fsblock we expect */ + u32 gfi_dev; /* device id */ + ext4_group_t gfi_agno; /* bg number, if applicable */ + struct ext4_fsmap gfi_low; /* low rmap key */ + struct ext4_fsmap gfi_high; /* high rmap key */ + struct ext4_fsmap gfi_lastfree; /* free ext at end of last bg */ + struct list_head gfi_meta_list; /* fixed metadata list */ + bool gfi_last; /* last extent? */ +}; + +/* Associate a device with a getfsmap handler. */ +struct ext4_getfsmap_dev { + int (*gfd_fn)(struct super_block *sb, + struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info); + u32 gfd_dev; +}; + +/* Compare two getfsmap device handlers. */ +static int ext4_getfsmap_dev_compare(const void *p1, const void *p2) +{ + const struct ext4_getfsmap_dev *d1 = p1; + const struct ext4_getfsmap_dev *d2 = p2; + + return d1->gfd_dev - d2->gfd_dev; +} + +/* Compare a record against our starting point */ +static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info, + struct ext4_fsmap *rec) +{ + return rec->fmr_physical < info->gfi_low.fmr_physical; +} + +/* + * Format a reverse mapping for getfsmap, having translated rm_startblock + * into the appropriate daddr units. + */ +static int ext4_getfsmap_helper(struct super_block *sb, + struct ext4_getfsmap_info *info, + struct ext4_fsmap *rec) +{ + struct ext4_fsmap fmr; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t rec_fsblk = rec->fmr_physical; + ext4_group_t agno; + ext4_grpblk_t cno; + int error; + + if (fatal_signal_pending(current)) + return -EINTR; + + /* + * Filter out records that start before our startpoint, if the + * caller requested that. + */ + if (ext4_getfsmap_rec_before_low_key(info, rec)) { + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; + } + + /* Are we just counting mappings? */ + if (info->gfi_head->fmh_count == 0) { + if (rec_fsblk > info->gfi_next_fsblk) + info->gfi_head->fmh_entries++; + + if (info->gfi_last) + return EXT4_QUERY_RANGE_CONTINUE; + + info->gfi_head->fmh_entries++; + + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; + } + + /* + * If the record starts past the last physical block we saw, + * then we've found a gap. Report the gap as being owned by + * whatever the caller specified is the missing owner. + */ + if (rec_fsblk > info->gfi_next_fsblk) { + if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) + return EXT4_QUERY_RANGE_ABORT; + + ext4_get_group_no_and_offset(sb, info->gfi_next_fsblk, + &agno, &cno); + trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, + EXT4_C2B(sbi, cno), + rec_fsblk - info->gfi_next_fsblk, + EXT4_FMR_OWN_UNKNOWN); + + fmr.fmr_device = info->gfi_dev; + fmr.fmr_physical = info->gfi_next_fsblk; + fmr.fmr_owner = EXT4_FMR_OWN_UNKNOWN; + fmr.fmr_length = rec_fsblk - info->gfi_next_fsblk; + fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; + error = info->gfi_formatter(&fmr, info->gfi_format_arg); + if (error) + return error; + info->gfi_head->fmh_entries++; + } + + if (info->gfi_last) + goto out; + + /* Fill out the extent we found */ + if (info->gfi_head->fmh_entries >= info->gfi_head->fmh_count) + return EXT4_QUERY_RANGE_ABORT; + + ext4_get_group_no_and_offset(sb, rec_fsblk, &agno, &cno); + trace_ext4_fsmap_mapping(sb, info->gfi_dev, agno, EXT4_C2B(sbi, cno), + rec->fmr_length, rec->fmr_owner); + + fmr.fmr_device = info->gfi_dev; + fmr.fmr_physical = rec_fsblk; + fmr.fmr_owner = rec->fmr_owner; + fmr.fmr_flags = FMR_OF_SPECIAL_OWNER; + fmr.fmr_length = rec->fmr_length; + error = info->gfi_formatter(&fmr, info->gfi_format_arg); + if (error) + return error; + info->gfi_head->fmh_entries++; + +out: + rec_fsblk += rec->fmr_length; + if (info->gfi_next_fsblk < rec_fsblk) + info->gfi_next_fsblk = rec_fsblk; + return EXT4_QUERY_RANGE_CONTINUE; +} + +static inline ext4_fsblk_t ext4_fsmap_next_pblk(struct ext4_fsmap *fmr) +{ + return fmr->fmr_physical + fmr->fmr_length; +} + +/* Transform a blockgroup's free record into a fsmap */ +static int ext4_getfsmap_datadev_helper(struct super_block *sb, + ext4_group_t agno, ext4_grpblk_t start, + ext4_grpblk_t len, void *priv) +{ + struct ext4_fsmap irec; + struct ext4_getfsmap_info *info = priv; + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb; + ext4_fsblk_t fslen; + int error; + + fsb = (EXT4_C2B(sbi, start) + ext4_group_first_block_no(sb, agno)); + fslen = EXT4_C2B(sbi, len); + + /* If the retained free extent record is set... */ + if (info->gfi_lastfree.fmr_owner) { + /* ...and abuts this one, lengthen it and return. */ + if (ext4_fsmap_next_pblk(&info->gfi_lastfree) == fsb) { + info->gfi_lastfree.fmr_length += fslen; + return 0; + } + + /* + * There's a gap between the two free extents; emit the + * retained extent prior to merging the meta_list. + */ + error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); + if (error) + return error; + info->gfi_lastfree.fmr_owner = 0; + } + + /* Merge in any relevant extents from the meta_list */ + list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { + if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) { + list_del(&p->fmr_list); + kfree(p); + } else if (p->fmr_physical < fsb) { + error = ext4_getfsmap_helper(sb, info, p); + if (error) + return error; + + list_del(&p->fmr_list); + kfree(p); + } + } + + irec.fmr_device = 0; + irec.fmr_physical = fsb; + irec.fmr_length = fslen; + irec.fmr_owner = EXT4_FMR_OWN_FREE; + irec.fmr_flags = 0; + + /* If this is a free extent at the end of a bg, buffer it. */ + if (ext4_fsmap_next_pblk(&irec) == + ext4_group_first_block_no(sb, agno + 1)) { + info->gfi_lastfree = irec; + return 0; + } + + /* Otherwise, emit it */ + return ext4_getfsmap_helper(sb, info, &irec); +} + +/* Execute a getfsmap query against the log device. */ +static int ext4_getfsmap_logdev(struct super_block *sb, struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + struct ext4_fsmap irec; + + /* Set up search keys */ + info->gfi_low = keys[0]; + info->gfi_low.fmr_length = 0; + + memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); + + trace_ext4_fsmap_low_key(sb, info->gfi_dev, 0, + info->gfi_low.fmr_physical, + info->gfi_low.fmr_length, + info->gfi_low.fmr_owner); + + trace_ext4_fsmap_high_key(sb, info->gfi_dev, 0, + info->gfi_high.fmr_physical, + info->gfi_high.fmr_length, + info->gfi_high.fmr_owner); + + if (keys[0].fmr_physical > 0) + return 0; + + /* Fabricate an rmap entry for the external log device. */ + irec.fmr_physical = journal->j_blk_offset; + irec.fmr_length = journal->j_maxlen; + irec.fmr_owner = EXT4_FMR_OWN_LOG; + irec.fmr_flags = 0; + + return ext4_getfsmap_helper(sb, info, &irec); +} + +/* Helper to fill out an ext4_fsmap. */ +static inline int ext4_getfsmap_fill(struct list_head *meta_list, + ext4_fsblk_t fsb, ext4_fsblk_t len, + uint64_t owner) +{ + struct ext4_fsmap *fsm; + + fsm = kmalloc(sizeof(*fsm), GFP_NOFS); + if (!fsm) + return -ENOMEM; + fsm->fmr_device = 0; + fsm->fmr_flags = 0; + fsm->fmr_physical = fsb; + fsm->fmr_owner = owner; + fsm->fmr_length = len; + list_add_tail(&fsm->fmr_list, meta_list); + + return 0; +} + +/* + * This function returns the number of file system metadata blocks at + * the beginning of a block group, including the reserved gdt blocks. + */ +static unsigned int ext4_getfsmap_find_sb(struct super_block *sb, + ext4_group_t agno, + struct list_head *meta_list) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t fsb = ext4_group_first_block_no(sb, agno); + ext4_fsblk_t len; + unsigned long first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); + unsigned long metagroup = agno / EXT4_DESC_PER_BLOCK(sb); + int error; + + /* Record the superblock. */ + if (ext4_bg_has_super(sb, agno)) { + error = ext4_getfsmap_fill(meta_list, fsb, 1, EXT4_FMR_OWN_FS); + if (error) + return error; + fsb++; + } + + /* Record the group descriptors. */ + len = ext4_bg_num_gdb(sb, agno); + if (!len) + return 0; + error = ext4_getfsmap_fill(meta_list, fsb, len, + EXT4_FMR_OWN_GDT); + if (error) + return error; + fsb += len; + + /* Reserved GDT blocks */ + if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) { + len = le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); + error = ext4_getfsmap_fill(meta_list, fsb, len, + EXT4_FMR_OWN_RESV_GDT); + if (error) + return error; + } + + return 0; +} + +/* Compare two fsmap items. */ +static int ext4_getfsmap_compare(void *priv, + struct list_head *a, + struct list_head *b) +{ + struct ext4_fsmap *fa; + struct ext4_fsmap *fb; + + fa = container_of(a, struct ext4_fsmap, fmr_list); + fb = container_of(b, struct ext4_fsmap, fmr_list); + if (fa->fmr_physical < fb->fmr_physical) + return -1; + else if (fa->fmr_physical > fb->fmr_physical) + return 1; + return 0; +} + +/* Merge adjacent extents of fixed metadata. */ +static void ext4_getfsmap_merge_fixed_metadata(struct list_head *meta_list) +{ + struct ext4_fsmap *p; + struct ext4_fsmap *prev = NULL; + struct ext4_fsmap *tmp; + + list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { + if (!prev) { + prev = p; + continue; + } + + if (prev->fmr_owner == p->fmr_owner && + prev->fmr_physical + prev->fmr_length == p->fmr_physical) { + prev->fmr_length += p->fmr_length; + list_del(&p->fmr_list); + kfree(p); + } else + prev = p; + } +} + +/* Free a list of fixed metadata. */ +static void ext4_getfsmap_free_fixed_metadata(struct list_head *meta_list) +{ + struct ext4_fsmap *p; + struct ext4_fsmap *tmp; + + list_for_each_entry_safe(p, tmp, meta_list, fmr_list) { + list_del(&p->fmr_list); + kfree(p); + } +} + +/* Find all the fixed metadata in the filesystem. */ +int ext4_getfsmap_find_fixed_metadata(struct super_block *sb, + struct list_head *meta_list) +{ + struct ext4_group_desc *gdp; + ext4_group_t agno; + int error; + + INIT_LIST_HEAD(meta_list); + + /* Collect everything. */ + for (agno = 0; agno < EXT4_SB(sb)->s_groups_count; agno++) { + gdp = ext4_get_group_desc(sb, agno, NULL); + if (!gdp) { + error = -EFSCORRUPTED; + goto err; + } + + /* Superblock & GDT */ + error = ext4_getfsmap_find_sb(sb, agno, meta_list); + if (error) + goto err; + + /* Block bitmap */ + error = ext4_getfsmap_fill(meta_list, + ext4_block_bitmap(sb, gdp), 1, + EXT4_FMR_OWN_BLKBM); + if (error) + goto err; + + /* Inode bitmap */ + error = ext4_getfsmap_fill(meta_list, + ext4_inode_bitmap(sb, gdp), 1, + EXT4_FMR_OWN_INOBM); + if (error) + goto err; + + /* Inodes */ + error = ext4_getfsmap_fill(meta_list, + ext4_inode_table(sb, gdp), + EXT4_SB(sb)->s_itb_per_group, + EXT4_FMR_OWN_INODES); + if (error) + goto err; + } + + /* Sort the list */ + list_sort(NULL, meta_list, ext4_getfsmap_compare); + + /* Merge adjacent extents */ + ext4_getfsmap_merge_fixed_metadata(meta_list); + + return 0; +err: + ext4_getfsmap_free_fixed_metadata(meta_list); + return error; +} + +/* Execute a getfsmap query against the buddy bitmaps */ +static int ext4_getfsmap_datadev(struct super_block *sb, + struct ext4_fsmap *keys, + struct ext4_getfsmap_info *info) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + ext4_fsblk_t start_fsb; + ext4_fsblk_t end_fsb; + ext4_fsblk_t eofs; + ext4_group_t start_ag; + ext4_group_t end_ag; + ext4_grpblk_t first_cluster; + ext4_grpblk_t last_cluster; + int error = 0; + + eofs = ext4_blocks_count(sbi->s_es); + if (keys[0].fmr_physical >= eofs) + return 0; + if (keys[1].fmr_physical >= eofs) + keys[1].fmr_physical = eofs - 1; + start_fsb = keys[0].fmr_physical; + end_fsb = keys[1].fmr_physical; + + /* Determine first and last group to examine based on start and end */ + ext4_get_group_no_and_offset(sb, start_fsb, &start_ag, &first_cluster); + ext4_get_group_no_and_offset(sb, end_fsb, &end_ag, &last_cluster); + + /* + * Convert the fsmap low/high keys to bg based keys. Initialize + * low to the fsmap low key and max out the high key to the end + * of the bg. + */ + info->gfi_low = keys[0]; + info->gfi_low.fmr_physical = EXT4_C2B(sbi, first_cluster); + info->gfi_low.fmr_length = 0; + + memset(&info->gfi_high, 0xFF, sizeof(info->gfi_high)); + + /* Assemble a list of all the fixed-location metadata. */ + error = ext4_getfsmap_find_fixed_metadata(sb, &info->gfi_meta_list); + if (error) + goto err; + + /* Query each bg */ + for (info->gfi_agno = start_ag; + info->gfi_agno <= end_ag; + info->gfi_agno++) { + /* + * Set the bg high key from the fsmap high key if this + * is the last bg that we're querying. + */ + if (info->gfi_agno == end_ag) { + info->gfi_high = keys[1]; + info->gfi_high.fmr_physical = EXT4_C2B(sbi, + last_cluster); + info->gfi_high.fmr_length = 0; + } + + trace_ext4_fsmap_low_key(sb, info->gfi_dev, info->gfi_agno, + info->gfi_low.fmr_physical, + info->gfi_low.fmr_length, + info->gfi_low.fmr_owner); + + trace_ext4_fsmap_high_key(sb, info->gfi_dev, info->gfi_agno, + info->gfi_high.fmr_physical, + info->gfi_high.fmr_length, + info->gfi_high.fmr_owner); + + error = ext4_mballoc_query_range(sb, info->gfi_agno, + EXT4_B2C(sbi, info->gfi_low.fmr_physical), + EXT4_B2C(sbi, info->gfi_high.fmr_physical), + ext4_getfsmap_datadev_helper, info); + if (error) + goto err; + + /* + * Set the bg low key to the start of the bg prior to + * moving on to the next bg. + */ + if (info->gfi_agno == start_ag) + memset(&info->gfi_low, 0, sizeof(info->gfi_low)); + } + + /* Do we have a retained free extent? */ + if (info->gfi_lastfree.fmr_owner) { + error = ext4_getfsmap_helper(sb, info, &info->gfi_lastfree); + if (error) + goto err; + } + + /* Report any gaps at the end of the bg */ + info->gfi_last = true; + error = ext4_getfsmap_datadev_helper(sb, end_ag, last_cluster, 0, info); + if (error) + goto err; + +err: + ext4_getfsmap_free_fixed_metadata(&info->gfi_meta_list); + return error; +} + +/* Do we recognize the device? */ +static bool ext4_getfsmap_is_valid_device(struct super_block *sb, + struct ext4_fsmap *fm) +{ + if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || + fm->fmr_device == new_encode_dev(sb->s_bdev->bd_dev)) + return true; + if (EXT4_SB(sb)->journal_bdev && + fm->fmr_device == new_encode_dev(EXT4_SB(sb)->journal_bdev->bd_dev)) + return true; + return false; +} + +/* Ensure that the low key is less than the high key. */ +static bool ext4_getfsmap_check_keys(struct ext4_fsmap *low_key, + struct ext4_fsmap *high_key) +{ + if (low_key->fmr_device > high_key->fmr_device) + return false; + if (low_key->fmr_device < high_key->fmr_device) + return true; + + if (low_key->fmr_physical > high_key->fmr_physical) + return false; + if (low_key->fmr_physical < high_key->fmr_physical) + return true; + + if (low_key->fmr_owner > high_key->fmr_owner) + return false; + if (low_key->fmr_owner < high_key->fmr_owner) + return true; + + return false; +} + +#define EXT4_GETFSMAP_DEVS 2 +/* + * Get filesystem's extents as described in head, and format for + * output. Calls formatter to fill the user's buffer until all + * extents are mapped, until the passed-in head->fmh_count slots have + * been filled, or until the formatter short-circuits the loop, if it + * is tracking filled-in extents on its own. + * + * Key to Confusion + * ---------------- + * There are multiple levels of keys and counters at work here: + * _fsmap_head.fmh_keys -- low and high fsmap keys passed in; + * these reflect fs-wide block addrs. + * dkeys -- fmh_keys used to query each device; + * these are fmh_keys but w/ the low key + * bumped up by fmr_length. + * _getfsmap_info.gfi_next_fsblk-- next fs block we expect to see; this + * is how we detect gaps in the fsmap + * records and report them. + * _getfsmap_info.gfi_low/high -- per-bg low/high keys computed from + * dkeys; used to query the free space. + */ +int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, + ext4_fsmap_format_t formatter, void *arg) +{ + struct ext4_fsmap dkeys[2]; /* per-dev keys */ + struct ext4_getfsmap_dev handlers[EXT4_GETFSMAP_DEVS]; + struct ext4_getfsmap_info info = {0}; + int i; + int error = 0; + + if (head->fmh_iflags & ~FMH_IF_VALID) + return -EINVAL; + if (!ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[0]) || + !ext4_getfsmap_is_valid_device(sb, &head->fmh_keys[1])) + return -EINVAL; + + head->fmh_entries = 0; + + /* Set up our device handlers. */ + memset(handlers, 0, sizeof(handlers)); + handlers[0].gfd_dev = new_encode_dev(sb->s_bdev->bd_dev); + handlers[0].gfd_fn = ext4_getfsmap_datadev; + if (EXT4_SB(sb)->journal_bdev) { + handlers[1].gfd_dev = new_encode_dev( + EXT4_SB(sb)->journal_bdev->bd_dev); + handlers[1].gfd_fn = ext4_getfsmap_logdev; + } + + sort(handlers, EXT4_GETFSMAP_DEVS, sizeof(struct ext4_getfsmap_dev), + ext4_getfsmap_dev_compare, NULL); + + /* + * To continue where we left off, we allow userspace to use the + * last mapping from a previous call as the low key of the next. + * This is identified by a non-zero length in the low key. We + * have to increment the low key in this scenario to ensure we + * don't return the same mapping again, and instead return the + * very next mapping. + * + * Bump the physical offset as there can be no other mapping for + * the same physical block range. + */ + dkeys[0] = head->fmh_keys[0]; + dkeys[0].fmr_physical += dkeys[0].fmr_length; + dkeys[0].fmr_owner = 0; + dkeys[0].fmr_length = 0; + memset(&dkeys[1], 0xFF, sizeof(struct ext4_fsmap)); + + if (!ext4_getfsmap_check_keys(dkeys, &head->fmh_keys[1])) + return -EINVAL; + + info.gfi_next_fsblk = head->fmh_keys[0].fmr_physical + + head->fmh_keys[0].fmr_length; + info.gfi_formatter = formatter; + info.gfi_format_arg = arg; + info.gfi_head = head; + + /* For each device we support... */ + for (i = 0; i < EXT4_GETFSMAP_DEVS; i++) { + /* Is this device within the range the user asked for? */ + if (!handlers[i].gfd_fn) + continue; + if (head->fmh_keys[0].fmr_device > handlers[i].gfd_dev) + continue; + if (head->fmh_keys[1].fmr_device < handlers[i].gfd_dev) + break; + + /* + * If this device number matches the high key, we have + * to pass the high key to the handler to limit the + * query results. If the device number exceeds the + * low key, zero out the low key so that we get + * everything from the beginning. + */ + if (handlers[i].gfd_dev == head->fmh_keys[1].fmr_device) + dkeys[1] = head->fmh_keys[1]; + if (handlers[i].gfd_dev > head->fmh_keys[0].fmr_device) + memset(&dkeys[0], 0, sizeof(struct ext4_fsmap)); + + info.gfi_dev = handlers[i].gfd_dev; + info.gfi_last = false; + info.gfi_agno = -1; + error = handlers[i].gfd_fn(sb, dkeys, &info); + if (error) + break; + info.gfi_next_fsblk = 0; + } + + head->fmh_oflags = FMH_OF_DEV_T; + return error; +} diff --git a/fs/ext4/fsmap.h b/fs/ext4/fsmap.h new file mode 100644 index 000000000000..9a2cd367cc66 --- /dev/null +++ b/fs/ext4/fsmap.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@oracle.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef __EXT4_FSMAP_H__ +#define __EXT4_FSMAP_H__ + +struct fsmap; + +/* internal fsmap representation */ +struct ext4_fsmap { + struct list_head fmr_list; + dev_t fmr_device; /* device id */ + uint32_t fmr_flags; /* mapping flags */ + uint64_t fmr_physical; /* device offset of segment */ + uint64_t fmr_owner; /* owner id */ + uint64_t fmr_length; /* length of segment, blocks */ +}; + +struct ext4_fsmap_head { + uint32_t fmh_iflags; /* control flags */ + uint32_t fmh_oflags; /* output flags */ + unsigned int fmh_count; /* # of entries in array incl. input */ + unsigned int fmh_entries; /* # of entries filled in (output). */ + + struct ext4_fsmap fmh_keys[2]; /* low and high keys */ +}; + +void ext4_fsmap_from_internal(struct super_block *sb, struct fsmap *dest, + struct ext4_fsmap *src); +void ext4_fsmap_to_internal(struct super_block *sb, struct ext4_fsmap *dest, + struct fsmap *src); + +/* fsmap to userspace formatter - copy to user & advance pointer */ +typedef int (*ext4_fsmap_format_t)(struct ext4_fsmap *, void *); + +int ext4_getfsmap(struct super_block *sb, struct ext4_fsmap_head *head, + ext4_fsmap_format_t formatter, void *arg); + +#define EXT4_QUERY_RANGE_ABORT 1 +#define EXT4_QUERY_RANGE_CONTINUE 0 + +/* fmr_owner special values for FS_IOC_GETFSMAP; some share w/ XFS */ +#define EXT4_FMR_OWN_FREE FMR_OWN_FREE /* free space */ +#define EXT4_FMR_OWN_UNKNOWN FMR_OWN_UNKNOWN /* unknown owner */ +#define EXT4_FMR_OWN_FS FMR_OWNER('X', 1) /* static fs metadata */ +#define EXT4_FMR_OWN_LOG FMR_OWNER('X', 2) /* journalling log */ +#define EXT4_FMR_OWN_INODES FMR_OWNER('X', 5) /* inodes */ +#define EXT4_FMR_OWN_GDT FMR_OWNER('f', 1) /* group descriptors */ +#define EXT4_FMR_OWN_RESV_GDT FMR_OWNER('f', 2) /* reserved gdt blocks */ +#define EXT4_FMR_OWN_BLKBM FMR_OWNER('f', 3) /* inode bitmap */ +#define EXT4_FMR_OWN_INOBM FMR_OWNER('f', 4) /* block bitmap */ + +#endif /* __EXT4_FSMAP_H__ */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 17bc043308f3..98ac2f1f23b3 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1098,6 +1098,17 @@ got: if (err) goto fail_drop; + /* + * Since the encryption xattr will always be unique, create it first so + * that it's less likely to end up in an external xattr block and + * prevent its deduplication. + */ + if (encrypt) { + err = fscrypt_inherit_context(dir, inode, handle, true); + if (err) + goto fail_free_drop; + } + err = ext4_init_acl(handle, inode, dir); if (err) goto fail_free_drop; @@ -1119,12 +1130,6 @@ got: ei->i_datasync_tid = handle->h_transaction->t_tid; } - if (encrypt) { - err = fscrypt_inherit_context(dir, inode, handle, true); - if (err) - goto fail_free_drop; - } - err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 375fb1c05d49..d5dea4c293ef 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1034,7 +1034,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; - ext4_insert_dentry(dir, inode, de, inline_size, fname); + ext4_insert_dentry(inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f0729b0705c7..5834c4d76be8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1643,6 +1643,7 @@ struct mpage_da_data { */ struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ + unsigned int do_map:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, @@ -2179,6 +2180,9 @@ static bool mpage_add_bh_to_extent(struct mpage_da_data *mpd, ext4_lblk_t lblk, /* First block in the extent? */ if (map->m_len == 0) { + /* We cannot map unless handle is started... */ + if (!mpd->do_map) + return false; map->m_lblk = lblk; map->m_len = 1; map->m_flags = bh->b_state & BH_FLAGS; @@ -2231,6 +2235,9 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, /* Found extent to map? */ if (mpd->map.m_len) return 0; + /* Buffer needs mapping and handle is not started? */ + if (!mpd->do_map) + return 0; /* Everything mapped so far and we hit EOF */ break; } @@ -2747,6 +2754,29 @@ retry: tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); done = false; blk_start_plug(&plug); + + /* + * First writeback pages that don't need mapping - we can avoid + * starting a transaction unnecessarily and also avoid being blocked + * in the block layer on device congestion while having transaction + * started. + */ + mpd.do_map = 0; + mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); + if (!mpd.io_submit.io_end) { + ret = -ENOMEM; + goto unplug; + } + ret = mpage_prepare_extent_to_map(&mpd); + /* Submit prepared bio */ + ext4_io_submit(&mpd.io_submit); + ext4_put_io_end_defer(mpd.io_submit.io_end); + mpd.io_submit.io_end = NULL; + /* Unlock pages we didn't use */ + mpage_release_unused_pages(&mpd, false); + if (ret < 0) + goto unplug; + while (!done && mpd.first_page <= mpd.last_page) { /* For each extent of pages we use new io_end */ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); @@ -2775,8 +2805,10 @@ retry: wbc->nr_to_write, inode->i_ino, ret); /* Release allocated io_end */ ext4_put_io_end(mpd.io_submit.io_end); + mpd.io_submit.io_end = NULL; break; } + mpd.do_map = 1; trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); ret = mpage_prepare_extent_to_map(&mpd); @@ -2807,6 +2839,7 @@ retry: if (!ext4_handle_valid(handle) || handle->h_sync == 0) { ext4_journal_stop(handle); handle = NULL; + mpd.do_map = 0; } /* Submit prepared bio */ ext4_io_submit(&mpd.io_submit); @@ -2824,6 +2857,7 @@ retry: ext4_journal_stop(handle); } else ext4_put_io_end(mpd.io_submit.io_end); + mpd.io_submit.io_end = NULL; if (ret == -ENOSPC && sbi->s_journal) { /* @@ -2839,6 +2873,7 @@ retry: if (ret) break; } +unplug: blk_finish_plug(&plug); if (!ret && !cycled && wbc->nr_to_write > 0) { cycled = 1; @@ -5855,6 +5890,11 @@ int ext4_page_mkwrite(struct vm_fault *vmf) file_update_time(vma->vm_file); down_read(&EXT4_I(inode)->i_mmap_sem); + + ret = ext4_convert_inline_data(inode); + if (ret) + goto out_ret; + /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 184e74eb3004..0c21e22acd74 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -19,6 +19,9 @@ #include <linux/delay.h> #include "ext4_jbd2.h" #include "ext4.h" +#include <linux/fsmap.h> +#include "fsmap.h" +#include <trace/events/ext4.h> /** * Swap memory between @a and @b for @len bytes. @@ -443,7 +446,7 @@ static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) return iflags; } -int ext4_shutdown(struct super_block *sb, unsigned long arg) +static int ext4_shutdown(struct super_block *sb, unsigned long arg) { struct ext4_sb_info *sbi = EXT4_SB(sb); __u32 flags; @@ -489,6 +492,90 @@ int ext4_shutdown(struct super_block *sb, unsigned long arg) return 0; } +struct getfsmap_info { + struct super_block *gi_sb; + struct fsmap_head __user *gi_data; + unsigned int gi_idx; + __u32 gi_last_flags; +}; + +static int ext4_getfsmap_format(struct ext4_fsmap *xfm, void *priv) +{ + struct getfsmap_info *info = priv; + struct fsmap fm; + + trace_ext4_getfsmap_mapping(info->gi_sb, xfm); + + info->gi_last_flags = xfm->fmr_flags; + ext4_fsmap_from_internal(info->gi_sb, &fm, xfm); + if (copy_to_user(&info->gi_data->fmh_recs[info->gi_idx++], &fm, + sizeof(struct fsmap))) + return -EFAULT; + + return 0; +} + +static int ext4_ioc_getfsmap(struct super_block *sb, + struct fsmap_head __user *arg) +{ + struct getfsmap_info info = {0}; + struct ext4_fsmap_head xhead = {0}; + struct fsmap_head head; + bool aborted = false; + int error; + + if (copy_from_user(&head, arg, sizeof(struct fsmap_head))) + return -EFAULT; + if (memchr_inv(head.fmh_reserved, 0, sizeof(head.fmh_reserved)) || + memchr_inv(head.fmh_keys[0].fmr_reserved, 0, + sizeof(head.fmh_keys[0].fmr_reserved)) || + memchr_inv(head.fmh_keys[1].fmr_reserved, 0, + sizeof(head.fmh_keys[1].fmr_reserved))) + return -EINVAL; + /* + * ext4 doesn't report file extents at all, so the only valid + * file offsets are the magic ones (all zeroes or all ones). + */ + if (head.fmh_keys[0].fmr_offset || + (head.fmh_keys[1].fmr_offset != 0 && + head.fmh_keys[1].fmr_offset != -1ULL)) + return -EINVAL; + + xhead.fmh_iflags = head.fmh_iflags; + xhead.fmh_count = head.fmh_count; + ext4_fsmap_to_internal(sb, &xhead.fmh_keys[0], &head.fmh_keys[0]); + ext4_fsmap_to_internal(sb, &xhead.fmh_keys[1], &head.fmh_keys[1]); + + trace_ext4_getfsmap_low_key(sb, &xhead.fmh_keys[0]); + trace_ext4_getfsmap_high_key(sb, &xhead.fmh_keys[1]); + + info.gi_sb = sb; + info.gi_data = arg; + error = ext4_getfsmap(sb, &xhead, ext4_getfsmap_format, &info); + if (error == EXT4_QUERY_RANGE_ABORT) { + error = 0; + aborted = true; + } else if (error) + return error; + + /* If we didn't abort, set the "last" flag in the last fmx */ + if (!aborted && info.gi_idx) { + info.gi_last_flags |= FMR_OF_LAST; + if (copy_to_user(&info.gi_data->fmh_recs[info.gi_idx - 1].fmr_flags, + &info.gi_last_flags, + sizeof(info.gi_last_flags))) + return -EFAULT; + } + + /* copy back header */ + head.fmh_entries = xhead.fmh_entries; + head.fmh_oflags = xhead.fmh_oflags; + if (copy_to_user(arg, &head, sizeof(struct fsmap_head))) + return -EFAULT; + + return 0; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -499,6 +586,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); switch (cmd) { + case FS_IOC_GETFSMAP: + return ext4_ioc_getfsmap(sb, (void __user *)arg); case EXT4_IOC_GETFLAGS: flags = ei->i_flags & EXT4_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); @@ -1007,6 +1096,7 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case EXT4_IOC_GET_ENCRYPTION_PWSALT: case EXT4_IOC_GET_ENCRYPTION_POLICY: case EXT4_IOC_SHUTDOWN: + case FS_IOC_GETFSMAP: break; default: return -ENOIOCTLCMD; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b60698c104fd..5083bce20ac4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -357,7 +357,7 @@ static struct kmem_cache *ext4_free_data_cachep; #define NR_GRPINFO_CACHES 8 static struct kmem_cache *ext4_groupinfo_caches[NR_GRPINFO_CACHES]; -static const char *ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { +static const char * const ext4_groupinfo_slab_names[NR_GRPINFO_CACHES] = { "ext4_groupinfo_1k", "ext4_groupinfo_2k", "ext4_groupinfo_4k", "ext4_groupinfo_8k", "ext4_groupinfo_16k", "ext4_groupinfo_32k", "ext4_groupinfo_64k", "ext4_groupinfo_128k" @@ -5277,3 +5277,52 @@ out: range->len = EXT4_C2B(EXT4_SB(sb), trimmed) << sb->s_blocksize_bits; return ret; } + +/* Iterate all the free extents in the group. */ +int +ext4_mballoc_query_range( + struct super_block *sb, + ext4_group_t group, + ext4_grpblk_t start, + ext4_grpblk_t end, + ext4_mballoc_query_range_fn formatter, + void *priv) +{ + void *bitmap; + ext4_grpblk_t next; + struct ext4_buddy e4b; + int error; + + error = ext4_mb_load_buddy(sb, group, &e4b); + if (error) + return error; + bitmap = e4b.bd_bitmap; + + ext4_lock_group(sb, group); + + start = (e4b.bd_info->bb_first_free > start) ? + e4b.bd_info->bb_first_free : start; + if (end >= EXT4_CLUSTERS_PER_GROUP(sb)) + end = EXT4_CLUSTERS_PER_GROUP(sb) - 1; + + while (start <= end) { + start = mb_find_next_zero_bit(bitmap, end + 1, start); + if (start > end) + break; + next = mb_find_next_bit(bitmap, end + 1, start); + + ext4_unlock_group(sb, group); + error = formatter(sb, group, start, next - start, priv); + if (error) + goto out_unload; + ext4_lock_group(sb, group); + + start = next + 1; + } + + ext4_unlock_group(sb, group); +out_unload: + ext4_mb_unload_buddy(&e4b); + + return error; +} diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 1aba469f8220..2bed62084a8c 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -199,4 +199,21 @@ static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, return ext4_group_first_block_no(sb, fex->fe_group) + (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } + +typedef int (*ext4_mballoc_query_range_fn)( + struct super_block *sb, + ext4_group_t agno, + ext4_grpblk_t start, + ext4_grpblk_t len, + void *priv); + +int +ext4_mballoc_query_range( + struct super_block *sb, + ext4_group_t agno, + ext4_grpblk_t start, + ext4_grpblk_t end, + ext4_mballoc_query_range_fn formatter, + void *priv); + #endif diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 07e5e1405771..b81f7d46f344 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1237,37 +1237,24 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) } /* - * NOTE! unlike strncmp, ext4_match returns 1 for success, 0 for failure. + * Test whether a directory entry matches the filename being searched for. * - * `len <= EXT4_NAME_LEN' is guaranteed by caller. - * `de != NULL' is guaranteed by caller. + * Return: %true if the directory entry matches, otherwise %false. */ -static inline int ext4_match(struct ext4_filename *fname, - struct ext4_dir_entry_2 *de) +static inline bool ext4_match(const struct ext4_filename *fname, + const struct ext4_dir_entry_2 *de) { - const void *name = fname_name(fname); - u32 len = fname_len(fname); + struct fscrypt_name f; if (!de->inode) - return 0; + return false; + f.usr_fname = fname->usr_fname; + f.disk_name = fname->disk_name; #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (unlikely(!name)) { - if (fname->usr_fname->name[0] == '_') { - int ret; - if (de->name_len < 16) - return 0; - ret = memcmp(de->name + de->name_len - 16, - fname->crypto_buf.name + 8, 16); - return (ret == 0) ? 1 : 0; - } - name = fname->crypto_buf.name; - len = fname->crypto_buf.len; - } + f.crypto_buf = fname->crypto_buf; #endif - if (de->name_len != len) - return 0; - return (memcmp(de->name, name, len) == 0) ? 1 : 0; + return fscrypt_match_name(&f, de->name, de->name_len); } /* @@ -1281,48 +1268,31 @@ int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, struct ext4_dir_entry_2 * de; char * dlimit; int de_len; - int res; de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ - if ((char *) de + de->name_len <= dlimit) { - res = ext4_match(fname, de); - if (res < 0) { - res = -1; - goto return_result; - } - if (res > 0) { - /* found a match - just to be sure, do - * a full check */ - if (ext4_check_dir_entry(dir, NULL, de, bh, - bh->b_data, - bh->b_size, offset)) { - res = -1; - goto return_result; - } - *res_dir = de; - res = 1; - goto return_result; - } - + if ((char *) de + de->name_len <= dlimit && + ext4_match(fname, de)) { + /* found a match - just to be sure, do + * a full check */ + if (ext4_check_dir_entry(dir, NULL, de, bh, bh->b_data, + bh->b_size, offset)) + return -1; + *res_dir = de; + return 1; } /* prevent looping on a bad block */ de_len = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); - if (de_len <= 0) { - res = -1; - goto return_result; - } + if (de_len <= 0) + return -1; offset += de_len; de = (struct ext4_dir_entry_2 *) ((char *) de + de_len); } - - res = 0; -return_result: - return res; + return 0; } static int is_dx_internal_node(struct inode *dir, ext4_lblk_t block, @@ -1616,16 +1586,9 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - int nokey = ext4_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode); - if (nokey) { - iput(inode); - return ERR_PTR(-ENOKEY); - } ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu", - (unsigned long) dir->i_ino, - (unsigned long) inode->i_ino); + dir->i_ino, inode->i_ino); iput(inode); return ERR_PTR(-EPERM); } @@ -1833,24 +1796,15 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, int nlen, rlen; unsigned int offset = 0; char *top; - int res; de = (struct ext4_dir_entry_2 *)buf; top = buf + buf_size - reclen; while ((char *) de <= top) { if (ext4_check_dir_entry(dir, NULL, de, bh, - buf, buf_size, offset)) { - res = -EFSCORRUPTED; - goto return_result; - } - /* Provide crypto context and crypto buffer to ext4 match */ - res = ext4_match(fname, de); - if (res < 0) - goto return_result; - if (res > 0) { - res = -EEXIST; - goto return_result; - } + buf, buf_size, offset)) + return -EFSCORRUPTED; + if (ext4_match(fname, de)) + return -EEXIST; nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if ((de->inode ? rlen - nlen : rlen) >= reclen) @@ -1858,22 +1812,17 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, de = (struct ext4_dir_entry_2 *)((char *)de + rlen); offset += rlen; } - if ((char *) de > top) - res = -ENOSPC; - else { - *dest_de = de; - res = 0; - } -return_result: - return res; + return -ENOSPC; + + *dest_de = de; + return 0; } -int ext4_insert_dentry(struct inode *dir, - struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - struct ext4_filename *fname) +void ext4_insert_dentry(struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname) { int nlen, rlen; @@ -1892,7 +1841,6 @@ int ext4_insert_dentry(struct inode *dir, ext4_set_de_type(inode->i_sb, de, inode->i_mode); de->name_len = fname_len(fname); memcpy(de->name, fname_name(fname), fname_len(fname)); - return 0; } /* @@ -1928,11 +1876,8 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, return err; } - /* By now the buffer is marked for journaling. Due to crypto operations, - * the following function call may fail */ - err = ext4_insert_dentry(dir, inode, de, blocksize, fname); - if (err < 0) - return err; + /* By now the buffer is marked for journaling */ + ext4_insert_dentry(inode, de, blocksize, fname); /* * XXX shouldn't update any times until successful diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 208241b06662..1a82138ba739 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -297,8 +297,17 @@ static void ext4_end_bio(struct bio *bio) { ext4_io_end_t *io_end = bio->bi_private; sector_t bi_sector = bio->bi_iter.bi_sector; + char b[BDEVNAME_SIZE]; - BUG_ON(!io_end); + if (WARN_ONCE(!io_end, "io_end is NULL: %s: sector %Lu len %u err %d\n", + bdevname(bio->bi_bdev, b), + (long long) bio->bi_iter.bi_sector, + (unsigned) bio_sectors(bio), + bio->bi_error)) { + ext4_finish_bio(bio); + bio_put(bio); + return; + } bio->bi_end_io = NULL; if (bio->bi_error) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index b2c74644d5de..c90edf09b0c3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -49,6 +49,7 @@ #include "xattr.h" #include "acl.h" #include "mballoc.h" +#include "fsmap.h" #define CREATE_TRACE_POINTS #include <trace/events/ext4.h> @@ -1230,7 +1231,7 @@ static const struct fscrypt_operations ext4_cryptops = { #endif #ifdef CONFIG_QUOTA -static char *quotatypes[] = INITQFNAMES; +static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); @@ -1443,7 +1444,8 @@ static ext4_fsblk_t get_sb_block(void **data) } #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) -static char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" +static const char deprecated_msg[] = + "Mount option \"%s\" will be removed by %s\n" "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; #ifdef CONFIG_QUOTA @@ -3898,6 +3900,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) bgl_lock_init(sbi->s_blockgroup_lock); + /* Pre-read the descriptors into the buffer cache */ + for (i = 0; i < db_count; i++) { + block = descriptor_loc(sb, logical_sb_block, i); + sb_breadahead(sb, block); + } + for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); sbi->s_group_desc[i] = sb_bread_unmovable(sb, block); @@ -4650,7 +4658,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) if (sync) { unlock_buffer(sbh); error = __sync_dirty_buffer(sbh, - test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC); + REQ_SYNC | (test_opt(sb, BARRIER) ? REQ_FUA : 0)); if (error) return error; diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index 42145be5c6b4..d74dc5f81a04 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -34,7 +34,7 @@ typedef enum { ptr_ext4_super_block_offset, } attr_ptr_t; -static const char *proc_dirname = "fs/ext4"; +static const char proc_dirname[] = "fs/ext4"; static struct proc_dir_entry *ext4_proc_root; struct ext4_attr { @@ -375,7 +375,7 @@ static const struct file_operations ext4_seq_##name##_fops = { \ PROC_FILE_SHOW_DEFN(es_shrinker_info); PROC_FILE_SHOW_DEFN(options); -static struct ext4_proc_files { +static const struct ext4_proc_files { const char *name; const struct file_operations *fops; } proc_files[] = { @@ -388,7 +388,7 @@ static struct ext4_proc_files { int ext4_register_sysfs(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_proc_files *p; + const struct ext4_proc_files *p; int err; sbi->s_kobj.kset = &ext4_kset; @@ -412,7 +412,7 @@ int ext4_register_sysfs(struct super_block *sb) void ext4_unregister_sysfs(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_proc_files *p; + const struct ext4_proc_files *p; if (sbi->s_proc) { for (p = proc_files; p->name; p++) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 996e7900d4c8..8fb7ce14e6eb 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -78,10 +78,8 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct mb_cache_entry **); static void ext4_xattr_rehash(struct ext4_xattr_header *, struct ext4_xattr_entry *); -static int ext4_xattr_list(struct dentry *dentry, char *buffer, - size_t buffer_size); -static const struct xattr_handler *ext4_xattr_handler_map[] = { +static const struct xattr_handler * const ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, #ifdef CONFIG_EXT4_FS_POSIX_ACL [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, @@ -163,20 +161,9 @@ ext4_xattr_handler(int name_index) return handler; } -/* - * Inode operation listxattr() - * - * d_inode(dentry)->i_mutex: don't care - */ -ssize_t -ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) -{ - return ext4_xattr_list(dentry, buffer, size); -} - static int -ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, - void *value_start) +ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end, + void *value_start) { struct ext4_xattr_entry *e = entry; @@ -230,8 +217,8 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) return -EFSCORRUPTED; if (!ext4_xattr_block_csum_verify(inode, bh)) return -EFSBADCRC; - error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, - bh->b_data); + error = ext4_xattr_check_entries(BFIRST(bh), bh->b_data + bh->b_size, + bh->b_data); if (!error) set_buffer_verified(bh); return error; @@ -246,7 +233,7 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, if (end - (void *)header < sizeof(*header) + sizeof(u32) || (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC))) goto errout; - error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); + error = ext4_xattr_check_entries(IFIRST(header), end, IFIRST(header)); errout: if (error) __ext4_error_inode(inode, function, line, 0, @@ -257,20 +244,9 @@ errout: #define xattr_check_inode(inode, header, end) \ __xattr_check_inode((inode), (header), (end), __func__, __LINE__) -static inline int -ext4_xattr_check_entry(struct ext4_xattr_entry *entry, size_t size) -{ - size_t value_size = le32_to_cpu(entry->e_value_size); - - if (entry->e_value_block != 0 || value_size > size || - le16_to_cpu(entry->e_value_offs) + value_size > size) - return -EFSCORRUPTED; - return 0; -} - static int ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, - const char *name, size_t size, int sorted) + const char *name, int sorted) { struct ext4_xattr_entry *entry; size_t name_len; @@ -290,8 +266,6 @@ ext4_xattr_find_entry(struct ext4_xattr_entry **pentry, int name_index, break; } *pentry = entry; - if (!cmp && ext4_xattr_check_entry(entry, size)) - return -EFSCORRUPTED; return cmp ? -ENODATA : 0; } @@ -319,7 +293,6 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(inode, bh)) { -bad_block: EXT4_ERROR_INODE(inode, "bad block %llu", EXT4_I(inode)->i_file_acl); error = -EFSCORRUPTED; @@ -327,9 +300,7 @@ bad_block: } ext4_xattr_cache_insert(ext4_mb_cache, bh); entry = BFIRST(bh); - error = ext4_xattr_find_entry(&entry, name_index, name, bh->b_size, 1); - if (error == -EFSCORRUPTED) - goto bad_block; + error = ext4_xattr_find_entry(&entry, name_index, name, 1); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); @@ -366,13 +337,12 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - entry = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; error = xattr_check_inode(inode, header, end); if (error) goto cleanup; - error = ext4_xattr_find_entry(&entry, name_index, name, - end - (void *)entry, 0); + entry = IFIRST(header); + error = ext4_xattr_find_entry(&entry, name_index, name, 0); if (error) goto cleanup; size = le32_to_cpu(entry->e_value_size); @@ -519,7 +489,9 @@ cleanup: } /* - * ext4_xattr_list() + * Inode operation listxattr() + * + * d_inode(dentry)->i_rwsem: don't care * * Copy a list of attribute names into the buffer * provided, or compute the buffer size required. @@ -528,8 +500,8 @@ cleanup: * Returns a negative error number on failure, or the number of bytes * used / required on success. */ -static int -ext4_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +ssize_t +ext4_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) { int ret, ret2; @@ -804,7 +776,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, bs->s.end = bs->bh->b_data + bs->bh->b_size; bs->s.here = bs->s.first; error = ext4_xattr_find_entry(&bs->s.here, i->name_index, - i->name, bs->bh->b_size, 1); + i->name, 1); if (error && error != -ENODATA) goto cleanup; bs->s.not_found = error; @@ -1076,8 +1048,7 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, return error; /* Find the named attribute. */ error = ext4_xattr_find_entry(&is->s.here, i->name_index, - i->name, is->s.end - - (void *)is->s.base, 0); + i->name, 0); if (error && error != -ENODATA) return error; is->s.not_found = error; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 0339daf4ca02..ea9c317b5916 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -275,10 +275,11 @@ static int f2fs_write_meta_pages(struct address_space *mapping, get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, META); + /* if locked failed, cp will flush dirty pages instead */ + if (!mutex_trylock(&sbi->cp_mutex)) + goto skip_write; - /* if mounting is failed, skip writing node pages */ - mutex_lock(&sbi->cp_mutex); + trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); written = sync_meta_pages(sbi, META, wbc->nr_to_write); mutex_unlock(&sbi->cp_mutex); @@ -567,7 +568,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) if (ni.blk_addr != NULL_ADDR) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x), run fsck to fix.", + "%s: orphan failed (ino=%x) by kernel, retry mount.", __func__, ino); return -EIO; } @@ -677,7 +678,7 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); - if (crc_offset >= blk_size) { + if (crc_offset > (blk_size - sizeof(__le32))) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc_offset: %zu", crc_offset); return -EINVAL; @@ -816,7 +817,9 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + if (!f2fs_is_volatile_file(inode)) + list_add_tail(&F2FS_I(inode)->dirty_list, + &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } @@ -941,6 +944,19 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) return 0; } +static void __prepare_cp_block(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + nid_t last_nid = nm_i->next_scan_nid; + + next_free_nid(sbi, &last_nid); + ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); + ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); + ckpt->next_free_nid = cpu_to_le32(last_nid); +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -964,21 +980,26 @@ retry_flush_dents: err = sync_dirty_inodes(sbi, DIR_INODE); if (err) goto out; + cond_resched(); goto retry_flush_dents; } + /* + * POR: we should ensure that there are no dirty node pages + * until finishing nat/sit flush. inode->i_blocks can be updated. + */ + down_write(&sbi->node_change); + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); err = f2fs_sync_inode_meta(sbi); if (err) goto out; + cond_resched(); goto retry_flush_dents; } - /* - * POR: we should ensure that there are no dirty node pages - * until finishing nat/sit flush. - */ retry_flush_nodes: down_write(&sbi->node_write); @@ -986,11 +1007,20 @@ retry_flush_nodes: up_write(&sbi->node_write); err = sync_node_pages(sbi, &wbc); if (err) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); goto out; } + cond_resched(); goto retry_flush_nodes; } + + /* + * sbi->node_change is used only for AIO write_begin path which produces + * dirty node blocks and some checkpoint values by block allocation. + */ + __prepare_cp_block(sbi); + up_write(&sbi->node_change); out: blk_finish_plug(&plug); return err; @@ -1024,16 +1054,20 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) spin_lock(&sbi->cp_lock); - if (cpc->reason == CP_UMOUNT && ckpt->cp_pack_total_block_count > + if ((cpc->reason & CP_UMOUNT) && + le32_to_cpu(ckpt->cp_pack_total_block_count) > sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) disable_nat_bits(sbi, false); - if (cpc->reason == CP_UMOUNT) + if (cpc->reason & CP_TRIMMED) + __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + + if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - if (cpc->reason == CP_FASTBOOT) + if (cpc->reason & CP_FASTBOOT) __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); else __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); @@ -1057,7 +1091,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; - nid_t last_nid = nm_i->next_scan_nid; block_t start_blk; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; @@ -1074,14 +1107,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) return -EIO; } - next_free_nid(sbi, &last_nid); - /* * modify checkpoint * version number is already updated */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); - ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { ckpt->cur_node_segno[i] = @@ -1100,10 +1130,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); } - ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); - ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); - ckpt->next_free_nid = cpu_to_le32(last_nid); - /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); spin_lock(&sbi->cp_lock); @@ -1143,7 +1169,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* write nat bits */ if (enabled_nat_bits(sbi, cpc)) { __u64 cp_ver = cur_cp_version(ckpt); - unsigned int i; block_t blk; cp_ver |= ((__u64)crc32 << 32); @@ -1250,8 +1275,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && - (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || - (cpc->reason == CP_DISCARD && !sbi->discard_blks))) + ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || + ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) goto out; if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1273,7 +1298,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_merged_bios(sbi); /* this is the case of multiple fstrims without any changes */ - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { if (!exist_trim_candidates(sbi, cpc)) { unblock_operations(sbi); goto out; @@ -1311,7 +1336,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); - if (cpc->reason == CP_RECOVERY) + if (cpc->reason & CP_RECOVERY) f2fs_msg(sbi->sb, KERN_NOTICE, "checkpoint: version = %llx", ckpt_ver); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1602b4bccae6..7c0f6bdf817d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -309,7 +309,7 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->fio.op = REQ_OP_WRITE; - io->fio.op_flags = REQ_META | REQ_PRIO; + io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC; if (!test_opt(sbi, NOBARRIER)) io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA; } @@ -341,7 +341,7 @@ void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) /* * Fill the locked page with data located in the block address. - * Return unlocked page. + * A caller needs to unlock the page on failure. */ int f2fs_submit_page_bio(struct f2fs_io_info *fio) { @@ -362,6 +362,9 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_set_op_attrs(bio, fio->op, fio->op_flags); __submit_bio(fio->sbi, bio, fio->type); + + if (!is_read_io(fio->op)) + inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page)); return 0; } @@ -787,6 +790,21 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) return err; } +static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +{ + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (lock) + down_read(&sbi->node_change); + else + up_read(&sbi->node_change); + } else { + if (lock) + f2fs_lock_op(sbi); + else + f2fs_unlock_op(sbi); + } +} + /* * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with * f2fs_map_blocks structure. @@ -829,7 +847,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, next_dnode: if (create) - f2fs_lock_op(sbi); + __do_map_lock(sbi, flag, true); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -939,7 +957,7 @@ skip: f2fs_put_dnode(&dn); if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; @@ -948,7 +966,7 @@ sync_out: f2fs_put_dnode(&dn); unlock_out: if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } out: @@ -1151,9 +1169,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping, for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { - prefetchw(&page->flags); if (pages) { page = list_last_entry(pages, struct page, lru); + + prefetchw(&page->flags); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, @@ -1283,17 +1302,83 @@ static int f2fs_read_data_pages(struct file *file, return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages); } +static int encrypt_one_page(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + gfp_t gfp_flags = GFP_NOFS; + + if (!f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + return 0; + + /* wait for GCed encrypted page writeback */ + f2fs_wait_on_encrypted_page_writeback(fio->sbi, fio->old_blkaddr); + +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + PAGE_SIZE, 0, fio->page->index, gfp_flags); + if (!IS_ERR(fio->encrypted_page)) + return 0; + + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_bios(fio->sbi); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); +} + +static inline bool need_inplace_update(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + + if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) + return false; + if (is_cold_data(fio->page)) + return false; + if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + return false; + + return need_inplace_update_policy(inode, fio); +} + +static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) +{ + if (fio->old_blkaddr == NEW_ADDR) + return false; + if (fio->old_blkaddr == NULL_ADDR) + return false; + return true; +} + int do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; + struct extent_info ei = {0,0,0}; + bool ipu_force = false; int err = 0; set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && + f2fs_lookup_extent_cache(inode, page->index, &ei)) { + fio->old_blkaddr = ei.blk + page->index - ei.fofs; + + if (valid_ipu_blkaddr(fio)) { + ipu_force = true; + fio->need_lock = false; + goto got_it; + } + } + + if (fio->need_lock) + f2fs_lock_op(fio->sbi); + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) - return err; + goto out; fio->old_blkaddr = dn.data_blkaddr; @@ -1302,31 +1387,10 @@ int do_write_data_page(struct f2fs_io_info *fio) ClearPageUptodate(page); goto out_writepage; } - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - gfp_t gfp_flags = GFP_NOFS; - - /* wait for GCed encrypted page writeback */ - f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), - fio->old_blkaddr); -retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, - PAGE_SIZE, 0, - fio->page->index, - gfp_flags); - if (IS_ERR(fio->encrypted_page)) { - err = PTR_ERR(fio->encrypted_page); - if (err == -ENOMEM) { - /* flush pending ios and wait for a while */ - f2fs_flush_merged_bios(F2FS_I_SB(inode)); - congestion_wait(BLK_RW_ASYNC, HZ/50); - gfp_flags |= __GFP_NOFAIL; - err = 0; - goto retry_encrypt; - } - goto out_writepage; - } - } +got_it: + err = encrypt_one_page(fio); + if (err) + goto out_writepage; set_page_writeback(page); @@ -1334,22 +1398,27 @@ retry_encrypt: * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(fio->old_blkaddr != NEW_ADDR && - !is_cold_data(page) && - !IS_ATOMIC_WRITTEN_PAGE(page) && - need_inplace_update(inode))) { - rewrite_data_page(fio); + if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { + f2fs_put_dnode(&dn); + if (fio->need_lock) + f2fs_unlock_op(fio->sbi); + err = rewrite_data_page(fio); + trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); - trace_f2fs_do_write_data_page(page, IPU); - } else { - write_data_page(&dn, fio); - trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(inode, FI_APPEND_WRITE); - if (page->index == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + return err; } + + /* LFS mode write path */ + write_data_page(&dn, fio); + trace_f2fs_do_write_data_page(page, OPU); + set_inode_flag(inode, FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); out_writepage: f2fs_put_dnode(&dn); +out: + if (fio->need_lock) + f2fs_unlock_op(fio->sbi); return err; } @@ -1370,9 +1439,11 @@ static int __write_data_page(struct page *page, bool *submitted, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, .submitted = false, + .need_lock = true, }; trace_f2fs_writepage(page, DATA); @@ -1408,6 +1479,7 @@ write: /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { + fio.need_lock = false; err = do_write_data_page(&fio); goto done; } @@ -1416,6 +1488,8 @@ write: need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; + else + set_inode_flag(inode, FI_HOT_DATA); err = -EAGAIN; if (f2fs_has_inline_data(inode)) { @@ -1423,12 +1497,12 @@ write: if (!err) goto out; } - f2fs_lock_op(sbi); + if (err == -EAGAIN) err = do_write_data_page(&fio); if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; - f2fs_unlock_op(sbi); + done: if (err && err != -ENOENT) goto redirty_out; @@ -1441,12 +1515,14 @@ out: if (wbc->for_reclaim) { f2fs_submit_merged_bio_cond(sbi, inode, 0, page->index, DATA, WRITE); + clear_inode_flag(inode, FI_HOT_DATA); remove_dirty_inode(inode); submitted = NULL; } unlock_page(page); - f2fs_balance_fs(sbi, need_balance_fs); + if (!S_ISDIR(inode->i_mode)) + f2fs_balance_fs(sbi, need_balance_fs); if (unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_bio(sbi, DATA, WRITE); @@ -1495,6 +1571,12 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pagevec_init(&pvec, 0); + if (get_dirty_pages(mapping->host) <= + SM_I(F2FS_M_SB(mapping))->min_hot_blocks) + set_inode_flag(mapping->host, FI_HOT_DATA); + else + clear_inode_flag(mapping->host, FI_HOT_DATA); + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1580,8 +1662,10 @@ continue_unlock: last_idx = page->index; } - if (--wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) { + /* give a priority to WB_SYNC threads */ + if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) || + --wbc->nr_to_write <= 0) && + wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } @@ -1637,9 +1721,18 @@ static int f2fs_write_data_pages(struct address_space *mapping, trace_f2fs_writepages(mapping->host, wbc, DATA); + /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req); + else if (atomic_read(&sbi->wb_sync_req)) + goto skip_write; + blk_start_plug(&plug); ret = f2fs_write_cache_pages(mapping, wbc); blk_finish_plug(&plug); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. @@ -1687,7 +1780,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); locked = true; } restart: @@ -1723,7 +1816,8 @@ restart: err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, + true); locked = true; goto restart; } @@ -1737,7 +1831,7 @@ out: f2fs_put_dnode(&dn); unlock_out: if (locked) - f2fs_unlock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); return err; } @@ -1951,7 +2045,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) - return; + return drop_inmem_page(inode, page); set_page_private(page, 0); ClearPagePrivate(page); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index ee2d0a485fc3..87f449845f5f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -51,15 +51,26 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); + si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); - if (SM_I(sbi) && SM_I(sbi)->fcc_info) - si->nr_flush = - atomic_read(&SM_I(sbi)->fcc_info->submit_flush); - if (SM_I(sbi) && SM_I(sbi)->dcc_info) - si->nr_discard = - atomic_read(&SM_I(sbi)->dcc_info->submit_discard); + if (SM_I(sbi) && SM_I(sbi)->fcc_info) { + si->nr_flushed = + atomic_read(&SM_I(sbi)->fcc_info->issued_flush); + si->nr_flushing = + atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + } + if (SM_I(sbi) && SM_I(sbi)->dcc_info) { + si->nr_discarded = + atomic_read(&SM_I(sbi)->dcc_info->issued_discard); + si->nr_discarding = + atomic_read(&SM_I(sbi)->dcc_info->issing_discard); + si->nr_discard_cmd = + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; + } si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -86,6 +97,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) @@ -99,8 +111,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); si->curseg[i] = curseg->segno; - si->cursec[i] = curseg->segno / sbi->segs_per_sec; - si->curzone[i] = si->cursec[i] / sbi->secs_per_zone; + si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); + si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); } for (i = 0; i < 2; i++) { @@ -124,10 +136,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + blks_per_sec = BLKS_PER_SEC(sbi); hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); dist = abs(vblocks - hblks_per_sec); bimodal += dist * dist; @@ -156,7 +168,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi) if (si->base_mem) goto get_cache; - si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; + /* build stat */ + si->base_mem = sizeof(struct f2fs_stat_info); + + /* build superblock */ + si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; @@ -208,8 +224,11 @@ get_cache: /* build merge flush thread */ if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); - if (SM_I(sbi)->dcc_info) + if (SM_I(sbi)->dcc_info) { si->cache_mem += sizeof(struct discard_cmd_control); + si->cache_mem += sizeof(struct discard_cmd) * + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + } /* free nids */ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + @@ -330,11 +349,16 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: %4d, Discard: %4d)\n", + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " + "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", si->nr_wb_cp_data, si->nr_wb_data, - si->nr_flush, si->nr_discard); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt); + si->nr_flushing, si->nr_flushed, + si->nr_discarding, si->nr_discarded, + si->nr_discard_cmd, si->undiscard_blks); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " + "volatile IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt, + si->vw_cnt, si->max_vw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", @@ -347,8 +371,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", - si->free_nids, si->alloc_nids); + seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", + si->free_nids, si->avail_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); @@ -434,7 +458,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inplace_count, 0); atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); + atomic_set(&sbi->max_vw_cnt, 0); mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 8d5c62b07b28..94756f55a97e 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -94,7 +94,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; @@ -111,8 +111,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; - struct fscrypt_str de_name = FSTR_INIT(NULL, 0); - struct fscrypt_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; @@ -130,17 +128,9 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, continue; } - /* encrypted case */ - de_name.name = d->filename[bit_pos]; - de_name.len = le16_to_cpu(de->name_len); - - /* show encrypted name */ - if (fname->hash) { - if (de->hash_code == cpu_to_le32(fname->hash)) - goto found; - } else if (de_name.len == name->len && - de->hash_code == namehash && - !memcmp(de_name.name, name->name, name->len)) + if (de->hash_code == namehash && + fscrypt_match_name(fname, d->filename[bit_pos], + le16_to_cpu(de->name_len))) goto found; if (max_slots && max_len > *max_slots) @@ -170,12 +160,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct f2fs_dir_entry *de = NULL; bool room = false; int max_slots; - f2fs_hash_t namehash; - - if(fname->hash) - namehash = cpu_to_le32(fname->hash); - else - namehash = f2fs_dentry_hash(&name); + f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -207,13 +192,9 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, f2fs_put_page(dentry_page, 0); } - /* This is to increase the speed of f2fs_create */ - if (!de && room) { - F2FS_I(dir)->task = current; - if (F2FS_I(dir)->chash != namehash) { - F2FS_I(dir)->chash = namehash; - F2FS_I(dir)->clevel = level; - } + if (!de && room && F2FS_I(dir)->chash != namehash) { + F2FS_I(dir)->chash = namehash; + F2FS_I(dir)->clevel = level; } return de; @@ -254,6 +235,9 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, break; } out: + /* This is to increase the speed of f2fs_create */ + if (!de) + F2FS_I(dir)->task = current; return de; } @@ -337,24 +321,6 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -int update_dent_inode(struct inode *inode, struct inode *to, - const struct qstr *name) -{ - struct page *page; - - if (file_enc_name(to)) - return 0; - - page = get_node_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(page)) - return PTR_ERR(page); - - init_dent_inode(name, page); - f2fs_put_page(page, 1); - - return 0; -} - void do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { @@ -384,7 +350,7 @@ static int make_empty_dir(struct inode *inode, dentry_blk = kmap_atomic(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); kunmap_atomic(dentry_blk); @@ -438,8 +404,11 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (new_name) + if (new_name) { init_dent_inode(new_name, page); + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); + } /* * This file should be checkpointed during fsync. @@ -542,7 +511,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, level = 0; slots = GET_DENTRY_SLOTS(new_name->len); - dentry_hash = f2fs_dentry_hash(new_name); + dentry_hash = f2fs_dentry_hash(new_name, NULL); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -599,11 +568,9 @@ add_dentry: err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); @@ -911,7 +878,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_blk = kmap(dentry_page); - make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index c6934f014e0f..2f98d7039701 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -18,6 +18,179 @@ #include "node.h" #include <trace/events/f2fs.h> +static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, + unsigned int ofs) +{ + if (cached_re) { + if (cached_re->ofs <= ofs && + cached_re->ofs + cached_re->len > ofs) { + return cached_re; + } + } + return NULL; +} + +static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, + unsigned int ofs) +{ + struct rb_node *node = root->rb_node; + struct rb_entry *re; + + while (node) { + re = rb_entry(node, struct rb_entry, rb_node); + + if (ofs < re->ofs) + node = node->rb_left; + else if (ofs >= re->ofs + re->len) + node = node->rb_right; + else + return re; + } + return NULL; +} + +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs) +{ + struct rb_entry *re; + + re = __lookup_rb_tree_fast(cached_re, ofs); + if (!re) + return __lookup_rb_tree_slow(root, ofs); + + return re; +} + +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs) +{ + struct rb_node **p = &root->rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (ofs < re->ofs) + p = &(*p)->rb_left; + else if (ofs >= re->ofs + re->len) + p = &(*p)->rb_right; + else + f2fs_bug_on(sbi, 1); + } + + return p; +} + +/* + * lookup rb entry in position of @ofs in rb-tree, + * if hit, return the entry, otherwise, return NULL + * @prev_ex: extent before ofs + * @next_ex: extent after ofs + * @insert_p: insert point for new extent at ofs + * in order to simpfy the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, + unsigned int ofs, + struct rb_entry **prev_entry, + struct rb_entry **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent, + bool force) +{ + struct rb_node **pnode = &root->rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct rb_entry *re = cached_re; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + if (re) { + if (re->ofs <= ofs && re->ofs + re->len > ofs) + goto lookup_neighbors; + } + + while (*pnode) { + parent = *pnode; + re = rb_entry(*pnode, struct rb_entry, rb_node); + + if (ofs < re->ofs) + pnode = &(*pnode)->rb_left; + else if (ofs >= re->ofs + re->len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + re = rb_entry(parent, struct rb_entry, rb_node); + tmp_node = parent; + if (parent && ofs > re->ofs) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + + tmp_node = parent; + if (parent && ofs < re->ofs) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + return NULL; + +lookup_neighbors: + if (ofs == re->ofs || force) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&re->rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + if (ofs == re->ofs + re->len - 1 || force) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&re->rb_node); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + return re; +} + +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first(root), *next; + struct rb_entry *cur_re, *next_re; + + if (!cur) + return true; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_re = rb_entry(cur, struct rb_entry, rb_node); + next_re = rb_entry(next, struct rb_entry, rb_node); + + if (cur_re->ofs + cur_re->len > next_re->ofs) { + f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, " + "cur(%u, %u) next(%u, %u)", + cur_re->ofs, cur_re->len, + next_re->ofs, next_re->len); + return false; + } + + cur = next; + } +#endif + return true; +} + static struct kmem_cache *extent_tree_slab; static struct kmem_cache *extent_node_slab; @@ -102,36 +275,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) return et; } -static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, unsigned int fofs) -{ - struct rb_node *node = et->root.rb_node; - struct extent_node *en = et->cached_en; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) { - stat_inc_cached_node_hit(sbi); - return en; - } - } - - while (node) { - en = rb_entry(node, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) { - node = node->rb_left; - } else if (fofs >= en->ei.fofs + en->ei.len) { - node = node->rb_right; - } else { - stat_inc_rbtree_node_hit(sbi); - return en; - } - } - return NULL; -} - static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei) { @@ -237,17 +380,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } - en = __lookup_extent_tree(sbi, et, pgofs); - if (en) { - *ei = en->ei; - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); - et->cached_en = en; - } - spin_unlock(&sbi->extent_lock); - ret = true; + en = (struct extent_node *)__lookup_rb_tree(&et->root, + (struct rb_entry *)et->cached_en, pgofs); + if (!en) + goto out; + + if (en == et->cached_en) + stat_inc_cached_node_hit(sbi); + else + stat_inc_rbtree_node_hit(sbi); + + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; } + spin_unlock(&sbi->extent_lock); + ret = true; out: stat_inc_total_hit(sbi); read_unlock(&et->lock); @@ -256,83 +406,6 @@ out: return ret; } - -/* - * lookup extent at @fofs, if hit, return the extent - * if not, return NULL and - * @prev_ex: extent before fofs - * @next_ex: extent after fofs - * @insert_p: insert point for new extent at fofs - * in order to simpfy the insertion after. - * tree must stay unchanged between lookup and insertion. - */ -static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, - unsigned int fofs, - struct extent_node **prev_ex, - struct extent_node **next_ex, - struct rb_node ***insert_p, - struct rb_node **insert_parent) -{ - struct rb_node **pnode = &et->root.rb_node; - struct rb_node *parent = NULL, *tmp_node; - struct extent_node *en = et->cached_en; - - *insert_p = NULL; - *insert_parent = NULL; - *prev_ex = NULL; - *next_ex = NULL; - - if (RB_EMPTY_ROOT(&et->root)) - return NULL; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) - goto lookup_neighbors; - } - - while (*pnode) { - parent = *pnode; - en = rb_entry(*pnode, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) - pnode = &(*pnode)->rb_left; - else if (fofs >= en->ei.fofs + en->ei.len) - pnode = &(*pnode)->rb_right; - else - goto lookup_neighbors; - } - - *insert_p = pnode; - *insert_parent = parent; - - en = rb_entry(parent, struct extent_node, rb_node); - tmp_node = parent; - if (parent && fofs > en->ei.fofs) - tmp_node = rb_next(parent); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - - tmp_node = parent; - if (parent && fofs < en->ei.fofs) - tmp_node = rb_prev(parent); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - return NULL; - -lookup_neighbors: - if (fofs == en->ei.fofs) { - /* lookup prev node for merging backward later */ - tmp_node = rb_prev(&en->rb_node); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - if (fofs == en->ei.fofs + en->ei.len - 1) { - /* lookup next node for merging frontward later */ - tmp_node = rb_next(&en->rb_node); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - return en; -} - static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct extent_node *prev_ex, @@ -387,17 +460,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, goto do_insert; } - while (*p) { - parent = *p; - en = rb_entry(parent, struct extent_node, rb_node); - - if (ei->fofs < en->ei.fofs) - p = &(*p)->rb_left; - else if (ei->fofs >= en->ei.fofs + en->ei.len) - p = &(*p)->rb_right; - else - f2fs_bug_on(sbi, 1); - } + p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); do_insert: en = __attach_extent_node(sbi, et, ei, parent, p); if (!en) @@ -447,8 +510,11 @@ static void f2fs_update_extent_tree_range(struct inode *inode, __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ - en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, - &insert_p, &insert_parent); + en = (struct extent_node *)__lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, + &insert_p, &insert_parent, false); if (!en) en = next_en; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1fc17a1fc5d0..2185c7a040a1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -50,6 +50,7 @@ enum { FAULT_BLOCK, FAULT_DIR_DEPTH, FAULT_EVICT_INODE, + FAULT_TRUNCATE, FAULT_IO, FAULT_CHECKPOINT, FAULT_MAX, @@ -62,7 +63,7 @@ struct f2fs_fault_info { }; extern char *fault_name[FAULT_MAX]; -#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) #endif /* @@ -88,9 +89,9 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_ADAPTIVE 0x00020000 #define F2FS_MOUNT_LFS 0x00040000 -#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option) +#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -124,22 +125,20 @@ enum { SIT_BITMAP }; -enum { - CP_UMOUNT, - CP_FASTBOOT, - CP_SYNC, - CP_RECOVERY, - CP_DISCARD, -}; +#define CP_UMOUNT 0x00000001 +#define CP_FASTBOOT 0x00000002 +#define CP_SYNC 0x00000004 +#define CP_RECOVERY 0x00000008 +#define CP_DISCARD 0x00000010 +#define CP_TRIMMED 0x00000020 #define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ - (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) + (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections)) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) -#define MAX_DISCARD_BLOCKS(sbi) \ - ((1 << (sbi)->log_blocks_per_seg) * (sbi)->segs_per_sec) -#define DISCARD_ISSUE_RATE 8 +#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) +#define DISCARD_ISSUE_RATE 8 #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -181,37 +180,63 @@ struct inode_entry { struct inode *inode; /* vfs inode pointer */ }; -/* for the list of blockaddresses to be discarded */ +/* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ - block_t blkaddr; /* block address to be discarded */ - int len; /* # of consecutive blocks of the discard */ + block_t start_blkaddr; /* start blockaddr of current segment */ + unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ }; +/* max discard pend list number */ +#define MAX_PLIST_NUM 512 +#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ + (MAX_PLIST_NUM - 1) : (blk_num - 1)) + enum { D_PREP, D_SUBMIT, D_DONE, }; +struct discard_info { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ +}; + struct discard_cmd { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ + }; + struct discard_info di; /* discard info */ + + }; struct list_head list; /* command list */ struct completion wait; /* compleation */ - block_t lstart; /* logical start address */ - block_t len; /* length */ - struct bio *bio; /* bio */ - int state; /* state */ + struct block_device *bdev; /* bdev */ + unsigned short ref; /* reference count */ + unsigned char state; /* state */ + int error; /* bio error */ }; struct discard_cmd_control { struct task_struct *f2fs_issue_discard; /* discard thread */ - struct list_head discard_entry_list; /* 4KB discard entry list */ - int nr_discards; /* # of discards in the list */ - struct list_head discard_cmd_list; /* discard cmd list */ + struct list_head entry_list; /* 4KB discard entry list */ + struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + struct list_head wait_list; /* store on-flushing entries */ wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ struct mutex cmd_lock; - int max_discards; /* max. discards to be issued */ - atomic_t submit_discard; /* # of issued discard */ + unsigned int nr_discards; /* # of discards in the list */ + unsigned int max_discards; /* max. discards to be issued */ + unsigned int undiscard_blks; /* # of undiscard blocks */ + atomic_t issued_discard; /* # of issued discard */ + atomic_t issing_discard; /* # of issing discard */ + atomic_t discard_cmd_cnt; /* # of cached cmd count */ + struct rb_root root; /* root of discard rb-tree */ }; /* for the list of fsync inodes, used only during recovery */ @@ -222,13 +247,13 @@ struct fsync_inode_entry { block_t last_dentry; /* block address locating the last dentry */ }; -#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) -#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits)) +#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) -#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne) -#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid) -#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se) -#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) #define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) #define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) @@ -270,11 +295,14 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) -#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) +#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) -#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ + struct f2fs_defragment) #define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ struct f2fs_move_range) +#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ + struct f2fs_flush_device) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -311,6 +339,11 @@ struct f2fs_move_range { u64 len; /* size to move */ }; +struct f2fs_flush_device { + u32 dev_num; /* device number to flush */ + u32 segments; /* # of segments to flush */ +}; + /* * For INODE and NODE manager */ @@ -323,26 +356,24 @@ struct f2fs_dentry_ptr { int max; }; -static inline void make_dentry_ptr(struct inode *inode, - struct f2fs_dentry_ptr *d, void *src, int type) +static inline void make_dentry_ptr_block(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t) { d->inode = inode; + d->max = NR_DENTRY_IN_BLOCK; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; +} - if (type == 1) { - struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; - - d->max = NR_DENTRY_IN_BLOCK; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } else { - struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src; - - d->max = NR_INLINE_DENTRY; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } +static inline void make_dentry_ptr_inline(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_inline_dentry *t) +{ + d->inode = inode; + d->max = NR_INLINE_DENTRY; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; } /* @@ -374,16 +405,30 @@ enum { /* number of extent info in extent cache we try to shrink */ #define EXTENT_CACHE_SHRINK_NUMBER 128 +struct rb_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ +}; + struct extent_info { unsigned int fofs; /* start offset in a file */ - u32 blk; /* start block address of the extent */ unsigned int len; /* length of the extent */ + u32 blk; /* start block address of the extent */ }; struct extent_node { - struct rb_node rb_node; /* rb node located in rb-tree */ + struct rb_node rb_node; + union { + struct { + unsigned int fofs; + unsigned int len; + u32 blk; + }; + struct extent_info ei; /* extent info */ + + }; struct list_head list; /* node in global extent list of sbi */ - struct extent_info ei; /* extent info */ struct extent_tree *et; /* extent tree pointer */ }; @@ -500,6 +545,24 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->len = len; } +static inline bool __is_discard_mergeable(struct discard_info *back, + struct discard_info *front) +{ + return back->lstart + back->len == front->lstart; +} + +static inline bool __is_discard_back_mergeable(struct discard_info *cur, + struct discard_info *back) +{ + return __is_discard_mergeable(back, cur); +} + +static inline bool __is_discard_front_mergeable(struct discard_info *cur, + struct discard_info *front) +{ + return __is_discard_mergeable(cur, front); +} + static inline bool __is_extent_mergeable(struct extent_info *back, struct extent_info *front) { @@ -562,7 +625,6 @@ struct f2fs_nm_info { unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; unsigned char *nat_block_bitmap; unsigned short *free_nid_count; /* free nid count of NAT block */ - spinlock_t free_nid_lock; /* protect updating of nid count */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ @@ -641,7 +703,8 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ - atomic_t submit_flush; /* # of issued flushes */ + atomic_t issued_flush; /* # of issued flushes */ + atomic_t issing_flush; /* # of issing flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -672,6 +735,7 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_hot_blocks; /* threshold for hot block allocation */ /* for flush command control */ struct flush_cmd_control *fcc_info; @@ -722,6 +786,7 @@ enum page_type { META_FLUSH, INMEM, /* the below types are used by tracepoints only. */ INMEM_DROP, + INMEM_INVALIDATE, INMEM_REVOKE, IPU, OPU, @@ -737,9 +802,10 @@ struct f2fs_io_info { struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ bool submitted; /* indicate IO submission */ + bool need_lock; /* indicate we need to lock cp_rwsem */ }; -#define is_read_io(rw) (rw == READ) +#define is_read_io(rw) ((rw) == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ @@ -827,6 +893,7 @@ struct f2fs_sb_info { struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ + struct rw_semaphore node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ @@ -879,6 +946,9 @@ struct f2fs_sb_info { /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; + /* writeback control */ + atomic_t wb_sync_req; /* count # of WB_SYNC threads */ + /* valid inode count */ struct percpu_counter total_valid_inode_count; @@ -912,11 +982,12 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t aw_cnt; /* # of atomic writes */ + atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ + atomic_t max_vw_cnt; /* max # of volatile writes */ int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif - unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ /* For sysfs suppport */ @@ -971,8 +1042,8 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) * and the return value is in kbytes. s is of struct f2fs_sb_info. */ #define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \ - s->sectors_written_start) >> 1) +(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \ + (s)->sectors_written_start) >> 1) static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { @@ -1193,7 +1264,7 @@ static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, { bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - return (cpc) ? (cpc->reason == CP_UMOUNT) && set : set; + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; } static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) @@ -1229,7 +1300,7 @@ static inline int __get_cp_reason(struct f2fs_sb_info *sbi) static inline bool __remain_node_summaries(int reason) { - return (reason == CP_UMOUNT || reason == CP_FASTBOOT); + return (reason & (CP_UMOUNT | CP_FASTBOOT)); } static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) @@ -1707,6 +1778,7 @@ enum { FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_HOT_DATA, /* indicate file is hot */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -1869,12 +1941,6 @@ static inline int f2fs_has_inline_data(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DATA); } -static inline void f2fs_clear_inline_inode(struct inode *inode) -{ - clear_inode_flag(inode, FI_INLINE_DATA); - clear_inode_flag(inode, FI_DATA_EXIST); -} - static inline int f2fs_exist_data(struct inode *inode) { return is_inode_flag_set(inode, FI_DATA_EXIST); @@ -2009,12 +2075,6 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) -/* get offset of first page in next direct node */ -#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \ - ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \ - (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \ - ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode)) - /* * file.c */ @@ -2076,8 +2136,6 @@ ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, struct page **page); void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode); -int update_dent_inode(struct inode *inode, struct inode *to, - const struct qstr *name); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct qstr *name, f2fs_hash_t name_hash, unsigned int bit_pos); @@ -2113,7 +2171,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname); /* * node.c @@ -2164,6 +2223,7 @@ void destroy_node_manager_caches(void); */ void register_inmem_page(struct inode *inode, struct page *page); void drop_inmem_pages(struct inode *inode); +void drop_inmem_page(struct inode *inode, struct page *page); int commit_inmem_pages(struct inode *inode); void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); @@ -2173,7 +2233,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); @@ -2185,7 +2245,7 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); void write_meta_page(struct f2fs_sb_info *sbi, struct page *page); void write_node_page(unsigned int nid, struct f2fs_io_info *fio); void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); -void rewrite_data_page(struct f2fs_io_info *fio); +int rewrite_data_page(struct f2fs_io_info *fio); void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, bool recover_curseg, bool recover_newaddr); @@ -2290,7 +2350,8 @@ int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, int start_gc_thread(struct f2fs_sb_info *sbi); void stop_gc_thread(struct f2fs_sb_info *sbi); block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background); +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, + unsigned int segno); void build_gc_manager(struct f2fs_sb_info *sbi); /* @@ -2314,11 +2375,15 @@ struct f2fs_stat_info { int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; + int nats, dirty_nats, sits, dirty_sits; + int free_nids, avail_nids, alloc_nids; int total_count, utilization; - int bg_gc, nr_wb_cp_data, nr_wb_data, nr_flush, nr_discard; + int bg_gc, nr_wb_cp_data, nr_wb_data; + int nr_flushing, nr_flushed, nr_discarding, nr_discarded; + int nr_discard_cmd; + unsigned int undiscard_blks; int inline_xattr, inline_inode, inline_dir, append, update, orphans; - int aw_cnt, max_aw_cnt; + int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2401,11 +2466,22 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) +#define stat_inc_volatile_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_dec_volatile_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_update_max_volatile_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ + } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ - (si)->tot_segs++; \ - if (type == SUM_TYPE_DATA) { \ + si->tot_segs++; \ + if ((type) == SUM_TYPE_DATA) { \ si->data_segs++; \ si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ } else { \ @@ -2415,14 +2491,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } while (0) #define stat_inc_tot_blk_count(si, blks) \ - (si->tot_blks += (blks)) + ((si)->tot_blks += (blks)) #define stat_inc_data_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ - si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) \ @@ -2430,7 +2506,7 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ - si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) int f2fs_build_stats(struct f2fs_sb_info *sbi); @@ -2438,32 +2514,35 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi); int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else -#define stat_inc_cp_count(si) -#define stat_inc_bg_cp_count(si) -#define stat_inc_call_count(si) -#define stat_inc_bggc_count(si) -#define stat_inc_dirty_inode(sbi, type) -#define stat_dec_dirty_inode(sbi, type) -#define stat_inc_total_hit(sb) -#define stat_inc_rbtree_node_hit(sb) -#define stat_inc_largest_node_hit(sbi) -#define stat_inc_cached_node_hit(sbi) -#define stat_inc_inline_xattr(inode) -#define stat_dec_inline_xattr(inode) -#define stat_inc_inline_inode(inode) -#define stat_dec_inline_inode(inode) -#define stat_inc_inline_dir(inode) -#define stat_dec_inline_dir(inode) -#define stat_inc_atomic_write(inode) -#define stat_dec_atomic_write(inode) -#define stat_update_max_atomic_write(inode) -#define stat_inc_seg_type(sbi, curseg) -#define stat_inc_block_count(sbi, curseg) -#define stat_inc_inplace_blocks(sbi) -#define stat_inc_seg_count(sbi, type, gc_type) -#define stat_inc_tot_blk_count(si, blks) -#define stat_inc_data_blk_count(sbi, blks, gc_type) -#define stat_inc_node_blk_count(sbi, blks, gc_type) +#define stat_inc_cp_count(si) do { } while (0) +#define stat_inc_bg_cp_count(si) do { } while (0) +#define stat_inc_call_count(si) do { } while (0) +#define stat_inc_bggc_count(si) do { } while (0) +#define stat_inc_dirty_inode(sbi, type) do { } while (0) +#define stat_dec_dirty_inode(sbi, type) do { } while (0) +#define stat_inc_total_hit(sb) do { } while (0) +#define stat_inc_rbtree_node_hit(sb) do { } while (0) +#define stat_inc_largest_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_inline_xattr(inode) do { } while (0) +#define stat_dec_inline_xattr(inode) do { } while (0) +#define stat_inc_inline_inode(inode) do { } while (0) +#define stat_dec_inline_inode(inode) do { } while (0) +#define stat_inc_inline_dir(inode) do { } while (0) +#define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_atomic_write(inode) do { } while (0) +#define stat_dec_atomic_write(inode) do { } while (0) +#define stat_update_max_atomic_write(inode) do { } while (0) +#define stat_inc_volatile_write(inode) do { } while (0) +#define stat_dec_volatile_write(inode) do { } while (0) +#define stat_update_max_volatile_write(inode) do { } while (0) +#define stat_inc_seg_type(sbi, curseg) do { } while (0) +#define stat_inc_block_count(sbi, curseg) do { } while (0) +#define stat_inc_inplace_blocks(sbi) do { } while (0) +#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_tot_blk_count(si, blks) do { } while (0) +#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) +#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } @@ -2489,7 +2568,7 @@ extern struct kmem_cache *inode_entry_slab; bool f2fs_may_inline_data(struct inode *inode); bool f2fs_may_inline_dentry(struct inode *inode); void read_inline_data(struct page *page, struct page *ipage); -bool truncate_inline_inode(struct page *ipage, u64 from); +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from); int f2fs_read_inline_data(struct inode *inode, struct page *page); int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); int f2fs_convert_inline_inode(struct inode *inode); @@ -2524,6 +2603,18 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs); +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs, + struct rb_entry **prev_entry, struct rb_entry **next_entry, + struct rb_node ***insert_p, struct rb_node **insert_parent, + bool force); +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root); unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); void f2fs_drop_extent_tree(struct inode *inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 0849af78381f..61af721329fa 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -116,11 +116,6 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) if (!dentry) return 0; - if (update_dent_inode(inode, inode, &dentry->d_name)) { - dput(dentry); - return 0; - } - *pino = parent_ino(dentry); dput(dentry); return 1; @@ -528,7 +523,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - return 0; + return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); truncate_out: f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, offset, PAGE_SIZE - offset); @@ -566,9 +561,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } if (f2fs_has_inline_data(inode)) { - truncate_inline_inode(ipage, from); - if (from == 0) - clear_inode_flag(inode, FI_DATA_EXIST); + truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); truncate_page = true; goto out; @@ -617,6 +610,12 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { + f2fs_show_injection_info(FAULT_TRUNCATE); + return -EIO; + } +#endif /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -1188,8 +1187,6 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - if (offset + len > new_size) - new_size = offset + len; new_size = max_t(loff_t, new_size, offset + len); } else { if (off_start) { @@ -1257,8 +1254,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) int ret = 0; new_size = i_size_read(inode) + len; - if (new_size > inode->i_sb->s_maxbytes) - return -EFBIG; + ret = inode_newsize_ok(inode, new_size); + if (ret) + return ret; if (offset >= i_size_read(inode)) return -EINVAL; @@ -1428,6 +1426,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); clear_inode_flag(inode, FI_DROP_CACHE); @@ -1474,10 +1473,10 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) if (ret) return ret; - flags = f2fs_mask_flags(inode->i_mode, flags); - inode_lock(inode); + flags = f2fs_mask_flags(inode->i_mode, flags); + oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { @@ -1491,10 +1490,11 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - inode_unlock(inode); inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); + + inode_unlock(inode); out: mnt_drop_write_file(filp); return ret; @@ -1515,6 +1515,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -1529,20 +1532,25 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; set_inode_flag(inode, FI_ATOMIC_FILE); + set_inode_flag(inode, FI_HOT_DATA); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (!get_dirty_pages(inode)) - goto out; + goto inc_stat; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); -out: + goto out; + } + +inc_stat: stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); +out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1592,6 +1600,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -1605,6 +1616,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (ret) goto out; + stat_inc_volatile_write(inode); + stat_update_max_volatile_write(inode); + set_inode_flag(inode, FI_VOLATILE_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); out: @@ -1660,6 +1674,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } @@ -1841,7 +1856,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync, true); + ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); out: mnt_drop_write_file(filp); return ret; @@ -1879,13 +1894,12 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; - unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; block_t blk_end = 0; bool fragmented = false; int err; /* if in-place-update policy is enabled, don't waste time here */ - if (need_inplace_update(inode)) + if (need_inplace_update_policy(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -1943,7 +1957,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, map.m_lblk = pg_start; map.m_len = pg_end - pg_start; - sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); /* * make sure there are enough free section for LFS allocation, this can @@ -2020,42 +2034,40 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode)) return -EINVAL; - err = mnt_want_write_file(filp); - if (err) - return err; - - if (f2fs_readonly(sbi->sb)) { - err = -EROFS; - goto out; - } + if (f2fs_readonly(sbi->sb)) + return -EROFS; if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, - sizeof(range))) { - err = -EFAULT; - goto out; - } + sizeof(range))) + return -EFAULT; /* verify alignment of offset & size */ - if (range.start & (F2FS_BLKSIZE - 1) || - range.len & (F2FS_BLKSIZE - 1)) { - err = -EINVAL; - goto out; - } + if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + if (unlikely((range.start + range.len) >> PAGE_SHIFT > + sbi->max_file_blocks)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; err = f2fs_defragment_range(sbi, filp, &range); + mnt_drop_write_file(filp); + f2fs_update_time(sbi, REQ_TIME); if (err < 0) - goto out; + return err; if (copy_to_user((struct f2fs_defragment __user *)arg, &range, sizeof(range))) - err = -EFAULT; -out: - mnt_drop_write_file(filp); - return err; + return -EFAULT; + + return 0; } static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, @@ -2189,6 +2201,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) range.pos_out, range.len); mnt_drop_write_file(filp); + if (err) + goto err_out; if (copy_to_user((struct f2fs_move_range __user *)arg, &range, sizeof(range))) @@ -2198,6 +2212,69 @@ err_out: return err; } +static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct sit_info *sm = SIT_I(sbi); + unsigned int start_segno = 0, end_segno = 0; + unsigned int dev_start_segno = 0, dev_end_segno = 0; + struct f2fs_flush_device range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg, + sizeof(range))) + return -EFAULT; + + if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num || + sbi->segs_per_sec != 1) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Can't flush %u in %d for segs_per_sec %u != 1\n", + range.dev_num, sbi->s_ndevs, + sbi->segs_per_sec); + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (range.dev_num != 0) + dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk); + dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk); + + start_segno = sm->last_victim[FLUSH_DEVICE]; + if (start_segno < dev_start_segno || start_segno >= dev_end_segno) + start_segno = dev_start_segno; + end_segno = min(start_segno + range.segments, dev_end_segno); + + while (start_segno < end_segno) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + sm->last_victim[GC_CB] = end_segno + 1; + sm->last_victim[GC_GREEDY] = end_segno + 1; + sm->last_victim[ALLOC_NEXT] = end_segno + 1; + ret = f2fs_gc(sbi, true, true, start_segno); + if (ret == -EAGAIN) + ret = 0; + else if (ret < 0) + break; + start_segno++; + } +out: + mnt_drop_write_file(filp); + return ret; +} + + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -2235,6 +2312,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: return f2fs_ioc_move_range(filp, arg); + case F2FS_IOC_FLUSH_DEVICE: + return f2fs_ioc_flush_device(filp, arg); default: return -ENOTTY; } @@ -2302,8 +2381,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GARBAGE_COLLECT: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: - break; case F2FS_IOC_MOVE_RANGE: + case F2FS_IOC_FLUSH_DEVICE: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 418fd9881646..026522107ca3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -84,7 +84,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -172,7 +172,11 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; - p->offset = sbi->last_victim[p->gc_mode]; + /* let's select beginning hot/small space first */ + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + p->offset = 0; + else + p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; } static unsigned int get_max_cost(struct f2fs_sb_info *sbi, @@ -182,7 +186,7 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, if (p->alloc_mode == SSR) return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return sbi->blocks_per_seg * p->ofs_unit; + return 2 * sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -207,7 +211,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) continue; clear_bit(secno, dirty_i->victim_secmap); - return secno * sbi->segs_per_sec; + return GET_SEG_FROM_SEC(sbi, secno); } return NULL_SEGNO; } @@ -215,8 +219,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SECNO(sbi, segno); - unsigned int start = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; unsigned int vblocks; unsigned char age = 0; @@ -225,7 +229,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) for (i = 0; i < sbi->segs_per_sec; i++) mtime += get_seg_entry(sbi, start + i)->mtime; - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); mtime = div_u64(mtime, sbi->segs_per_sec); vblocks = div_u64(vblocks, sbi->segs_per_sec); @@ -248,7 +252,7 @@ static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, unsigned int segno) { unsigned int valid_blocks = - get_valid_blocks(sbi, segno, sbi->segs_per_sec); + get_valid_blocks(sbi, segno, true); return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? valid_blocks * 2 : valid_blocks; @@ -291,6 +295,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned int *result, int gc_type, int type, char alloc_mode) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct sit_info *sm = SIT_I(sbi); struct victim_sel_policy p; unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); @@ -304,10 +309,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_segno = NULL_SEGNO; p.min_cost = get_max_cost(sbi, &p); + if (*result != NULL_SEGNO) { + if (IS_DATASEG(get_seg_entry(sbi, *result)->type) && + get_valid_blocks(sbi, *result, false) && + !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + p.min_segno = *result; + goto out; + } + if (p.max_search == 0) goto out; - last_victim = sbi->last_victim[p.gc_mode]; + last_victim = sm->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -320,9 +333,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); if (segno >= last_segment) { - if (sbi->last_victim[p.gc_mode]) { - last_segment = sbi->last_victim[p.gc_mode]; - sbi->last_victim[p.gc_mode] = 0; + if (sm->last_victim[p.gc_mode]) { + last_segment = + sm->last_victim[p.gc_mode]; + sm->last_victim[p.gc_mode] = 0; p.offset = 0; continue; } @@ -339,7 +353,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } - secno = GET_SECNO(sbi, segno); + secno = GET_SEC_FROM_SEG(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; @@ -357,17 +371,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } next: if (nsearched >= p.max_search) { - if (!sbi->last_victim[p.gc_mode] && segno <= last_victim) - sbi->last_victim[p.gc_mode] = last_victim + 1; + if (!sm->last_victim[p.gc_mode] && segno <= last_victim) + sm->last_victim[p.gc_mode] = last_victim + 1; else - sbi->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); break; } } if (p.min_segno != NULL_SEGNO) { got_it: if (p.alloc_mode == LFS) { - secno = GET_SECNO(sbi, p.min_segno); + secno = GET_SEC_FROM_SEG(sbi, p.min_segno); if (gc_type == FG_GC) sbi->cur_victim_sec = secno; else @@ -550,8 +565,10 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, get_node_info(sbi, nid, dni); if (sum->version != dni->version) { - f2fs_put_page(node_page, 1); - return false; + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: valid data with mismatched node version.", + __func__); + set_sbi_flag(sbi, SBI_NEED_FSCK); } *nofs = ofs_of_node(node_page); @@ -697,8 +714,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC, + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, + .need_lock = true, }; bool is_dirty = PageDirty(page); int err; @@ -890,7 +909,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, GET_SUM_BLOCK(sbi, segno)); f2fs_put_page(sum_page, 0); - if (get_valid_blocks(sbi, segno, 1) == 0 || + if (get_valid_blocks(sbi, segno, false) == 0 || !PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) goto next; @@ -905,7 +924,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, * - mutex_lock(sentry_lock) - change_curseg() * - lock_page(sum_page) */ - if (type == SUM_TYPE_NODE) gc_node_segment(sbi, sum->entries, segno, gc_type); else @@ -924,7 +942,7 @@ next: blk_finish_plug(&plug); if (gc_type == FG_GC && - get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) + get_valid_blocks(sbi, start_segno, true) == 0) sec_freed = 1; stat_inc_call_count(sbi->stat_info); @@ -932,13 +950,14 @@ next: return sec_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, + bool background, unsigned int segno) { - unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0; int ret = -EINVAL; struct cp_control cpc; + unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(GFP_NOFS), @@ -959,9 +978,11 @@ gc_more: * threshold, we can make them free by checkpoint. Then, we * secure free segments which doesn't need fggc any more. */ - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; + if (prefree_segments(sbi)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } if (has_not_enough_free_secs(sbi, 0, 0)) gc_type = FG_GC; } @@ -981,13 +1002,17 @@ gc_more: sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + segno = NULL_SEGNO; goto gc_more; + } if (gc_type == FG_GC) ret = write_checkpoint(sbi, &cpc); } stop: + SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); @@ -999,7 +1024,7 @@ stop: void build_gc_manager(struct f2fs_sb_info *sbi) { - u64 main_count, resv_count, ovp_count, blocks_per_sec; + u64 main_count, resv_count, ovp_count; DIRTY_I(sbi)->v_ops = &default_v_ops; @@ -1007,8 +1032,12 @@ void build_gc_manager(struct f2fs_sb_info *sbi) main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; - blocks_per_sec = sbi->blocks_per_seg * sbi->segs_per_sec; - sbi->fggc_threshold = div64_u64((main_count - ovp_count) * blocks_per_sec, - (main_count - resv_count)); + sbi->fggc_threshold = div64_u64((main_count - ovp_count) * + BLKS_PER_SEC(sbi), (main_count - resv_count)); + + /* give warm/cold data area from slower device */ + if (sbi->s_ndevs && sbi->segs_per_sec == 1) + SIT_I(sbi)->last_victim[ALLOC_NEXT] = + GET_SEGNO(sbi, FDEV(0).end_blk) + 1; } diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 71b7206c431e..eb2e031ea887 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -70,7 +70,8 @@ static void str2hashbuf(const unsigned char *msg, size_t len, *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname) { __u32 hash; f2fs_hash_t f2fs_hash; @@ -79,6 +80,10 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) const unsigned char *name = name_info->name; size_t len = name_info->len; + /* encrypted bigname case */ + if (fname && !fname->disk_name.name) + return cpu_to_le32(fname->hash); + if (is_dot_dotdot(name_info)) return 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index e32a9e527968..e4c527c4e7d0 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -63,19 +63,21 @@ void read_inline_data(struct page *page, struct page *ipage) SetPageUptodate(page); } -bool truncate_inline_inode(struct page *ipage, u64 from) +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) { void *addr; if (from >= MAX_INLINE_DATA) - return false; + return; addr = inline_data_addr(ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); memset(addr + from, 0, MAX_INLINE_DATA - from); set_page_dirty(ipage); - return true; + + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); } int f2fs_read_inline_data(struct inode *inode, struct page *page) @@ -135,6 +137,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); fio.old_blkaddr = dn->data_blkaddr; + set_inode_flag(dn->inode, FI_HOT_DATA); write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) { @@ -146,11 +149,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - truncate_inline_inode(dn->inode_page, 0); + truncate_inline_inode(dn->inode, dn->inode_page, 0); clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); - f2fs_clear_inline_inode(dn->inode); + clear_inode_flag(dn->inode, FI_INLINE_DATA); f2fs_put_dnode(dn); return 0; } @@ -267,9 +270,8 @@ process_inline: if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - if (!truncate_inline_inode(ipage, 0)) - return false; - f2fs_clear_inline_inode(inode); + truncate_inline_inode(inode, ipage, 0); + clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -296,11 +298,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, return NULL; } - namehash = f2fs_dentry_hash(&name); + namehash = f2fs_dentry_hash(&name, fname); inline_dentry = inline_data_addr(ipage); - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(NULL, &d, inline_dentry); de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) @@ -319,7 +321,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, dentry_blk = inline_data_addr(ipage); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + make_dentry_ptr_inline(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); @@ -380,7 +382,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_page_dirty(page); /* clear inline dir and flag after data writeback */ - truncate_inline_inode(ipage, 0); + truncate_inline_inode(dir, ipage, 0); stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); @@ -400,7 +402,7 @@ static int f2fs_add_inline_entries(struct inode *dir, unsigned long bit_pos = 0; int err = 0; - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(NULL, &d, inline_dentry); while (bit_pos < d.max) { struct f2fs_dir_entry *de; @@ -455,7 +457,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, } memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); - truncate_inline_inode(ipage, 0); + truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -527,14 +529,12 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(new_name); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + name_hash = f2fs_dentry_hash(new_name, NULL); + make_dentry_ptr_inline(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -623,7 +623,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, inline_dentry = inline_data_addr(ipage); - make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(inode, &d, inline_dentry); err = f2fs_fill_dentries(ctx, &d, 0, fstr); if (!err) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 24bb8213d974..518f49643092 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -316,7 +316,6 @@ retry: } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi, false); } - f2fs_inode_synced(inode); return 0; } ret = update_inode(inode, node_page); @@ -339,7 +338,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode) && wbc && wbc->nr_to_write) + update_inode_page(inode); + if (wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } @@ -372,13 +372,6 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_EVICT_INODE)) { - f2fs_show_injection_info(FAULT_EVICT_INODE); - goto no_delete; - } -#endif - remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); @@ -389,6 +382,12 @@ retry: if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_EVICT_INODE)) { + f2fs_show_injection_info(FAULT_EVICT_INODE); + err = -EIO; + } +#endif if (!err) { f2fs_lock_op(sbi); err = remove_inode_page(inode); @@ -411,7 +410,10 @@ no_delete: stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + /* ino == 0, if f2fs_new_inode() was failed t*/ + if (inode->i_ino) + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, + inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (inode->i_nlink) { @@ -448,6 +450,7 @@ void handle_failed_inode(struct inode *inode) * in a panic when flushing dirty inodes in gdirty_list. */ update_inode_page(inode); + f2fs_inode_synced(inode); /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 98f00a3a7f50..c31b40e5f9cf 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -148,8 +148,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -163,6 +161,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -324,9 +324,10 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (f2fs_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - bool nokey = f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode); - err = nokey ? -ENOKEY : -EPERM; + f2fs_msg(inode->i_sb, KERN_WARNING, + "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; goto err_out; } return d_splice_alias(inode, dentry); @@ -423,8 +424,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -487,6 +486,8 @@ err_out: } kfree(sd); + + f2fs_balance_fs(sbi, true); return err; out: handle_failed_inode(inode); @@ -508,8 +509,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); - f2fs_balance_fs(sbi, true); - set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); @@ -524,6 +523,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out_fail: @@ -554,8 +555,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -569,6 +568,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -595,8 +596,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -622,6 +621,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + + f2fs_balance_fs(sbi, true); return 0; release_out: @@ -720,13 +721,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - err = update_dent_inode(old_inode, new_inode, - &new_dentry->d_name); - if (err) { - release_orphan_inode(sbi); - goto put_out_dir; - } - f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_inode->i_ctime = current_time(new_inode); @@ -779,8 +773,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, down_write(&F2FS_I(old_inode)->i_sem); file_lost_pino(old_inode); - if (new_inode && file_enc_name(new_inode)) - file_set_enc_name(old_inode); up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); @@ -908,8 +900,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, old_nlink = old_dir_entry ? -1 : 1; new_nlink = -old_nlink; err = -EMLINK; - if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) || - (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX)) + if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX)) goto out_new_dir; } @@ -917,18 +909,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_lock_op(sbi); - err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); - if (err) - goto out_unlock; - if (file_enc_name(new_inode)) - file_set_enc_name(old_inode); - - err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name); - if (err) - goto out_undo; - if (file_enc_name(old_inode)) - file_set_enc_name(new_inode); - /* update ".." directory entry info of old dentry */ if (old_dir_entry) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); @@ -972,14 +952,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); return 0; -out_undo: - /* - * Still we may fail to recover name info of f2fs_inode here - * Drop it, once its name is set as encrypted - */ - update_dent_inode(old_inode, old_inode, &old_dentry->d_name); -out_unlock: - f2fs_unlock_op(sbi); out_new_dir: if (new_dir_entry) { f2fs_dentry_kunmap(new_inode, new_dir_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 0ea1dca8a0e2..4547c5c5cd98 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -22,7 +22,7 @@ #include "trace.h" #include <trace/events/f2fs.h> -#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) +#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; @@ -63,8 +63,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) int i; for (i = 0; i <= UPDATE_INO; i++) - mem_size += (sbi->im[i].ino_num * - sizeof(struct ino_entry)) >> PAGE_SHIFT; + mem_size += sbi->im[i].ino_num * + sizeof(struct ino_entry); + mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { mem_size = (atomic_read(&sbi->total_ext_tree) * @@ -177,18 +178,12 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) + struct nat_entry_set *set, struct nat_entry *ne) { - nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); - struct nat_entry_set *head; - - head = radix_tree_lookup(&nm_i->nat_set_root, set); - if (head) { - list_move_tail(&ne->list, &nm_i->nat_entries); - set_nat_flag(ne, IS_DIRTY, false); - head->entry_cnt--; - nm_i->dirty_nat_cnt--; - } + list_move_tail(&ne->list, &nm_i->nat_entries); + set_nat_flag(ne, IS_DIRTY, false); + set->entry_cnt--; + nm_i->dirty_nat_cnt--; } static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, @@ -381,6 +376,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) struct page *page = NULL; struct f2fs_nat_entry ne; struct nat_entry *e; + pgoff_t index; int i; ni->nid = nid; @@ -406,17 +402,21 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); } up_read(&curseg->journal_rwsem); - if (i >= 0) + if (i >= 0) { + up_read(&nm_i->nat_tree_lock); goto cache; + } /* Fill node_info from nat page */ - page = get_current_nat_page(sbi, start_nid); + index = current_nat_addr(sbi, nid); + up_read(&nm_i->nat_tree_lock); + + page = get_meta_page(sbi, index); nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: - up_read(&nm_i->nat_tree_lock); /* cache nat entry */ down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); @@ -1463,6 +1463,9 @@ continue_unlock: f2fs_wait_on_page_writeback(page, NODE, true); BUG_ON(PageWriteback(page)); + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + if (!atomic || page == last_page) { set_fsync_mark(page, 1); if (IS_INODE(page)) { @@ -1766,40 +1769,67 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct free_nid *i; + struct free_nid *i, *e; struct nat_entry *ne; - int err; + int err = -EINVAL; + bool ret = false; /* 0 nid should not be used */ if (unlikely(nid == 0)) return false; - if (build) { - /* do not add allocated nids */ - ne = __lookup_nat_cache(nm_i, nid); - if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || - nat_get_blkaddr(ne) != NULL_ADDR)) - return false; - } - i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; i->state = NID_NEW; - if (radix_tree_preload(GFP_NOFS)) { - kmem_cache_free(free_nid_slab, i); - return true; - } + if (radix_tree_preload(GFP_NOFS)) + goto err; spin_lock(&nm_i->nid_list_lock); + + if (build) { + /* + * Thread A Thread B + * - f2fs_create + * - f2fs_new_inode + * - alloc_nid + * - __insert_nid_to_list(ALLOC_NID_LIST) + * - f2fs_balance_fs_bg + * - build_free_nids + * - __build_free_nids + * - scan_nat_page + * - add_free_nid + * - __lookup_nat_cache + * - f2fs_add_link + * - init_inode_metadata + * - new_inode_page + * - new_node_page + * - set_node_addr + * - alloc_nid_done + * - __remove_nid_from_list(ALLOC_NID_LIST) + * - __insert_nid_to_list(FREE_NID_LIST) + */ + ne = __lookup_nat_cache(nm_i, nid); + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) + goto err_out; + + e = __lookup_free_nid_list(nm_i, nid); + if (e) { + if (e->state == NID_NEW) + ret = true; + goto err_out; + } + } + ret = true; err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); +err_out: spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); - if (err) { +err: + if (err) kmem_cache_free(free_nid_slab, i); - return true; - } - return true; + return ret; } static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) @@ -1821,7 +1851,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) } static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, - bool set, bool build, bool locked) + bool set, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); @@ -1835,14 +1865,10 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, else __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); - if (!locked) - spin_lock(&nm_i->free_nid_lock); if (set) nm_i->free_nid_count[nat_ofs]++; else if (!build) nm_i->free_nid_count[nat_ofs]--; - if (!locked) - spin_unlock(&nm_i->free_nid_lock); } static void scan_nat_page(struct f2fs_sb_info *sbi, @@ -1871,7 +1897,9 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) freed = add_free_nid(sbi, start_nid, true); - update_free_nid_bitmap(sbi, start_nid, freed, true, false); + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, freed, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -1927,6 +1955,9 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) int i = 0; nid_t nid = nm_i->next_scan_nid; + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + /* Enough entries */ if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; @@ -2026,7 +2057,7 @@ retry: __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; - update_free_nid_bitmap(sbi, *nid, false, false, false); + update_free_nid_bitmap(sbi, *nid, false, false); spin_unlock(&nm_i->nid_list_lock); return true; @@ -2082,7 +2113,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false, false); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&nm_i->nid_list_lock); @@ -2407,16 +2438,16 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, } raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); - __clear_nat_cache_dirty(NM_I(sbi), ne); + __clear_nat_cache_dirty(NM_I(sbi), set, ne); if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; - update_free_nid_bitmap(sbi, nid, true, false, false); + update_free_nid_bitmap(sbi, nid, true, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } else { spin_lock(&NM_I(sbi)->nid_list_lock); - update_free_nid_bitmap(sbi, nid, false, false, false); + update_free_nid_bitmap(sbi, nid, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } @@ -2428,10 +2459,11 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } - f2fs_bug_on(sbi, set->entry_cnt); - - radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - kmem_cache_free(nat_entry_set_slab, set); + /* Allow dirty nats by node block allocation in write_begin */ + if (!set->entry_cnt) { + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); + } } /* @@ -2476,8 +2508,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) __flush_nat_entry_set(sbi, set, cpc); up_write(&nm_i->nat_tree_lock); - - f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); + /* Allow dirty nats by node block allocation in write_begin */ } static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) @@ -2541,10 +2572,10 @@ inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) nid = i * NAT_ENTRY_PER_BLOCK; last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; - spin_lock(&nm_i->free_nid_lock); + spin_lock(&NM_I(sbi)->nid_list_lock); for (; nid < last_nid; nid++) - update_free_nid_bitmap(sbi, nid, true, true, true); - spin_unlock(&nm_i->free_nid_lock); + update_free_nid_bitmap(sbi, nid, true, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); } for (i = 0; i < nm_i->nat_blocks; i++) { @@ -2635,9 +2666,6 @@ static int init_free_nid_cache(struct f2fs_sb_info *sbi) sizeof(unsigned short), GFP_KERNEL); if (!nm_i->free_nid_count) return -ENOMEM; - - spin_lock_init(&nm_i->free_nid_lock); - return 0; } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 2f9603fa85a5..558048e33cf9 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -9,10 +9,10 @@ * published by the Free Software Foundation. */ /* start node id of a node block dedicated to the given node id */ -#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) +#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) /* node block offset on the NAT area dedicated to the given start node id */ -#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) +#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 8 @@ -62,16 +62,16 @@ struct nat_entry { struct node_info ni; /* in-memory node information */ }; -#define nat_get_nid(nat) (nat->ni.nid) -#define nat_set_nid(nat, n) (nat->ni.nid = n) -#define nat_get_blkaddr(nat) (nat->ni.blk_addr) -#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b) -#define nat_get_ino(nat) (nat->ni.ino) -#define nat_set_ino(nat, i) (nat->ni.ino = i) -#define nat_get_version(nat) (nat->ni.version) -#define nat_set_version(nat, v) (nat->ni.version = v) +#define nat_get_nid(nat) ((nat)->ni.nid) +#define nat_set_nid(nat, n) ((nat)->ni.nid = (n)) +#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr) +#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b)) +#define nat_get_ino(nat) ((nat)->ni.ino) +#define nat_set_ino(nat, i) ((nat)->ni.ino = (i)) +#define nat_get_version(nat) ((nat)->ni.version) +#define nat_set_version(nat, v) ((nat)->ni.version = (v)) -#define inc_node_version(version) (++version) +#define inc_node_version(version) (++(version)) static inline void copy_node_info(struct node_info *dst, struct node_info *src) @@ -200,13 +200,16 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) struct f2fs_nm_info *nm_i = NM_I(sbi); pgoff_t block_off; pgoff_t block_addr; - int seg_off; + /* + * block_off = segment_off * 512 + off_in_segment + * OLD = (segment_off * 512) * 2 + off_in_segment + * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment + */ block_off = NAT_BLOCK_OFFSET(start); - seg_off = block_off >> sbi->log_blocks_per_seg; block_addr = (pgoff_t)(nm_i->nat_blkaddr + - (seg_off << sbi->log_blocks_per_seg << 1) + + (block_off << 1) - (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d025aa83fb5b..907d6b7dde6a 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -198,7 +198,8 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, + bool check_only) { struct curseg_info *curseg; struct page *page = NULL; @@ -225,7 +226,8 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) { - if (IS_INODE(page) && is_dent_dnode(page)) { + if (!check_only && + IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; @@ -569,7 +571,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) mutex_lock(&sbi->cp_mutex); /* step #1: find fsynced inode numbers */ - err = find_fsync_dnodes(sbi, &inode_list); + err = find_fsync_dnodes(sbi, &inode_list, check_only); if (err || list_empty(&inode_list)) goto out; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 87c962705550..96845854e7ee 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -250,6 +250,36 @@ void drop_inmem_pages(struct inode *inode) stat_dec_atomic_write(inode); } +void drop_inmem_page(struct inode *inode, struct page *page) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct list_head *head = &fi->inmem_pages; + struct inmem_pages *cur = NULL; + + f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page)); + + mutex_lock(&fi->inmem_lock); + list_for_each_entry(cur, head, list) { + if (cur->page == page) + break; + } + + f2fs_bug_on(sbi, !cur || cur->page != page); + list_del(&cur->list); + mutex_unlock(&fi->inmem_lock); + + dec_page_count(sbi, F2FS_INMEM_PAGES); + kmem_cache_free(inmem_entry_slab, cur); + + ClearPageUptodate(page); + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); + + trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); +} + static int __commit_inmem_pages(struct inode *inode, struct list_head *revoke_list) { @@ -261,7 +291,6 @@ static int __commit_inmem_pages(struct inode *inode, .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, - .encrypted_page = NULL, }; pgoff_t last_idx = ULONG_MAX; int err = 0; @@ -281,6 +310,9 @@ static int __commit_inmem_pages(struct inode *inode, } fio.page = page; + fio.old_blkaddr = NULL_ADDR; + fio.encrypted_page = NULL; + fio.need_lock = false, err = do_write_data_page(&fio); if (err) { unlock_page(page); @@ -358,11 +390,8 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) } #endif - if (!need) - return; - /* balance_fs_bg is able to be pending */ - if (excess_cached_nats(sbi)) + if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi); /* @@ -371,7 +400,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false, false); + f2fs_gc(sbi, false, false, NULL_SEGNO); } } @@ -390,7 +419,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else build_free_nids(sbi, false, false); - if (!is_idle(sbi)) + if (!is_idle(sbi) && !excess_dirty_nats(sbi)) return; /* checkpoint is the only way to shrink partial cached entries */ @@ -411,32 +440,34 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } -static int __submit_flush_wait(struct block_device *bdev) +static int __submit_flush_wait(struct f2fs_sb_info *sbi, + struct block_device *bdev) { struct bio *bio = f2fs_bio_alloc(0); int ret; - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH; bio->bi_bdev = bdev; ret = submit_bio_wait(bio); bio_put(bio); + + trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE), ret); return ret; } static int submit_flush_wait(struct f2fs_sb_info *sbi) { - int ret = __submit_flush_wait(sbi->sb->s_bdev); + int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev); int i; - if (sbi->s_ndevs && !ret) { - for (i = 1; i < sbi->s_ndevs; i++) { - trace_f2fs_issue_flush(FDEV(i).bdev, - test_opt(sbi, NOBARRIER), - test_opt(sbi, FLUSH_MERGE)); - ret = __submit_flush_wait(FDEV(i).bdev); - if (ret) - break; - } + if (!sbi->s_ndevs || ret) + return ret; + + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; } return ret; } @@ -458,6 +489,8 @@ repeat: fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); ret = submit_flush_wait(sbi); + atomic_inc(&fcc->issued_flush); + llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; @@ -475,25 +508,29 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; + int ret; if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) - return submit_flush_wait(sbi); - - if (!atomic_read(&fcc->submit_flush)) { - int ret; + if (!test_opt(sbi, FLUSH_MERGE)) { + ret = submit_flush_wait(sbi); + atomic_inc(&fcc->issued_flush); + return ret; + } - atomic_inc(&fcc->submit_flush); + if (!atomic_read(&fcc->issing_flush)) { + atomic_inc(&fcc->issing_flush); ret = submit_flush_wait(sbi); - atomic_dec(&fcc->submit_flush); + atomic_dec(&fcc->issing_flush); + + atomic_inc(&fcc->issued_flush); return ret; } init_completion(&cmd.wait); - atomic_inc(&fcc->submit_flush); + atomic_inc(&fcc->issing_flush); llist_add(&cmd.llnode, &fcc->issue_list); if (!fcc->dispatch_list) @@ -501,10 +538,10 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + atomic_dec(&fcc->issing_flush); } else { llist_del_all(&fcc->issue_list); - atomic_set(&fcc->submit_flush, 0); + atomic_set(&fcc->issing_flush, 0); } return cmd.ret; @@ -524,7 +561,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; - atomic_set(&fcc->submit_flush, 0); + atomic_set(&fcc->issued_flush, 0); + atomic_set(&fcc->issing_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; @@ -597,8 +635,8 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; - if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) - clear_bit(GET_SECNO(sbi, segno), + if (get_valid_blocks(sbi, segno, true) == 0) + clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); } } @@ -618,7 +656,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == 0) { __locate_dirty_segment(sbi, segno, PRE); @@ -633,162 +671,407 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static void __add_discard_cmd(struct f2fs_sb_info *sbi, - struct bio *bio, block_t lstart, block_t len) +static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *cmd_list = &(dcc->discard_cmd_list); + struct list_head *pend_list; struct discard_cmd *dc; + f2fs_bug_on(sbi, !len); + + pend_list = &dcc->pend_list[plist_idx(len)]; + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); INIT_LIST_HEAD(&dc->list); - dc->bio = bio; - bio->bi_private = dc; + dc->bdev = bdev; dc->lstart = lstart; + dc->start = start; dc->len = len; + dc->ref = 0; dc->state = D_PREP; + dc->error = 0; init_completion(&dc->wait); + list_add_tail(&dc->list, pend_list); + atomic_inc(&dcc->discard_cmd_cnt); + dcc->undiscard_blks += len; - mutex_lock(&dcc->cmd_lock); - list_add_tail(&dc->list, cmd_list); - mutex_unlock(&dcc->cmd_lock); + return dc; } -static void __remove_discard_cmd(struct f2fs_sb_info *sbi, struct discard_cmd *dc) +static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node *parent, struct rb_node **p) { - int err = dc->bio->bi_error; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; - if (dc->state == D_DONE) - atomic_dec(&(SM_I(sbi)->dcc_info->submit_discard)); + dc = __create_discard_cmd(sbi, bdev, lstart, start, len); - if (err == -EOPNOTSUPP) - err = 0; + rb_link_node(&dc->rb_node, parent, p); + rb_insert_color(&dc->rb_node, &dcc->root); + + return dc; +} + +static void __detach_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + if (dc->state == D_DONE) + atomic_dec(&dcc->issing_discard); - if (err) - f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard failed, ret: %d", err); - bio_put(dc->bio); list_del(&dc->list); + rb_erase(&dc->rb_node, &dcc->root); + dcc->undiscard_blks -= dc->len; + kmem_cache_free(discard_cmd_slab, dc); + + atomic_dec(&dcc->discard_cmd_cnt); } -/* This should be covered by global mutex, &sit_i->sentry_lock */ -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - struct list_head *wait_list = &(dcc->discard_cmd_list); - struct discard_cmd *dc, *tmp; - struct blk_plug plug; - mutex_lock(&dcc->cmd_lock); + if (dc->error == -EOPNOTSUPP) + dc->error = 0; - blk_start_plug(&plug); + if (dc->error) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard failed, ret: %d", dc->error); + __detach_discard_cmd(dcc, dc); +} - list_for_each_entry_safe(dc, tmp, wait_list, list) { +static void f2fs_submit_discard_endio(struct bio *bio) +{ + struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - if (blkaddr == NULL_ADDR) { - if (dc->state == D_PREP) { - dc->state = D_SUBMIT; - submit_bio(dc->bio); - atomic_inc(&dcc->submit_discard); - } - continue; + dc->error = bio->bi_error; + dc->state = D_DONE; + complete(&dc->wait); + bio_put(bio); +} + +/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ +static void __submit_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct bio *bio = NULL; + + if (dc->state != D_PREP) + return; + + trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); + + dc->error = __blkdev_issue_discard(dc->bdev, + SECTOR_FROM_BLOCK(dc->start), + SECTOR_FROM_BLOCK(dc->len), + GFP_NOFS, 0, &bio); + if (!dc->error) { + /* should keep before submission to avoid D_DONE right away */ + dc->state = D_SUBMIT; + atomic_inc(&dcc->issued_discard); + atomic_inc(&dcc->issing_discard); + if (bio) { + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + bio->bi_opf |= REQ_SYNC; + submit_bio(bio); + list_move_tail(&dc->list, &dcc->wait_list); } + } else { + __remove_discard_cmd(sbi, dc); + } +} - if (dc->lstart <= blkaddr && blkaddr < dc->lstart + dc->len) { - if (dc->state == D_SUBMIT) - wait_for_completion_io(&dc->wait); - else - __remove_discard_cmd(sbi, dc); +static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node **p = &dcc->root.rb_node; + struct rb_node *parent = NULL; + struct discard_cmd *dc = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); +do_insert: + dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); + if (!dc) + return NULL; + + return dc; +} + +static void __relocate_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]); +} + +static void __punch_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_info di = dc->di; + bool modified = false; + + if (dc->state == D_DONE || dc->len == 1) { + __remove_discard_cmd(sbi, dc); + return; + } + + dcc->undiscard_blks -= di.len; + + if (blkaddr > di.lstart) { + dc->len = blkaddr - dc->lstart; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root)); + modified = true; + } + + if (blkaddr < di.lstart + di.len - 1) { + if (modified) { + __insert_discard_tree(sbi, dc->bdev, blkaddr + 1, + di.start + blkaddr + 1 - di.lstart, + di.lstart + di.len - 1 - blkaddr, + NULL, NULL); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + } else { + dc->lstart++; + dc->len--; + dc->start++; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); } } - blk_finish_plug(&plug); +} - /* this comes from f2fs_put_super */ - if (blkaddr == NULL_ADDR) { - list_for_each_entry_safe(dc, tmp, wait_list, list) { - wait_for_completion_io(&dc->wait); - __remove_discard_cmd(sbi, dc); +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct discard_cmd *dc; + struct discard_info di = {0}; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + block_t end = lstart + len; + + mutex_lock(&dcc->cmd_lock); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, lstart, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (dc) + prev_dc = dc; + + if (!prev_dc) { + di.lstart = lstart; + di.len = next_dc ? next_dc->lstart - lstart : len; + di.len = min(di.len, len); + di.start = start; + } + + while (1) { + struct rb_node *node; + bool merged = false; + struct discard_cmd *tdc = NULL; + + if (prev_dc) { + di.lstart = prev_dc->lstart + prev_dc->len; + if (di.lstart < lstart) + di.lstart = lstart; + if (di.lstart >= end) + break; + + if (!next_dc || next_dc->lstart > end) + di.len = end - di.lstart; + else + di.len = next_dc->lstart - di.lstart; + di.start = start + di.lstart - lstart; + } + + if (!di.len) + goto next; + + if (prev_dc && prev_dc->state == D_PREP && + prev_dc->bdev == bdev && + __is_discard_back_mergeable(&di, &prev_dc->di)) { + prev_dc->di.len += di.len; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, prev_dc); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + di = prev_dc->di; + tdc = prev_dc; + merged = true; + } + + if (next_dc && next_dc->state == D_PREP && + next_dc->bdev == bdev && + __is_discard_front_mergeable(&di, &next_dc->di)) { + next_dc->di.lstart = di.lstart; + next_dc->di.len += di.len; + next_dc->di.start = di.start; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, next_dc); + if (tdc) + __remove_discard_cmd(sbi, tdc); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + merged = true; + } + + if (!merged) { + __insert_discard_tree(sbi, bdev, di.lstart, di.start, + di.len, NULL, NULL); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); } + next: + prev_dc = next_dc; + if (!prev_dc) + break; + + node = rb_next(&prev_dc->rb_node); + next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); } + mutex_unlock(&dcc->cmd_lock); } -static void f2fs_submit_discard_endio(struct bio *bio) +static int __queue_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) { - struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; + block_t lblkstart = blkstart; - complete(&dc->wait); - dc->state = D_DONE; + trace_f2fs_queue_discard(bdev, blkstart, blklen); + + if (sbi->s_ndevs) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); + return 0; } -static int issue_discard_thread(void *data) +static void __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) { - struct f2fs_sb_info *sbi = data; struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - wait_queue_head_t *q = &dcc->discard_wait_queue; - struct list_head *cmd_list = &dcc->discard_cmd_list; + struct list_head *pend_list; struct discard_cmd *dc, *tmp; struct blk_plug plug; - int iter = 0; -repeat: - if (kthread_should_stop()) - return 0; + int i, iter = 0; + mutex_lock(&dcc->cmd_lock); blk_start_plug(&plug); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + + if (!issue_cond || is_idle(sbi)) + __submit_discard_cmd(sbi, dc); + if (issue_cond && iter++ > DISCARD_ISSUE_RATE) + goto out; + } + } +out: + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); +} + +static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = &(dcc->wait_list); + struct discard_cmd *dc, *tmp; mutex_lock(&dcc->cmd_lock); - list_for_each_entry_safe(dc, tmp, cmd_list, list) { - if (dc->state == D_PREP) { - dc->state = D_SUBMIT; - submit_bio(dc->bio); - atomic_inc(&dcc->submit_discard); - if (iter++ > DISCARD_ISSUE_RATE) - break; - } else if (dc->state == D_DONE) { + list_for_each_entry_safe(dc, tmp, wait_list, list) { + if (!wait_cond || dc->state == D_DONE) { + if (dc->ref) + continue; + wait_for_completion_io(&dc->wait); __remove_discard_cmd(sbi, dc); } } mutex_unlock(&dcc->cmd_lock); +} - blk_finish_plug(&plug); +/* This should be covered by global mutex, &sit_i->sentry_lock */ +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; + bool need_wait = false; - iter = 0; - congestion_wait(BLK_RW_SYNC, HZ/50); + mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr); + if (dc) { + if (dc->state == D_PREP) { + __punch_discard_cmd(sbi, dc, blkaddr); + } else { + dc->ref++; + need_wait = true; + } + } + mutex_unlock(&dcc->cmd_lock); - wait_event_interruptible(*q, - kthread_should_stop() || !list_empty(&dcc->discard_cmd_list)); - goto repeat; + if (need_wait) { + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) + __remove_discard_cmd(sbi, dc); + mutex_unlock(&dcc->cmd_lock); + } } - -/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, - struct block_device *bdev, block_t blkstart, block_t blklen) +/* This comes from f2fs_put_super */ +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) { - struct bio *bio = NULL; - block_t lblkstart = blkstart; - int err; + __issue_discard_cmd(sbi, false); + __wait_discard_cmd(sbi, false); +} - trace_f2fs_issue_discard(bdev, blkstart, blklen); +static int issue_discard_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + wait_queue_head_t *q = &dcc->discard_wait_queue; +repeat: + if (kthread_should_stop()) + return 0; - if (sbi->s_ndevs) { - int devi = f2fs_target_device_index(sbi, blkstart); + __issue_discard_cmd(sbi, true); + __wait_discard_cmd(sbi, true); - blkstart -= FDEV(devi).start_blk; - } - err = __blkdev_issue_discard(bdev, - SECTOR_FROM_BLOCK(blkstart), - SECTOR_FROM_BLOCK(blklen), - GFP_NOFS, 0, &bio); - if (!err && bio) { - bio->bi_end_io = f2fs_submit_discard_endio; - bio->bi_opf |= REQ_SYNC; + congestion_wait(BLK_RW_SYNC, HZ/50); - __add_discard_cmd(sbi, bio, lblkstart, blklen); - wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); - } - return err; + wait_event_interruptible(*q, kthread_should_stop() || + atomic_read(&dcc->discard_cmd_cnt)); + goto repeat; } #ifdef CONFIG_BLK_DEV_ZONED @@ -796,6 +1079,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { sector_t sector, nr_sects; + block_t lblkstart = blkstart; int devi = 0; if (sbi->s_ndevs) { @@ -813,7 +1097,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: sector = SECTOR_FROM_BLOCK(blkstart); @@ -845,7 +1129,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __queue_discard_cmd(sbi, bdev, blkstart, blklen); } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, @@ -888,32 +1172,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return err; } -static void __add_discard_entry(struct f2fs_sb_info *sbi, - struct cp_control *cpc, struct seg_entry *se, - unsigned int start, unsigned int end) -{ - struct list_head *head = &SM_I(sbi)->dcc_info->discard_entry_list; - struct discard_entry *new, *last; - - if (!list_empty(head)) { - last = list_last_entry(head, struct discard_entry, list); - if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len && - last->len < MAX_DISCARD_BLOCKS(sbi)) { - last->len += end - start; - goto done; - } - } - - new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); - INIT_LIST_HEAD(&new->list); - new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; - new->len = end - start; - list_add_tail(&new->list, head); -done: - SM_I(sbi)->dcc_info->nr_discards += end - start; -} - static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, bool check_only) { @@ -925,7 +1183,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; - bool force = (cpc->reason == CP_DISCARD); + bool force = (cpc->reason & CP_DISCARD); + struct discard_entry *de = NULL; + struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) @@ -957,14 +1217,24 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (check_only) return true; - __add_discard_entry(sbi, cpc, se, start, end); + if (!de) { + de = f2fs_kmem_cache_alloc(discard_entry_slab, + GFP_F2FS_ZERO); + de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); + list_add_tail(&de->list, head); + } + + for (i = start; i < end; i++) + __set_bit_le(i, (void *)de->discard_map); + + SM_I(sbi)->dcc_info->nr_discards += end - start; } return false; } void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -990,13 +1260,13 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->dcc_info->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; - bool force = (cpc->reason == CP_DISCARD); + bool force = (cpc->reason & CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -1026,10 +1296,10 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) continue; } next: - secno = GET_SECNO(sbi, start); - start_segno = secno * sbi->segs_per_sec; + secno = GET_SEC_FROM_SEG(sbi, start); + start_segno = GET_SEG_FROM_SEC(sbi, secno); if (!IS_CURSEC(sbi, secno) && - !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), sbi->segs_per_sec << sbi->log_blocks_per_seg); @@ -1043,22 +1313,46 @@ next: /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (force && entry->len < cpc->trim_minlen) - goto skip; - f2fs_issue_discard(sbi, entry->blkaddr, entry->len); - cpc->trimmed += entry->len; + unsigned int cur_pos = 0, next_pos, len, total_len = 0; + bool is_valid = test_bit_le(0, entry->discard_map); + +find_next: + if (is_valid) { + next_pos = find_next_zero_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + len = next_pos - cur_pos; + + if (force && len < cpc->trim_minlen) + goto skip; + + f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, + len); + cpc->trimmed += len; + total_len += len; + } else { + next_pos = find_next_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + } skip: + cur_pos = next_pos; + is_valid = !is_valid; + + if (cur_pos < sbi->blocks_per_seg) + goto find_next; + list_del(&entry->list); - SM_I(sbi)->dcc_info->nr_discards -= entry->len; + SM_I(sbi)->dcc_info->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } + + wake_up(&SM_I(sbi)->dcc_info->discard_wait_queue); } static int create_discard_cmd_control(struct f2fs_sb_info *sbi) { dev_t dev = sbi->sb->s_bdev->bd_dev; struct discard_cmd_control *dcc; - int err = 0; + int err = 0, i; if (SM_I(sbi)->dcc_info) { dcc = SM_I(sbi)->dcc_info; @@ -1069,12 +1363,18 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) if (!dcc) return -ENOMEM; - INIT_LIST_HEAD(&dcc->discard_entry_list); - INIT_LIST_HEAD(&dcc->discard_cmd_list); + INIT_LIST_HEAD(&dcc->entry_list); + for (i = 0; i < MAX_PLIST_NUM; i++) + INIT_LIST_HEAD(&dcc->pend_list[i]); + INIT_LIST_HEAD(&dcc->wait_list); mutex_init(&dcc->cmd_lock); - atomic_set(&dcc->submit_discard, 0); + atomic_set(&dcc->issued_discard, 0); + atomic_set(&dcc->issing_discard, 0); + atomic_set(&dcc->discard_cmd_cnt, 0); dcc->nr_discards = 0; - dcc->max_discards = 0; + dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + dcc->undiscard_blks = 0; + dcc->root = RB_ROOT; init_waitqueue_head(&dcc->discard_wait_queue); SM_I(sbi)->dcc_info = dcc; @@ -1091,20 +1391,22 @@ init_thread: return err; } -static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi, bool free) +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) { struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; - if (dcc && dcc->f2fs_issue_discard) { + if (!dcc) + return; + + if (dcc->f2fs_issue_discard) { struct task_struct *discard_thread = dcc->f2fs_issue_discard; dcc->f2fs_issue_discard = NULL; kthread_stop(discard_thread); } - if (free) { - kfree(dcc); - SM_I(sbi)->dcc_info = NULL; - } + + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -1345,6 +1647,17 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, f2fs_put_page(page, 1); } +static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + unsigned int segno = curseg->segno + 1; + struct free_segmap_info *free_i = FREE_I(sbi); + + if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec) + return !test_bit(segno, free_i->free_segmap); + return 0; +} + /* * Find a new segment from the free segments bitmap to right order * This function should be returned with success, otherwise BUG @@ -1355,8 +1668,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; - unsigned int hint = *newseg / sbi->segs_per_sec; - unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); + unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); + unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); unsigned int left_start = hint; bool init = true; int go_left = 0; @@ -1366,8 +1679,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - (hint + 1) * sbi->segs_per_sec, *newseg + 1); - if (segno < (hint + 1) * sbi->segs_per_sec) + GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); + if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } find_other_zone: @@ -1398,8 +1711,8 @@ find_other_zone: secno = left_start; skip_left: hint = secno; - segno = secno * sbi->segs_per_sec; - zoneno = secno / sbi->secs_per_zone; + segno = GET_SEG_FROM_SEC(sbi, secno); + zoneno = GET_ZONE_FROM_SEC(sbi, secno); /* give up on finding another zone */ if (!init) @@ -1443,7 +1756,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) struct summary_footer *sum_footer; curseg->segno = curseg->next_segno; - curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno); + curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; @@ -1456,6 +1769,20 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) __set_sit_entry_type(sbi, type, curseg->segno, modified); } +static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) +{ + /* if segs_per_sec is large than 1, we need to keep original policy. */ + if (sbi->segs_per_sec != 1) + return CURSEG_I(sbi, type)->segno; + + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + return 0; + + if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) + return SIT_I(sbi)->last_victim[ALLOC_NEXT]; + return CURSEG_I(sbi, type)->segno; +} + /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. @@ -1474,6 +1801,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) if (test_opt(sbi, NOHEAP)) dir = ALLOC_RIGHT; + segno = __get_next_segno(sbi, type); get_new_segment(sbi, &segno, new_sec, dir); curseg->next_segno = segno; reset_curseg(sbi, type, 1); @@ -1549,12 +1877,15 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + unsigned segno = NULL_SEGNO; int i, cnt; bool reversed = false; /* need_SSR() already forces to do this */ - if (v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR)) + if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + curseg->next_segno = segno; return 1; + } /* For node segments, let's do SSR more intensively */ if (IS_NODESEG(type)) { @@ -1578,9 +1909,10 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) for (; cnt-- > 0; reversed ? i-- : i++) { if (i == type) continue; - if (v_ops->get_victim(sbi, &(curseg)->next_segno, - BG_GC, i, SSR)) + if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + curseg->next_segno = segno; return 1; + } } return 0; } @@ -1592,17 +1924,21 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) static void allocate_segment_by_default(struct f2fs_sb_info *sbi, int type, bool force) { + struct curseg_info *curseg = CURSEG_I(sbi, type); + if (force) new_curseg(sbi, type, true); else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); + else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) + new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) change_curseg(sbi, type, true); else new_curseg(sbi, type, false); - stat_inc_seg_type(sbi, CURSEG_I(sbi, type)); + stat_inc_seg_type(sbi, curseg); } void allocate_new_segments(struct f2fs_sb_info *sbi) @@ -1734,18 +2070,16 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type) if (p_type == DATA) { struct inode *inode = page->mapping->host; - if (S_ISDIR(inode->i_mode)) - return CURSEG_HOT_DATA; - else if (is_cold_data(page) || file_is_cold(inode)) + if (is_cold_data(page) || file_is_cold(inode)) return CURSEG_COLD_DATA; - else - return CURSEG_WARM_DATA; + if (is_inode_flag_set(inode, FI_HOT_DATA)) + return CURSEG_HOT_DATA; + return CURSEG_WARM_DATA; } else { if (IS_DNODE(page)) return is_cold_node(page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; - else - return CURSEG_COLD_NODE; + return CURSEG_COLD_NODE; } } @@ -1788,15 +2122,14 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, stat_inc_block_count(sbi, curseg); + if (!__has_curseg_space(sbi, type)) + sit_i->s_ops->allocate_segment(sbi, type, false); /* - * SIT information should be updated before segment allocation, - * since SSR needs latest valid block information. + * SIT information should be updated after segment allocation, + * since we need to keep dirty segments precisely under SSR. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); - if (!__has_curseg_space(sbi, type)) - sit_i->s_ops->allocate_segment(sbi, type, false); - mutex_unlock(&sit_i->sentry_lock); if (page && IS_NODESEG(type)) @@ -1868,11 +2201,11 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) f2fs_update_data_blkaddr(dn, fio->new_blkaddr); } -void rewrite_data_page(struct f2fs_io_info *fio) +int rewrite_data_page(struct f2fs_io_info *fio) { fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); - f2fs_submit_page_mbio(fio); + return f2fs_submit_page_bio(fio); } void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -2437,7 +2770,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) se = get_seg_entry(sbi, segno); /* add discard candidates */ - if (cpc->reason != CP_DISCARD) { + if (!(cpc->reason & CP_DISCARD)) { cpc->trim_start = segno; add_discard_addrs(sbi, cpc, false); } @@ -2473,7 +2806,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, !list_empty(head)); f2fs_bug_on(sbi, sit_i->dirty_sentries); out: - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) @@ -2672,10 +3005,17 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) /* build discard map only one time */ if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -2699,10 +3039,15 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) seg_info_from_raw_sit(se, &sit); if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -2746,7 +3091,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) continue; if (valid_blocks > sbi->blocks_per_seg) { @@ -2852,6 +3197,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; + sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; @@ -2988,7 +3334,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; destroy_flush_cmd_control(sbi, true); - destroy_discard_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 313b99040aa6..010f336a7573 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -21,78 +21,84 @@ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ /* L: Logical segment # in volume, R: Relative segment # in main area */ -#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) -#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) +#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) +#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) -#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA) -#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE) +#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) #define IS_CURSEG(sbi, seg) \ - ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) #define IS_CURSEC(sbi, secno) \ - ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - sbi->segs_per_sec)) \ + (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ + (sbi)->segs_per_sec)) \ #define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) #define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) -#define MAIN_SECS(sbi) (sbi->total_sections) +#define MAIN_SECS(sbi) ((sbi)->total_sections) #define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) -#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg) +#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) -#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \ - sbi->log_blocks_per_seg)) +#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ + (sbi)->log_blocks_per_seg)) #define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ - (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) + (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg)) #define NEXT_FREE_BLKADDR(sbi, curseg) \ - (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) + (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ + ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) -#define GET_SECNO(sbi, segno) \ - ((segno) / sbi->segs_per_sec) -#define GET_ZONENO_FROM_SEGNO(sbi, segno) \ - ((segno / sbi->segs_per_sec) / sbi->secs_per_zone) +#define BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define GET_SEC_FROM_SEG(sbi, segno) \ + ((segno) / (sbi)->segs_per_sec) +#define GET_SEG_FROM_SEC(sbi, secno) \ + ((secno) * (sbi)->segs_per_sec) +#define GET_ZONE_FROM_SEC(sbi, secno) \ + ((secno) / (sbi)->secs_per_zone) +#define GET_ZONE_FROM_SEG(sbi, segno) \ + GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) #define GET_SUM_BLOCK(sbi, segno) \ - ((sbi->sm_info->ssa_blkaddr) + segno) + ((sbi)->sm_info->ssa_blkaddr + (segno)) #define GET_SUM_TYPE(footer) ((footer)->entry_type) -#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) #define SIT_ENTRY_OFFSET(sit_i, segno) \ - (segno % sit_i->sents_per_block) + ((segno) % (sit_i)->sents_per_block) #define SIT_BLOCK_OFFSET(segno) \ - (segno / SIT_ENTRY_PER_BLOCK) + ((segno) / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(segno) \ (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) #define SIT_BLK_CNT(sbi) \ @@ -103,7 +109,7 @@ #define SECTOR_FROM_BLOCK(blk_addr) \ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ - (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) + ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -132,7 +138,10 @@ enum { */ enum { GC_CB = 0, - GC_GREEDY + GC_GREEDY, + ALLOC_NEXT, + FLUSH_DEVICE, + MAX_GC_POLICY, }; /* @@ -227,6 +236,8 @@ struct sit_info { unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ + + unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; struct free_segmap_info { @@ -303,17 +314,17 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - return &sit_i->sec_entries[GET_SECNO(sbi, segno)]; + return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno, int section) + unsigned int segno, bool use_section) { /* * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ - if (section > 1) + if (use_section && sbi->segs_per_sec > 1) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; @@ -358,8 +369,8 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -379,7 +390,8 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + set_bit(segno, free_i->free_segmap); free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) @@ -390,8 +402,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -412,7 +424,8 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + spin_lock(&free_i->segmap_lock); if (!test_and_set_bit(segno, free_i->free_segmap)) { free_i->free_segments--; @@ -477,12 +490,12 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) static inline int overprovision_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec; + return GET_SEC_FROM_SEG(sbi, (unsigned int)overprovision_segments(sbi)); } static inline int reserved_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec; + return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } static inline bool need_SSR(struct f2fs_sb_info *sbi) @@ -495,7 +508,7 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) return false; return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + 1); + 2 * reserved_sections(sbi)); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, @@ -540,6 +553,7 @@ static inline int utilization(struct f2fs_sb_info *sbi) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 +#define DEF_MIN_HOT_BLOCKS 16 enum { F2FS_IPU_FORCE, @@ -547,17 +561,15 @@ enum { F2FS_IPU_UTIL, F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, + F2FS_IPU_ASYNC, }; -static inline bool need_inplace_update(struct inode *inode) +static inline bool need_inplace_update_policy(struct inode *inode, + struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; - /* IPU can be done only for the user data */ - if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) - return false; - if (test_opt(sbi, LFS)) return false; @@ -572,6 +584,15 @@ static inline bool need_inplace_update(struct inode *inode) utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; + /* + * IPU for rewrite async pages + */ + if (policy & (0x1 << F2FS_IPU_ASYNC) && + fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && + !f2fs_encrypted_inode(inode)) + return true; + /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && is_inode_flag_set(inode, FI_NEED_IPU)) @@ -720,7 +741,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, unsigned int secno) { - if (get_valid_blocks(sbi, secno, sbi->segs_per_sec) >= + if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >= sbi->fggc_threshold) return true; return false; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 96fe8ed73100..83355ec4a92c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_TRUNCATE] = "truncate fail", [FAULT_IO] = "IO error", [FAULT_CHECKPOINT] = "checkpoint error", }; @@ -82,6 +83,7 @@ enum { Opt_discard, Opt_nodiscard, Opt_noheap, + Opt_heap, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, @@ -116,6 +118,7 @@ static match_table_t f2fs_tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, + {Opt_heap, "heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, @@ -293,6 +296,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); @@ -318,6 +322,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_hot_blocks), ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), @@ -436,6 +441,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noheap: set_opt(sbi, NOHEAP); break; + case Opt_heap: + clear_opt(sbi, NOHEAP); + break; #ifdef CONFIG_F2FS_FS_XATTR case Opt_user_xattr: set_opt(sbi, XATTR_USER); @@ -787,7 +795,14 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bio(sbi, NULL_ADDR); + f2fs_wait_discard_bios(sbi); + + if (!sbi->discard_blks) { + struct cp_control cpc = { + .reason = CP_UMOUNT | CP_TRIMMED, + }; + write_checkpoint(sbi, &cpc); + } /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); @@ -913,7 +928,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISCARD)) seq_puts(seq, ",discard"); if (test_opt(sbi, NOHEAP)) - seq_puts(seq, ",no_heap_alloc"); + seq_puts(seq, ",no_heap"); + else + seq_puts(seq, ",heap"); #ifdef CONFIG_F2FS_FS_XATTR if (test_opt(sbi, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -986,7 +1003,7 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) if ((i % 10) == 0) seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u", se->type, - get_valid_blocks(sbi, i, 1)); + get_valid_blocks(sbi, i, false)); if ((i % 10) == 9 || i == (total_segs - 1)) seq_putc(seq, '\n'); else @@ -1012,7 +1029,7 @@ static int segment_bits_seq_show(struct seq_file *seq, void *offset) seq_printf(seq, "%-10d", i); seq_printf(seq, "%d|%-3u|", se->type, - get_valid_blocks(sbi, i, 1)); + get_valid_blocks(sbi, i, false)); for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) seq_printf(seq, " %.2x", se->cur_valid_map[j]); seq_putc(seq, '\n'); @@ -1046,6 +1063,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, NOHEAP); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_mounted_blkzoned(sbi->sb)) { @@ -1307,7 +1325,7 @@ static int __f2fs_commit_super(struct buffer_head *bh, unlock_buffer(bh); /* it's rare case, we can do fua all the time */ - return __sync_dirty_buffer(bh, REQ_PREFLUSH | REQ_FUA); + return __sync_dirty_buffer(bh, REQ_SYNC | REQ_PREFLUSH | REQ_FUA); } static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, @@ -1483,6 +1501,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } + if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment count (%u)", + le32_to_cpu(raw_super->segment_count)); + return 1; + } + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ if (sanity_check_area_boundary(sbi, bh)) return 1; @@ -1555,6 +1580,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + atomic_set(&sbi->wb_sync_req, 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); mutex_init(&sbi->wio_mutex[NODE]); @@ -1917,6 +1944,7 @@ try_onemore: mutex_init(&sbi->gc_mutex); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); + init_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); @@ -2022,6 +2050,10 @@ try_onemore: f2fs_join_shrinker(sbi); + err = f2fs_build_stats(sbi); + if (err) + goto free_nm; + /* if there are nt orphan nodes free them */ err = recover_orphan_inodes(sbi); if (err) @@ -2046,10 +2078,6 @@ try_onemore: goto free_root_inode; } - err = f2fs_build_stats(sbi); - if (err) - goto free_root_inode; - if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); @@ -2143,7 +2171,6 @@ free_proc: remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } - f2fs_destroy_stats(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2161,6 +2188,7 @@ free_node_inode: truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); + f2fs_destroy_stats(sbi); free_nm: destroy_node_manager(sbi); free_sm: diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 73b4e1d1912a..bccbbf2616d2 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -59,7 +59,7 @@ void f2fs_trace_pid(struct page *page) pid_t pid = task_pid_nr(current); void *p; - page->private = pid; + set_page_private(page, (unsigned long)pid); if (radix_tree_preload(GFP_NOFS)) return; @@ -138,7 +138,7 @@ static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, radix_tree_for_each_slot(slot, &pids, &iter, first_index) { results[ret] = iter.index; - if (++ret == PIDVEC_SIZE) + if (++ret == max_items) break; } return ret; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 7298a4488f7f..832c5110abab 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -250,15 +250,13 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, void *cur_addr, *txattr_addr, *last_addr = NULL; nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; - unsigned int inline_size = 0; + unsigned int inline_size = inline_xattr_size(inode); int err = 0; - inline_size = inline_xattr_size(inode); - if (!size && !inline_size) return -ENODATA; - txattr_addr = kzalloc(inline_size + size + sizeof(__u32), + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -328,13 +326,14 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; - size_t size = PAGE_SIZE, inline_size = 0; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = VALID_XATTR_BLOCK_SIZE; + unsigned int inline_size = inline_xattr_size(inode); void *txattr_addr; int err; - inline_size = inline_xattr_size(inode); - - txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, + GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -358,19 +357,19 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, } /* read from xattr node block */ - if (F2FS_I(inode)->i_xattr_nid) { + if (xnid) { struct page *xpage; void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = get_node_page(sbi, xnid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); goto fail; } xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); + memcpy(txattr_addr + inline_size, xattr_addr, size); f2fs_put_page(xpage, 1); } @@ -392,14 +391,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, void *txattr_addr, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - size_t inline_size = 0; + size_t inline_size = inline_xattr_size(inode); void *xattr_addr; struct page *xpage; nid_t new_nid = 0; int err; - inline_size = inline_xattr_size(inode); - if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) if (!alloc_nid(sbi, &new_nid)) return -ENOSPC; @@ -454,7 +451,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } xattr_addr = page_address(xpage); - memcpy(xattr_addr, txattr_addr + inline_size, MAX_XATTR_BLOCK_SIZE); + memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); @@ -546,7 +543,9 @@ static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, const void *value, size_t size) { void *pval = entry->e_name + entry->e_name_len; - return (entry->e_value_size == size) && !memcmp(pval, value, size); + + return (le16_to_cpu(entry->e_value_size) == size) && + !memcmp(pval, value, size); } static int __f2fs_setxattr(struct inode *inode, int index, diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index d5a94928c116..dbcd1d16e669 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -58,10 +58,10 @@ struct f2fs_xattr_entry { #define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) #define XATTR_ROUND (3) -#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) +#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND) #define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ - entry->e_name_len + le16_to_cpu(entry->e_value_size))) + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size))) #define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\ ENTRY_SIZE(entry))) @@ -72,8 +72,8 @@ struct f2fs_xattr_entry { for (entry = XATTR_FIRST_ENTRY(addr);\ !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) -#define MAX_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) -#define VALID_XATTR_BLOCK_SIZE (MAX_XATTR_BLOCK_SIZE - sizeof(__u32)) +#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define XATTR_PADDING_SIZE (sizeof(__u32)) #define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ VALID_XATTR_BLOCK_SIZE) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index c43fe83ee708..5a0245e36240 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -700,8 +700,21 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) { int err = 0; - jbd2_might_wait_for_commit(journal); read_lock(&journal->j_state_lock); +#ifdef CONFIG_PROVE_LOCKING + /* + * Some callers make sure transaction is already committing and in that + * case we cannot block on open handles anymore. So don't warn in that + * case. + */ + if (tid_gt(tid, journal->j_commit_sequence) && + (!journal->j_committing_transaction || + journal->j_committing_transaction->t_tid != tid)) { + read_unlock(&journal->j_state_lock); + jbd2_might_wait_for_commit(journal); + read_lock(&journal->j_state_lock); + } +#endif #ifdef CONFIG_JBD2_DEBUG if (!tid_geq(journal->j_commit_request, tid)) { printk(KERN_ERR @@ -922,7 +935,8 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block) * space and if we lose sb update during power failure we'd replay * old transaction with possibly newly overwritten data. */ - ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA); + ret = jbd2_journal_update_sb_log_tail(journal, tid, block, + REQ_SYNC | REQ_FUA); if (ret) goto out; @@ -1323,7 +1337,7 @@ static int journal_reset(journal_t *journal) jbd2_journal_update_sb_log_tail(journal, journal->j_tail_sequence, journal->j_tail, - REQ_FUA); + REQ_SYNC | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } return jbd2_journal_start_thread(journal); @@ -1463,7 +1477,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal) sb->s_errno = cpu_to_be32(journal->j_errno); read_unlock(&journal->j_state_lock); - jbd2_write_superblock(journal, REQ_FUA); + jbd2_write_superblock(journal, REQ_SYNC | REQ_FUA); } EXPORT_SYMBOL(jbd2_journal_update_sb_errno); @@ -1730,7 +1744,7 @@ int jbd2_journal_destroy(journal_t *journal) write_unlock(&journal->j_state_lock); jbd2_mark_journal_empty(journal, - REQ_PREFLUSH | REQ_FUA); + REQ_SYNC | REQ_PREFLUSH | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } else err = -EIO; @@ -1989,7 +2003,7 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - jbd2_mark_journal_empty(journal, REQ_FUA); + jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); write_lock(&journal->j_state_lock); J_ASSERT(!journal->j_running_transaction); @@ -2035,7 +2049,7 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (write) { /* Lock to make assertions happy... */ mutex_lock(&journal->j_checkpoint_mutex); - jbd2_mark_journal_empty(journal, REQ_FUA); + jbd2_mark_journal_empty(journal, REQ_SYNC | REQ_FUA); mutex_unlock(&journal->j_checkpoint_mutex); } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 19fcc9a3364e..566079d9b402 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -285,6 +285,15 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry, goto out_dent; } + if (ubifs_crypt_is_encrypted(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { + ubifs_warn(c, "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; + goto out_inode; + } + done: kfree(dent); fscrypt_free_filename(&nm); @@ -295,6 +304,8 @@ done: d_add(dentry, inode); return NULL; +out_inode: + iput(inode); out_dent: kfree(dent); out_fname: diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h index fe797d6ef89d..295584f31a4e 100644 --- a/include/kvm/arm_arch_timer.h +++ b/include/kvm/arm_arch_timer.h @@ -63,6 +63,8 @@ int kvm_timer_vcpu_reset(struct kvm_vcpu *vcpu, void kvm_timer_vcpu_init(struct kvm_vcpu *vcpu); void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu); +bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu); +void kvm_timer_update_run(struct kvm_vcpu *vcpu); void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu); u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid); diff --git a/include/kvm/arm_pmu.h b/include/kvm/arm_pmu.h index 92e7e97ca8ff..1ab4633adf4f 100644 --- a/include/kvm/arm_pmu.h +++ b/include/kvm/arm_pmu.h @@ -50,6 +50,8 @@ void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu); void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu); +bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu); +void kvm_pmu_update_run(struct kvm_vcpu *vcpu); void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val); void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, u64 data, @@ -85,6 +87,11 @@ static inline void kvm_pmu_enable_counter(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_overflow_set(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_flush_hwstate(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_sync_hwstate(struct kvm_vcpu *vcpu) {} +static inline bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) +{ + return false; +} +static inline void kvm_pmu_update_run(struct kvm_vcpu *vcpu) {} static inline void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) {} static inline void kvm_pmu_set_counter_event_type(struct kvm_vcpu *vcpu, diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h index c0b3d999c266..581a59ea7e34 100644 --- a/include/kvm/arm_vgic.h +++ b/include/kvm/arm_vgic.h @@ -225,8 +225,6 @@ struct vgic_dist { struct vgic_v2_cpu_if { u32 vgic_hcr; u32 vgic_vmcr; - u32 vgic_misr; /* Saved only */ - u64 vgic_eisr; /* Saved only */ u64 vgic_elrsr; /* Saved only */ u32 vgic_apr; u32 vgic_lr[VGIC_V2_MAX_LRS]; @@ -236,8 +234,6 @@ struct vgic_v3_cpu_if { u32 vgic_hcr; u32 vgic_vmcr; u32 vgic_sre; /* Restored only, change ignored */ - u32 vgic_misr; /* Saved only */ - u32 vgic_eisr; /* Saved only */ u32 vgic_elrsr; /* Saved only */ u32 vgic_ap0r[4]; u32 vgic_ap1r[4]; @@ -264,8 +260,6 @@ struct vgic_cpu { */ struct list_head ap_list_head; - u64 live_lrs; - /* * Members below are used with GICv3 emulation only and represent * parts of the redistributor. @@ -307,6 +301,9 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq); int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu); +void kvm_vgic_load(struct kvm_vcpu *vcpu); +void kvm_vgic_put(struct kvm_vcpu *vcpu); + #define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel)) #define vgic_initialized(k) ((k)->arch.vgic.initialized) #define vgic_ready(k) ((k)->arch.vgic.ready) diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index e2d239ed4c60..b6feed6547ce 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -32,9 +32,9 @@ /* 0, 1(node nid), 2(meta nid) are reserved node id */ #define F2FS_RESERVED_NODE_NUM 3 -#define F2FS_ROOT_INO(sbi) (sbi->root_ino_num) -#define F2FS_NODE_INO(sbi) (sbi->node_ino_num) -#define F2FS_META_INO(sbi) (sbi->meta_ino_num) +#define F2FS_ROOT_INO(sbi) ((sbi)->root_ino_num) +#define F2FS_NODE_INO(sbi) ((sbi)->node_ino_num) +#define F2FS_META_INO(sbi) ((sbi)->meta_ino_num) #define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */ #define F2FS_IO_SIZE_KB(sbi) (1 << ((sbi)->write_io_size_bits + 2)) /* KB */ @@ -114,6 +114,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_TRIMMED_FLAG 0x00000100 #define CP_NAT_BITS_FLAG 0x00000080 #define CP_CRC_RECOVERY_FLAG 0x00000040 #define CP_FASTBOOT_FLAG 0x00000020 @@ -161,7 +162,7 @@ struct f2fs_checkpoint { */ #define F2FS_ORPHANS_PER_BLOCK 1020 -#define GET_ORPHAN_BLOCKS(n) ((n + F2FS_ORPHANS_PER_BLOCK - 1) / \ +#define GET_ORPHAN_BLOCKS(n) (((n) + F2FS_ORPHANS_PER_BLOCK - 1) / \ F2FS_ORPHANS_PER_BLOCK) struct f2fs_orphan_block { @@ -302,6 +303,12 @@ struct f2fs_nat_block { #define SIT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_sit_entry)) /* + * F2FS uses 4 bytes to represent block address. As a result, supported size of + * disk is 16 TB and it equals to 16 * 1024 * 1024 / 2 segments. + */ +#define F2FS_MAX_SEGMENT ((16 * 1024 * 1024) / 2) + +/* * Note that f2fs_sit_entry->vblocks has the following bit-field information. * [15:10] : allocation type such as CURSEG_XXXX_TYPE * [9:0] : valid block count @@ -449,7 +456,7 @@ typedef __le32 f2fs_hash_t; #define F2FS_SLOT_LEN 8 #define F2FS_SLOT_LEN_BITS 3 -#define GET_DENTRY_SLOTS(x) ((x + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS) +#define GET_DENTRY_SLOTS(x) (((x) + F2FS_SLOT_LEN - 1) >> F2FS_SLOT_LEN_BITS) /* MAX level for dir lookup */ #define MAX_DIR_HASH_DEPTH 63 diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h index 10c1abfbac6c..0a30c106c1e5 100644 --- a/include/linux/fscrypt_common.h +++ b/include/linux/fscrypt_common.h @@ -46,17 +46,6 @@ struct fscrypt_symlink_data { char encrypted_path[1]; } __packed; -/** - * This function is used to calculate the disk space required to - * store a filename of length l in encrypted symlink format. - */ -static inline u32 fscrypt_symlink_data_len(u32 l) -{ - if (l < FS_CRYPTO_BLOCK_SIZE) - l = FS_CRYPTO_BLOCK_SIZE; - return (l + sizeof(struct fscrypt_symlink_data) - 1); -} - struct fscrypt_str { unsigned char *name; u32 len; diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h index 3511ca798804..ec406aed2f2f 100644 --- a/include/linux/fscrypt_notsupp.h +++ b/include/linux/fscrypt_notsupp.h @@ -147,6 +147,15 @@ static inline int fscrypt_fname_usr_to_disk(struct inode *inode, return -EOPNOTSUPP; } +static inline bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + /* Encryption support disabled; use standard comparison */ + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + /* bio.c */ static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h index a140f47e9b27..cd4e82c17304 100644 --- a/include/linux/fscrypt_supp.h +++ b/include/linux/fscrypt_supp.h @@ -57,6 +57,80 @@ extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, struct fscrypt_str *); +#define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE 32 + +/* Extracts the second-to-last ciphertext block; see explanation below */ +#define FSCRYPT_FNAME_DIGEST(name, len) \ + ((name) + round_down((len) - FS_CRYPTO_BLOCK_SIZE - 1, \ + FS_CRYPTO_BLOCK_SIZE)) + +#define FSCRYPT_FNAME_DIGEST_SIZE FS_CRYPTO_BLOCK_SIZE + +/** + * fscrypt_digested_name - alternate identifier for an on-disk filename + * + * When userspace lists an encrypted directory without access to the key, + * filenames whose ciphertext is longer than FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE + * bytes are shown in this abbreviated form (base64-encoded) rather than as the + * full ciphertext (base64-encoded). This is necessary to allow supporting + * filenames up to NAME_MAX bytes, since base64 encoding expands the length. + * + * To make it possible for filesystems to still find the correct directory entry + * despite not knowing the full on-disk name, we encode any filesystem-specific + * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups, + * followed by the second-to-last ciphertext block of the filename. Due to the + * use of the CBC-CTS encryption mode, the second-to-last ciphertext block + * depends on the full plaintext. (Note that ciphertext stealing causes the + * last two blocks to appear "flipped".) This makes accidental collisions very + * unlikely: just a 1 in 2^128 chance for two filenames to collide even if they + * share the same filesystem-specific hashes. + * + * However, this scheme isn't immune to intentional collisions, which can be + * created by anyone able to create arbitrary plaintext filenames and view them + * without the key. Making the "digest" be a real cryptographic hash like + * SHA-256 over the full ciphertext would prevent this, although it would be + * less efficient and harder to implement, especially since the filesystem would + * need to calculate it for each directory entry examined during a search. + */ +struct fscrypt_digested_name { + u32 hash; + u32 minor_hash; + u8 digest[FSCRYPT_FNAME_DIGEST_SIZE]; +}; + +/** + * fscrypt_match_name() - test whether the given name matches a directory entry + * @fname: the name being searched for + * @de_name: the name from the directory entry + * @de_name_len: the length of @de_name in bytes + * + * Normally @fname->disk_name will be set, and in that case we simply compare + * that to the name stored in the directory entry. The only exception is that + * if we don't have the key for an encrypted directory and a filename in it is + * very long, then we won't have the full disk_name and we'll instead need to + * match against the fscrypt_digested_name. + * + * Return: %true if the name matches, otherwise %false. + */ +static inline bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + if (unlikely(!fname->disk_name.name)) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_')) + return false; + if (de_name_len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) + return false; + return !memcmp(FSCRYPT_FNAME_DIGEST(de_name, de_name_len), + n->digest, FSCRYPT_FNAME_DIGEST_SIZE); + } + + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + /* bio.c */ extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); extern void fscrypt_pullback_bio_page(struct page **, bool); diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 5d9b2a08e553..4d629471869b 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -115,14 +115,17 @@ static inline bool is_error_page(struct page *page) return IS_ERR(page); } +#define KVM_REQUEST_MASK GENMASK(7,0) +#define KVM_REQUEST_NO_WAKEUP BIT(8) +#define KVM_REQUEST_WAIT BIT(9) /* * Architecture-independent vcpu->requests bit members * Bits 4-7 are reserved for more arch-independent bits. */ -#define KVM_REQ_TLB_FLUSH 0 -#define KVM_REQ_MMU_RELOAD 1 -#define KVM_REQ_PENDING_TIMER 2 -#define KVM_REQ_UNHALT 3 +#define KVM_REQ_TLB_FLUSH (0 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_PENDING_TIMER 2 +#define KVM_REQ_UNHALT 3 #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 @@ -268,6 +271,12 @@ struct kvm_vcpu { static inline int kvm_vcpu_exiting_guest_mode(struct kvm_vcpu *vcpu) { + /* + * The memory barrier ensures a previous write to vcpu->requests cannot + * be reordered with the read of vcpu->mode. It pairs with the general + * memory barrier following the write of vcpu->mode in VCPU RUN. + */ + smp_mb__before_atomic(); return cmpxchg(&vcpu->mode, IN_GUEST_MODE, EXITING_GUEST_MODE); } @@ -403,7 +412,7 @@ struct kvm { struct kvm_vm_stat stat; struct kvm_arch arch; refcount_t users_count; -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET +#ifdef CONFIG_KVM_MMIO struct kvm_coalesced_mmio_ring *coalesced_mmio_ring; spinlock_t ring_lock; struct list_head coalesced_zones; @@ -502,10 +511,10 @@ int __must_check vcpu_load(struct kvm_vcpu *vcpu); void vcpu_put(struct kvm_vcpu *vcpu); #ifdef __KVM_HAVE_IOAPIC -void kvm_vcpu_request_scan_ioapic(struct kvm *kvm); +void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm); void kvm_arch_post_irq_routing_update(struct kvm *kvm); #else -static inline void kvm_vcpu_request_scan_ioapic(struct kvm *kvm) +static inline void kvm_arch_post_irq_ack_notifier_list_update(struct kvm *kvm) { } static inline void kvm_arch_post_irq_routing_update(struct kvm *kvm) @@ -641,18 +650,18 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int kvm_read_guest_atomic(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); -int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len); +int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len); int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, int offset, int len); int kvm_write_guest(struct kvm *kvm, gpa_t gpa, const void *data, unsigned long len); -int kvm_vcpu_write_guest_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len); -int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc, - void *data, int offset, unsigned long len); -int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *v, struct gfn_to_hva_cache *ghc, - gpa_t gpa, unsigned long len); +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len); +int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, int offset, unsigned long len); +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + gpa_t gpa, unsigned long len); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len); int kvm_clear_guest(struct kvm *kvm, gpa_t gpa, unsigned long len); struct kvm_memory_slot *gfn_to_memslot(struct kvm *kvm, gfn_t gfn); @@ -682,7 +691,7 @@ void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn); void kvm_vcpu_block(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu); void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu); -void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); +bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu); void kvm_vcpu_kick(struct kvm_vcpu *vcpu); int kvm_vcpu_yield_to(struct kvm_vcpu *target); void kvm_vcpu_on_spin(struct kvm_vcpu *vcpu); @@ -875,22 +884,6 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, int kvm_request_irq_source_id(struct kvm *kvm); void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); -#ifdef CONFIG_KVM_DEVICE_ASSIGNMENT -int kvm_iommu_map_pages(struct kvm *kvm, struct kvm_memory_slot *slot); -void kvm_iommu_unmap_pages(struct kvm *kvm, struct kvm_memory_slot *slot); -#else -static inline int kvm_iommu_map_pages(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ - return 0; -} - -static inline void kvm_iommu_unmap_pages(struct kvm *kvm, - struct kvm_memory_slot *slot) -{ -} -#endif - /* * search_memslots() and __gfn_to_memslot() are here because they are * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. @@ -1023,6 +1016,7 @@ static inline int mmu_notifier_retry(struct kvm *kvm, unsigned long mmu_seq) #define KVM_MAX_IRQ_ROUTES 1024 #endif +bool kvm_arch_can_set_irq_routing(struct kvm *kvm); int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *entries, unsigned nr, @@ -1090,13 +1084,23 @@ static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu) * caller. Paired with the smp_mb__after_atomic in kvm_check_request. */ smp_wmb(); - set_bit(req, &vcpu->requests); + set_bit(req & KVM_REQUEST_MASK, &vcpu->requests); +} + +static inline bool kvm_test_request(int req, struct kvm_vcpu *vcpu) +{ + return test_bit(req & KVM_REQUEST_MASK, &vcpu->requests); +} + +static inline void kvm_clear_request(int req, struct kvm_vcpu *vcpu) +{ + clear_bit(req & KVM_REQUEST_MASK, &vcpu->requests); } static inline bool kvm_check_request(int req, struct kvm_vcpu *vcpu) { - if (test_bit(req, &vcpu->requests)) { - clear_bit(req, &vcpu->requests); + if (kvm_test_request(req, vcpu)) { + kvm_clear_request(req, vcpu); /* * Ensure the rest of the request is visible to kvm_check_request's diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 09c71e9aaebf..dfae175ddebc 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -15,6 +15,7 @@ struct ext4_inode_info; struct mpage_da_data; struct ext4_map_blocks; struct extent_status; +struct ext4_fsmap; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) @@ -2529,6 +2530,79 @@ TRACE_EVENT(ext4_es_shrink, __entry->scan_time, __entry->nr_skipped, __entry->retried) ); +/* fsmap traces */ +DECLARE_EVENT_CLASS(ext4_fsmap_class, + TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, + u64 owner), + TP_ARGS(sb, keydev, agno, bno, len, owner), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(u32, agno) + __field(u64, bno) + __field(u64, len) + __field(u64, owner) + ), + TP_fast_assign( + __entry->dev = sb->s_bdev->bd_dev; + __entry->keydev = new_decode_dev(keydev); + __entry->agno = agno; + __entry->bno = bno; + __entry->len = len; + __entry->owner = owner; + ), + TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->agno, + __entry->bno, + __entry->len, + __entry->owner) +) +#define DEFINE_FSMAP_EVENT(name) \ +DEFINE_EVENT(ext4_fsmap_class, name, \ + TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, \ + u64 owner), \ + TP_ARGS(sb, keydev, agno, bno, len, owner)) +DEFINE_FSMAP_EVENT(ext4_fsmap_low_key); +DEFINE_FSMAP_EVENT(ext4_fsmap_high_key); +DEFINE_FSMAP_EVENT(ext4_fsmap_mapping); + +DECLARE_EVENT_CLASS(ext4_getfsmap_class, + TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), + TP_ARGS(sb, fsmap), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(dev_t, keydev) + __field(u64, block) + __field(u64, len) + __field(u64, owner) + __field(u64, flags) + ), + TP_fast_assign( + __entry->dev = sb->s_bdev->bd_dev; + __entry->keydev = new_decode_dev(fsmap->fmr_device); + __entry->block = fsmap->fmr_physical; + __entry->len = fsmap->fmr_length; + __entry->owner = fsmap->fmr_owner; + __entry->flags = fsmap->fmr_flags; + ), + TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld flags 0x%llx\n", + MAJOR(__entry->dev), MINOR(__entry->dev), + MAJOR(__entry->keydev), MINOR(__entry->keydev), + __entry->block, + __entry->len, + __entry->owner, + __entry->flags) +) +#define DEFINE_GETFSMAP_EVENT(name) \ +DEFINE_EVENT(ext4_getfsmap_class, name, \ + TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), \ + TP_ARGS(sb, fsmap)) +DEFINE_GETFSMAP_EVENT(ext4_getfsmap_low_key); +DEFINE_GETFSMAP_EVENT(ext4_getfsmap_high_key); +DEFINE_GETFSMAP_EVENT(ext4_getfsmap_mapping); + #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index c80fcad0a6c9..15da88c5c3a4 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -15,6 +15,8 @@ TRACE_DEFINE_ENUM(META); TRACE_DEFINE_ENUM(META_FLUSH); TRACE_DEFINE_ENUM(INMEM); TRACE_DEFINE_ENUM(INMEM_DROP); +TRACE_DEFINE_ENUM(INMEM_INVALIDATE); +TRACE_DEFINE_ENUM(INMEM_REVOKE); TRACE_DEFINE_ENUM(IPU); TRACE_DEFINE_ENUM(OPU); TRACE_DEFINE_ENUM(CURSEG_HOT_DATA); @@ -42,6 +44,7 @@ TRACE_DEFINE_ENUM(CP_FASTBOOT); TRACE_DEFINE_ENUM(CP_SYNC); TRACE_DEFINE_ENUM(CP_RECOVERY); TRACE_DEFINE_ENUM(CP_DISCARD); +TRACE_DEFINE_ENUM(CP_TRIMMED); #define show_block_type(type) \ __print_symbolic(type, \ @@ -51,12 +54,13 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { META_FLUSH, "META_FLUSH" }, \ { INMEM, "INMEM" }, \ { INMEM_DROP, "INMEM_DROP" }, \ + { INMEM_INVALIDATE, "INMEM_INVALIDATE" }, \ { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) -#define F2FS_OP_FLAGS (REQ_RAHEAD | REQ_SYNC | REQ_PREFLUSH | REQ_META |\ - REQ_PRIO) +#define F2FS_OP_FLAGS (REQ_RAHEAD | REQ_SYNC | REQ_META | REQ_PRIO | \ + REQ_PREFLUSH | REQ_FUA) #define F2FS_BIO_FLAG_MASK(t) (t & F2FS_OP_FLAGS) #define show_bio_type(op,op_flags) show_bio_op(op), \ @@ -75,16 +79,13 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { REQ_OP_WRITE_ZEROES, "WRITE_ZEROES" }) #define show_bio_op_flags(flags) \ - __print_symbolic(F2FS_BIO_FLAG_MASK(flags), \ - { REQ_RAHEAD, "(RA)" }, \ - { REQ_SYNC, "(S)" }, \ - { REQ_SYNC | REQ_PRIO, "(SP)" }, \ - { REQ_META, "(M)" }, \ - { REQ_META | REQ_PRIO, "(MP)" }, \ - { REQ_SYNC | REQ_PREFLUSH , "(SF)" }, \ - { REQ_SYNC | REQ_META | REQ_PRIO, "(SMP)" }, \ - { REQ_PREFLUSH | REQ_META | REQ_PRIO, "(FMP)" }, \ - { 0, " \b" }) + __print_flags(F2FS_BIO_FLAG_MASK(flags), "|", \ + { REQ_RAHEAD, "R" }, \ + { REQ_SYNC, "S" }, \ + { REQ_META, "M" }, \ + { REQ_PRIO, "P" }, \ + { REQ_PREFLUSH, "PF" }, \ + { REQ_FUA, "FUA" }) #define show_data_type(type) \ __print_symbolic(type, \ @@ -117,12 +118,14 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { GC_CB, "Cost-Benefit" }) #define show_cpreason(type) \ - __print_symbolic(type, \ + __print_flags(type, "|", \ { CP_UMOUNT, "Umount" }, \ { CP_FASTBOOT, "Fastboot" }, \ { CP_SYNC, "Sync" }, \ { CP_RECOVERY, "Recovery" }, \ - { CP_DISCARD, "Discard" }) + { CP_DISCARD, "Discard" }, \ + { CP_UMOUNT, "Umount" }, \ + { CP_TRIMMED, "Trimmed" }) struct victim_sel_policy; struct f2fs_map_blocks; @@ -769,7 +772,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, ), TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, " - "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s%s, type = %s", + "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s", show_dev_ino(__entry), (unsigned long)__entry->index, (unsigned long long)__entry->old_blkaddr, @@ -822,7 +825,7 @@ DECLARE_EVENT_CLASS(f2fs__bio, __entry->size = bio->bi_iter.bi_size; ), - TP_printk("dev = (%d,%d)/(%d,%d), rw = %s%s, %s, sector = %lld, size = %u", + TP_printk("dev = (%d,%d)/(%d,%d), rw = %s(%s), %s, sector = %lld, size = %u", show_dev(__entry->target), show_dev(__entry->dev), show_bio_type(__entry->op, __entry->op_flags), @@ -1126,7 +1129,7 @@ TRACE_EVENT(f2fs_write_checkpoint, __entry->msg) ); -TRACE_EVENT(f2fs_issue_discard, +DECLARE_EVENT_CLASS(f2fs_discard, TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), @@ -1150,6 +1153,20 @@ TRACE_EVENT(f2fs_issue_discard, (unsigned long long)__entry->blklen) ); +DEFINE_EVENT(f2fs_discard, f2fs_queue_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + +DEFINE_EVENT(f2fs_discard, f2fs_issue_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + TRACE_EVENT(f2fs_issue_reset_zone, TP_PROTO(struct block_device *dev, block_t blkstart), @@ -1174,26 +1191,29 @@ TRACE_EVENT(f2fs_issue_reset_zone, TRACE_EVENT(f2fs_issue_flush, TP_PROTO(struct block_device *dev, unsigned int nobarrier, - unsigned int flush_merge), + unsigned int flush_merge, int ret), - TP_ARGS(dev, nobarrier, flush_merge), + TP_ARGS(dev, nobarrier, flush_merge, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, nobarrier) __field(unsigned int, flush_merge) + __field(int, ret) ), TP_fast_assign( __entry->dev = dev->bd_dev; __entry->nobarrier = nobarrier; __entry->flush_merge = flush_merge; + __entry->ret = ret; ), - TP_printk("dev = (%d,%d), %s %s", + TP_printk("dev = (%d,%d), %s %s, ret = %d", show_dev(__entry->dev), __entry->nobarrier ? "skip (nobarrier)" : "issue", - __entry->flush_merge ? " with flush_merge" : "") + __entry->flush_merge ? " with flush_merge" : "", + __entry->ret) ); TRACE_EVENT(f2fs_lookup_extent_tree_start, diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 048a85e9f017..24e61a54feaa 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -279,12 +279,25 @@ struct fscrypt_policy { __u8 filenames_encryption_mode; __u8 flags; __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; -} __packed; +}; #define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy) #define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) #define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) +/* Parameters for passing an encryption key into the kernel keyring */ +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +/* Structure that userspace passes to the kernel keyring */ +#define FS_MAX_KEY_SIZE 64 + +struct fscrypt_key { + __u32 mode; + __u8 raw[FS_MAX_KEY_SIZE]; + __u32 size; +}; + /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) * diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index f51d5082a377..577429a95ad8 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -702,6 +702,10 @@ struct kvm_ppc_resize_hpt { #define KVM_VM_PPC_HV 1 #define KVM_VM_PPC_PR 2 +/* on MIPS, 0 forces trap & emulate, 1 forces VZ ASE */ +#define KVM_VM_MIPS_TE 0 +#define KVM_VM_MIPS_VZ 1 + #define KVM_S390_SIE_PAGE_OFFSET 1 /* @@ -883,6 +887,14 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_PPC_MMU_RADIX 134 #define KVM_CAP_PPC_MMU_HASH_V3 135 #define KVM_CAP_IMMEDIATE_EXIT 136 +#define KVM_CAP_MIPS_VZ 137 +#define KVM_CAP_MIPS_TE 138 +#define KVM_CAP_MIPS_64BIT 139 +#define KVM_CAP_S390_GS 140 +#define KVM_CAP_S390_AIS 141 +#define KVM_CAP_SPAPR_TCE_VFIO 142 +#define KVM_CAP_X86_GUEST_MWAIT 143 +#define KVM_CAP_ARM_USER_IRQ 144 #ifdef KVM_CAP_IRQ_ROUTING @@ -1087,6 +1099,7 @@ struct kvm_device_attr { #define KVM_DEV_VFIO_GROUP 1 #define KVM_DEV_VFIO_GROUP_ADD 1 #define KVM_DEV_VFIO_GROUP_DEL 2 +#define KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE 3 enum kvm_device_type { KVM_DEV_TYPE_FSL_MPIC_20 = 1, @@ -1108,6 +1121,11 @@ enum kvm_device_type { KVM_DEV_TYPE_MAX, }; +struct kvm_vfio_spapr_tce { + __s32 groupfd; + __s32 tablefd; +}; + /* * ioctls for VM fds */ @@ -1354,4 +1372,11 @@ struct kvm_assigned_msix_entry { #define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0) #define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1) +/* Available with KVM_CAP_ARM_USER_IRQ */ + +/* Bits for run->s.regs.device_irq_level */ +#define KVM_ARM_DEV_EL1_VTIMER (1 << 0) +#define KVM_ARM_DEV_EL1_PTIMER (1 << 1) +#define KVM_ARM_DEV_PMU (1 << 2) + #endif /* __LINUX_KVM_H */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2359608d2568..143c1c25d680 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2352,10 +2352,16 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) if (wbc->nr_to_write <= 0) return 0; - if (mapping->a_ops->writepages) - ret = mapping->a_ops->writepages(mapping, wbc); - else - ret = generic_writepages(mapping, wbc); + while (1) { + if (mapping->a_ops->writepages) + ret = mapping->a_ops->writepages(mapping, wbc); + else + ret = generic_writepages(mapping, wbc); + if ((ret != -ENOMEM) || (wbc->sync_mode != WB_SYNC_ALL)) + break; + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + } return ret; } diff --git a/tools/Makefile b/tools/Makefile index 00caacd3ed92..c8a90d01dd8e 100644 --- a/tools/Makefile +++ b/tools/Makefile @@ -86,10 +86,13 @@ tmon: FORCE freefall: FORCE $(call descend,laptop/$@) +kvm_stat: FORCE + $(call descend,kvm/$@) + all: acpi cgroup cpupower gpio hv firewire lguest \ perf selftests turbostat usb \ virtio vm net x86_energy_perf_policy \ - tmon freefall objtool + tmon freefall objtool kvm_stat acpi_install: $(call descend,power/$(@:_install=),install) diff --git a/tools/arch/s390/include/uapi/asm/kvm.h b/tools/arch/s390/include/uapi/asm/kvm.h index a2ffec4139ad..7f4fd65e9208 100644 --- a/tools/arch/s390/include/uapi/asm/kvm.h +++ b/tools/arch/s390/include/uapi/asm/kvm.h @@ -131,7 +131,8 @@ struct kvm_s390_vm_cpu_subfunc { __u8 kmo[16]; /* with MSA4 */ __u8 pcc[16]; /* with MSA4 */ __u8 ppno[16]; /* with MSA5 */ - __u8 reserved[1824]; + __u8 kma[16]; /* with MSA8 */ + __u8 reserved[1808]; }; /* kvm attributes for crypto */ diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 581278c58488..8f74ed8e7237 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -30,8 +30,8 @@ import fcntl import resource import struct import re +import subprocess from collections import defaultdict -from time import sleep VMX_EXIT_REASONS = { 'EXCEPTION_NMI': 0, @@ -225,6 +225,7 @@ IOCTL_NUMBERS = { 'RESET': 0x00002403, } + class Arch(object): """Encapsulates global architecture specific data. @@ -255,12 +256,14 @@ class Arch(object): return ArchX86(SVM_EXIT_REASONS) return + class ArchX86(Arch): def __init__(self, exit_reasons): self.sc_perf_evt_open = 298 self.ioctl_numbers = IOCTL_NUMBERS self.exit_reasons = exit_reasons + class ArchPPC(Arch): def __init__(self): self.sc_perf_evt_open = 319 @@ -275,12 +278,14 @@ class ArchPPC(Arch): self.ioctl_numbers['SET_FILTER'] = 0x80002406 | char_ptr_size << 16 self.exit_reasons = {} + class ArchA64(Arch): def __init__(self): self.sc_perf_evt_open = 241 self.ioctl_numbers = IOCTL_NUMBERS self.exit_reasons = AARCH64_EXIT_REASONS + class ArchS390(Arch): def __init__(self): self.sc_perf_evt_open = 331 @@ -316,6 +321,61 @@ def parse_int_list(list_string): return integers +def get_pid_from_gname(gname): + """Fuzzy function to convert guest name to QEMU process pid. + + Returns a list of potential pids, can be empty if no match found. + Throws an exception on processing errors. + + """ + pids = [] + try: + child = subprocess.Popen(['ps', '-A', '--format', 'pid,args'], + stdout=subprocess.PIPE) + except: + raise Exception + for line in child.stdout: + line = line.lstrip().split(' ', 1) + # perform a sanity check before calling the more expensive + # function to possibly extract the guest name + if ' -name ' in line[1] and gname == get_gname_from_pid(line[0]): + pids.append(int(line[0])) + child.stdout.close() + + return pids + + +def get_gname_from_pid(pid): + """Returns the guest name for a QEMU process pid. + + Extracts the guest name from the QEMU comma line by processing the '-name' + option. Will also handle names specified out of sequence. + + """ + name = '' + try: + line = open('/proc/{}/cmdline'.format(pid), 'rb').read().split('\0') + parms = line[line.index('-name') + 1].split(',') + while '' in parms: + # commas are escaped (i.e. ',,'), hence e.g. 'foo,bar' results in + # ['foo', '', 'bar'], which we revert here + idx = parms.index('') + parms[idx - 1] += ',' + parms[idx + 1] + del parms[idx:idx+2] + # the '-name' switch allows for two ways to specify the guest name, + # where the plain name overrides the name specified via 'guest=' + for arg in parms: + if '=' not in arg: + name = arg + break + if arg[:6] == 'guest=': + name = arg[6:] + except (ValueError, IOError, IndexError): + pass + + return name + + def get_online_cpus(): """Returns a list of cpu id integers.""" with open('/sys/devices/system/cpu/online') as cpu_list: @@ -342,6 +402,7 @@ def get_filters(): libc = ctypes.CDLL('libc.so.6', use_errno=True) syscall = libc.syscall + class perf_event_attr(ctypes.Structure): """Struct that holds the necessary data to set up a trace event. @@ -370,6 +431,7 @@ class perf_event_attr(ctypes.Structure): self.size = ctypes.sizeof(self) self.read_format = PERF_FORMAT_GROUP + def perf_event_open(attr, pid, cpu, group_fd, flags): """Wrapper for the sys_perf_evt_open() syscall. @@ -395,6 +457,7 @@ PERF_FORMAT_GROUP = 1 << 3 PATH_DEBUGFS_TRACING = '/sys/kernel/debug/tracing' PATH_DEBUGFS_KVM = '/sys/kernel/debug/kvm' + class Group(object): """Represents a perf event group.""" @@ -427,6 +490,7 @@ class Group(object): struct.unpack(read_format, os.read(self.events[0].fd, length)))) + class Event(object): """Represents a performance event and manages its life cycle.""" def __init__(self, name, group, trace_cpu, trace_pid, trace_point, @@ -510,6 +574,7 @@ class Event(object): """Resets the count of the trace event in the kernel.""" fcntl.ioctl(self.fd, ARCH.ioctl_numbers['RESET'], 0) + class TracepointProvider(object): """Data provider for the stats class. @@ -551,6 +616,7 @@ class TracepointProvider(object): def setup_traces(self): """Creates all event and group objects needed to be able to retrieve data.""" + fields = self.get_available_fields() if self._pid > 0: # Fetch list of all threads of the monitored pid, as qemu # starts a thread for each vcpu. @@ -561,7 +627,7 @@ class TracepointProvider(object): # The constant is needed as a buffer for python libs, std # streams and other files that the script opens. - newlim = len(groupids) * len(self._fields) + 50 + newlim = len(groupids) * len(fields) + 50 try: softlim_, hardlim = resource.getrlimit(resource.RLIMIT_NOFILE) @@ -577,7 +643,7 @@ class TracepointProvider(object): for groupid in groupids: group = Group() - for name in self._fields: + for name in fields: tracepoint = name tracefilter = None match = re.match(r'(.*)\((.*)\)', name) @@ -650,13 +716,23 @@ class TracepointProvider(object): ret[name] += val return ret + def reset(self): + """Reset all field counters""" + for group in self.group_leaders: + for event in group.events: + event.reset() + + class DebugfsProvider(object): """Provides data from the files that KVM creates in the kvm debugfs folder.""" def __init__(self): self._fields = self.get_available_fields() + self._baseline = {} self._pid = 0 self.do_read = True + self.paths = [] + self.reset() def get_available_fields(self): """"Returns a list of available fields. @@ -673,6 +749,7 @@ class DebugfsProvider(object): @fields.setter def fields(self, fields): self._fields = fields + self.reset() @property def pid(self): @@ -690,10 +767,11 @@ class DebugfsProvider(object): self.paths = filter(lambda x: "{}-".format(pid) in x, vms) else: - self.paths = [''] + self.paths = [] self.do_read = True + self.reset() - def read(self): + def read(self, reset=0): """Returns a dict with format:'file name / field -> current value'.""" results = {} @@ -701,10 +779,22 @@ class DebugfsProvider(object): if not self.do_read: return results - for path in self.paths: + paths = self.paths + if self._pid == 0: + paths = [] + for entry in os.walk(PATH_DEBUGFS_KVM): + for dir in entry[1]: + paths.append(dir) + for path in paths: for field in self._fields: - results[field] = results.get(field, 0) \ - + self.read_field(field, path) + value = self.read_field(field, path) + key = path + field + if reset: + self._baseline[key] = value + if self._baseline.get(key, -1) == -1: + self._baseline[key] = value + results[field] = (results.get(field, 0) + value - + self._baseline.get(key, 0)) return results @@ -718,6 +808,12 @@ class DebugfsProvider(object): except IOError: return 0 + def reset(self): + """Reset field counters""" + self._baseline = {} + self.read(1) + + class Stats(object): """Manages the data providers and the data they provide. @@ -753,14 +849,20 @@ class Stats(object): for provider in self.providers: provider.pid = self._pid_filter + def reset(self): + self.values = {} + for provider in self.providers: + provider.reset() + @property def fields_filter(self): return self._fields_filter @fields_filter.setter def fields_filter(self, fields_filter): - self._fields_filter = fields_filter - self.update_provider_filters() + if fields_filter != self._fields_filter: + self._fields_filter = fields_filter + self.update_provider_filters() @property def pid_filter(self): @@ -768,9 +870,10 @@ class Stats(object): @pid_filter.setter def pid_filter(self, pid): - self._pid_filter = pid - self.values = {} - self.update_provider_pid() + if pid != self._pid_filter: + self._pid_filter = pid + self.values = {} + self.update_provider_pid() def get(self): """Returns a dict with field -> (value, delta to last value) of all @@ -778,23 +881,26 @@ class Stats(object): for provider in self.providers: new = provider.read() for key in provider.fields: - oldval = self.values.get(key, (0, 0)) + oldval = self.values.get(key, (0, 0))[0] newval = new.get(key, 0) - newdelta = None - if oldval is not None: - newdelta = newval - oldval[0] + newdelta = newval - oldval self.values[key] = (newval, newdelta) return self.values LABEL_WIDTH = 40 NUMBER_WIDTH = 10 +DELAY_INITIAL = 0.25 +DELAY_REGULAR = 3.0 +MAX_GUEST_NAME_LEN = 48 +MAX_REGEX_LEN = 44 +DEFAULT_REGEX = r'^[^\(]*$' + class Tui(object): """Instruments curses to draw a nice text ui.""" def __init__(self, stats): self.stats = stats self.screen = None - self.drilldown = False self.update_drilldown() def __enter__(self): @@ -809,7 +915,14 @@ class Tui(object): # return from C start_color() is ignorable. try: curses.start_color() - except: + except curses.error: + pass + + # Hide cursor in extra statement as some monochrome terminals + # might support hiding but not colors. + try: + curses.curs_set(0) + except curses.error: pass curses.use_default_colors() @@ -827,36 +940,60 @@ class Tui(object): def update_drilldown(self): """Sets or removes a filter that only allows fields without braces.""" if not self.stats.fields_filter: - self.stats.fields_filter = r'^[^\(]*$' + self.stats.fields_filter = DEFAULT_REGEX - elif self.stats.fields_filter == r'^[^\(]*$': + elif self.stats.fields_filter == DEFAULT_REGEX: self.stats.fields_filter = None def update_pid(self, pid): """Propagates pid selection to stats object.""" self.stats.pid_filter = pid - def refresh(self, sleeptime): - """Refreshes on-screen data.""" + def refresh_header(self, pid=None): + """Refreshes the header.""" + if pid is None: + pid = self.stats.pid_filter self.screen.erase() - if self.stats.pid_filter > 0: - self.screen.addstr(0, 0, 'kvm statistics - pid {0}' - .format(self.stats.pid_filter), - curses.A_BOLD) + gname = get_gname_from_pid(pid) + if gname: + gname = ('({})'.format(gname[:MAX_GUEST_NAME_LEN] + '...' + if len(gname) > MAX_GUEST_NAME_LEN + else gname)) + if pid > 0: + self.screen.addstr(0, 0, 'kvm statistics - pid {0} {1}' + .format(pid, gname), curses.A_BOLD) else: self.screen.addstr(0, 0, 'kvm statistics - summary', curses.A_BOLD) + if self.stats.fields_filter and self.stats.fields_filter \ + != DEFAULT_REGEX: + regex = self.stats.fields_filter + if len(regex) > MAX_REGEX_LEN: + regex = regex[:MAX_REGEX_LEN] + '...' + self.screen.addstr(1, 17, 'regex filter: {0}'.format(regex)) self.screen.addstr(2, 1, 'Event') self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH - len('Total'), 'Total') - self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 8 - + self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 7 - + len('%Total'), '%Total') + self.screen.addstr(2, 1 + LABEL_WIDTH + NUMBER_WIDTH + 7 + 8 - len('Current'), 'Current') + self.screen.addstr(4, 1, 'Collecting data...') + self.screen.refresh() + + def refresh_body(self, sleeptime): row = 3 + self.screen.move(row, 0) + self.screen.clrtobot() stats = self.stats.get() + def sortkey(x): if stats[x][1]: return (-stats[x][1], -stats[x][0]) else: return (0, -stats[x][0]) + total = 0. + for val in stats.values(): + total += val[0] for key in sorted(stats.keys(), key=sortkey): if row >= self.screen.getmaxyx()[0]: @@ -869,6 +1006,8 @@ class Tui(object): col += LABEL_WIDTH self.screen.addstr(row, col, '%10d' % (values[0],)) col += NUMBER_WIDTH + self.screen.addstr(row, col, '%7.1f' % (values[0] * 100 / total,)) + col += 7 if values[1] is not None: self.screen.addstr(row, col, '%8d' % (values[1] / sleeptime,)) row += 1 @@ -893,20 +1032,24 @@ class Tui(object): regex = self.screen.getstr() curses.noecho() if len(regex) == 0: + self.stats.fields_filter = DEFAULT_REGEX + self.refresh_header() return try: re.compile(regex) self.stats.fields_filter = regex + self.refresh_header() return except re.error: continue - def show_vm_selection(self): + def show_vm_selection_by_pid(self): """Draws PID selection mask. Asks for a pid until a valid pid or 0 has been entered. """ + msg = '' while True: self.screen.erase() self.screen.addstr(0, 0, @@ -915,6 +1058,7 @@ class Tui(object): self.screen.addstr(1, 0, 'This might limit the shown data to the trace ' 'statistics.') + self.screen.addstr(5, 0, msg) curses.echo() self.screen.addstr(3, 0, "Pid [0 or pid]: ") @@ -922,60 +1066,128 @@ class Tui(object): curses.noecho() try: - pid = int(pid) - - if pid == 0: - self.update_pid(pid) - break - else: - if not os.path.isdir(os.path.join('/proc/', str(pid))): + if len(pid) > 0: + pid = int(pid) + if pid != 0 and not os.path.isdir(os.path.join('/proc/', + str(pid))): + msg = '"' + str(pid) + '": Not a running process' continue - else: - self.update_pid(pid) - break + else: + pid = 0 + self.refresh_header(pid) + self.update_pid(pid) + break except ValueError: + msg = '"' + str(pid) + '": Not a valid pid' continue + def show_vm_selection_by_guest_name(self): + """Draws guest selection mask. + + Asks for a guest name until a valid guest name or '' is entered. + + """ + msg = '' + while True: + self.screen.erase() + self.screen.addstr(0, 0, + 'Show statistics for specific guest.', + curses.A_BOLD) + self.screen.addstr(1, 0, + 'This might limit the shown data to the trace ' + 'statistics.') + self.screen.addstr(5, 0, msg) + curses.echo() + self.screen.addstr(3, 0, "Guest [ENTER or guest]: ") + gname = self.screen.getstr() + curses.noecho() + + if not gname: + self.refresh_header(0) + self.update_pid(0) + break + else: + pids = [] + try: + pids = get_pid_from_gname(gname) + except: + msg = '"' + gname + '": Internal error while searching, ' \ + 'use pid filter instead' + continue + if len(pids) == 0: + msg = '"' + gname + '": Not an active guest' + continue + if len(pids) > 1: + msg = '"' + gname + '": Multiple matches found, use pid ' \ + 'filter instead' + continue + self.refresh_header(pids[0]) + self.update_pid(pids[0]) + break + def show_stats(self): """Refreshes the screen and processes user input.""" - sleeptime = 0.25 + sleeptime = DELAY_INITIAL + self.refresh_header() while True: - self.refresh(sleeptime) + self.refresh_body(sleeptime) curses.halfdelay(int(sleeptime * 10)) - sleeptime = 3 + sleeptime = DELAY_REGULAR try: char = self.screen.getkey() if char == 'x': - self.drilldown = not self.drilldown + self.refresh_header() self.update_drilldown() + sleeptime = DELAY_INITIAL if char == 'q': break + if char == 'c': + self.stats.fields_filter = DEFAULT_REGEX + self.refresh_header(0) + self.update_pid(0) + sleeptime = DELAY_INITIAL if char == 'f': self.show_filter_selection() + sleeptime = DELAY_INITIAL + if char == 'g': + self.show_vm_selection_by_guest_name() + sleeptime = DELAY_INITIAL if char == 'p': - self.show_vm_selection() + self.show_vm_selection_by_pid() + sleeptime = DELAY_INITIAL + if char == 'r': + self.refresh_header() + self.stats.reset() + sleeptime = DELAY_INITIAL except KeyboardInterrupt: break except curses.error: continue + def batch(stats): """Prints statistics in a key, value format.""" - s = stats.get() - time.sleep(1) - s = stats.get() - for key in sorted(s.keys()): - values = s[key] - print '%-42s%10d%10d' % (key, values[0], values[1]) + try: + s = stats.get() + time.sleep(1) + s = stats.get() + for key in sorted(s.keys()): + values = s[key] + print '%-42s%10d%10d' % (key, values[0], values[1]) + except KeyboardInterrupt: + pass + def log(stats): """Prints statistics as reiterating key block, multiple value blocks.""" keys = sorted(stats.get().iterkeys()) + def banner(): for k in keys: print '%s' % k, print + def statline(): s = stats.get() for k in keys: @@ -984,11 +1196,15 @@ def log(stats): line = 0 banner_repeat = 20 while True: - time.sleep(1) - if line % banner_repeat == 0: - banner() - statline() - line += 1 + try: + time.sleep(1) + if line % banner_repeat == 0: + banner() + statline() + line += 1 + except KeyboardInterrupt: + break + def get_options(): """Returns processed program arguments.""" @@ -1009,6 +1225,16 @@ Requirements: CAP_SYS_ADMIN and perf events are used. - CAP_SYS_RESOURCE if the hard limit is not high enough to allow the large number of files that are possibly opened. + +Interactive Commands: + c clear filter + f filter by regular expression + g filter by guest name + p filter by PID + q quit + x toggle reporting of stats for individual child trace events + r reset stats +Press any other key to refresh statistics immediately. """ class PlainHelpFormatter(optparse.IndentedHelpFormatter): @@ -1018,6 +1244,22 @@ Requirements: else: return "" + def cb_guest_to_pid(option, opt, val, parser): + try: + pids = get_pid_from_gname(val) + except: + raise optparse.OptionValueError('Error while searching for guest ' + '"{}", use "-p" to specify a pid ' + 'instead'.format(val)) + if len(pids) == 0: + raise optparse.OptionValueError('No guest by the name "{}" ' + 'found'.format(val)) + if len(pids) > 1: + raise optparse.OptionValueError('Multiple processes found (pids: ' + '{}) - use "-p" to specify a pid ' + 'instead'.format(" ".join(pids))) + parser.values.pid = pids[0] + optparser = optparse.OptionParser(description=description_text, formatter=PlainHelpFormatter()) optparser.add_option('-1', '--once', '--batch', @@ -1051,15 +1293,24 @@ Requirements: help='fields to display (regex)', ) optparser.add_option('-p', '--pid', - action='store', - default=0, - type=int, - dest='pid', - help='restrict statistics to pid', - ) + action='store', + default=0, + type='int', + dest='pid', + help='restrict statistics to pid', + ) + optparser.add_option('-g', '--guest', + action='callback', + type='string', + dest='pid', + metavar='GUEST', + help='restrict statistics to guest by name', + callback=cb_guest_to_pid, + ) (options, _) = optparser.parse_args(sys.argv) return options + def get_providers(options): """Returns a list of data providers depending on the passed options.""" providers = [] @@ -1073,6 +1324,7 @@ def get_providers(options): return providers + def check_access(options): """Exits if the current user can't access all needed directories.""" if not os.path.exists('/sys/kernel/debug'): @@ -1086,8 +1338,8 @@ def check_access(options): "Also ensure, that the kvm modules are loaded.\n") sys.exit(1) - if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints - or not options.debugfs): + if not os.path.exists(PATH_DEBUGFS_TRACING) and (options.tracepoints or + not options.debugfs): sys.stderr.write("Please enable CONFIG_TRACING in your kernel " "when using the option -t (default).\n" "If it is enabled, make {0} readable by the " @@ -1098,10 +1350,11 @@ def check_access(options): sys.stderr.write("Falling back to debugfs statistics!\n") options.debugfs = True - sleep(5) + time.sleep(5) return options + def main(): options = get_options() options = check_access(options) diff --git a/tools/kvm/kvm_stat/kvm_stat.txt b/tools/kvm/kvm_stat/kvm_stat.txt index b92a153d7115..109431bdc63c 100644 --- a/tools/kvm/kvm_stat/kvm_stat.txt +++ b/tools/kvm/kvm_stat/kvm_stat.txt @@ -18,11 +18,33 @@ state transitions such as guest mode entry and exit. This tool is useful for observing guest behavior from the host perspective. Often conclusions about performance or buggy behavior can be drawn from the output. +While running in regular mode, use any of the keys listed in section +'Interactive Commands' below. +Use batch and logging modes for scripting purposes. The set of KVM kernel module trace events may be specific to the kernel version or architecture. It is best to check the KVM kernel module source code for the meaning of events. +INTERACTIVE COMMANDS +-------------------- +[horizontal] +*c*:: clear filter + +*f*:: filter by regular expression + +*g*:: filter by guest name + +*p*:: filter by PID + +*q*:: quit + +*r*:: reset stats + +*x*:: toggle reporting of stats for child trace events + +Press any other key to refresh statistics immediately. + OPTIONS ------- -1:: @@ -46,6 +68,10 @@ OPTIONS --pid=<pid>:: limit statistics to one virtual machine (pid) +-g<guest>:: +--guest=<guest_name>:: + limit statistics to one virtual machine (guest name) + -f<fields>:: --fields=<fields>:: fields to display (regex) diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c index 35d7100e0815..5976609ef27c 100644 --- a/virt/kvm/arm/arch_timer.c +++ b/virt/kvm/arm/arch_timer.c @@ -184,28 +184,47 @@ bool kvm_timer_should_fire(struct arch_timer_context *timer_ctx) return cval <= now; } +/* + * Reflect the timer output level into the kvm_run structure + */ +void kvm_timer_update_run(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct kvm_sync_regs *regs = &vcpu->run->s.regs; + + /* Populate the device bitmap with the timer states */ + regs->device_irq_level &= ~(KVM_ARM_DEV_EL1_VTIMER | + KVM_ARM_DEV_EL1_PTIMER); + if (vtimer->irq.level) + regs->device_irq_level |= KVM_ARM_DEV_EL1_VTIMER; + if (ptimer->irq.level) + regs->device_irq_level |= KVM_ARM_DEV_EL1_PTIMER; +} + static void kvm_timer_update_irq(struct kvm_vcpu *vcpu, bool new_level, struct arch_timer_context *timer_ctx) { int ret; - BUG_ON(!vgic_initialized(vcpu->kvm)); - timer_ctx->active_cleared_last = false; timer_ctx->irq.level = new_level; trace_kvm_timer_update_irq(vcpu->vcpu_id, timer_ctx->irq.irq, timer_ctx->irq.level); - ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, timer_ctx->irq.irq, - timer_ctx->irq.level); - WARN_ON(ret); + if (likely(irqchip_in_kernel(vcpu->kvm))) { + ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, + timer_ctx->irq.irq, + timer_ctx->irq.level); + WARN_ON(ret); + } } /* * Check if there was a change in the timer state (should we raise or lower * the line level to the GIC). */ -static int kvm_timer_update_state(struct kvm_vcpu *vcpu) +static void kvm_timer_update_state(struct kvm_vcpu *vcpu) { struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); @@ -217,16 +236,14 @@ static int kvm_timer_update_state(struct kvm_vcpu *vcpu) * because the guest would never see the interrupt. Instead wait * until we call this function from kvm_timer_flush_hwstate. */ - if (!vgic_initialized(vcpu->kvm) || !timer->enabled) - return -ENODEV; + if (unlikely(!timer->enabled)) + return; if (kvm_timer_should_fire(vtimer) != vtimer->irq.level) kvm_timer_update_irq(vcpu, !vtimer->irq.level, vtimer); if (kvm_timer_should_fire(ptimer) != ptimer->irq.level) kvm_timer_update_irq(vcpu, !ptimer->irq.level, ptimer); - - return 0; } /* Schedule the background timer for the emulated timer. */ @@ -286,25 +303,12 @@ void kvm_timer_unschedule(struct kvm_vcpu *vcpu) timer_disarm(timer); } -/** - * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu - * @vcpu: The vcpu pointer - * - * Check if the virtual timer has expired while we were running in the host, - * and inject an interrupt if that was the case. - */ -void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) +static void kvm_timer_flush_hwstate_vgic(struct kvm_vcpu *vcpu) { struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); bool phys_active; int ret; - if (kvm_timer_update_state(vcpu)) - return; - - /* Set the background timer for the physical timer emulation. */ - kvm_timer_emulate(vcpu, vcpu_ptimer(vcpu)); - /* * If we enter the guest with the virtual input level to the VGIC * asserted, then we have already told the VGIC what we need to, and @@ -356,11 +360,72 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) vtimer->active_cleared_last = !phys_active; } +bool kvm_timer_should_notify_user(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + struct arch_timer_context *ptimer = vcpu_ptimer(vcpu); + struct kvm_sync_regs *sregs = &vcpu->run->s.regs; + bool vlevel, plevel; + + if (likely(irqchip_in_kernel(vcpu->kvm))) + return false; + + vlevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_VTIMER; + plevel = sregs->device_irq_level & KVM_ARM_DEV_EL1_PTIMER; + + return vtimer->irq.level != vlevel || + ptimer->irq.level != plevel; +} + +static void kvm_timer_flush_hwstate_user(struct kvm_vcpu *vcpu) +{ + struct arch_timer_context *vtimer = vcpu_vtimer(vcpu); + + /* + * To prevent continuously exiting from the guest, we mask the + * physical interrupt such that the guest can make forward progress. + * Once we detect the output level being deasserted, we unmask the + * interrupt again so that we exit from the guest when the timer + * fires. + */ + if (vtimer->irq.level) + disable_percpu_irq(host_vtimer_irq); + else + enable_percpu_irq(host_vtimer_irq, 0); +} + +/** + * kvm_timer_flush_hwstate - prepare timers before running the vcpu + * @vcpu: The vcpu pointer + * + * Check if the virtual timer has expired while we were running in the host, + * and inject an interrupt if that was the case, making sure the timer is + * masked or disabled on the host so that we keep executing. Also schedule a + * software timer for the physical timer if it is enabled. + */ +void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu) +{ + struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu; + + if (unlikely(!timer->enabled)) + return; + + kvm_timer_update_state(vcpu); + + /* Set the background timer for the physical timer emulation. */ + kvm_timer_emulate(vcpu, vcpu_ptimer(vcpu)); + + if (unlikely(!irqchip_in_kernel(vcpu->kvm))) + kvm_timer_flush_hwstate_user(vcpu); + else + kvm_timer_flush_hwstate_vgic(vcpu); +} + /** * kvm_timer_sync_hwstate - sync timer state from cpu * @vcpu: The vcpu pointer * - * Check if the virtual timer has expired while we were running in the guest, + * Check if any of the timers have expired while we were running in the guest, * and inject an interrupt if that was the case. */ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu) @@ -560,6 +625,13 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) if (timer->enabled) return 0; + /* Without a VGIC we do not map virtual IRQs to physical IRQs */ + if (!irqchip_in_kernel(vcpu->kvm)) + goto no_vgic; + + if (!vgic_initialized(vcpu->kvm)) + return -ENODEV; + /* * Find the physical IRQ number corresponding to the host_vtimer_irq */ @@ -583,8 +655,8 @@ int kvm_timer_enable(struct kvm_vcpu *vcpu) if (ret) return ret; +no_vgic: timer->enabled = 1; - return 0; } diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c index c8aeb7b91ec8..a3f18d362366 100644 --- a/virt/kvm/arm/hyp/vgic-v2-sr.c +++ b/virt/kvm/arm/hyp/vgic-v2-sr.c @@ -22,49 +22,6 @@ #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> -static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, - void __iomem *base) -{ - struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; - u32 eisr0, eisr1; - int i; - bool expect_mi; - - expect_mi = !!(cpu_if->vgic_hcr & GICH_HCR_UIE); - - for (i = 0; i < nr_lr; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - - expect_mi |= (!(cpu_if->vgic_lr[i] & GICH_LR_HW) && - (cpu_if->vgic_lr[i] & GICH_LR_EOI)); - } - - if (expect_mi) { - cpu_if->vgic_misr = readl_relaxed(base + GICH_MISR); - - if (cpu_if->vgic_misr & GICH_MISR_EOI) { - eisr0 = readl_relaxed(base + GICH_EISR0); - if (unlikely(nr_lr > 32)) - eisr1 = readl_relaxed(base + GICH_EISR1); - else - eisr1 = 0; - } else { - eisr0 = eisr1 = 0; - } - } else { - cpu_if->vgic_misr = 0; - eisr0 = eisr1 = 0; - } - -#ifdef CONFIG_CPU_BIG_ENDIAN - cpu_if->vgic_eisr = ((u64)eisr0 << 32) | eisr1; -#else - cpu_if->vgic_eisr = ((u64)eisr1 << 32) | eisr0; -#endif -} - static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; @@ -87,13 +44,10 @@ static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base) static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base) { struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; - int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; int i; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; - for (i = 0; i < nr_lr; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - + for (i = 0; i < used_lrs; i++) { if (cpu_if->vgic_elrsr & (1UL << i)) cpu_if->vgic_lr[i] &= ~GICH_LR_STATE; else @@ -110,26 +64,20 @@ void __hyp_text __vgic_v2_save_state(struct kvm_vcpu *vcpu) struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; struct vgic_dist *vgic = &kvm->arch.vgic; void __iomem *base = kern_hyp_va(vgic->vctrl_base); + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; if (!base) return; - cpu_if->vgic_vmcr = readl_relaxed(base + GICH_VMCR); - - if (vcpu->arch.vgic_cpu.live_lrs) { + if (used_lrs) { cpu_if->vgic_apr = readl_relaxed(base + GICH_APR); - save_maint_int_state(vcpu, base); save_elrsr(vcpu, base); save_lrs(vcpu, base); writel_relaxed(0, base + GICH_HCR); - - vcpu->arch.vgic_cpu.live_lrs = 0; } else { - cpu_if->vgic_eisr = 0; cpu_if->vgic_elrsr = ~0UL; - cpu_if->vgic_misr = 0; cpu_if->vgic_apr = 0; } } @@ -141,32 +89,20 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu) struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; struct vgic_dist *vgic = &kvm->arch.vgic; void __iomem *base = kern_hyp_va(vgic->vctrl_base); - int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr; int i; - u64 live_lrs = 0; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; if (!base) return; - - for (i = 0; i < nr_lr; i++) - if (cpu_if->vgic_lr[i] & GICH_LR_STATE) - live_lrs |= 1UL << i; - - if (live_lrs) { + if (used_lrs) { writel_relaxed(cpu_if->vgic_hcr, base + GICH_HCR); writel_relaxed(cpu_if->vgic_apr, base + GICH_APR); - for (i = 0; i < nr_lr; i++) { - if (!(live_lrs & (1UL << i))) - continue; - + for (i = 0; i < used_lrs; i++) { writel_relaxed(cpu_if->vgic_lr[i], base + GICH_LR0 + (i * 4)); } } - - writel_relaxed(cpu_if->vgic_vmcr, base + GICH_VMCR); - vcpu->arch.vgic_cpu.live_lrs = live_lrs; } #ifdef CONFIG_ARM64 diff --git a/virt/kvm/arm/hyp/vgic-v3-sr.c b/virt/kvm/arm/hyp/vgic-v3-sr.c index 3947095cc0a1..bce6037cf01d 100644 --- a/virt/kvm/arm/hyp/vgic-v3-sr.c +++ b/virt/kvm/arm/hyp/vgic-v3-sr.c @@ -118,66 +118,32 @@ static void __hyp_text __gic_v3_set_lr(u64 val, int lr) } } -static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu, int nr_lr) -{ - struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; - int i; - bool expect_mi; - - expect_mi = !!(cpu_if->vgic_hcr & ICH_HCR_UIE); - - for (i = 0; i < nr_lr; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - - expect_mi |= (!(cpu_if->vgic_lr[i] & ICH_LR_HW) && - (cpu_if->vgic_lr[i] & ICH_LR_EOI)); - } - - if (expect_mi) { - cpu_if->vgic_misr = read_gicreg(ICH_MISR_EL2); - - if (cpu_if->vgic_misr & ICH_MISR_EOI) - cpu_if->vgic_eisr = read_gicreg(ICH_EISR_EL2); - else - cpu_if->vgic_eisr = 0; - } else { - cpu_if->vgic_misr = 0; - cpu_if->vgic_eisr = 0; - } -} - void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; u64 val; /* * Make sure stores to the GIC via the memory mapped interface * are now visible to the system register interface. */ - if (!cpu_if->vgic_sre) + if (!cpu_if->vgic_sre) { dsb(st); + cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); + } - cpu_if->vgic_vmcr = read_gicreg(ICH_VMCR_EL2); - - if (vcpu->arch.vgic_cpu.live_lrs) { + if (used_lrs) { int i; - u32 max_lr_idx, nr_pri_bits; + u32 nr_pri_bits; cpu_if->vgic_elrsr = read_gicreg(ICH_ELSR_EL2); write_gicreg(0, ICH_HCR_EL2); val = read_gicreg(ICH_VTR_EL2); - max_lr_idx = vtr_to_max_lr_idx(val); nr_pri_bits = vtr_to_nr_pri_bits(val); - save_maint_int_state(vcpu, max_lr_idx + 1); - - for (i = 0; i <= max_lr_idx; i++) { - if (!(vcpu->arch.vgic_cpu.live_lrs & (1UL << i))) - continue; - + for (i = 0; i < used_lrs; i++) { if (cpu_if->vgic_elrsr & (1 << i)) cpu_if->vgic_lr[i] &= ~ICH_LR_STATE; else @@ -205,11 +171,7 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) default: cpu_if->vgic_ap1r[0] = read_gicreg(ICH_AP1R0_EL2); } - - vcpu->arch.vgic_cpu.live_lrs = 0; } else { - cpu_if->vgic_misr = 0; - cpu_if->vgic_eisr = 0; cpu_if->vgic_elrsr = 0xffff; cpu_if->vgic_ap0r[0] = 0; cpu_if->vgic_ap0r[1] = 0; @@ -234,9 +196,9 @@ void __hyp_text __vgic_v3_save_state(struct kvm_vcpu *vcpu) void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u64 used_lrs = vcpu->arch.vgic_cpu.used_lrs; u64 val; - u32 max_lr_idx, nr_pri_bits; - u16 live_lrs = 0; + u32 nr_pri_bits; int i; /* @@ -245,25 +207,19 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) * delivered as a FIQ to the guest, with potentially fatal * consequences. So we must make sure that ICC_SRE_EL1 has * been actually programmed with the value we want before - * starting to mess with the rest of the GIC. + * starting to mess with the rest of the GIC, and VMCR_EL2 in + * particular. */ if (!cpu_if->vgic_sre) { write_gicreg(0, ICC_SRE_EL1); isb(); + write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); } val = read_gicreg(ICH_VTR_EL2); - max_lr_idx = vtr_to_max_lr_idx(val); nr_pri_bits = vtr_to_nr_pri_bits(val); - for (i = 0; i <= max_lr_idx; i++) { - if (cpu_if->vgic_lr[i] & ICH_LR_STATE) - live_lrs |= (1 << i); - } - - write_gicreg(cpu_if->vgic_vmcr, ICH_VMCR_EL2); - - if (live_lrs) { + if (used_lrs) { write_gicreg(cpu_if->vgic_hcr, ICH_HCR_EL2); switch (nr_pri_bits) { @@ -286,12 +242,8 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) write_gicreg(cpu_if->vgic_ap1r[0], ICH_AP1R0_EL2); } - for (i = 0; i <= max_lr_idx; i++) { - if (!(live_lrs & (1 << i))) - continue; - + for (i = 0; i < used_lrs; i++) __gic_v3_set_lr(cpu_if->vgic_lr[i], i); - } } /* @@ -303,7 +255,6 @@ void __hyp_text __vgic_v3_restore_state(struct kvm_vcpu *vcpu) isb(); dsb(sy); } - vcpu->arch.vgic_cpu.live_lrs = live_lrs; /* * Prevent the guest from touching the GIC system registers if @@ -326,3 +277,13 @@ u64 __hyp_text __vgic_v3_get_ich_vtr_el2(void) { return read_gicreg(ICH_VTR_EL2); } + +u64 __hyp_text __vgic_v3_read_vmcr(void) +{ + return read_gicreg(ICH_VMCR_EL2); +} + +void __hyp_text __vgic_v3_write_vmcr(u32 vmcr) +{ + write_gicreg(vmcr, ICH_VMCR_EL2); +} diff --git a/virt/kvm/arm/pmu.c b/virt/kvm/arm/pmu.c index 69ccce308458..4b43e7f3b158 100644 --- a/virt/kvm/arm/pmu.c +++ b/virt/kvm/arm/pmu.c @@ -230,13 +230,44 @@ static void kvm_pmu_update_state(struct kvm_vcpu *vcpu) return; overflow = !!kvm_pmu_overflow_status(vcpu); - if (pmu->irq_level != overflow) { - pmu->irq_level = overflow; - kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, - pmu->irq_num, overflow); + if (pmu->irq_level == overflow) + return; + + pmu->irq_level = overflow; + + if (likely(irqchip_in_kernel(vcpu->kvm))) { + int ret; + ret = kvm_vgic_inject_irq(vcpu->kvm, vcpu->vcpu_id, + pmu->irq_num, overflow); + WARN_ON(ret); } } +bool kvm_pmu_should_notify_user(struct kvm_vcpu *vcpu) +{ + struct kvm_pmu *pmu = &vcpu->arch.pmu; + struct kvm_sync_regs *sregs = &vcpu->run->s.regs; + bool run_level = sregs->device_irq_level & KVM_ARM_DEV_PMU; + + if (likely(irqchip_in_kernel(vcpu->kvm))) + return false; + + return pmu->irq_level != run_level; +} + +/* + * Reflect the PMU overflow interrupt output level into the kvm_run structure + */ +void kvm_pmu_update_run(struct kvm_vcpu *vcpu) +{ + struct kvm_sync_regs *regs = &vcpu->run->s.regs; + + /* Populate the timer bitmap for user space */ + regs->device_irq_level &= ~KVM_ARM_DEV_PMU; + if (vcpu->arch.pmu.irq_level) + regs->device_irq_level |= KVM_ARM_DEV_PMU; +} + /** * kvm_pmu_flush_hwstate - flush pmu state to cpu * @vcpu: The vcpu pointer diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c index 702f8108608d..25fd1b942c11 100644 --- a/virt/kvm/arm/vgic/vgic-init.c +++ b/virt/kvm/arm/vgic/vgic-init.c @@ -24,7 +24,12 @@ /* * Initialization rules: there are multiple stages to the vgic - * initialization, both for the distributor and the CPU interfaces. + * initialization, both for the distributor and the CPU interfaces. The basic + * idea is that even though the VGIC is not functional or not requested from + * user space, the critical path of the run loop can still call VGIC functions + * that just won't do anything, without them having to check additional + * initialization flags to ensure they don't look at uninitialized data + * structures. * * Distributor: * @@ -39,23 +44,67 @@ * * CPU Interface: * - * - kvm_vgic_cpu_early_init(): initialization of static data that + * - kvm_vgic_vcpu_early_init(): initialization of static data that * doesn't depend on any sizing information or emulation type. No * allocation is allowed there. */ /* EARLY INIT */ -/* - * Those 2 functions should not be needed anymore but they - * still are called from arm.c +/** + * kvm_vgic_early_init() - Initialize static VGIC VCPU data structures + * @kvm: The VM whose VGIC districutor should be initialized + * + * Only do initialization of static structures that don't require any + * allocation or sizing information from userspace. vgic_init() called + * kvm_vgic_dist_init() which takes care of the rest. */ void kvm_vgic_early_init(struct kvm *kvm) { + struct vgic_dist *dist = &kvm->arch.vgic; + + INIT_LIST_HEAD(&dist->lpi_list_head); + spin_lock_init(&dist->lpi_list_lock); } +/** + * kvm_vgic_vcpu_early_init() - Initialize static VGIC VCPU data structures + * @vcpu: The VCPU whose VGIC data structures whould be initialized + * + * Only do initialization, but do not actually enable the VGIC CPU interface + * yet. + */ void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu) { + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + int i; + + INIT_LIST_HEAD(&vgic_cpu->ap_list_head); + spin_lock_init(&vgic_cpu->ap_list_lock); + + /* + * Enable and configure all SGIs to be edge-triggered and + * configure all PPIs as level-triggered. + */ + for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { + struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; + + INIT_LIST_HEAD(&irq->ap_list); + spin_lock_init(&irq->irq_lock); + irq->intid = i; + irq->vcpu = NULL; + irq->target_vcpu = vcpu; + irq->targets = 1U << vcpu->vcpu_id; + kref_init(&irq->refcount); + if (vgic_irq_is_sgi(i)) { + /* SGIs */ + irq->enabled = 1; + irq->config = VGIC_CONFIG_EDGE; + } else { + /* PPIs */ + irq->config = VGIC_CONFIG_LEVEL; + } + } } /* CREATION */ @@ -148,9 +197,6 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0); int i; - INIT_LIST_HEAD(&dist->lpi_list_head); - spin_lock_init(&dist->lpi_list_lock); - dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL); if (!dist->spis) return -ENOMEM; @@ -181,41 +227,11 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis) } /** - * kvm_vgic_vcpu_init: initialize the vcpu data structures and - * enable the VCPU interface - * @vcpu: the VCPU which's VGIC should be initialized + * kvm_vgic_vcpu_init() - Enable the VCPU interface + * @vcpu: the VCPU which's VGIC should be enabled */ static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu) { - struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; - int i; - - INIT_LIST_HEAD(&vgic_cpu->ap_list_head); - spin_lock_init(&vgic_cpu->ap_list_lock); - - /* - * Enable and configure all SGIs to be edge-triggered and - * configure all PPIs as level-triggered. - */ - for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) { - struct vgic_irq *irq = &vgic_cpu->private_irqs[i]; - - INIT_LIST_HEAD(&irq->ap_list); - spin_lock_init(&irq->irq_lock); - irq->intid = i; - irq->vcpu = NULL; - irq->target_vcpu = vcpu; - irq->targets = 1U << vcpu->vcpu_id; - kref_init(&irq->refcount); - if (vgic_irq_is_sgi(i)) { - /* SGIs */ - irq->enabled = 1; - irq->config = VGIC_CONFIG_EDGE; - } else { - /* PPIs */ - irq->config = VGIC_CONFIG_LEVEL; - } - } if (kvm_vgic_global_state.type == VGIC_V2) vgic_v2_enable(vcpu); else @@ -262,6 +278,18 @@ int vgic_init(struct kvm *kvm) vgic_debug_init(kvm); dist->initialized = true; + + /* + * If we're initializing GICv2 on-demand when first running the VCPU + * then we need to load the VGIC state onto the CPU. We can detect + * this easily by checking if we are in between vcpu_load and vcpu_put + * when we just initialized the VGIC. + */ + preempt_disable(); + vcpu = kvm_arm_get_running_vcpu(); + if (vcpu) + kvm_vgic_load(vcpu); + preempt_enable(); out: return ret; } diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c index b637d9c7afe3..a65757aab6d3 100644 --- a/virt/kvm/arm/vgic/vgic-v2.c +++ b/virt/kvm/arm/vgic/vgic-v2.c @@ -22,20 +22,6 @@ #include "vgic.h" -/* - * Call this function to convert a u64 value to an unsigned long * bitmask - * in a way that works on both 32-bit and 64-bit LE and BE platforms. - * - * Warning: Calling this function may modify *val. - */ -static unsigned long *u64_to_bitmask(u64 *val) -{ -#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32 - *val = (*val >> 32) | (*val << 32); -#endif - return (unsigned long *)val; -} - static inline void vgic_v2_write_lr(int lr, u32 val) { void __iomem *base = kvm_vgic_global_state.vctrl_base; @@ -51,45 +37,17 @@ void vgic_v2_init_lrs(void) vgic_v2_write_lr(i, 0); } -void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu) +void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) { struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; - if (cpuif->vgic_misr & GICH_MISR_EOI) { - u64 eisr = cpuif->vgic_eisr; - unsigned long *eisr_bmap = u64_to_bitmask(&eisr); - int lr; - - for_each_set_bit(lr, eisr_bmap, kvm_vgic_global_state.nr_lr) { - u32 intid = cpuif->vgic_lr[lr] & GICH_LR_VIRTUALID; - - WARN_ON(cpuif->vgic_lr[lr] & GICH_LR_STATE); - - /* Only SPIs require notification */ - if (vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); - } - } - - /* check and disable underflow maintenance IRQ */ - cpuif->vgic_hcr &= ~GICH_HCR_UIE; - - /* - * In the next iterations of the vcpu loop, if we sync the - * vgic state after flushing it, but before entering the guest - * (this happens for pending signals and vmid rollovers), then - * make sure we don't pick up any old maintenance interrupts - * here. - */ - cpuif->vgic_eisr = 0; + cpuif->vgic_hcr |= GICH_HCR_UIE; } -void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) +static bool lr_signals_eoi_mi(u32 lr_val) { - struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; - - cpuif->vgic_hcr |= GICH_HCR_UIE; + return !(lr_val & GICH_LR_STATE) && (lr_val & GICH_LR_EOI) && + !(lr_val & GICH_LR_HW); } /* @@ -101,14 +59,22 @@ void vgic_v2_set_underflow(struct kvm_vcpu *vcpu) */ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) { - struct vgic_v2_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v2; + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v2_cpu_if *cpuif = &vgic_cpu->vgic_v2; int lr; - for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) { + cpuif->vgic_hcr &= ~GICH_HCR_UIE; + + for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { u32 val = cpuif->vgic_lr[lr]; u32 intid = val & GICH_LR_VIRTUALID; struct vgic_irq *irq; + /* Notify fds when the guest EOI'ed a level-triggered SPI */ + if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + irq = vgic_get_irq(vcpu->kvm, vcpu, intid); spin_lock(&irq->irq_lock); @@ -141,6 +107,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); } + + vgic_cpu->used_lrs = 0; } /* @@ -199,6 +167,7 @@ void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr) void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; u32 vmcr; vmcr = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK; @@ -209,12 +178,15 @@ void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) vmcr |= ((vmcrp->pmr >> GICV_PMR_PRIORITY_SHIFT) << GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK; - vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr; + cpu_if->vgic_vmcr = vmcr; } void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { - u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr; + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + u32 vmcr; + + vmcr = cpu_if->vgic_vmcr; vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >> GICH_VMCR_CTRL_SHIFT; @@ -390,3 +362,19 @@ out: return ret; } + +void vgic_v2_load(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; + + writel_relaxed(cpu_if->vgic_vmcr, vgic->vctrl_base + GICH_VMCR); +} + +void vgic_v2_put(struct kvm_vcpu *vcpu) +{ + struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2; + struct vgic_dist *vgic = &vcpu->kvm->arch.vgic; + + cpu_if->vgic_vmcr = readl_relaxed(vgic->vctrl_base + GICH_VMCR); +} diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c index be0f4c3e0142..df1503650300 100644 --- a/virt/kvm/arm/vgic/vgic-v3.c +++ b/virt/kvm/arm/vgic/vgic-v3.c @@ -21,59 +21,29 @@ #include "vgic.h" -void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu) +void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) { struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; - u32 model = vcpu->kvm->arch.vgic.vgic_model; - - if (cpuif->vgic_misr & ICH_MISR_EOI) { - unsigned long eisr_bmap = cpuif->vgic_eisr; - int lr; - - for_each_set_bit(lr, &eisr_bmap, kvm_vgic_global_state.nr_lr) { - u32 intid; - u64 val = cpuif->vgic_lr[lr]; - - if (model == KVM_DEV_TYPE_ARM_VGIC_V3) - intid = val & ICH_LR_VIRTUAL_ID_MASK; - else - intid = val & GICH_LR_VIRTUALID; - - WARN_ON(cpuif->vgic_lr[lr] & ICH_LR_STATE); - - /* Only SPIs require notification */ - if (vgic_valid_spi(vcpu->kvm, intid)) - kvm_notify_acked_irq(vcpu->kvm, 0, - intid - VGIC_NR_PRIVATE_IRQS); - } - - /* - * In the next iterations of the vcpu loop, if we sync - * the vgic state after flushing it, but before - * entering the guest (this happens for pending - * signals and vmid rollovers), then make sure we - * don't pick up any old maintenance interrupts here. - */ - cpuif->vgic_eisr = 0; - } - cpuif->vgic_hcr &= ~ICH_HCR_UIE; + cpuif->vgic_hcr |= ICH_HCR_UIE; } -void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) +static bool lr_signals_eoi_mi(u64 lr_val) { - struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; - - cpuif->vgic_hcr |= ICH_HCR_UIE; + return !(lr_val & ICH_LR_STATE) && (lr_val & ICH_LR_EOI) && + !(lr_val & ICH_LR_HW); } void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) { - struct vgic_v3_cpu_if *cpuif = &vcpu->arch.vgic_cpu.vgic_v3; + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + struct vgic_v3_cpu_if *cpuif = &vgic_cpu->vgic_v3; u32 model = vcpu->kvm->arch.vgic.vgic_model; int lr; - for (lr = 0; lr < vcpu->arch.vgic_cpu.used_lrs; lr++) { + cpuif->vgic_hcr &= ~ICH_HCR_UIE; + + for (lr = 0; lr < vgic_cpu->used_lrs; lr++) { u64 val = cpuif->vgic_lr[lr]; u32 intid; struct vgic_irq *irq; @@ -82,6 +52,12 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) intid = val & ICH_LR_VIRTUAL_ID_MASK; else intid = val & GICH_LR_VIRTUALID; + + /* Notify fds when the guest EOI'ed a level-triggered IRQ */ + if (lr_signals_eoi_mi(val) && vgic_valid_spi(vcpu->kvm, intid)) + kvm_notify_acked_irq(vcpu->kvm, 0, + intid - VGIC_NR_PRIVATE_IRQS); + irq = vgic_get_irq(vcpu->kvm, vcpu, intid); if (!irq) /* An LPI could have been unmapped. */ continue; @@ -117,6 +93,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); } + + vgic_cpu->used_lrs = 0; } /* Requires the irq to be locked already */ @@ -173,6 +151,7 @@ void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr) void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; u32 vmcr; /* @@ -188,12 +167,15 @@ void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) vmcr |= (vmcrp->grpen0 << ICH_VMCR_ENG0_SHIFT) & ICH_VMCR_ENG0_MASK; vmcr |= (vmcrp->grpen1 << ICH_VMCR_ENG1_SHIFT) & ICH_VMCR_ENG1_MASK; - vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr; + cpu_if->vgic_vmcr = vmcr; } void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp) { - u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr; + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + u32 vmcr; + + vmcr = cpu_if->vgic_vmcr; /* * Ignore the FIQen bit, because GIC emulation always implies @@ -386,3 +368,24 @@ int vgic_v3_probe(const struct gic_kvm_info *info) return 0; } + +void vgic_v3_load(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + + /* + * If dealing with a GICv2 emulation on GICv3, VMCR_EL2.VFIQen + * is dependent on ICC_SRE_EL1.SRE, and we have to perform the + * VMCR_EL2 save/restore in the world switch. + */ + if (likely(cpu_if->vgic_sre)) + kvm_call_hyp(__vgic_v3_write_vmcr, cpu_if->vgic_vmcr); +} + +void vgic_v3_put(struct kvm_vcpu *vcpu) +{ + struct vgic_v3_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v3; + + if (likely(cpu_if->vgic_sre)) + cpu_if->vgic_vmcr = kvm_call_hyp(__vgic_v3_read_vmcr); +} diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c index 7713d96e85b7..4346bc7d08dc 100644 --- a/virt/kvm/arm/vgic/vgic.c +++ b/virt/kvm/arm/vgic/vgic.c @@ -529,14 +529,6 @@ retry: spin_unlock(&vgic_cpu->ap_list_lock); } -static inline void vgic_process_maintenance_interrupt(struct kvm_vcpu *vcpu) -{ - if (kvm_vgic_global_state.type == VGIC_V2) - vgic_v2_process_maintenance(vcpu); - else - vgic_v3_process_maintenance(vcpu); -} - static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu) { if (kvm_vgic_global_state.type == VGIC_V2) @@ -603,10 +595,8 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) DEBUG_SPINLOCK_BUG_ON(!spin_is_locked(&vgic_cpu->ap_list_lock)); - if (compute_ap_list_depth(vcpu) > kvm_vgic_global_state.nr_lr) { - vgic_set_underflow(vcpu); + if (compute_ap_list_depth(vcpu) > kvm_vgic_global_state.nr_lr) vgic_sort_ap_list(vcpu); - } list_for_each_entry(irq, &vgic_cpu->ap_list_head, ap_list) { spin_lock(&irq->irq_lock); @@ -625,8 +615,12 @@ static void vgic_flush_lr_state(struct kvm_vcpu *vcpu) next: spin_unlock(&irq->irq_lock); - if (count == kvm_vgic_global_state.nr_lr) + if (count == kvm_vgic_global_state.nr_lr) { + if (!list_is_last(&irq->ap_list, + &vgic_cpu->ap_list_head)) + vgic_set_underflow(vcpu); break; + } } vcpu->arch.vgic_cpu.used_lrs = count; @@ -639,18 +633,30 @@ next: /* Sync back the hardware VGIC state into our emulation after a guest's run. */ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; + + /* An empty ap_list_head implies used_lrs == 0 */ + if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) return; - vgic_process_maintenance_interrupt(vcpu); - vgic_fold_lr_state(vcpu); + if (vgic_cpu->used_lrs) + vgic_fold_lr_state(vcpu); vgic_prune_ap_list(vcpu); } /* Flush our emulation state into the GIC hardware before entering the guest. */ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) { - if (unlikely(!vgic_initialized(vcpu->kvm))) + /* + * If there are no virtual interrupts active or pending for this + * VCPU, then there is no work to do and we can bail out without + * taking any lock. There is a potential race with someone injecting + * interrupts to the VCPU, but it is a benign race as the VCPU will + * either observe the new interrupt before or after doing this check, + * and introducing additional synchronization mechanism doesn't change + * this. + */ + if (list_empty(&vcpu->arch.vgic_cpu.ap_list_head)) return; spin_lock(&vcpu->arch.vgic_cpu.ap_list_lock); @@ -658,6 +664,28 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu) spin_unlock(&vcpu->arch.vgic_cpu.ap_list_lock); } +void kvm_vgic_load(struct kvm_vcpu *vcpu) +{ + if (unlikely(!vgic_initialized(vcpu->kvm))) + return; + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_load(vcpu); + else + vgic_v3_load(vcpu); +} + +void kvm_vgic_put(struct kvm_vcpu *vcpu) +{ + if (unlikely(!vgic_initialized(vcpu->kvm))) + return; + + if (kvm_vgic_global_state.type == VGIC_V2) + vgic_v2_put(vcpu); + else + vgic_v3_put(vcpu); +} + int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu) { struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu; diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h index 6cf557e9f718..799fd651b260 100644 --- a/virt/kvm/arm/vgic/vgic.h +++ b/virt/kvm/arm/vgic/vgic.h @@ -119,7 +119,6 @@ void vgic_kick_vcpus(struct kvm *kvm); int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, phys_addr_t addr, phys_addr_t alignment); -void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu); void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v2_clear_lr(struct kvm_vcpu *vcpu, int lr); @@ -138,6 +137,8 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address, enum vgic_type); void vgic_v2_init_lrs(void); +void vgic_v2_load(struct kvm_vcpu *vcpu); +void vgic_v2_put(struct kvm_vcpu *vcpu); static inline void vgic_get_irq_kref(struct vgic_irq *irq) { @@ -147,7 +148,6 @@ static inline void vgic_get_irq_kref(struct vgic_irq *irq) kref_get(&irq->refcount); } -void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu); void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu); void vgic_v3_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr); void vgic_v3_clear_lr(struct kvm_vcpu *vcpu, int lr); @@ -159,6 +159,9 @@ int vgic_v3_probe(const struct gic_kvm_info *info); int vgic_v3_map_resources(struct kvm *kvm); int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address); +void vgic_v3_load(struct kvm_vcpu *vcpu); +void vgic_v3_put(struct kvm_vcpu *vcpu); + int vgic_register_its_iodevs(struct kvm *kvm); bool vgic_has_its(struct kvm *kvm); int kvm_vgic_register_its_device(void); diff --git a/virt/kvm/eventfd.c b/virt/kvm/eventfd.c index 4d28a9ddbee0..a8d540398bbd 100644 --- a/virt/kvm/eventfd.c +++ b/virt/kvm/eventfd.c @@ -490,7 +490,7 @@ void kvm_register_irq_ack_notifier(struct kvm *kvm, mutex_lock(&kvm->irq_lock); hlist_add_head_rcu(&kian->link, &kvm->irq_ack_notifier_list); mutex_unlock(&kvm->irq_lock); - kvm_vcpu_request_scan_ioapic(kvm); + kvm_arch_post_irq_ack_notifier_list_update(kvm); } void kvm_unregister_irq_ack_notifier(struct kvm *kvm, @@ -500,7 +500,7 @@ void kvm_unregister_irq_ack_notifier(struct kvm *kvm, hlist_del_init_rcu(&kian->link); mutex_unlock(&kvm->irq_lock); synchronize_srcu(&kvm->irq_srcu); - kvm_vcpu_request_scan_ioapic(kvm); + kvm_arch_post_irq_ack_notifier_list_update(kvm); } #endif diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c index 3bcc9990adf7..31e40c9e81df 100644 --- a/virt/kvm/irqchip.c +++ b/virt/kvm/irqchip.c @@ -142,8 +142,8 @@ static int setup_routing_entry(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, const struct kvm_irq_routing_entry *ue) { - int r = -EINVAL; struct kvm_kernel_irq_routing_entry *ei; + int r; /* * Do not allow GSI to be mapped to the same irqchip more than once. @@ -153,26 +153,30 @@ static int setup_routing_entry(struct kvm *kvm, if (ei->type != KVM_IRQ_ROUTING_IRQCHIP || ue->type != KVM_IRQ_ROUTING_IRQCHIP || ue->u.irqchip.irqchip == ei->irqchip.irqchip) - return r; + return -EINVAL; e->gsi = ue->gsi; e->type = ue->type; r = kvm_set_routing_entry(kvm, e, ue); if (r) - goto out; + return r; if (e->type == KVM_IRQ_ROUTING_IRQCHIP) rt->chip[e->irqchip.irqchip][e->irqchip.pin] = e->gsi; hlist_add_head(&e->link, &rt->map[e->gsi]); - r = 0; -out: - return r; + + return 0; } void __attribute__((weak)) kvm_arch_irq_routing_update(struct kvm *kvm) { } +bool __weak kvm_arch_can_set_irq_routing(struct kvm *kvm) +{ + return true; +} + int kvm_set_irq_routing(struct kvm *kvm, const struct kvm_irq_routing_entry *ue, unsigned nr, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index aca22d36be9c..b3d151ee2a67 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -165,6 +165,24 @@ void vcpu_put(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(vcpu_put); +/* TODO: merge with kvm_arch_vcpu_should_kick */ +static bool kvm_request_needs_ipi(struct kvm_vcpu *vcpu, unsigned req) +{ + int mode = kvm_vcpu_exiting_guest_mode(vcpu); + + /* + * We need to wait for the VCPU to reenable interrupts and get out of + * READING_SHADOW_PAGE_TABLES mode. + */ + if (req & KVM_REQUEST_WAIT) + return mode != OUTSIDE_GUEST_MODE; + + /* + * Need to kick a running VCPU, but otherwise there is nothing to do. + */ + return mode == IN_GUEST_MODE; +} + static void ack_flush(void *_completed) { } @@ -174,6 +192,7 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) int i, cpu, me; cpumask_var_t cpus; bool called = true; + bool wait = req & KVM_REQUEST_WAIT; struct kvm_vcpu *vcpu; zalloc_cpumask_var(&cpus, GFP_ATOMIC); @@ -183,17 +202,17 @@ bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req) kvm_make_request(req, vcpu); cpu = vcpu->cpu; - /* Set ->requests bit before we read ->mode. */ - smp_mb__after_atomic(); + if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu)) + continue; if (cpus != NULL && cpu != -1 && cpu != me && - kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE) + kvm_request_needs_ipi(vcpu, req)) cpumask_set_cpu(cpu, cpus); } if (unlikely(cpus == NULL)) - smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1); + smp_call_function_many(cpu_online_mask, ack_flush, NULL, wait); else if (!cpumask_empty(cpus)) - smp_call_function_many(cpus, ack_flush, NULL, 1); + smp_call_function_many(cpus, ack_flush, NULL, wait); else called = false; put_cpu(); @@ -1007,8 +1026,6 @@ int __kvm_set_memory_region(struct kvm *kvm, old_memslots = install_new_memslots(kvm, as_id, slots); - /* slot was deleted or moved, clear iommu mapping */ - kvm_iommu_unmap_pages(kvm, &old); /* From this point no new shadow pages pointing to a deleted, * or moved, memslot will be created. * @@ -1043,21 +1060,6 @@ int __kvm_set_memory_region(struct kvm *kvm, kvm_free_memslot(kvm, &old, &new); kvfree(old_memslots); - - /* - * IOMMU mapping: New slots need to be mapped. Old slots need to be - * un-mapped and re-mapped if their base changes. Since base change - * unmapping is handled above with slot deletion, mapping alone is - * needed here. Anything else the iommu might care about for existing - * slots (size changes, userspace addr changes and read-only flag - * changes) is disallowed above, so any other attribute changes getting - * here can be skipped. - */ - if (as_id == 0 && (change == KVM_MR_CREATE || change == KVM_MR_MOVE)) { - r = kvm_iommu_map_pages(kvm, &new); - return r; - } - return 0; out_slots: @@ -1961,18 +1963,18 @@ static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots, return 0; } -int kvm_vcpu_gfn_to_hva_cache_init(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, +int kvm_gfn_to_hva_cache_init(struct kvm *kvm, struct gfn_to_hva_cache *ghc, gpa_t gpa, unsigned long len) { - struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); + struct kvm_memslots *slots = kvm_memslots(kvm); return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva_cache_init); +EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init); -int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, - void *data, int offset, unsigned long len) +int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, int offset, unsigned long len) { - struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); + struct kvm_memslots *slots = kvm_memslots(kvm); int r; gpa_t gpa = ghc->gpa + offset; @@ -1982,7 +1984,7 @@ int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_ __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); if (unlikely(!ghc->memslot)) - return kvm_vcpu_write_guest(vcpu, gpa, data, len); + return kvm_write_guest(kvm, gpa, data, len); if (kvm_is_error_hva(ghc->hva)) return -EFAULT; @@ -1994,19 +1996,19 @@ int kvm_vcpu_write_guest_offset_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_ return 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_offset_cached); +EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached); -int kvm_vcpu_write_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len) +int kvm_write_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len) { - return kvm_vcpu_write_guest_offset_cached(vcpu, ghc, data, 0, len); + return kvm_write_guest_offset_cached(kvm, ghc, data, 0, len); } -EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_cached); +EXPORT_SYMBOL_GPL(kvm_write_guest_cached); -int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *ghc, - void *data, unsigned long len) +int kvm_read_guest_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc, + void *data, unsigned long len) { - struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); + struct kvm_memslots *slots = kvm_memslots(kvm); int r; BUG_ON(len > ghc->len); @@ -2015,7 +2017,7 @@ int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *g __kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len); if (unlikely(!ghc->memslot)) - return kvm_vcpu_read_guest(vcpu, ghc->gpa, data, len); + return kvm_read_guest(kvm, ghc->gpa, data, len); if (kvm_is_error_hva(ghc->hva)) return -EFAULT; @@ -2026,7 +2028,7 @@ int kvm_vcpu_read_guest_cached(struct kvm_vcpu *vcpu, struct gfn_to_hva_cache *g return 0; } -EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_cached); +EXPORT_SYMBOL_GPL(kvm_read_guest_cached); int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) { @@ -2200,8 +2202,7 @@ out: } EXPORT_SYMBOL_GPL(kvm_vcpu_block); -#ifndef CONFIG_S390 -void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) +bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) { struct swait_queue_head *wqp; @@ -2209,11 +2210,14 @@ void kvm_vcpu_wake_up(struct kvm_vcpu *vcpu) if (swait_active(wqp)) { swake_up(wqp); ++vcpu->stat.halt_wakeup; + return true; } + return false; } EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up); +#ifndef CONFIG_S390 /* * Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode. */ @@ -2222,7 +2226,9 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu) int me; int cpu = vcpu->cpu; - kvm_vcpu_wake_up(vcpu); + if (kvm_vcpu_wake_up(vcpu)) + return; + me = get_cpu(); if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) if (kvm_arch_vcpu_should_kick(vcpu)) @@ -2354,7 +2360,7 @@ static int kvm_vcpu_fault(struct vm_fault *vmf) else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET) page = virt_to_page(vcpu->arch.pio_data); #endif -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET +#ifdef CONFIG_KVM_MMIO else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET) page = virt_to_page(vcpu->kvm->coalesced_mmio_ring); #endif @@ -2923,6 +2929,10 @@ static long kvm_vm_ioctl_check_extension_generic(struct kvm *kvm, long arg) case KVM_CAP_IOEVENTFD_ANY_LENGTH: case KVM_CAP_CHECK_EXTENSION_VM: return 1; +#ifdef CONFIG_KVM_MMIO + case KVM_CAP_COALESCED_MMIO: + return KVM_COALESCED_MMIO_PAGE_OFFSET; +#endif #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING case KVM_CAP_IRQ_ROUTING: return KVM_MAX_IRQ_ROUTES; @@ -2972,7 +2982,7 @@ static long kvm_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_get_dirty_log(kvm, &log); break; } -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET +#ifdef CONFIG_KVM_MMIO case KVM_REGISTER_COALESCED_MMIO: { struct kvm_coalesced_mmio_zone zone; @@ -3055,6 +3065,8 @@ static long kvm_vm_ioctl(struct file *filp, if (copy_from_user(&routing, argp, sizeof(routing))) goto out; r = -EINVAL; + if (!kvm_arch_can_set_irq_routing(kvm)) + goto out; if (routing.nr > KVM_MAX_IRQ_ROUTES) goto out; if (routing.flags) @@ -3164,7 +3176,7 @@ static int kvm_dev_ioctl_create_vm(unsigned long type) kvm = kvm_create_vm(type); if (IS_ERR(kvm)) return PTR_ERR(kvm); -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET +#ifdef CONFIG_KVM_MMIO r = kvm_coalesced_mmio_init(kvm); if (r < 0) { kvm_put_kvm(kvm); @@ -3217,7 +3229,7 @@ static long kvm_dev_ioctl(struct file *filp, #ifdef CONFIG_X86 r += PAGE_SIZE; /* pio data page */ #endif -#ifdef KVM_COALESCED_MMIO_PAGE_OFFSET +#ifdef CONFIG_KVM_MMIO r += PAGE_SIZE; /* coalesced mmio ring page */ #endif break; diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index d32f239eb471..37d9118fd84b 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -20,6 +20,10 @@ #include <linux/vfio.h> #include "vfio.h" +#ifdef CONFIG_SPAPR_TCE_IOMMU +#include <asm/kvm_ppc.h> +#endif + struct kvm_vfio_group { struct list_head node; struct vfio_group *vfio_group; @@ -89,6 +93,47 @@ static bool kvm_vfio_group_is_coherent(struct vfio_group *vfio_group) return ret > 0; } +#ifdef CONFIG_SPAPR_TCE_IOMMU +static int kvm_vfio_external_user_iommu_id(struct vfio_group *vfio_group) +{ + int (*fn)(struct vfio_group *); + int ret = -EINVAL; + + fn = symbol_get(vfio_external_user_iommu_id); + if (!fn) + return ret; + + ret = fn(vfio_group); + + symbol_put(vfio_external_user_iommu_id); + + return ret; +} + +static struct iommu_group *kvm_vfio_group_get_iommu_group( + struct vfio_group *group) +{ + int group_id = kvm_vfio_external_user_iommu_id(group); + + if (group_id < 0) + return NULL; + + return iommu_group_get_by_id(group_id); +} + +static void kvm_spapr_tce_release_vfio_group(struct kvm *kvm, + struct vfio_group *vfio_group) +{ + struct iommu_group *grp = kvm_vfio_group_get_iommu_group(vfio_group); + + if (WARN_ON_ONCE(!grp)) + return; + + kvm_spapr_tce_release_iommu_group(kvm, grp); + iommu_group_put(grp); +} +#endif + /* * Groups can use the same or different IOMMU domains. If the same then * adding a new group may change the coherency of groups we've previously @@ -211,6 +256,9 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) mutex_unlock(&kv->lock); +#ifdef CONFIG_SPAPR_TCE_IOMMU + kvm_spapr_tce_release_vfio_group(dev->kvm, vfio_group); +#endif kvm_vfio_group_set_kvm(vfio_group, NULL); kvm_vfio_group_put_external_user(vfio_group); @@ -218,6 +266,57 @@ static int kvm_vfio_set_group(struct kvm_device *dev, long attr, u64 arg) kvm_vfio_update_coherency(dev); return ret; + +#ifdef CONFIG_SPAPR_TCE_IOMMU + case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: { + struct kvm_vfio_spapr_tce param; + struct kvm_vfio *kv = dev->private; + struct vfio_group *vfio_group; + struct kvm_vfio_group *kvg; + struct fd f; + struct iommu_group *grp; + + if (copy_from_user(¶m, (void __user *)arg, + sizeof(struct kvm_vfio_spapr_tce))) + return -EFAULT; + + f = fdget(param.groupfd); + if (!f.file) + return -EBADF; + + vfio_group = kvm_vfio_group_get_external_user(f.file); + fdput(f); + + if (IS_ERR(vfio_group)) + return PTR_ERR(vfio_group); + + grp = kvm_vfio_group_get_iommu_group(vfio_group); + if (WARN_ON_ONCE(!grp)) { + kvm_vfio_group_put_external_user(vfio_group); + return -EIO; + } + + ret = -ENOENT; + + mutex_lock(&kv->lock); + + list_for_each_entry(kvg, &kv->group_list, node) { + if (kvg->vfio_group != vfio_group) + continue; + + ret = kvm_spapr_tce_attach_iommu_group(dev->kvm, + param.tablefd, grp); + break; + } + + mutex_unlock(&kv->lock); + + iommu_group_put(grp); + kvm_vfio_group_put_external_user(vfio_group); + + return ret; + } +#endif /* CONFIG_SPAPR_TCE_IOMMU */ } return -ENXIO; @@ -242,6 +341,9 @@ static int kvm_vfio_has_attr(struct kvm_device *dev, switch (attr->attr) { case KVM_DEV_VFIO_GROUP_ADD: case KVM_DEV_VFIO_GROUP_DEL: +#ifdef CONFIG_SPAPR_TCE_IOMMU + case KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: +#endif return 0; } @@ -257,6 +359,9 @@ static void kvm_vfio_destroy(struct kvm_device *dev) struct kvm_vfio_group *kvg, *tmp; list_for_each_entry_safe(kvg, tmp, &kv->group_list, node) { +#ifdef CONFIG_SPAPR_TCE_IOMMU + kvm_spapr_tce_release_vfio_group(dev->kvm, kvg->vfio_group); +#endif kvm_vfio_group_set_kvm(kvg->vfio_group, NULL); kvm_vfio_group_put_external_user(kvg->vfio_group); list_del(&kvg->node); |