summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--.cocciconfig3
-rw-r--r--.gitignore2
-rw-r--r--Documentation/PCI/MSI-HOWTO.txt469
-rw-r--r--Documentation/coccinelle.txt148
-rw-r--r--Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt7
-rw-r--r--Documentation/devicetree/bindings/mtd/atmel-quadspi.txt32
-rw-r--r--Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt1
-rw-r--r--Documentation/devicetree/bindings/mtd/cadence-quadspi.txt56
-rw-r--r--Documentation/devicetree/bindings/mtd/gpmc-nand.txt2
-rw-r--r--Documentation/devicetree/bindings/mtd/hisilicon,fmc-spi-nor.txt24
-rw-r--r--Documentation/devicetree/bindings/mtd/mtk-nand.txt160
-rw-r--r--Documentation/devicetree/bindings/mtd/sunxi-nand.txt6
-rw-r--r--Documentation/devicetree/bindings/pci/aardvark-pci.txt56
-rw-r--r--Documentation/devicetree/bindings/pci/axis,artpec6-pcie.txt46
-rw-r--r--Documentation/dontdiff1
-rw-r--r--Documentation/gcc-plugins.txt87
-rw-r--r--Documentation/kernel-parameters.txt9
-rw-r--r--Documentation/virtual/kvm/api.txt82
-rw-r--r--Documentation/virtual/kvm/devices/arm-vgic.txt25
-rw-r--r--Documentation/virtual/kvm/devices/vm.txt87
-rw-r--r--Documentation/virtual/kvm/locking.txt4
-rw-r--r--MAINTAINERS26
-rw-r--r--Makefile45
-rw-r--r--arch/Kconfig37
-rw-r--r--arch/alpha/boot/Makefile2
-rw-r--r--arch/arm/Kconfig3
-rw-r--r--arch/arm/include/asm/kvm_asm.h2
-rw-r--r--arch/arm/include/asm/kvm_host.h27
-rw-r--r--arch/arm/include/asm/kvm_hyp.h3
-rw-r--r--arch/arm/include/asm/kvm_mmu.h15
-rw-r--r--arch/arm/include/asm/mach/pci.h1
-rw-r--r--arch/arm/include/asm/pgtable.h4
-rw-r--r--arch/arm/include/asm/virt.h4
-rw-r--r--arch/arm/kernel/bios32.c45
-rw-r--r--arch/arm/kvm/Kconfig7
-rw-r--r--arch/arm/kvm/Makefile6
-rw-r--r--arch/arm/kvm/arm.c46
-rw-r--r--arch/arm/kvm/emulate.c2
-rw-r--r--arch/arm/kvm/guest.c2
-rw-r--r--arch/arm/kvm/init.S56
-rw-r--r--arch/arm/kvm/mmu.c142
-rw-r--r--arch/arm/kvm/reset.c2
-rw-r--r--arch/arm64/Kconfig7
-rw-r--r--arch/arm64/boot/dts/marvell/armada-3720-db.dts5
-rw-r--r--arch/arm64/boot/dts/marvell/armada-37xx.dtsi25
-rw-r--r--arch/arm64/include/asm/cpufeature.h3
-rw-r--r--arch/arm64/include/asm/kvm_arm.h2
-rw-r--r--arch/arm64/include/asm/kvm_host.h19
-rw-r--r--arch/arm64/include/asm/kvm_hyp.h23
-rw-r--r--arch/arm64/include/asm/kvm_mmu.h96
-rw-r--r--arch/arm64/include/asm/pgtable-hwdef.h1
-rw-r--r--arch/arm64/include/asm/pgtable-prot.h4
-rw-r--r--arch/arm64/include/asm/virt.h4
-rw-r--r--arch/arm64/include/uapi/asm/kvm.h2
-rw-r--r--arch/arm64/kernel/cpufeature.c19
-rw-r--r--arch/arm64/kernel/pci.c159
-rw-r--r--arch/arm64/kvm/Kconfig8
-rw-r--r--arch/arm64/kvm/Makefile9
-rw-r--r--arch/arm64/kvm/guest.c2
-rw-r--r--arch/arm64/kvm/hyp-init.S61
-rw-r--r--arch/arm64/kvm/hyp/entry.S19
-rw-r--r--arch/arm64/kvm/hyp/hyp-entry.S15
-rw-r--r--arch/arm64/kvm/hyp/switch.c11
-rw-r--r--arch/arm64/kvm/reset.c38
-rw-r--r--arch/arm64/kvm/sys_regs.c4
-rw-r--r--arch/cris/arch-v10/drivers/axisflashmap.c2
-rw-r--r--arch/cris/arch-v32/drivers/axisflashmap.c2
-rw-r--r--arch/microblaze/include/asm/pci.h3
-rw-r--r--arch/microblaze/pci/pci-common.c73
-rw-r--r--arch/mips/Kconfig2
-rw-r--r--arch/mips/include/asm/addrspace.h2
-rw-r--r--arch/mips/include/asm/kvm_host.h315
-rw-r--r--arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h2
-rw-r--r--arch/mips/include/asm/mipsregs.h21
-rw-r--r--arch/mips/include/asm/pci.h10
-rw-r--r--arch/mips/include/asm/setup.h1
-rw-r--r--arch/mips/include/asm/uasm.h7
-rw-r--r--arch/mips/include/uapi/asm/inst.h114
-rw-r--r--arch/mips/kernel/asm-offsets.c70
-rw-r--r--arch/mips/kernel/branch.c8
-rw-r--r--arch/mips/kernel/traps.c23
-rw-r--r--arch/mips/kvm/Kconfig1
-rw-r--r--arch/mips/kvm/Makefile3
-rw-r--r--arch/mips/kvm/commpage.c2
-rw-r--r--arch/mips/kvm/dyntrans.c182
-rw-r--r--arch/mips/kvm/emulate.c485
-rw-r--r--arch/mips/kvm/entry.c701
-rw-r--r--arch/mips/kvm/fpu.S7
-rw-r--r--arch/mips/kvm/interrupt.c12
-rw-r--r--arch/mips/kvm/interrupt.h14
-rw-r--r--arch/mips/kvm/locore.S605
-rw-r--r--arch/mips/kvm/mips.c367
-rw-r--r--arch/mips/kvm/mmu.c375
-rw-r--r--arch/mips/kvm/stats.c21
-rw-r--r--arch/mips/kvm/tlb.c498
-rw-r--r--arch/mips/kvm/trace.h236
-rw-r--r--arch/mips/kvm/trap_emul.c178
-rw-r--r--arch/mips/math-emu/cp1emu.c8
-rw-r--r--arch/mips/mm/c-r4k.c2
-rw-r--r--arch/mips/mm/uasm-micromips.c13
-rw-r--r--arch/mips/mm/uasm-mips.c11
-rw-r--r--arch/mips/mm/uasm.c24
-rw-r--r--arch/mips/pci/pci.c19
-rw-r--r--arch/powerpc/boot/Makefile2
-rw-r--r--arch/powerpc/include/asm/hmi.h45
-rw-r--r--arch/powerpc/include/asm/paca.h6
-rw-r--r--arch/powerpc/include/asm/pci.h3
-rw-r--r--arch/powerpc/kernel/Makefile2
-rw-r--r--arch/powerpc/kernel/exceptions-64s.S4
-rw-r--r--arch/powerpc/kernel/hmi.c56
-rw-r--r--arch/powerpc/kernel/idle_book3s.S4
-rw-r--r--arch/powerpc/kernel/pci-common.c79
-rw-r--r--arch/powerpc/kernel/traps.c5
-rw-r--r--arch/powerpc/kvm/Makefile2
-rw-r--r--arch/powerpc/kvm/book3s_hv.c41
-rw-r--r--arch/powerpc/kvm/book3s_hv_ras.c176
-rw-r--r--arch/powerpc/kvm/book3s_hv_rmhandlers.S527
-rw-r--r--arch/powerpc/kvm/book3s_pr.c16
-rw-r--r--arch/powerpc/kvm/booke.c4
-rw-r--r--arch/powerpc/kvm/emulate.c1
-rw-r--r--arch/powerpc/kvm/mpic.c3
-rw-r--r--arch/powerpc/kvm/powerpc.c6
-rw-r--r--arch/powerpc/platforms/powernv/opal-wrappers.S2
-rw-r--r--arch/s390/boot/compressed/Makefile4
-rw-r--r--arch/s390/hypfs/hypfs_diag.c377
-rw-r--r--arch/s390/include/asm/cpacf.h10
-rw-r--r--arch/s390/include/asm/diag.h149
-rw-r--r--arch/s390/include/asm/gmap.h82
-rw-r--r--arch/s390/include/asm/kvm_host.h33
-rw-r--r--arch/s390/include/asm/mmu.h11
-rw-r--r--arch/s390/include/asm/mmu_context.h3
-rw-r--r--arch/s390/include/asm/page.h9
-rw-r--r--arch/s390/include/asm/pgalloc.h2
-rw-r--r--arch/s390/include/asm/pgtable.h17
-rw-r--r--arch/s390/include/asm/processor.h2
-rw-r--r--arch/s390/include/asm/sclp.h23
-rw-r--r--arch/s390/include/uapi/asm/kvm.h41
-rw-r--r--arch/s390/include/uapi/asm/sie.h1
-rw-r--r--arch/s390/kernel/diag.c39
-rw-r--r--arch/s390/kvm/Makefile2
-rw-r--r--arch/s390/kvm/diag.c5
-rw-r--r--arch/s390/kvm/gaccess.c387
-rw-r--r--arch/s390/kvm/gaccess.h3
-rw-r--r--arch/s390/kvm/guestdbg.c19
-rw-r--r--arch/s390/kvm/intercept.c33
-rw-r--r--arch/s390/kvm/interrupt.c34
-rw-r--r--arch/s390/kvm/kvm-s390.c404
-rw-r--r--arch/s390/kvm/kvm-s390.h22
-rw-r--r--arch/s390/kvm/priv.c226
-rw-r--r--arch/s390/kvm/sigp.c10
-rw-r--r--arch/s390/kvm/sthyi.c471
-rw-r--r--arch/s390/kvm/trace.h49
-rw-r--r--arch/s390/kvm/vsie.c1091
-rw-r--r--arch/s390/mm/fault.c2
-rw-r--r--arch/s390/mm/gmap.c1574
-rw-r--r--arch/s390/mm/pgalloc.c39
-rw-r--r--arch/s390/mm/pgtable.c209
-rw-r--r--arch/sparc/include/asm/pci_64.h3
-rw-r--r--arch/sparc/kernel/pci.c20
-rw-r--r--arch/um/Kconfig.common1
-rw-r--r--arch/um/Makefile4
-rw-r--r--arch/unicore32/kernel/pci.c9
-rw-r--r--arch/x86/Kconfig1
-rw-r--r--arch/x86/boot/Makefile2
-rw-r--r--arch/x86/entry/vdso/Makefile3
-rw-r--r--arch/x86/include/asm/kvm_host.h31
-rw-r--r--arch/x86/include/asm/svm.h1
-rw-r--r--arch/x86/include/asm/virtext.h8
-rw-r--r--arch/x86/kvm/Kconfig1
-rw-r--r--arch/x86/kvm/i8254.c4
-rw-r--r--arch/x86/kvm/iommu.c2
-rw-r--r--arch/x86/kvm/irq_comm.c49
-rw-r--r--arch/x86/kvm/lapic.c539
-rw-r--r--arch/x86/kvm/lapic.h19
-rw-r--r--arch/x86/kvm/mmu.c29
-rw-r--r--arch/x86/kvm/mmu.h5
-rw-r--r--arch/x86/kvm/paging_tmpl.h10
-rw-r--r--arch/x86/kvm/pmu_intel.c2
-rw-r--r--arch/x86/kvm/svm.c8
-rw-r--r--arch/x86/kvm/trace.h15
-rw-r--r--arch/x86/kvm/vmx.c386
-rw-r--r--arch/x86/kvm/x86.c175
-rw-r--r--arch/x86/pci/common.c2
-rw-r--r--arch/x86/pci/vmd.c41
-rw-r--r--arch/x86/purgatory/Makefile2
-rw-r--r--arch/x86/realmode/rm/Makefile2
-rw-r--r--drivers/acpi/Kconfig3
-rw-r--r--drivers/acpi/Makefile1
-rw-r--r--drivers/acpi/pci_mcfg.c92
-rw-r--r--drivers/acpi/pci_root.c35
-rw-r--r--drivers/block/rbd.c15
-rw-r--r--drivers/gpu/drm/drm_dp_helper.c32
-rw-r--r--drivers/gpu/drm/i915/intel_drv.h2
-rw-r--r--drivers/gpu/drm/i915/intel_psr.c19
-rw-r--r--drivers/gpu/drm/i915/intel_sprite.c6
-rw-r--r--drivers/irqchip/Kconfig18
-rw-r--r--drivers/memory/Kconfig2
-rw-r--r--drivers/memory/fsl_ifc.c4
-rw-r--r--drivers/misc/genwqe/card_base.c18
-rw-r--r--drivers/mtd/chips/cfi_cmdset_0020.c2
-rw-r--r--drivers/mtd/devices/Kconfig16
-rw-r--r--drivers/mtd/devices/m25p80.c37
-rw-r--r--drivers/mtd/maps/physmap_of.c2
-rw-r--r--drivers/mtd/maps/pmcmsp-flash.c6
-rw-r--r--drivers/mtd/maps/sa1100-flash.c4
-rw-r--r--drivers/mtd/nand/Kconfig10
-rw-r--r--drivers/mtd/nand/Makefile1
-rw-r--r--drivers/mtd/nand/brcmnand/brcmnand.c173
-rw-r--r--drivers/mtd/nand/jz4780_bch.c2
-rw-r--r--drivers/mtd/nand/jz4780_nand.c2
-rw-r--r--drivers/mtd/nand/mtk_ecc.c530
-rw-r--r--drivers/mtd/nand/mtk_ecc.h50
-rw-r--r--drivers/mtd/nand/mtk_nand.c1526
-rw-r--r--drivers/mtd/nand/nand_base.c2
-rw-r--r--drivers/mtd/nand/nand_ids.c1
-rw-r--r--drivers/mtd/nand/omap2.c11
-rw-r--r--drivers/mtd/nand/sunxi_nand.c397
-rw-r--r--drivers/mtd/nand/xway_nand.c231
-rw-r--r--drivers/mtd/onenand/onenand_base.c4
-rw-r--r--drivers/mtd/spi-nor/Kconfig27
-rw-r--r--drivers/mtd/spi-nor/Makefile3
-rw-r--r--drivers/mtd/spi-nor/atmel-quadspi.c732
-rw-r--r--drivers/mtd/spi-nor/cadence-quadspi.c1299
-rw-r--r--drivers/mtd/spi-nor/fsl-quadspi.c29
-rw-r--r--drivers/mtd/spi-nor/hisi-sfc.c489
-rw-r--r--drivers/mtd/spi-nor/mtk-quadspi.c43
-rw-r--r--drivers/mtd/spi-nor/nxp-spifi.c25
-rw-r--r--drivers/mtd/spi-nor/spi-nor.c127
-rw-r--r--drivers/mtd/ssfdc.c3
-rw-r--r--drivers/mtd/tests/nandbiterrs.c2
-rw-r--r--drivers/net/ethernet/atheros/alx/main.c12
-rw-r--r--drivers/net/ethernet/intel/e1000e/netdev.c6
-rw-r--r--drivers/net/ethernet/intel/fm10k/fm10k_pci.c11
-rw-r--r--drivers/net/ethernet/intel/i40e/i40e_main.c9
-rw-r--r--drivers/net/ethernet/intel/igb/igb_main.c10
-rw-r--r--drivers/net/ethernet/intel/ixgbe/ixgbe_main.c9
-rw-r--r--drivers/nvme/host/pci.c15
-rw-r--r--drivers/pci/Kconfig2
-rw-r--r--drivers/pci/bus.c31
-rw-r--r--drivers/pci/ecam.c6
-rw-r--r--drivers/pci/host/Kconfig49
-rw-r--r--drivers/pci/host/Makefile2
-rw-r--r--drivers/pci/host/pci-aardvark.c1001
-rw-r--r--drivers/pci/host/pci-dra7xx.c4
-rw-r--r--drivers/pci/host/pci-host-common.c44
-rw-r--r--drivers/pci/host/pci-host-generic.c13
-rw-r--r--drivers/pci/host/pci-hyperv.c30
-rw-r--r--drivers/pci/host/pci-keystone.c10
-rw-r--r--drivers/pci/host/pci-layerscape.c10
-rw-r--r--drivers/pci/host/pci-mvebu.c28
-rw-r--r--drivers/pci/host/pci-rcar-gen2.c27
-rw-r--r--drivers/pci/host/pci-tegra.c99
-rw-r--r--drivers/pci/host/pci-thunder-ecam.c11
-rw-r--r--drivers/pci/host/pci-thunder-pem.c14
-rw-r--r--drivers/pci/host/pci-versatile.c29
-rw-r--r--drivers/pci/host/pci-xgene.c24
-rw-r--r--drivers/pci/host/pcie-altera.c83
-rw-r--r--drivers/pci/host/pcie-armada8k.c14
-rw-r--r--drivers/pci/host/pcie-artpec6.c280
-rw-r--r--drivers/pci/host/pcie-designware-plat.c10
-rw-r--r--drivers/pci/host/pcie-designware.c34
-rw-r--r--drivers/pci/host/pcie-hisi.c13
-rw-r--r--drivers/pci/host/pcie-iproc.c4
-rw-r--r--drivers/pci/host/pcie-rcar.c44
-rw-r--r--drivers/pci/host/pcie-xilinx-nwl.c20
-rw-r--r--drivers/pci/host/pcie-xilinx.c22
-rw-r--r--drivers/pci/hotplug/acpiphp_glue.c4
-rw-r--r--drivers/pci/hotplug/pciehp_hpc.c4
-rw-r--r--drivers/pci/msi.c266
-rw-r--r--drivers/pci/pci-driver.c5
-rw-r--r--drivers/pci/pci-sysfs.c5
-rw-r--r--drivers/pci/pci.c281
-rw-r--r--drivers/pci/pci.h11
-rw-r--r--drivers/pci/pcie/Kconfig5
-rw-r--r--drivers/pci/pcie/aspm.c2
-rw-r--r--drivers/pci/pcie/pcie-dpc.c19
-rw-r--r--drivers/pci/pcie/portdrv_core.c3
-rw-r--r--drivers/pci/pcie/portdrv_pci.c52
-rw-r--r--drivers/pci/probe.c22
-rw-r--r--drivers/pci/proc.c9
-rw-r--r--drivers/pci/quirks.c17
-rw-r--r--drivers/pci/remove.c2
-rw-r--r--drivers/pci/setup-bus.c68
-rw-r--r--drivers/s390/char/sclp_early.c12
-rw-r--r--drivers/s390/char/sclp_ocf.c23
-rw-r--r--drivers/scsi/lpfc/lpfc_init.c15
-rw-r--r--drivers/usb/host/xhci-pci.c2
-rw-r--r--fs/ceph/addr.c77
-rw-r--r--fs/ceph/cache.c2
-rw-r--r--fs/ceph/caps.c873
-rw-r--r--fs/ceph/dir.c73
-rw-r--r--fs/ceph/file.c77
-rw-r--r--fs/ceph/inode.c40
-rw-r--r--fs/ceph/ioctl.c30
-rw-r--r--fs/ceph/mds_client.c358
-rw-r--r--fs/ceph/mds_client.h19
-rw-r--r--fs/ceph/snap.c10
-rw-r--r--fs/ceph/super.c43
-rw-r--r--fs/ceph/super.h48
-rw-r--r--fs/ceph/xattr.c101
-rw-r--r--fs/orangefs/downcall.h5
-rw-r--r--fs/orangefs/file.c36
-rw-r--r--fs/orangefs/orangefs-cache.c4
-rw-r--r--fs/orangefs/orangefs-dev-proto.h2
-rw-r--r--fs/orangefs/orangefs-sysfs.c127
-rw-r--r--fs/orangefs/orangefs-utils.c2
-rw-r--r--fs/orangefs/upcall.h12
-rw-r--r--include/asm-generic/vmlinux.lds.h2
-rw-r--r--include/drm/drm_dp_helper.h2
-rw-r--r--include/kvm/arm_vgic.h438
-rw-r--r--include/kvm/vgic/vgic.h246
-rw-r--r--include/linux/ceph/ceph_fs.h55
-rw-r--r--include/linux/ceph/decode.h55
-rw-r--r--include/linux/ceph/libceph.h4
-rw-r--r--include/linux/ceph/mon_client.h7
-rw-r--r--include/linux/ceph/msgpool.h1
-rw-r--r--include/linux/ceph/osd_client.h1
-rw-r--r--include/linux/ceph/osdmap.h15
-rw-r--r--include/linux/ceph/string_table.h62
-rw-r--r--include/linux/context_tracking.h38
-rw-r--r--include/linux/export.h2
-rw-r--r--include/linux/irqchip/arm-gic-v3.h213
-rw-r--r--include/linux/kconfig.h31
-rw-r--r--include/linux/kvm_host.h58
-rw-r--r--include/linux/mtd/nand.h1
-rw-r--r--include/linux/mtd/spi-nor.h8
-rw-r--r--include/linux/page_ref.h9
-rw-r--r--include/linux/pci-acpi.h2
-rw-r--r--include/linux/pci-ecam.h (renamed from drivers/pci/ecam.h)4
-rw-r--r--include/linux/pci.h93
-rw-r--r--include/trace/events/kvm.h5
-rw-r--r--include/uapi/linux/kvm.h13
-rw-r--r--lib/Kconfig.debug2
-rw-r--r--mm/gup.c1
-rw-r--r--net/ceph/Makefile2
-rw-r--r--net/ceph/ceph_common.c2
-rw-r--r--net/ceph/ceph_fs.c30
-rw-r--r--net/ceph/debugfs.c12
-rw-r--r--net/ceph/mon_client.c4
-rw-r--r--net/ceph/msgpool.c1
-rw-r--r--net/ceph/osd_client.c49
-rw-r--r--net/ceph/osdmap.c58
-rw-r--r--net/ceph/string_table.c111
-rw-r--r--scripts/Kbuild.include2
-rw-r--r--scripts/Makefile2
-rw-r--r--scripts/Makefile.build2
-rw-r--r--scripts/Makefile.clean4
-rw-r--r--scripts/Makefile.gcc-plugins43
-rw-r--r--scripts/Makefile.host55
-rw-r--r--scripts/Makefile.lib7
-rw-r--r--scripts/basic/bin2c.c3
-rwxr-xr-xscripts/coccicheck96
-rw-r--r--scripts/coccinelle/free/devm_free.cocci26
-rw-r--r--scripts/coccinelle/free/ifnullfree.cocci4
-rw-r--r--scripts/coccinelle/free/kfree.cocci18
-rw-r--r--scripts/coccinelle/free/kfreeaddr.cocci6
-rw-r--r--scripts/coccinelle/iterators/device_node_continue.cocci3
-rw-r--r--scripts/coccinelle/misc/noderef.cocci18
-rwxr-xr-xscripts/gcc-plugin.sh51
-rw-r--r--scripts/gcc-plugins/Makefile27
-rw-r--r--scripts/gcc-plugins/cyc_complexity_plugin.c73
-rw-r--r--scripts/gcc-plugins/gcc-common.h830
-rw-r--r--scripts/gcc-plugins/gcc-generate-gimple-pass.h175
-rw-r--r--scripts/gcc-plugins/gcc-generate-ipa-pass.h289
-rw-r--r--scripts/gcc-plugins/gcc-generate-rtl-pass.h175
-rw-r--r--scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h175
-rw-r--r--scripts/gcc-plugins/sancov_plugin.c144
-rwxr-xr-xscripts/link-vmlinux.sh2
-rwxr-xr-xscripts/package/builddeb12
-rwxr-xr-xscripts/setlocalversion2
-rw-r--r--virt/kvm/Kconfig3
-rw-r--r--virt/kvm/arm/hyp/vgic-v2-sr.c15
-rw-r--r--virt/kvm/arm/vgic-v2-emul.c856
-rw-r--r--virt/kvm/arm/vgic-v2.c274
-rw-r--r--virt/kvm/arm/vgic-v3-emul.c1074
-rw-r--r--virt/kvm/arm/vgic-v3.c279
-rw-r--r--virt/kvm/arm/vgic.c2417
-rw-r--r--virt/kvm/arm/vgic.h140
-rw-r--r--virt/kvm/arm/vgic/vgic-init.c9
-rw-r--r--virt/kvm/arm/vgic/vgic-its.c1500
-rw-r--r--virt/kvm/arm/vgic/vgic-kvm-device.c22
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v2.c10
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio-v3.c247
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.c64
-rw-r--r--virt/kvm/arm/vgic/vgic-mmio.h31
-rw-r--r--virt/kvm/arm/vgic/vgic-v2.c12
-rw-r--r--virt/kvm/arm/vgic/vgic-v3.c29
-rw-r--r--virt/kvm/arm/vgic/vgic.c119
-rw-r--r--virt/kvm/arm/vgic/vgic.h38
-rw-r--r--virt/kvm/irqchip.c7
-rw-r--r--virt/kvm/kvm_main.c110
391 files changed, 24818 insertions, 11628 deletions
diff --git a/.cocciconfig b/.cocciconfig
new file mode 100644
index 000000000000..43967c6b2015
--- /dev/null
+++ b/.cocciconfig
@@ -0,0 +1,3 @@
+[spatch]
+ options = --timeout 200
+ options = --use-gitgrep
diff --git a/.gitignore b/.gitignore
index 0c320bf02586..c2ed4ecb0acd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,6 +37,7 @@ modules.builtin
Module.symvers
*.dwo
*.su
+*.c.[012]*.*
#
# Top-level generic files
@@ -66,6 +67,7 @@ Module.symvers
#
!.gitignore
!.mailmap
+!.cocciconfig
#
# Generated include files
diff --git a/Documentation/PCI/MSI-HOWTO.txt b/Documentation/PCI/MSI-HOWTO.txt
index 1179850f453c..c55df2911136 100644
--- a/Documentation/PCI/MSI-HOWTO.txt
+++ b/Documentation/PCI/MSI-HOWTO.txt
@@ -78,422 +78,111 @@ CONFIG_PCI_MSI option.
4.2 Using MSI
-Most of the hard work is done for the driver in the PCI layer. It simply
-has to request that the PCI layer set up the MSI capability for this
+Most of the hard work is done for the driver in the PCI layer. The driver
+simply has to request that the PCI layer set up the MSI capability for this
device.
-4.2.1 pci_enable_msi
+To automatically use MSI or MSI-X interrupt vectors, use the following
+function:
-int pci_enable_msi(struct pci_dev *dev)
+ int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
+ unsigned int max_vecs, unsigned int flags);
-A successful call allocates ONE interrupt to the device, regardless
-of how many MSIs the device supports. The device is switched from
-pin-based interrupt mode to MSI mode. The dev->irq number is changed
-to a new number which represents the message signaled interrupt;
-consequently, this function should be called before the driver calls
-request_irq(), because an MSI is delivered via a vector that is
-different from the vector of a pin-based interrupt.
+which allocates up to max_vecs interrupt vectors for a PCI device. It
+returns the number of vectors allocated or a negative error. If the device
+has a requirements for a minimum number of vectors the driver can pass a
+min_vecs argument set to this limit, and the PCI core will return -ENOSPC
+if it can't meet the minimum number of vectors.
-4.2.2 pci_enable_msi_range
+The flags argument should normally be set to 0, but can be used to pass the
+PCI_IRQ_NOMSI and PCI_IRQ_NOMSIX flag in case a device claims to support
+MSI or MSI-X, but the support is broken, or to pass PCI_IRQ_NOLEGACY in
+case the device does not support legacy interrupt lines.
-int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec)
+By default this function will spread the interrupts around the available
+CPUs, but this feature can be disabled by passing the PCI_IRQ_NOAFFINITY
+flag.
-This function allows a device driver to request any number of MSI
-interrupts within specified range from 'minvec' to 'maxvec'.
+To get the Linux IRQ numbers passed to request_irq() and free_irq() and the
+vectors, use the following function:
-If this function returns a positive number it indicates the number of
-MSI interrupts that have been successfully allocated. In this case
-the device is switched from pin-based interrupt mode to MSI mode and
-updates dev->irq to be the lowest of the new interrupts assigned to it.
-The other interrupts assigned to the device are in the range dev->irq
-to dev->irq + returned value - 1. Device driver can use the returned
-number of successfully allocated MSI interrupts to further allocate
-and initialize device resources.
+ int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
-If this function returns a negative number, it indicates an error and
-the driver should not attempt to request any more MSI interrupts for
-this device.
+Any allocated resources should be freed before removing the device using
+the following function:
-This function should be called before the driver calls request_irq(),
-because MSI interrupts are delivered via vectors that are different
-from the vector of a pin-based interrupt.
+ void pci_free_irq_vectors(struct pci_dev *dev);
-It is ideal if drivers can cope with a variable number of MSI interrupts;
-there are many reasons why the platform may not be able to provide the
-exact number that a driver asks for.
+If a device supports both MSI-X and MSI capabilities, this API will use the
+MSI-X facilities in preference to the MSI facilities. MSI-X supports any
+number of interrupts between 1 and 2048. In contrast, MSI is restricted to
+a maximum of 32 interrupts (and must be a power of two). In addition, the
+MSI interrupt vectors must be allocated consecutively, so the system might
+not be able to allocate as many vectors for MSI as it could for MSI-X. On
+some platforms, MSI interrupts must all be targeted at the same set of CPUs
+whereas MSI-X interrupts can all be targeted at different CPUs.
-There could be devices that can not operate with just any number of MSI
-interrupts within a range. See chapter 4.3.1.3 to get the idea how to
-handle such devices for MSI-X - the same logic applies to MSI.
+If a device supports neither MSI-X or MSI it will fall back to a single
+legacy IRQ vector.
-4.2.1.1 Maximum possible number of MSI interrupts
+The typical usage of MSI or MSI-X interrupts is to allocate as many vectors
+as possible, likely up to the limit supported by the device. If nvec is
+larger than the number supported by the device it will automatically be
+capped to the supported limit, so there is no need to query the number of
+vectors supported beforehand:
-The typical usage of MSI interrupts is to allocate as many vectors as
-possible, likely up to the limit returned by pci_msi_vec_count() function:
-
-static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec)
-{
- return pci_enable_msi_range(pdev, 1, nvec);
-}
-
-Note the value of 'minvec' parameter is 1. As 'minvec' is inclusive,
-the value of 0 would be meaningless and could result in error.
-
-Some devices have a minimal limit on number of MSI interrupts.
-In this case the function could look like this:
-
-static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec)
-{
- return pci_enable_msi_range(pdev, FOO_DRIVER_MINIMUM_NVEC, nvec);
-}
-
-4.2.1.2 Exact number of MSI interrupts
+ nvec = pci_alloc_irq_vectors(pdev, 1, nvec, 0);
+ if (nvec < 0)
+ goto out_err;
If a driver is unable or unwilling to deal with a variable number of MSI
-interrupts it could request a particular number of interrupts by passing
-that number to pci_enable_msi_range() function as both 'minvec' and 'maxvec'
-parameters:
-
-static int foo_driver_enable_msi(struct pci_dev *pdev, int nvec)
-{
- return pci_enable_msi_range(pdev, nvec, nvec);
-}
-
-Note, unlike pci_enable_msi_exact() function, which could be also used to
-enable a particular number of MSI-X interrupts, pci_enable_msi_range()
-returns either a negative errno or 'nvec' (not negative errno or 0 - as
-pci_enable_msi_exact() does).
-
-4.2.1.3 Single MSI mode
-
-The most notorious example of the request type described above is
-enabling the single MSI mode for a device. It could be done by passing
-two 1s as 'minvec' and 'maxvec':
-
-static int foo_driver_enable_single_msi(struct pci_dev *pdev)
-{
- return pci_enable_msi_range(pdev, 1, 1);
-}
-
-Note, unlike pci_enable_msi() function, which could be also used to
-enable the single MSI mode, pci_enable_msi_range() returns either a
-negative errno or 1 (not negative errno or 0 - as pci_enable_msi()
-does).
-
-4.2.3 pci_enable_msi_exact
-
-int pci_enable_msi_exact(struct pci_dev *dev, int nvec)
-
-This variation on pci_enable_msi_range() call allows a device driver to
-request exactly 'nvec' MSIs.
-
-If this function returns a negative number, it indicates an error and
-the driver should not attempt to request any more MSI interrupts for
-this device.
-
-By contrast with pci_enable_msi_range() function, pci_enable_msi_exact()
-returns zero in case of success, which indicates MSI interrupts have been
-successfully allocated.
-
-4.2.4 pci_disable_msi
-
-void pci_disable_msi(struct pci_dev *dev)
-
-This function should be used to undo the effect of pci_enable_msi_range().
-Calling it restores dev->irq to the pin-based interrupt number and frees
-the previously allocated MSIs. The interrupts may subsequently be assigned
-to another device, so drivers should not cache the value of dev->irq.
-
-Before calling this function, a device driver must always call free_irq()
-on any interrupt for which it previously called request_irq().
-Failure to do so results in a BUG_ON(), leaving the device with
-MSI enabled and thus leaking its vector.
-
-4.2.4 pci_msi_vec_count
-
-int pci_msi_vec_count(struct pci_dev *dev)
-
-This function could be used to retrieve the number of MSI vectors the
-device requested (via the Multiple Message Capable register). The MSI
-specification only allows the returned value to be a power of two,
-up to a maximum of 2^5 (32).
-
-If this function returns a negative number, it indicates the device is
-not capable of sending MSIs.
-
-If this function returns a positive number, it indicates the maximum
-number of MSI interrupt vectors that could be allocated.
-
-4.3 Using MSI-X
-
-The MSI-X capability is much more flexible than the MSI capability.
-It supports up to 2048 interrupts, each of which can be controlled
-independently. To support this flexibility, drivers must use an array of
-`struct msix_entry':
-
-struct msix_entry {
- u16 vector; /* kernel uses to write alloc vector */
- u16 entry; /* driver uses to specify entry */
-};
-
-This allows for the device to use these interrupts in a sparse fashion;
-for example, it could use interrupts 3 and 1027 and yet allocate only a
-two-element array. The driver is expected to fill in the 'entry' value
-in each element of the array to indicate for which entries the kernel
-should assign interrupts; it is invalid to fill in two entries with the
-same number.
-
-4.3.1 pci_enable_msix_range
-
-int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
- int minvec, int maxvec)
-
-Calling this function asks the PCI subsystem to allocate any number of
-MSI-X interrupts within specified range from 'minvec' to 'maxvec'.
-The 'entries' argument is a pointer to an array of msix_entry structs
-which should be at least 'maxvec' entries in size.
-
-On success, the device is switched into MSI-X mode and the function
-returns the number of MSI-X interrupts that have been successfully
-allocated. In this case the 'vector' member in entries numbered from
-0 to the returned value - 1 is populated with the interrupt number;
-the driver should then call request_irq() for each 'vector' that it
-decides to use. The device driver is responsible for keeping track of the
-interrupts assigned to the MSI-X vectors so it can free them again later.
-Device driver can use the returned number of successfully allocated MSI-X
-interrupts to further allocate and initialize device resources.
-
-If this function returns a negative number, it indicates an error and
-the driver should not attempt to allocate any more MSI-X interrupts for
-this device.
-
-This function, in contrast with pci_enable_msi_range(), does not adjust
-dev->irq. The device will not generate interrupts for this interrupt
-number once MSI-X is enabled.
-
-Device drivers should normally call this function once per device
-during the initialization phase.
-
-It is ideal if drivers can cope with a variable number of MSI-X interrupts;
-there are many reasons why the platform may not be able to provide the
-exact number that a driver asks for.
-
-There could be devices that can not operate with just any number of MSI-X
-interrupts within a range. E.g., an network adapter might need let's say
-four vectors per each queue it provides. Therefore, a number of MSI-X
-interrupts allocated should be a multiple of four. In this case interface
-pci_enable_msix_range() can not be used alone to request MSI-X interrupts
-(since it can allocate any number within the range, without any notion of
-the multiple of four) and the device driver should master a custom logic
-to request the required number of MSI-X interrupts.
-
-4.3.1.1 Maximum possible number of MSI-X interrupts
-
-The typical usage of MSI-X interrupts is to allocate as many vectors as
-possible, likely up to the limit returned by pci_msix_vec_count() function:
-
-static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec)
-{
- return pci_enable_msix_range(adapter->pdev, adapter->msix_entries,
- 1, nvec);
-}
-
-Note the value of 'minvec' parameter is 1. As 'minvec' is inclusive,
-the value of 0 would be meaningless and could result in error.
-
-Some devices have a minimal limit on number of MSI-X interrupts.
-In this case the function could look like this:
-
-static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec)
-{
- return pci_enable_msix_range(adapter->pdev, adapter->msix_entries,
- FOO_DRIVER_MINIMUM_NVEC, nvec);
-}
-
-4.3.1.2 Exact number of MSI-X interrupts
-
-If a driver is unable or unwilling to deal with a variable number of MSI-X
-interrupts it could request a particular number of interrupts by passing
-that number to pci_enable_msix_range() function as both 'minvec' and 'maxvec'
-parameters:
-
-static int foo_driver_enable_msix(struct foo_adapter *adapter, int nvec)
-{
- return pci_enable_msix_range(adapter->pdev, adapter->msix_entries,
- nvec, nvec);
-}
-
-Note, unlike pci_enable_msix_exact() function, which could be also used to
-enable a particular number of MSI-X interrupts, pci_enable_msix_range()
-returns either a negative errno or 'nvec' (not negative errno or 0 - as
-pci_enable_msix_exact() does).
-
-4.3.1.3 Specific requirements to the number of MSI-X interrupts
-
-As noted above, there could be devices that can not operate with just any
-number of MSI-X interrupts within a range. E.g., let's assume a device that
-is only capable sending the number of MSI-X interrupts which is a power of
-two. A routine that enables MSI-X mode for such device might look like this:
-
-/*
- * Assume 'minvec' and 'maxvec' are non-zero
- */
-static int foo_driver_enable_msix(struct foo_adapter *adapter,
- int minvec, int maxvec)
-{
- int rc;
-
- minvec = roundup_pow_of_two(minvec);
- maxvec = rounddown_pow_of_two(maxvec);
-
- if (minvec > maxvec)
- return -ERANGE;
-
-retry:
- rc = pci_enable_msix_range(adapter->pdev, adapter->msix_entries,
- maxvec, maxvec);
- /*
- * -ENOSPC is the only error code allowed to be analyzed
- */
- if (rc == -ENOSPC) {
- if (maxvec == 1)
- return -ENOSPC;
-
- maxvec /= 2;
-
- if (minvec > maxvec)
- return -ENOSPC;
-
- goto retry;
- }
-
- return rc;
-}
-
-Note how pci_enable_msix_range() return value is analyzed for a fallback -
-any error code other than -ENOSPC indicates a fatal error and should not
-be retried.
-
-4.3.2 pci_enable_msix_exact
-
-int pci_enable_msix_exact(struct pci_dev *dev,
- struct msix_entry *entries, int nvec)
-
-This variation on pci_enable_msix_range() call allows a device driver to
-request exactly 'nvec' MSI-Xs.
-
-If this function returns a negative number, it indicates an error and
-the driver should not attempt to allocate any more MSI-X interrupts for
-this device.
-
-By contrast with pci_enable_msix_range() function, pci_enable_msix_exact()
-returns zero in case of success, which indicates MSI-X interrupts have been
-successfully allocated.
-
-Another version of a routine that enables MSI-X mode for a device with
-specific requirements described in chapter 4.3.1.3 might look like this:
-
-/*
- * Assume 'minvec' and 'maxvec' are non-zero
- */
-static int foo_driver_enable_msix(struct foo_adapter *adapter,
- int minvec, int maxvec)
-{
- int rc;
-
- minvec = roundup_pow_of_two(minvec);
- maxvec = rounddown_pow_of_two(maxvec);
-
- if (minvec > maxvec)
- return -ERANGE;
-
-retry:
- rc = pci_enable_msix_exact(adapter->pdev,
- adapter->msix_entries, maxvec);
-
- /*
- * -ENOSPC is the only error code allowed to be analyzed
- */
- if (rc == -ENOSPC) {
- if (maxvec == 1)
- return -ENOSPC;
-
- maxvec /= 2;
-
- if (minvec > maxvec)
- return -ENOSPC;
-
- goto retry;
- } else if (rc < 0) {
- return rc;
- }
-
- return maxvec;
-}
-
-4.3.3 pci_disable_msix
-
-void pci_disable_msix(struct pci_dev *dev)
-
-This function should be used to undo the effect of pci_enable_msix_range().
-It frees the previously allocated MSI-X interrupts. The interrupts may
-subsequently be assigned to another device, so drivers should not cache
-the value of the 'vector' elements over a call to pci_disable_msix().
-
-Before calling this function, a device driver must always call free_irq()
-on any interrupt for which it previously called request_irq().
-Failure to do so results in a BUG_ON(), leaving the device with
-MSI-X enabled and thus leaking its vector.
-
-4.3.3 The MSI-X Table
-
-The MSI-X capability specifies a BAR and offset within that BAR for the
-MSI-X Table. This address is mapped by the PCI subsystem, and should not
-be accessed directly by the device driver. If the driver wishes to
-mask or unmask an interrupt, it should call disable_irq() / enable_irq().
+interrupts it can request a particular number of interrupts by passing that
+number to pci_alloc_irq_vectors() function as both 'min_vecs' and
+'max_vecs' parameters:
-4.3.4 pci_msix_vec_count
+ ret = pci_alloc_irq_vectors(pdev, nvec, nvec, 0);
+ if (ret < 0)
+ goto out_err;
-int pci_msix_vec_count(struct pci_dev *dev)
+The most notorious example of the request type described above is enabling
+the single MSI mode for a device. It could be done by passing two 1s as
+'min_vecs' and 'max_vecs':
-This function could be used to retrieve number of entries in the device
-MSI-X table.
+ ret = pci_alloc_irq_vectors(pdev, 1, 1, 0);
+ if (ret < 0)
+ goto out_err;
-If this function returns a negative number, it indicates the device is
-not capable of sending MSI-Xs.
+Some devices might not support using legacy line interrupts, in which case
+the PCI_IRQ_NOLEGACY flag can be used to fail the request if the platform
+can't provide MSI or MSI-X interrupts:
-If this function returns a positive number, it indicates the maximum
-number of MSI-X interrupt vectors that could be allocated.
+ nvec = pci_alloc_irq_vectors(pdev, 1, nvec, PCI_IRQ_NOLEGACY);
+ if (nvec < 0)
+ goto out_err;
-4.4 Handling devices implementing both MSI and MSI-X capabilities
+4.3 Legacy APIs
-If a device implements both MSI and MSI-X capabilities, it can
-run in either MSI mode or MSI-X mode, but not both simultaneously.
-This is a requirement of the PCI spec, and it is enforced by the
-PCI layer. Calling pci_enable_msi_range() when MSI-X is already
-enabled or pci_enable_msix_range() when MSI is already enabled
-results in an error. If a device driver wishes to switch between MSI
-and MSI-X at runtime, it must first quiesce the device, then switch
-it back to pin-interrupt mode, before calling pci_enable_msi_range()
-or pci_enable_msix_range() and resuming operation. This is not expected
-to be a common operation but may be useful for debugging or testing
-during development.
+The following old APIs to enable and disable MSI or MSI-X interrupts should
+not be used in new code:
-4.5 Considerations when using MSIs
+ pci_enable_msi() /* deprecated */
+ pci_enable_msi_range() /* deprecated */
+ pci_enable_msi_exact() /* deprecated */
+ pci_disable_msi() /* deprecated */
+ pci_enable_msix_range() /* deprecated */
+ pci_enable_msix_exact() /* deprecated */
+ pci_disable_msix() /* deprecated */
-4.5.1 Choosing between MSI-X and MSI
+Additionally there are APIs to provide the number of supported MSI or MSI-X
+vectors: pci_msi_vec_count() and pci_msix_vec_count(). In general these
+should be avoided in favor of letting pci_alloc_irq_vectors() cap the
+number of vectors. If you have a legitimate special use case for the count
+of vectors we might have to revisit that decision and add a
+pci_nr_irq_vectors() helper that handles MSI and MSI-X transparently.
-If your device supports both MSI-X and MSI capabilities, you should use
-the MSI-X facilities in preference to the MSI facilities. As mentioned
-above, MSI-X supports any number of interrupts between 1 and 2048.
-In contrast, MSI is restricted to a maximum of 32 interrupts (and
-must be a power of two). In addition, the MSI interrupt vectors must
-be allocated consecutively, so the system might not be able to allocate
-as many vectors for MSI as it could for MSI-X. On some platforms, MSI
-interrupts must all be targeted at the same set of CPUs whereas MSI-X
-interrupts can all be targeted at different CPUs.
+4.4 Considerations when using MSIs
-4.5.2 Spinlocks
+4.4.1 Spinlocks
Most device drivers have a per-device spinlock which is taken in the
interrupt handler. With pin-based interrupts or a single MSI, it is not
@@ -505,7 +194,7 @@ acquire the spinlock. Such deadlocks can be avoided by using
spin_lock_irqsave() or spin_lock_irq() which disable local interrupts
and acquire the lock (see Documentation/DocBook/kernel-locking).
-4.6 How to tell whether MSI/MSI-X is enabled on a device
+4.5 How to tell whether MSI/MSI-X is enabled on a device
Using 'lspci -v' (as root) may show some devices with "MSI", "Message
Signalled Interrupts" or "MSI-X" capabilities. Each of these capabilities
diff --git a/Documentation/coccinelle.txt b/Documentation/coccinelle.txt
index 7f773d51fdd9..01fb1dae3163 100644
--- a/Documentation/coccinelle.txt
+++ b/Documentation/coccinelle.txt
@@ -38,6 +38,15 @@ as a regular user, and install it with
sudo make install
+ Supplemental documentation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+For supplemental documentation refer to the wiki:
+
+https://bottest.wiki.kernel.org/coccicheck
+
+The wiki documentation always refers to the linux-next version of the script.
+
Using Coccinelle on the Linux kernel
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -94,11 +103,26 @@ To enable verbose messages set the V= variable, for example:
make coccicheck MODE=report V=1
+ Coccinelle parallelization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
By default, coccicheck tries to run as parallel as possible. To change
the parallelism, set the J= variable. For example, to run across 4 CPUs:
make coccicheck MODE=report J=4
+As of Coccinelle 1.0.2 Coccinelle uses Ocaml parmap for parallelization,
+if support for this is detected you will benefit from parmap parallelization.
+
+When parmap is enabled coccicheck will enable dynamic load balancing by using
+'--chunksize 1' argument, this ensures we keep feeding threads with work
+one by one, so that we avoid the situation where most work gets done by only
+a few threads. With dynamic load balancing, if a thread finishes early we keep
+feeding it more work.
+
+When parmap is enabled, if an error occurs in Coccinelle, this error
+value is propagated back, the return value of the 'make coccicheck'
+captures this return value.
Using Coccinelle with a single semantic patch
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -142,15 +166,118 @@ semantic patch as shown in the previous section.
The "report" mode is the default. You can select another one with the
MODE variable explained above.
+ Debugging Coccinelle SmPL patches
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Using coccicheck is best as it provides in the spatch command line
+include options matching the options used when we compile the kernel.
+You can learn what these options are by using V=1, you could then
+manually run Coccinelle with debug options added.
+
+Alternatively you can debug running Coccinelle against SmPL patches
+by asking for stderr to be redirected to stderr, by default stderr
+is redirected to /dev/null, if you'd like to capture stderr you
+can specify the DEBUG_FILE="file.txt" option to coccicheck. For
+instance:
+
+ rm -f cocci.err
+ make coccicheck COCCI=scripts/coccinelle/free/kfree.cocci MODE=report DEBUG_FILE=cocci.err
+ cat cocci.err
+
+You can use SPFLAGS to add debugging flags, for instance you may want to
+add both --profile --show-trying to SPFLAGS when debugging. For instance
+you may want to use:
+
+ rm -f err.log
+ export COCCI=scripts/coccinelle/misc/irqf_oneshot.cocci
+ make coccicheck DEBUG_FILE="err.log" MODE=report SPFLAGS="--profile --show-trying" M=./drivers/mfd/arizona-irq.c
+
+err.log will now have the profiling information, while stdout will
+provide some progress information as Coccinelle moves forward with
+work.
+
+DEBUG_FILE support is only supported when using coccinelle >= 1.2.
+
+ .cocciconfig support
+~~~~~~~~~~~~~~~~~~~~~~
+
+Coccinelle supports reading .cocciconfig for default Coccinelle options that
+should be used every time spatch is spawned, the order of precedence for
+variables for .cocciconfig is as follows:
+
+ o Your current user's home directory is processed first
+ o Your directory from which spatch is called is processed next
+ o The directory provided with the --dir option is processed last, if used
+
+Since coccicheck runs through make, it naturally runs from the kernel
+proper dir, as such the second rule above would be implied for picking up a
+.cocciconfig when using 'make coccicheck'.
+
+'make coccicheck' also supports using M= targets.If you do not supply
+any M= target, it is assumed you want to target the entire kernel.
+The kernel coccicheck script has:
+
+ if [ "$KBUILD_EXTMOD" = "" ] ; then
+ OPTIONS="--dir $srctree $COCCIINCLUDE"
+ else
+ OPTIONS="--dir $KBUILD_EXTMOD $COCCIINCLUDE"
+ fi
+
+KBUILD_EXTMOD is set when an explicit target with M= is used. For both cases
+the spatch --dir argument is used, as such third rule applies when whether M=
+is used or not, and when M= is used the target directory can have its own
+.cocciconfig file. When M= is not passed as an argument to coccicheck the
+target directory is the same as the directory from where spatch was called.
+
+If not using the kernel's coccicheck target, keep the above precedence
+order logic of .cocciconfig reading. If using the kernel's coccicheck target,
+override any of the kernel's .coccicheck's settings using SPFLAGS.
+
+We help Coccinelle when used against Linux with a set of sensible defaults
+options for Linux with our own Linux .cocciconfig. This hints to coccinelle
+git can be used for 'git grep' queries over coccigrep. A timeout of 200
+seconds should suffice for now.
+
+The options picked up by coccinelle when reading a .cocciconfig do not appear
+as arguments to spatch processes running on your system, to confirm what
+options will be used by Coccinelle run:
+
+ spatch --print-options-only
+
+You can override with your own preferred index option by using SPFLAGS. Take
+note that when there are conflicting options Coccinelle takes precedence for
+the last options passed. Using .cocciconfig is possible to use idutils, however
+given the order of precedence followed by Coccinelle, since the kernel now
+carries its own .cocciconfig, you will need to use SPFLAGS to use idutils if
+desired. See below section "Additional flags" for more details on how to use
+idutils.
+
Additional flags
~~~~~~~~~~~~~~~~~~
Additional flags can be passed to spatch through the SPFLAGS
-variable.
+variable. This works as Coccinelle respects the last flags
+given to it when options are in conflict.
make SPFLAGS=--use-glimpse coccicheck
+
+Coccinelle supports idutils as well but requires coccinelle >= 1.0.6.
+When no ID file is specified coccinelle assumes your ID database file
+is in the file .id-utils.index on the top level of the kernel, coccinelle
+carries a script scripts/idutils_index.sh which creates the database with
+
+ mkid -i C --output .id-utils.index
+
+If you have another database filename you can also just symlink with this
+name.
+
make SPFLAGS=--use-idutils coccicheck
+Alternatively you can specify the database filename explicitly, for
+instance:
+
+ make SPFLAGS="--use-idutils /full-path/to/ID" coccicheck
+
See spatch --help to learn more about spatch options.
Note that the '--use-glimpse' and '--use-idutils' options
@@ -159,6 +286,25 @@ thus active by default. However, by indexing the code with
one of these tools, and according to the cocci file used,
spatch could proceed the entire code base more quickly.
+ SmPL patch specific options
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+SmPL patches can have their own requirements for options passed
+to Coccinelle. SmPL patch specific options can be provided by
+providing them at the top of the SmPL patch, for instance:
+
+// Options: --no-includes --include-headers
+
+ SmPL patch Coccinelle requirements
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As Coccinelle features get added some more advanced SmPL patches
+may require newer versions of Coccinelle. If an SmPL patch requires
+at least a version of Coccinelle, this can be specified as follows,
+as an example if requiring at least Coccinelle >= 1.0.5:
+
+// Requires: 1.0.5
+
Proposing new semantic patches
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt b/Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
index 21055e210234..c1359f4d48d7 100644
--- a/Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
+++ b/Documentation/devicetree/bindings/memory-controllers/omap-gpmc.txt
@@ -46,6 +46,10 @@ Required properties:
0 maps to GPMC_WAIT0 pin.
- gpio-cells: Must be set to 2
+Required properties when using NAND prefetch dma:
+ - dmas GPMC NAND prefetch dma channel
+ - dma-names Must be set to "rxtx"
+
Timing properties for child nodes. All are optional and default to 0.
- gpmc,sync-clk-ps: Minimum clock period for synchronous mode, in picoseconds
@@ -137,7 +141,8 @@ Example for an AM33xx board:
ti,hwmods = "gpmc";
reg = <0x50000000 0x2000>;
interrupts = <100>;
-
+ dmas = <&edma 52 0>;
+ dma-names = "rxtx";
gpmc,num-cs = <8>;
gpmc,num-waitpins = <2>;
#address-cells = <2>;
diff --git a/Documentation/devicetree/bindings/mtd/atmel-quadspi.txt b/Documentation/devicetree/bindings/mtd/atmel-quadspi.txt
new file mode 100644
index 000000000000..489807005eda
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/atmel-quadspi.txt
@@ -0,0 +1,32 @@
+* Atmel Quad Serial Peripheral Interface (QSPI)
+
+Required properties:
+- compatible: Should be "atmel,sama5d2-qspi".
+- reg: Should contain the locations and lengths of the base registers
+ and the mapped memory.
+- reg-names: Should contain the resource reg names:
+ - qspi_base: configuration register address space
+ - qspi_mmap: memory mapped address space
+- interrupts: Should contain the interrupt for the device.
+- clocks: The phandle of the clock needed by the QSPI controller.
+- #address-cells: Should be <1>.
+- #size-cells: Should be <0>.
+
+Example:
+
+spi@f0020000 {
+ compatible = "atmel,sama5d2-qspi";
+ reg = <0xf0020000 0x100>, <0xd0000000 0x8000000>;
+ reg-names = "qspi_base", "qspi_mmap";
+ interrupts = <52 IRQ_TYPE_LEVEL_HIGH 7>;
+ clocks = <&spi0_clk>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ pinctrl-names = "default";
+ pinctrl-0 = <&pinctrl_spi0_default>;
+ status = "okay";
+
+ m25p80@0 {
+ ...
+ };
+};
diff --git a/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt b/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt
index 7066597c9a81..b40f3a492800 100644
--- a/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt
+++ b/Documentation/devicetree/bindings/mtd/brcm,brcmnand.txt
@@ -27,6 +27,7 @@ Required properties:
brcm,brcmnand-v6.2
brcm,brcmnand-v7.0
brcm,brcmnand-v7.1
+ brcm,brcmnand-v7.2
brcm,brcmnand
- reg : the register start and length for NAND register region.
(optional) Flash DMA register range (if present)
diff --git a/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt b/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
new file mode 100644
index 000000000000..f248056da24c
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/cadence-quadspi.txt
@@ -0,0 +1,56 @@
+* Cadence Quad SPI controller
+
+Required properties:
+- compatible : Should be "cdns,qspi-nor".
+- reg : Contains two entries, each of which is a tuple consisting of a
+ physical address and length. The first entry is the address and
+ length of the controller register set. The second entry is the
+ address and length of the QSPI Controller data area.
+- interrupts : Unit interrupt specifier for the controller interrupt.
+- clocks : phandle to the Quad SPI clock.
+- cdns,fifo-depth : Size of the data FIFO in words.
+- cdns,fifo-width : Bus width of the data FIFO in bytes.
+- cdns,trigger-address : 32-bit indirect AHB trigger address.
+
+Optional properties:
+- cdns,is-decoded-cs : Flag to indicate whether decoder is used or not.
+
+Optional subnodes:
+Subnodes of the Cadence Quad SPI controller are spi slave nodes with additional
+custom properties:
+- cdns,read-delay : Delay for read capture logic, in clock cycles
+- cdns,tshsl-ns : Delay in nanoseconds for the length that the master
+ mode chip select outputs are de-asserted between
+ transactions.
+- cdns,tsd2d-ns : Delay in nanoseconds between one chip select being
+ de-activated and the activation of another.
+- cdns,tchsh-ns : Delay in nanoseconds between last bit of current
+ transaction and deasserting the device chip select
+ (qspi_n_ss_out).
+- cdns,tslch-ns : Delay in nanoseconds between setting qspi_n_ss_out low
+ and first bit transfer.
+
+Example:
+
+ qspi: spi@ff705000 {
+ compatible = "cdns,qspi-nor";
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <0xff705000 0x1000>,
+ <0xffa00000 0x1000>;
+ interrupts = <0 151 4>;
+ clocks = <&qspi_clk>;
+ cdns,is-decoded-cs;
+ cdns,fifo-depth = <128>;
+ cdns,fifo-width = <4>;
+ cdns,trigger-address = <0x00000000>;
+
+ flash0: n25q00@0 {
+ ...
+ cdns,read-delay = <4>;
+ cdns,tshsl-ns = <50>;
+ cdns,tsd2d-ns = <50>;
+ cdns,tchsh-ns = <4>;
+ cdns,tslch-ns = <4>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
index 3ee7e202657c..174f68c26c1b 100644
--- a/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/gpmc-nand.txt
@@ -39,7 +39,7 @@ Optional properties:
"prefetch-polled" Prefetch polled mode (default)
"polled" Polled mode, without prefetch
- "prefetch-dma" Prefetch enabled sDMA mode
+ "prefetch-dma" Prefetch enabled DMA mode
"prefetch-irq" Prefetch enabled irq mode
- elm_id: <deprecated> use "ti,elm-id" instead
diff --git a/Documentation/devicetree/bindings/mtd/hisilicon,fmc-spi-nor.txt b/Documentation/devicetree/bindings/mtd/hisilicon,fmc-spi-nor.txt
new file mode 100644
index 000000000000..74981520d6dd
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/hisilicon,fmc-spi-nor.txt
@@ -0,0 +1,24 @@
+HiSilicon SPI-NOR Flash Controller
+
+Required properties:
+- compatible : Should be "hisilicon,fmc-spi-nor" and one of the following strings:
+ "hisilicon,hi3519-spi-nor"
+- address-cells : Should be 1.
+- size-cells : Should be 0.
+- reg : Offset and length of the register set for the controller device.
+- reg-names : Must include the following two entries: "control", "memory".
+- clocks : handle to spi-nor flash controller clock.
+
+Example:
+spi-nor-controller@10000000 {
+ compatible = "hisilicon,hi3519-spi-nor", "hisilicon,fmc-spi-nor";
+ #address-cells = <1>;
+ #size-cells = <0>;
+ reg = <0x10000000 0x1000>, <0x14000000 0x1000000>;
+ reg-names = "control", "memory";
+ clocks = <&clock HI3519_FMC_CLK>;
+ spi-nor@0 {
+ compatible = "jedec,spi-nor";
+ reg = <0>;
+ };
+};
diff --git a/Documentation/devicetree/bindings/mtd/mtk-nand.txt b/Documentation/devicetree/bindings/mtd/mtk-nand.txt
new file mode 100644
index 000000000000..069c192ed5c2
--- /dev/null
+++ b/Documentation/devicetree/bindings/mtd/mtk-nand.txt
@@ -0,0 +1,160 @@
+MTK SoCs NAND FLASH controller (NFC) DT binding
+
+This file documents the device tree bindings for MTK SoCs NAND controllers.
+The functional split of the controller requires two drivers to operate:
+the nand controller interface driver and the ECC engine driver.
+
+The hardware description for both devices must be captured as device
+tree nodes.
+
+1) NFC NAND Controller Interface (NFI):
+=======================================
+
+The first part of NFC is NAND Controller Interface (NFI) HW.
+Required NFI properties:
+- compatible: Should be "mediatek,mtxxxx-nfc".
+- reg: Base physical address and size of NFI.
+- interrupts: Interrupts of NFI.
+- clocks: NFI required clocks.
+- clock-names: NFI clocks internal name.
+- status: Disabled default. Then set "okay" by platform.
+- ecc-engine: Required ECC Engine node.
+- #address-cells: NAND chip index, should be 1.
+- #size-cells: Should be 0.
+
+Example:
+
+ nandc: nfi@1100d000 {
+ compatible = "mediatek,mt2701-nfc";
+ reg = <0 0x1100d000 0 0x1000>;
+ interrupts = <GIC_SPI 56 IRQ_TYPE_LEVEL_LOW>;
+ clocks = <&pericfg CLK_PERI_NFI>,
+ <&pericfg CLK_PERI_NFI_PAD>;
+ clock-names = "nfi_clk", "pad_clk";
+ status = "disabled";
+ ecc-engine = <&bch>;
+ #address-cells = <1>;
+ #size-cells = <0>;
+ };
+
+Platform related properties, should be set in {platform_name}.dts:
+- children nodes: NAND chips.
+
+Children nodes properties:
+- reg: Chip Select Signal, default 0.
+ Set as reg = <0>, <1> when need 2 CS.
+Optional:
+- nand-on-flash-bbt: Store BBT on NAND Flash.
+- nand-ecc-mode: the NAND ecc mode (check driver for supported modes)
+- nand-ecc-step-size: Number of data bytes covered by a single ECC step.
+ valid values: 512 and 1024.
+ 1024 is recommended for large page NANDs.
+- nand-ecc-strength: Number of bits to correct per ECC step.
+ The valid values that the controller supports are: 4, 6,
+ 8, 10, 12, 14, 16, 18, 20, 22, 24, 28, 32, 36, 40, 44,
+ 48, 52, 56, 60.
+ The strength should be calculated as follows:
+ E = (S - F) * 8 / 14
+ S = O / (P / Q)
+ E : nand-ecc-strength.
+ S : spare size per sector.
+ F : FDM size, should be in the range [1,8].
+ It is used to store free oob data.
+ O : oob size.
+ P : page size.
+ Q : nand-ecc-step-size.
+ If the result does not match any one of the listed
+ choices above, please select the smaller valid value from
+ the list.
+ (otherwise the driver will do the adjustment at runtime)
+- pinctrl-names: Default NAND pin GPIO setting name.
+- pinctrl-0: GPIO setting node.
+
+Example:
+ &pio {
+ nand_pins_default: nanddefault {
+ pins_dat {
+ pinmux = <MT2701_PIN_111_MSDC0_DAT7__FUNC_NLD7>,
+ <MT2701_PIN_112_MSDC0_DAT6__FUNC_NLD6>,
+ <MT2701_PIN_114_MSDC0_DAT4__FUNC_NLD4>,
+ <MT2701_PIN_118_MSDC0_DAT3__FUNC_NLD3>,
+ <MT2701_PIN_121_MSDC0_DAT0__FUNC_NLD0>,
+ <MT2701_PIN_120_MSDC0_DAT1__FUNC_NLD1>,
+ <MT2701_PIN_113_MSDC0_DAT5__FUNC_NLD5>,
+ <MT2701_PIN_115_MSDC0_RSTB__FUNC_NLD8>,
+ <MT2701_PIN_119_MSDC0_DAT2__FUNC_NLD2>;
+ input-enable;
+ drive-strength = <MTK_DRIVE_8mA>;
+ bias-pull-up;
+ };
+
+ pins_we {
+ pinmux = <MT2701_PIN_117_MSDC0_CLK__FUNC_NWEB>;
+ drive-strength = <MTK_DRIVE_8mA>;
+ bias-pull-up = <MTK_PUPD_SET_R1R0_10>;
+ };
+
+ pins_ale {
+ pinmux = <MT2701_PIN_116_MSDC0_CMD__FUNC_NALE>;
+ drive-strength = <MTK_DRIVE_8mA>;
+ bias-pull-down = <MTK_PUPD_SET_R1R0_10>;
+ };
+ };
+ };
+
+ &nandc {
+ status = "okay";
+ pinctrl-names = "default";
+ pinctrl-0 = <&nand_pins_default>;
+ nand@0 {
+ reg = <0>;
+ nand-on-flash-bbt;
+ nand-ecc-mode = "hw";
+ nand-ecc-strength = <24>;
+ nand-ecc-step-size = <1024>;
+ };
+ };
+
+NAND chip optional subnodes:
+- Partitions, see Documentation/devicetree/bindings/mtd/partition.txt
+
+Example:
+ nand@0 {
+ partitions {
+ compatible = "fixed-partitions";
+ #address-cells = <1>;
+ #size-cells = <1>;
+
+ preloader@0 {
+ label = "pl";
+ read-only;
+ reg = <0x00000000 0x00400000>;
+ };
+ android@0x00400000 {
+ label = "android";
+ reg = <0x00400000 0x12c00000>;
+ };
+ };
+ };
+
+2) ECC Engine:
+==============
+
+Required BCH properties:
+- compatible: Should be "mediatek,mtxxxx-ecc".
+- reg: Base physical address and size of ECC.
+- interrupts: Interrupts of ECC.
+- clocks: ECC required clocks.
+- clock-names: ECC clocks internal name.
+- status: Disabled default. Then set "okay" by platform.
+
+Example:
+
+ bch: ecc@1100e000 {
+ compatible = "mediatek,mt2701-ecc";
+ reg = <0 0x1100e000 0 0x1000>;
+ interrupts = <GIC_SPI 55 IRQ_TYPE_LEVEL_LOW>;
+ clocks = <&pericfg CLK_PERI_NFI_ECC>;
+ clock-names = "nfiecc_clk";
+ status = "disabled";
+ };
diff --git a/Documentation/devicetree/bindings/mtd/sunxi-nand.txt b/Documentation/devicetree/bindings/mtd/sunxi-nand.txt
index 086d6f44c4b9..f322f56aef74 100644
--- a/Documentation/devicetree/bindings/mtd/sunxi-nand.txt
+++ b/Documentation/devicetree/bindings/mtd/sunxi-nand.txt
@@ -11,10 +11,16 @@ Required properties:
* "ahb" : AHB gating clock
* "mod" : nand controller clock
+Optional properties:
+- dmas : shall reference DMA channel associated to the NAND controller.
+- dma-names : shall be "rxtx".
+
Optional children nodes:
Children nodes represent the available nand chips.
Optional properties:
+- reset : phandle + reset specifier pair
+- reset-names : must contain "ahb"
- allwinner,rb : shall contain the native Ready/Busy ids.
or
- rb-gpios : shall contain the gpios used as R/B pins.
diff --git a/Documentation/devicetree/bindings/pci/aardvark-pci.txt b/Documentation/devicetree/bindings/pci/aardvark-pci.txt
new file mode 100644
index 000000000000..bbcd9f4c501f
--- /dev/null
+++ b/Documentation/devicetree/bindings/pci/aardvark-pci.txt
@@ -0,0 +1,56 @@
+Aardvark PCIe controller
+
+This PCIe controller is used on the Marvell Armada 3700 ARM64 SoC.
+
+The Device Tree node describing an Aardvark PCIe controller must
+contain the following properties:
+
+ - compatible: Should be "marvell,armada-3700-pcie"
+ - reg: range of registers for the PCIe controller
+ - interrupts: the interrupt line of the PCIe controller
+ - #address-cells: set to <3>
+ - #size-cells: set to <2>
+ - device_type: set to "pci"
+ - ranges: ranges for the PCI memory and I/O regions
+ - #interrupt-cells: set to <1>
+ - msi-controller: indicates that the PCIe controller can itself
+ handle MSI interrupts
+ - msi-parent: pointer to the MSI controller to be used
+ - interrupt-map-mask and interrupt-map: standard PCI properties to
+ define the mapping of the PCIe interface to interrupt numbers.
+ - bus-range: PCI bus numbers covered
+
+In addition, the Device Tree describing an Aardvark PCIe controller
+must include a sub-node that describes the legacy interrupt controller
+built into the PCIe controller. This sub-node must have the following
+properties:
+
+ - interrupt-controller
+ - #interrupt-cells: set to <1>
+
+Example:
+
+ pcie0: pcie@d0070000 {
+ compatible = "marvell,armada-3700-pcie";
+ device_type = "pci";
+ status = "disabled";
+ reg = <0 0xd0070000 0 0x20000>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ bus-range = <0x00 0xff>;
+ interrupts = <GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>;
+ #interrupt-cells = <1>;
+ msi-controller;
+ msi-parent = <&pcie0>;
+ ranges = <0x82000000 0 0xe8000000 0 0xe8000000 0 0x1000000 /* Port 0 MEM */
+ 0x81000000 0 0xe9000000 0 0xe9000000 0 0x10000>; /* Port 0 IO*/
+ interrupt-map-mask = <0 0 0 7>;
+ interrupt-map = <0 0 0 1 &pcie_intc 0>,
+ <0 0 0 2 &pcie_intc 1>,
+ <0 0 0 3 &pcie_intc 2>,
+ <0 0 0 4 &pcie_intc 3>;
+ pcie_intc: interrupt-controller {
+ interrupt-controller;
+ #interrupt-cells = <1>;
+ };
+ };
diff --git a/Documentation/devicetree/bindings/pci/axis,artpec6-pcie.txt b/Documentation/devicetree/bindings/pci/axis,artpec6-pcie.txt
new file mode 100644
index 000000000000..330a45b5f0b5
--- /dev/null
+++ b/Documentation/devicetree/bindings/pci/axis,artpec6-pcie.txt
@@ -0,0 +1,46 @@
+* Axis ARTPEC-6 PCIe interface
+
+This PCIe host controller is based on the Synopsys DesignWare PCIe IP
+and thus inherits all the common properties defined in designware-pcie.txt.
+
+Required properties:
+- compatible: "axis,artpec6-pcie", "snps,dw-pcie"
+- reg: base addresses and lengths of the PCIe controller (DBI),
+ the phy controller, and configuration address space.
+- reg-names: Must include the following entries:
+ - "dbi"
+ - "phy"
+ - "config"
+- interrupts: A list of interrupt outputs of the controller. Must contain an
+ entry for each entry in the interrupt-names property.
+- interrupt-names: Must include the following entries:
+ - "msi": The interrupt that is asserted when an MSI is received
+- axis,syscon-pcie: A phandle pointing to the ARTPEC-6 system controller,
+ used to enable and control the Synopsys IP.
+
+Example:
+
+ pcie@f8050000 {
+ compatible = "axis,artpec6-pcie", "snps,dw-pcie";
+ reg = <0xf8050000 0x2000
+ 0xf8040000 0x1000
+ 0xc0000000 0x1000>;
+ reg-names = "dbi", "phy", "config";
+ #address-cells = <3>;
+ #size-cells = <2>;
+ device_type = "pci";
+ /* downstream I/O */
+ ranges = <0x81000000 0 0x00010000 0xc0010000 0 0x00010000
+ /* non-prefetchable memory */
+ 0x82000000 0 0xc0020000 0xc0020000 0 0x1ffe0000>;
+ num-lanes = <2>;
+ interrupts = <GIC_SPI 148 IRQ_TYPE_LEVEL_HIGH>;
+ interrupt-names = "msi";
+ #interrupt-cells = <1>;
+ interrupt-map-mask = <0 0 0 0x7>;
+ interrupt-map = <0 0 0 1 &intc GIC_SPI 144 IRQ_TYPE_LEVEL_HIGH>,
+ <0 0 0 2 &intc GIC_SPI 145 IRQ_TYPE_LEVEL_HIGH>,
+ <0 0 0 3 &intc GIC_SPI 146 IRQ_TYPE_LEVEL_HIGH>,
+ <0 0 0 4 &intc GIC_SPI 147 IRQ_TYPE_LEVEL_HIGH>;
+ axis,syscon-pcie = <&syscon>;
+ };
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 8ea834f6b289..5385cba941d2 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -3,6 +3,7 @@
*.bc
*.bin
*.bz2
+*.c.[012]*.*
*.cis
*.cpio
*.csp
diff --git a/Documentation/gcc-plugins.txt b/Documentation/gcc-plugins.txt
new file mode 100644
index 000000000000..891c69464434
--- /dev/null
+++ b/Documentation/gcc-plugins.txt
@@ -0,0 +1,87 @@
+GCC plugin infrastructure
+=========================
+
+
+1. Introduction
+===============
+
+GCC plugins are loadable modules that provide extra features to the
+compiler [1]. They are useful for runtime instrumentation and static analysis.
+We can analyse, change and add further code during compilation via
+callbacks [2], GIMPLE [3], IPA [4] and RTL passes [5].
+
+The GCC plugin infrastructure of the kernel supports all gcc versions from
+4.5 to 6.0, building out-of-tree modules, cross-compilation and building in a
+separate directory.
+Plugin source files have to be compilable by both a C and a C++ compiler as well
+because gcc versions 4.5 and 4.6 are compiled by a C compiler,
+gcc-4.7 can be compiled by a C or a C++ compiler,
+and versions 4.8+ can only be compiled by a C++ compiler.
+
+Currently the GCC plugin infrastructure supports only the x86, arm and arm64
+architectures.
+
+This infrastructure was ported from grsecurity [6] and PaX [7].
+
+--
+[1] https://gcc.gnu.org/onlinedocs/gccint/Plugins.html
+[2] https://gcc.gnu.org/onlinedocs/gccint/Plugin-API.html#Plugin-API
+[3] https://gcc.gnu.org/onlinedocs/gccint/GIMPLE.html
+[4] https://gcc.gnu.org/onlinedocs/gccint/IPA.html
+[5] https://gcc.gnu.org/onlinedocs/gccint/RTL.html
+[6] https://grsecurity.net/
+[7] https://pax.grsecurity.net/
+
+
+2. Files
+========
+
+$(src)/scripts/gcc-plugins
+ This is the directory of the GCC plugins.
+
+$(src)/scripts/gcc-plugins/gcc-common.h
+ This is a compatibility header for GCC plugins.
+ It should be always included instead of individual gcc headers.
+
+$(src)/scripts/gcc-plugin.sh
+ This script checks the availability of the included headers in
+ gcc-common.h and chooses the proper host compiler to build the plugins
+ (gcc-4.7 can be built by either gcc or g++).
+
+$(src)/scripts/gcc-plugins/gcc-generate-gimple-pass.h
+$(src)/scripts/gcc-plugins/gcc-generate-ipa-pass.h
+$(src)/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h
+$(src)/scripts/gcc-plugins/gcc-generate-rtl-pass.h
+ These headers automatically generate the registration structures for
+ GIMPLE, SIMPLE_IPA, IPA and RTL passes. They support all gcc versions
+ from 4.5 to 6.0.
+ They should be preferred to creating the structures by hand.
+
+
+3. Usage
+========
+
+You must install the gcc plugin headers for your gcc version,
+e.g., on Ubuntu for gcc-4.9:
+
+ apt-get install gcc-4.9-plugin-dev
+
+Enable a GCC plugin based feature in the kernel config:
+
+ CONFIG_GCC_PLUGIN_CYC_COMPLEXITY = y
+
+To compile only the plugin(s):
+
+ make gcc-plugins
+
+or just run the kernel make and compile the whole kernel with
+the cyclomatic complexity GCC plugin.
+
+
+4. How to add a new GCC plugin
+==============================
+
+The GCC plugins are in $(src)/scripts/gcc-plugins/. You can use a file or a directory
+here. It must be added to $(src)/scripts/gcc-plugins/Makefile,
+$(src)/scripts/Makefile.gcc-plugins and $(src)/arch/Kconfig.
+See the cyc_complexity_plugin.c (CONFIG_GCC_PLUGIN_CYC_COMPLEXITY) GCC plugin.
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index e24aa11e8f8a..642029012059 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3021,6 +3021,8 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
resource_alignment=
Format:
[<order of align>@][<domain>:]<bus>:<slot>.<func>[; ...]
+ [<order of align>@]pci:<vendor>:<device>\
+ [:<subvendor>:<subdevice>][; ...]
Specifies alignment and device to reassign
aligned memory resources.
If <order of align> is not specified,
@@ -3039,6 +3041,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
hpmemsize=nn[KMG] The fixed amount of bus space which is
reserved for hotplug bridge's memory window.
Default size is 2 megabytes.
+ hpbussize=nn The minimum amount of additional bus numbers
+ reserved for buses below a hotplug bridge.
+ Default is 1.
realloc= Enable/disable reallocating PCI bridge resources
if allocations done by BIOS are too small to
accommodate resources required by all child
@@ -3070,6 +3075,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
compat Treat PCIe ports as PCI-to-PCI bridges, disable the PCIe
ports driver.
+ pcie_port_pm= [PCIE] PCIe port power management handling:
+ off Disable power management of all PCIe ports
+ force Forcibly enable power management of all PCIe ports
+
pcie_pme= [PCIE,PM] Native PCIe PME signaling options:
nomsi Do not use MSI for native PCIe PME signaling (this makes
all PCIe root ports use INTx for all services).
diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt
index a4482cce4bae..5237e1b2fd66 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1482,6 +1482,11 @@ struct kvm_irq_routing_msi {
__u32 pad;
};
+On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS
+feature of KVM_CAP_X2APIC_API capability is enabled. If it is enabled,
+address_hi bits 31-8 provide bits 31-8 of the destination id. Bits 7-0 of
+address_hi must be zero.
+
struct kvm_irq_routing_s390_adapter {
__u64 ind_addr;
__u64 summary_addr;
@@ -1583,6 +1588,17 @@ struct kvm_lapic_state {
Reads the Local APIC registers and copies them into the input argument. The
data format and layout are the same as documented in the architecture manual.
+If KVM_X2APIC_API_USE_32BIT_IDS feature of KVM_CAP_X2APIC_API is
+enabled, then the format of APIC_ID register depends on the APIC mode
+(reported by MSR_IA32_APICBASE) of its VCPU. x2APIC stores APIC ID in
+the APIC_ID register (bytes 32-35). xAPIC only allows an 8-bit APIC ID
+which is stored in bits 31-24 of the APIC register, or equivalently in
+byte 35 of struct kvm_lapic_state's regs field. KVM_GET_LAPIC must then
+be called after MSR_IA32_APICBASE has been set with KVM_SET_MSR.
+
+If KVM_X2APIC_API_USE_32BIT_IDS feature is disabled, struct kvm_lapic_state
+always uses xAPIC format.
+
4.58 KVM_SET_LAPIC
@@ -1600,6 +1616,10 @@ struct kvm_lapic_state {
Copies the input argument into the Local APIC registers. The data format
and layout are the same as documented in the architecture manual.
+The format of the APIC ID register (bytes 32-35 of struct kvm_lapic_state's
+regs field) depends on the state of the KVM_CAP_X2APIC_API capability.
+See the note in KVM_GET_LAPIC.
+
4.59 KVM_IOEVENTFD
@@ -2032,6 +2052,12 @@ registers, find a list below:
MIPS | KVM_REG_MIPS_CP0_CONFIG5 | 32
MIPS | KVM_REG_MIPS_CP0_CONFIG7 | 32
MIPS | KVM_REG_MIPS_CP0_ERROREPC | 64
+ MIPS | KVM_REG_MIPS_CP0_KSCRATCH1 | 64
+ MIPS | KVM_REG_MIPS_CP0_KSCRATCH2 | 64
+ MIPS | KVM_REG_MIPS_CP0_KSCRATCH3 | 64
+ MIPS | KVM_REG_MIPS_CP0_KSCRATCH4 | 64
+ MIPS | KVM_REG_MIPS_CP0_KSCRATCH5 | 64
+ MIPS | KVM_REG_MIPS_CP0_KSCRATCH6 | 64
MIPS | KVM_REG_MIPS_COUNT_CTL | 64
MIPS | KVM_REG_MIPS_COUNT_RESUME | 64
MIPS | KVM_REG_MIPS_COUNT_HZ | 64
@@ -2156,7 +2182,7 @@ after pausing the vcpu, but before it is resumed.
4.71 KVM_SIGNAL_MSI
Capability: KVM_CAP_SIGNAL_MSI
-Architectures: x86
+Architectures: x86 arm64
Type: vm ioctl
Parameters: struct kvm_msi (in)
Returns: >0 on delivery, 0 if guest blocked the MSI, and -1 on error
@@ -2169,10 +2195,22 @@ struct kvm_msi {
__u32 address_hi;
__u32 data;
__u32 flags;
- __u8 pad[16];
+ __u32 devid;
+ __u8 pad[12];
};
-No flags are defined so far. The corresponding field must be 0.
+flags: KVM_MSI_VALID_DEVID: devid contains a valid value
+devid: If KVM_MSI_VALID_DEVID is set, contains a unique device identifier
+ for the device that wrote the MSI message.
+ For PCI, this is usually a BFD identifier in the lower 16 bits.
+
+The per-VM KVM_CAP_MSI_DEVID capability advertises the need to provide
+the device ID. If this capability is not set, userland cannot rely on
+the kernel to allow the KVM_MSI_VALID_DEVID flag being set.
+
+On x86, address_hi is ignored unless the KVM_CAP_X2APIC_API capability is
+enabled. If it is enabled, address_hi bits 31-8 provide bits 31-8 of the
+destination id. Bits 7-0 of address_hi must be zero.
4.71 KVM_CREATE_PIT2
@@ -2520,6 +2558,7 @@ Parameters: struct kvm_device_attr
Returns: 0 on success, -1 on error
Errors:
ENXIO: The group or attribute is unknown/unsupported for this device
+ or hardware support is missing.
EPERM: The attribute cannot (currently) be accessed this way
(e.g. read-only attribute, or attribute that only makes
sense when the device is in a different state)
@@ -2547,6 +2586,7 @@ Parameters: struct kvm_device_attr
Returns: 0 on success, -1 on error
Errors:
ENXIO: The group or attribute is unknown/unsupported for this device
+ or hardware support is missing.
Tests whether a device supports a particular attribute. A successful
return indicates the attribute is implemented. It does not necessarily
@@ -3803,6 +3843,42 @@ Allows use of runtime-instrumentation introduced with zEC12 processor.
Will return -EINVAL if the machine does not support runtime-instrumentation.
Will return -EBUSY if a VCPU has already been created.
+7.7 KVM_CAP_X2APIC_API
+
+Architectures: x86
+Parameters: args[0] - features that should be enabled
+Returns: 0 on success, -EINVAL when args[0] contains invalid features
+
+Valid feature flags in args[0] are
+
+#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
+
+Enabling KVM_X2APIC_API_USE_32BIT_IDS changes the behavior of
+KVM_SET_GSI_ROUTING, KVM_SIGNAL_MSI, KVM_SET_LAPIC, and KVM_GET_LAPIC,
+allowing the use of 32-bit APIC IDs. See KVM_CAP_X2APIC_API in their
+respective sections.
+
+KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK must be enabled for x2APIC to work
+in logical mode or with more than 255 VCPUs. Otherwise, KVM treats 0xff
+as a broadcast even in x2APIC mode in order to support physical x2APIC
+without interrupt remapping. This is undesirable in logical mode,
+where 0xff represents CPUs 0-7 in cluster 0.
+
+7.8 KVM_CAP_S390_USER_INSTR0
+
+Architectures: s390
+Parameters: none
+
+With this capability enabled, all illegal instructions 0x0000 (2 bytes) will
+be intercepted and forwarded to user space. User space can use this
+mechanism e.g. to realize 2-byte software breakpoints. The kernel will
+not inject an operating exception for these instructions, user space has
+to take care of that.
+
+This capability can be enabled dynamically even if VCPUs were already
+created and are running.
+
8. Other capabilities.
----------------------
diff --git a/Documentation/virtual/kvm/devices/arm-vgic.txt b/Documentation/virtual/kvm/devices/arm-vgic.txt
index 59541d49e15c..89182f80cc7f 100644
--- a/Documentation/virtual/kvm/devices/arm-vgic.txt
+++ b/Documentation/virtual/kvm/devices/arm-vgic.txt
@@ -4,16 +4,22 @@ ARM Virtual Generic Interrupt Controller (VGIC)
Device types supported:
KVM_DEV_TYPE_ARM_VGIC_V2 ARM Generic Interrupt Controller v2.0
KVM_DEV_TYPE_ARM_VGIC_V3 ARM Generic Interrupt Controller v3.0
+ KVM_DEV_TYPE_ARM_VGIC_ITS ARM Interrupt Translation Service Controller
-Only one VGIC instance may be instantiated through either this API or the
-legacy KVM_CREATE_IRQCHIP api. The created VGIC will act as the VM interrupt
-controller, requiring emulated user-space devices to inject interrupts to the
-VGIC instead of directly to CPUs.
+Only one VGIC instance of the V2/V3 types above may be instantiated through
+either this API or the legacy KVM_CREATE_IRQCHIP api. The created VGIC will
+act as the VM interrupt controller, requiring emulated user-space devices to
+inject interrupts to the VGIC instead of directly to CPUs.
Creating a guest GICv3 device requires a host GICv3 as well.
GICv3 implementations with hardware compatibility support allow a guest GICv2
as well.
+Creating a virtual ITS controller requires a host GICv3 (but does not depend
+on having physical ITS controllers).
+There can be multiple ITS controllers per guest, each of them has to have
+a separate, non-overlapping MMIO region.
+
Groups:
KVM_DEV_ARM_VGIC_GRP_ADDR
Attributes:
@@ -39,6 +45,13 @@ Groups:
Only valid for KVM_DEV_TYPE_ARM_VGIC_V3.
This address needs to be 64K aligned.
+ KVM_VGIC_V3_ADDR_TYPE_ITS (rw, 64-bit)
+ Base address in the guest physical address space of the GICv3 ITS
+ control register frame. The ITS allows MSI(-X) interrupts to be
+ injected into guests. This extension is optional. If the kernel
+ does not support the ITS, the call returns -ENODEV.
+ Only valid for KVM_DEV_TYPE_ARM_VGIC_ITS.
+ This address needs to be 64K aligned and the region covers 128K.
KVM_DEV_ARM_VGIC_GRP_DIST_REGS
Attributes:
@@ -109,8 +122,8 @@ Groups:
KVM_DEV_ARM_VGIC_GRP_CTRL
Attributes:
KVM_DEV_ARM_VGIC_CTRL_INIT
- request the initialization of the VGIC, no additional parameter in
- kvm_device_attr.addr.
+ request the initialization of the VGIC or ITS, no additional parameter
+ in kvm_device_attr.addr.
Errors:
-ENXIO: VGIC not properly configured as required prior to calling
this attribute
diff --git a/Documentation/virtual/kvm/devices/vm.txt b/Documentation/virtual/kvm/devices/vm.txt
index a9ea8774a45f..b6cda49f2ba4 100644
--- a/Documentation/virtual/kvm/devices/vm.txt
+++ b/Documentation/virtual/kvm/devices/vm.txt
@@ -20,7 +20,8 @@ Enables Collaborative Memory Management Assist (CMMA) for the virtual machine.
1.2. ATTRIBUTE: KVM_S390_VM_MEM_CLR_CMMA
Parameters: none
-Returns: 0
+Returns: -EINVAL if CMMA was not enabled
+ 0 otherwise
Clear the CMMA status for all guest pages, so any pages the guest marked
as unused are again used any may not be reclaimed by the host.
@@ -85,6 +86,90 @@ Returns: -EBUSY in case 1 or more vcpus are already activated (only in write
-ENOMEM if not enough memory is available to process the ioctl
0 in case of success
+2.3. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_FEAT (r/o)
+
+Allows user space to retrieve available cpu features. A feature is available if
+provided by the hardware and supported by kvm. In theory, cpu features could
+even be completely emulated by kvm.
+
+struct kvm_s390_vm_cpu_feat {
+ __u64 feat[16]; # Bitmap (1 = feature available), MSB 0 bit numbering
+};
+
+Parameters: address of a buffer to load the feature list from.
+Returns: -EFAULT if the given address is not accessible from kernel space.
+ 0 in case of success.
+
+2.4. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_FEAT (r/w)
+
+Allows user space to retrieve or change enabled cpu features for all VCPUs of a
+VM. Features that are not available cannot be enabled.
+
+See 2.3. for a description of the parameter struct.
+
+Parameters: address of a buffer to store/load the feature list from.
+Returns: -EFAULT if the given address is not accessible from kernel space.
+ -EINVAL if a cpu feature that is not available is to be enabled.
+ -EBUSY if at least one VCPU has already been defined.
+ 0 in case of success.
+
+2.5. ATTRIBUTE: KVM_S390_VM_CPU_MACHINE_SUBFUNC (r/o)
+
+Allows user space to retrieve available cpu subfunctions without any filtering
+done by a set IBC. These subfunctions are indicated to the guest VCPU via
+query or "test bit" subfunctions and used e.g. by cpacf functions, plo and ptff.
+
+A subfunction block is only valid if KVM_S390_VM_CPU_MACHINE contains the
+STFL(E) bit introducing the affected instruction. If the affected instruction
+indicates subfunctions via a "query subfunction", the response block is
+contained in the returned struct. If the affected instruction
+indicates subfunctions via a "test bit" mechanism, the subfunction codes are
+contained in the returned struct in MSB 0 bit numbering.
+
+struct kvm_s390_vm_cpu_subfunc {
+ u8 plo[32]; # always valid (ESA/390 feature)
+ u8 ptff[16]; # valid with TOD-clock steering
+ u8 kmac[16]; # valid with Message-Security-Assist
+ u8 kmc[16]; # valid with Message-Security-Assist
+ u8 km[16]; # valid with Message-Security-Assist
+ u8 kimd[16]; # valid with Message-Security-Assist
+ u8 klmd[16]; # valid with Message-Security-Assist
+ u8 pckmo[16]; # valid with Message-Security-Assist-Extension 3
+ u8 kmctr[16]; # valid with Message-Security-Assist-Extension 4
+ u8 kmf[16]; # valid with Message-Security-Assist-Extension 4
+ u8 kmo[16]; # valid with Message-Security-Assist-Extension 4
+ u8 pcc[16]; # valid with Message-Security-Assist-Extension 4
+ u8 ppno[16]; # valid with Message-Security-Assist-Extension 5
+ u8 reserved[1824]; # reserved for future instructions
+};
+
+Parameters: address of a buffer to load the subfunction blocks from.
+Returns: -EFAULT if the given address is not accessible from kernel space.
+ 0 in case of success.
+
+2.6. ATTRIBUTE: KVM_S390_VM_CPU_PROCESSOR_SUBFUNC (r/w)
+
+Allows user space to retrieve or change cpu subfunctions to be indicated for
+all VCPUs of a VM. This attribute will only be available if kernel and
+hardware support are in place.
+
+The kernel uses the configured subfunction blocks for indication to
+the guest. A subfunction block will only be used if the associated STFL(E) bit
+has not been disabled by user space (so the instruction to be queried is
+actually available for the guest).
+
+As long as no data has been written, a read will fail. The IBC will be used
+to determine available subfunctions in this case, this will guarantee backward
+compatibility.
+
+See 2.5. for a description of the parameter struct.
+
+Parameters: address of a buffer to store/load the subfunction blocks from.
+Returns: -EFAULT if the given address is not accessible from kernel space.
+ -EINVAL when reading, if there was no write yet.
+ -EBUSY if at least one VCPU has already been defined.
+ 0 in case of success.
+
3. GROUP: KVM_S390_VM_TOD
Architectures: s390
diff --git a/Documentation/virtual/kvm/locking.txt b/Documentation/virtual/kvm/locking.txt
index 19f94a6b9bb0..f2491a8c68b4 100644
--- a/Documentation/virtual/kvm/locking.txt
+++ b/Documentation/virtual/kvm/locking.txt
@@ -89,7 +89,7 @@ In mmu_spte_clear_track_bits():
old_spte = *spte;
/* 'if' condition is satisfied. */
- if (old_spte.Accssed == 1 &&
+ if (old_spte.Accessed == 1 &&
old_spte.W == 0)
spte = 0ull;
on fast page fault path:
@@ -102,7 +102,7 @@ In mmu_spte_clear_track_bits():
old_spte = xchg(spte, 0ull)
- if (old_spte.Accssed == 1)
+ if (old_spte.Accessed == 1)
kvm_set_pfn_accessed(spte.pfn);
if (old_spte.Dirty == 1)
kvm_set_pfn_dirty(spte.pfn);
diff --git a/MAINTAINERS b/MAINTAINERS
index 25f43204014d..10074ff03c57 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5094,6 +5094,15 @@ L: linux-scsi@vger.kernel.org
S: Odd Fixes (e.g., new signatures)
F: drivers/scsi/fdomain.*
+GCC PLUGINS
+M: Kees Cook <keescook@chromium.org>
+R: Emese Revfy <re.emese@gmail.com>
+L: kernel-hardening@lists.openwall.com
+S: Maintained
+F: scripts/gcc-plugins/
+F: scripts/gcc-plugin.sh
+F: Documentation/gcc-plugins.txt
+
GCOV BASED KERNEL PROFILING
M: Peter Oberparleiter <oberpar@linux.vnet.ibm.com>
S: Maintained
@@ -8874,6 +8883,7 @@ L: linux-pci@vger.kernel.org
Q: http://patchwork.ozlabs.org/project/linux-pci/list/
T: git git://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git
S: Supported
+F: Documentation/devicetree/bindings/pci/
F: Documentation/PCI/
F: drivers/pci/
F: include/linux/pci*
@@ -8937,6 +8947,13 @@ L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
S: Maintained
F: drivers/pci/host/*mvebu*
+PCI DRIVER FOR AARDVARK (Marvell Armada 3700)
+M: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
+L: linux-pci@vger.kernel.org
+L: linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
+S: Maintained
+F: drivers/pci/host/pci-aardvark.c
+
PCI DRIVER FOR NVIDIA TEGRA
M: Thierry Reding <thierry.reding@gmail.com>
L: linux-tegra@vger.kernel.org
@@ -9019,6 +9036,15 @@ S: Maintained
F: Documentation/devicetree/bindings/pci/xgene-pci-msi.txt
F: drivers/pci/host/pci-xgene-msi.c
+PCIE DRIVER FOR AXIS ARTPEC
+M: Niklas Cassel <niklas.cassel@axis.com>
+M: Jesper Nilsson <jesper.nilsson@axis.com>
+L: linux-arm-kernel@axis.com
+L: linux-pci@vger.kernel.org
+S: Maintained
+F: Documentation/devicetree/bindings/pci/axis,artpec*
+F: drivers/pci/host/*artpec*
+
PCIE DRIVER FOR HISILICON
M: Zhou Wang <wangzhou1@hisilicon.com>
M: Gabriele Paoloni <gabriele.paoloni@huawei.com>
diff --git a/Makefile b/Makefile
index 393b6159ae92..d6d401bf9d95 100644
--- a/Makefile
+++ b/Makefile
@@ -371,26 +371,27 @@ CFLAGS_KERNEL =
AFLAGS_KERNEL =
LDFLAGS_vmlinux =
CFLAGS_GCOV = -fprofile-arcs -ftest-coverage -fno-tree-loop-im
-CFLAGS_KCOV = -fsanitize-coverage=trace-pc
+CFLAGS_KCOV := $(call cc-option,-fsanitize-coverage=trace-pc,)
# Use USERINCLUDE when you must reference the UAPI directories only.
USERINCLUDE := \
-I$(srctree)/arch/$(hdr-arch)/include/uapi \
- -Iarch/$(hdr-arch)/include/generated/uapi \
+ -I$(objtree)/arch/$(hdr-arch)/include/generated/uapi \
-I$(srctree)/include/uapi \
- -Iinclude/generated/uapi \
+ -I$(objtree)/include/generated/uapi \
-include $(srctree)/include/linux/kconfig.h
# Use LINUXINCLUDE when you must reference the include/ directory.
# Needed to be compatible with the O= option
LINUXINCLUDE := \
-I$(srctree)/arch/$(hdr-arch)/include \
- -Iarch/$(hdr-arch)/include/generated/uapi \
- -Iarch/$(hdr-arch)/include/generated \
+ -I$(objtree)/arch/$(hdr-arch)/include/generated/uapi \
+ -I$(objtree)/arch/$(hdr-arch)/include/generated \
$(if $(KBUILD_SRC), -I$(srctree)/include) \
- -Iinclude \
- $(USERINCLUDE)
+ -I$(objtree)/include
+
+LINUXINCLUDE += $(filter-out $(LINUXINCLUDE),$(USERINCLUDE))
KBUILD_CPPFLAGS := -D__KERNEL__
@@ -554,7 +555,7 @@ ifeq ($(KBUILD_EXTMOD),)
# in parallel
PHONY += scripts
scripts: scripts_basic include/config/auto.conf include/config/tristate.conf \
- asm-generic
+ asm-generic gcc-plugins
$(Q)$(MAKE) $(build)=$(@)
# Objects we will link into vmlinux / subdirs we need to visit
@@ -635,6 +636,15 @@ endif
# Tell gcc to never replace conditional load with a non-conditional one
KBUILD_CFLAGS += $(call cc-option,--param=allow-store-data-races=0)
+PHONY += gcc-plugins
+gcc-plugins: scripts_basic
+ifdef CONFIG_GCC_PLUGINS
+ $(Q)$(MAKE) $(build)=scripts/gcc-plugins
+endif
+ @:
+
+include scripts/Makefile.gcc-plugins
+
ifdef CONFIG_READABLE_ASM
# Disable optimizations that make assembler listings hard to read.
# reorder blocks reorders the control in the function
@@ -666,21 +676,11 @@ endif
endif
# Find arch-specific stack protector compiler sanity-checking script.
ifdef CONFIG_CC_STACKPROTECTOR
- stackp-path := $(srctree)/scripts/gcc-$(ARCH)_$(BITS)-has-stack-protector.sh
- ifneq ($(wildcard $(stackp-path)),)
- stackp-check := $(stackp-path)
- endif
+ stackp-path := $(srctree)/scripts/gcc-$(SRCARCH)_$(BITS)-has-stack-protector.sh
+ stackp-check := $(wildcard $(stackp-path))
endif
KBUILD_CFLAGS += $(stackp-flag)
-ifdef CONFIG_KCOV
- ifeq ($(call cc-option, $(CFLAGS_KCOV)),)
- $(warning Cannot use CONFIG_KCOV: \
- -fsanitize-coverage=trace-pc is not supported by compiler)
- CFLAGS_KCOV =
- endif
-endif
-
ifeq ($(cc-name),clang)
KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,)
KBUILD_CPPFLAGS += $(call cc-option,-Wno-unknown-warning-option,)
@@ -1019,7 +1019,7 @@ prepare1: prepare2 $(version_h) include/generated/utsrelease.h \
archprepare: archheaders archscripts prepare1 scripts_basic
-prepare0: archprepare
+prepare0: archprepare gcc-plugins
$(Q)$(MAKE) $(build)=.
# All the preparing..
@@ -1531,6 +1531,7 @@ clean: $(clean-dirs)
-o -name '.*.d' -o -name '.*.tmp' -o -name '*.mod.c' \
-o -name '*.symtypes' -o -name 'modules.order' \
-o -name modules.builtin -o -name '.tmp_*.o.*' \
+ -o -name '*.c.[012]*.*' \
-o -name '*.gcno' \) -type f -print | xargs rm -f
# Generate tags for editors
@@ -1641,7 +1642,7 @@ endif
$(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
$(build)=$(build-dir)
# Make sure the latest headers are built for Documentation
-Documentation/: headers_install
+Documentation/ samples/: headers_install
%/: prepare scripts FORCE
$(cmd_crmodverdir)
$(Q)$(MAKE) KBUILD_MODULES=$(if $(CONFIG_MODULES),1) \
diff --git a/arch/Kconfig b/arch/Kconfig
index 15996290fed4..bd8056b5b246 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -357,6 +357,43 @@ config SECCOMP_FILTER
See Documentation/prctl/seccomp_filter.txt for details.
+config HAVE_GCC_PLUGINS
+ bool
+ help
+ An arch should select this symbol if it supports building with
+ GCC plugins.
+
+menuconfig GCC_PLUGINS
+ bool "GCC plugins"
+ depends on HAVE_GCC_PLUGINS
+ depends on !COMPILE_TEST
+ help
+ GCC plugins are loadable modules that provide extra features to the
+ compiler. They are useful for runtime instrumentation and static analysis.
+
+ See Documentation/gcc-plugins.txt for details.
+
+config GCC_PLUGIN_CYC_COMPLEXITY
+ bool "Compute the cyclomatic complexity of a function"
+ depends on GCC_PLUGINS
+ help
+ The complexity M of a function's control flow graph is defined as:
+ M = E - N + 2P
+ where
+
+ E = the number of edges
+ N = the number of nodes
+ P = the number of connected components (exit nodes).
+
+config GCC_PLUGIN_SANCOV
+ bool
+ depends on GCC_PLUGINS
+ help
+ This plugin inserts a __sanitizer_cov_trace_pc() call at the start of
+ basic blocks. It supports all gcc versions with plugin support (from
+ gcc-4.5 on). It is based on the commit "Add fuzzing coverage support"
+ by Dmitry Vyukov <dvyukov@google.com>.
+
config HAVE_CC_STACKPROTECTOR
bool
help
diff --git a/arch/alpha/boot/Makefile b/arch/alpha/boot/Makefile
index 8399bd0e68e8..0cbe4c59d3ce 100644
--- a/arch/alpha/boot/Makefile
+++ b/arch/alpha/boot/Makefile
@@ -15,7 +15,7 @@ targets := vmlinux.gz vmlinux \
OBJSTRIP := $(obj)/tools/objstrip
HOSTCFLAGS := -Wall -I$(objtree)/usr/include
-BOOTCFLAGS += -I$(obj) -I$(srctree)/$(obj)
+BOOTCFLAGS += -I$(objtree)/$(obj) -I$(srctree)/$(obj)
# SRM bootable image. Copy to offset 512 of a partition.
$(obj)/bootimage: $(addprefix $(obj)/tools/,mkbb lxboot bootlx) $(obj)/vmlinux.nh
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 14b4cf75ceec..2d601d769a1c 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -54,6 +54,7 @@ config ARM
select HAVE_FTRACE_MCOUNT_RECORD if (!XIP_KERNEL)
select HAVE_FUNCTION_GRAPH_TRACER if (!THUMB2_KERNEL)
select HAVE_FUNCTION_TRACER if (!XIP_KERNEL)
+ select HAVE_GCC_PLUGINS
select HAVE_GENERIC_DMA_COHERENT
select HAVE_HW_BREAKPOINT if (PERF_EVENTS && (CPU_V6 || CPU_V6K || CPU_V7))
select HAVE_IDE if PCI || ISA || PCMCIA
@@ -699,7 +700,7 @@ config ARCH_VIRT
depends on ARCH_MULTI_V7
select ARM_AMBA
select ARM_GIC
- select ARM_GIC_V2M if PCI_MSI
+ select ARM_GIC_V2M if PCI
select ARM_GIC_V3
select ARM_PSCI
select HAVE_ARM_ARCH_TIMER
diff --git a/arch/arm/include/asm/kvm_asm.h b/arch/arm/include/asm/kvm_asm.h
index 3d5a5cd071bd..58faff5f1eb2 100644
--- a/arch/arm/include/asm/kvm_asm.h
+++ b/arch/arm/include/asm/kvm_asm.h
@@ -66,6 +66,8 @@ extern void __kvm_tlb_flush_vmid(struct kvm *kvm);
extern int __kvm_vcpu_run(struct kvm_vcpu *vcpu);
extern void __init_stage2_translation(void);
+
+extern void __kvm_hyp_reset(unsigned long);
#endif
#endif /* __ARM_KVM_ASM_H__ */
diff --git a/arch/arm/include/asm/kvm_host.h b/arch/arm/include/asm/kvm_host.h
index 96387d477e91..de338d93d11b 100644
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -241,8 +241,7 @@ int kvm_arm_coproc_set_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *);
int handle_exit(struct kvm_vcpu *vcpu, struct kvm_run *run,
int exception_index);
-static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
- phys_addr_t pgd_ptr,
+static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
unsigned long hyp_stack_ptr,
unsigned long vector_ptr)
{
@@ -251,18 +250,13 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
* code. The init code doesn't need to preserve these
* registers as r0-r3 are already callee saved according to
* the AAPCS.
- * Note that we slightly misuse the prototype by casing the
+ * Note that we slightly misuse the prototype by casting the
* stack pointer to a void *.
- *
- * We don't have enough registers to perform the full init in
- * one go. Install the boot PGD first, and then install the
- * runtime PGD, stack pointer and vectors. The PGDs are always
- * passed as the third argument, in order to be passed into
- * r2-r3 to the init code (yes, this is compliant with the
- * PCS!).
- */
- kvm_call_hyp(NULL, 0, boot_pgd_ptr);
+ * The PGDs are always passed as the third argument, in order
+ * to be passed into r2-r3 to the init code (yes, this is
+ * compliant with the PCS!).
+ */
kvm_call_hyp((void*)hyp_stack_ptr, vector_ptr, pgd_ptr);
}
@@ -272,16 +266,13 @@ static inline void __cpu_init_stage2(void)
kvm_call_hyp(__init_stage2_translation);
}
-static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
phys_addr_t phys_idmap_start)
{
- /*
- * TODO
- * kvm_call_reset(boot_pgd_ptr, phys_idmap_start);
- */
+ kvm_call_hyp((void *)virt_to_idmap(__kvm_hyp_reset), vector_ptr);
}
-static inline int kvm_arch_dev_ioctl_check_extension(long ext)
+static inline int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
{
return 0;
}
diff --git a/arch/arm/include/asm/kvm_hyp.h b/arch/arm/include/asm/kvm_hyp.h
index f0e860761380..6eaff28f2ff3 100644
--- a/arch/arm/include/asm/kvm_hyp.h
+++ b/arch/arm/include/asm/kvm_hyp.h
@@ -25,9 +25,6 @@
#define __hyp_text __section(.hyp.text) notrace
-#define kern_hyp_va(v) (v)
-#define hyp_kern_va(v) (v)
-
#define __ACCESS_CP15(CRn, Op1, CRm, Op2) \
"mrc", "mcr", __stringify(p15, Op1, %0, CRn, CRm, Op2), u32
#define __ACCESS_CP15_64(Op1, CRm) \
diff --git a/arch/arm/include/asm/kvm_mmu.h b/arch/arm/include/asm/kvm_mmu.h
index f9a65061130b..3bb803d6814b 100644
--- a/arch/arm/include/asm/kvm_mmu.h
+++ b/arch/arm/include/asm/kvm_mmu.h
@@ -26,16 +26,7 @@
* We directly use the kernel VA for the HYP, as we can directly share
* the mapping (HTTBR "covers" TTBR1).
*/
-#define HYP_PAGE_OFFSET_MASK UL(~0)
-#define HYP_PAGE_OFFSET PAGE_OFFSET
-#define KERN_TO_HYP(kva) (kva)
-
-/*
- * Our virtual mapping for the boot-time MMU-enable code. Must be
- * shared across all the page-tables. Conveniently, we use the vectors
- * page, where no kernel data will ever be shared with HYP.
- */
-#define TRAMPOLINE_VA UL(CONFIG_VECTORS_BASE)
+#define kern_hyp_va(kva) (kva)
/*
* KVM_MMU_CACHE_MIN_PAGES is the number of stage2 page table translation levels.
@@ -49,9 +40,8 @@
#include <asm/pgalloc.h>
#include <asm/stage2_pgtable.h>
-int create_hyp_mappings(void *from, void *to);
+int create_hyp_mappings(void *from, void *to, pgprot_t prot);
int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
-void free_boot_hyp_pgd(void);
void free_hyp_pgds(void);
void stage2_unmap_vm(struct kvm *kvm);
@@ -65,7 +55,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
phys_addr_t kvm_mmu_get_httbr(void);
-phys_addr_t kvm_mmu_get_boot_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
phys_addr_t kvm_get_idmap_start(void);
int kvm_mmu_init(void);
diff --git a/arch/arm/include/asm/mach/pci.h b/arch/arm/include/asm/mach/pci.h
index 0070e8520cd4..2d88af5be45f 100644
--- a/arch/arm/include/asm/mach/pci.h
+++ b/arch/arm/include/asm/mach/pci.h
@@ -22,6 +22,7 @@ struct hw_pci {
struct msi_controller *msi_ctrl;
struct pci_ops *ops;
int nr_controllers;
+ unsigned int io_optional:1;
void **private_data;
int (*setup)(int nr, struct pci_sys_data *);
struct pci_bus *(*scan)(int nr, struct pci_sys_data *);
diff --git a/arch/arm/include/asm/pgtable.h b/arch/arm/include/asm/pgtable.h
index d62204060cbe..a8d656d9aec7 100644
--- a/arch/arm/include/asm/pgtable.h
+++ b/arch/arm/include/asm/pgtable.h
@@ -97,7 +97,9 @@ extern pgprot_t pgprot_s2_device;
#define PAGE_READONLY_EXEC _MOD_PROT(pgprot_user, L_PTE_USER | L_PTE_RDONLY)
#define PAGE_KERNEL _MOD_PROT(pgprot_kernel, L_PTE_XN)
#define PAGE_KERNEL_EXEC pgprot_kernel
-#define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_HYP)
+#define PAGE_HYP _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_XN)
+#define PAGE_HYP_EXEC _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY)
+#define PAGE_HYP_RO _MOD_PROT(pgprot_kernel, L_PTE_HYP | L_PTE_RDONLY | L_PTE_XN)
#define PAGE_HYP_DEVICE _MOD_PROT(pgprot_hyp_device, L_PTE_HYP)
#define PAGE_S2 _MOD_PROT(pgprot_s2, L_PTE_S2_RDONLY)
#define PAGE_S2_DEVICE _MOD_PROT(pgprot_s2_device, L_PTE_S2_RDONLY)
diff --git a/arch/arm/include/asm/virt.h b/arch/arm/include/asm/virt.h
index d4ceaf5f299b..a2e75b84e2ae 100644
--- a/arch/arm/include/asm/virt.h
+++ b/arch/arm/include/asm/virt.h
@@ -80,6 +80,10 @@ static inline bool is_kernel_in_hyp_mode(void)
return false;
}
+/* The section containing the hypervisor idmap text */
+extern char __hyp_idmap_text_start[];
+extern char __hyp_idmap_text_end[];
+
/* The section containing the hypervisor text */
extern char __hyp_text_start[];
extern char __hyp_text_end[];
diff --git a/arch/arm/kernel/bios32.c b/arch/arm/kernel/bios32.c
index 05e61a2eeabe..2f0e07735d1d 100644
--- a/arch/arm/kernel/bios32.c
+++ b/arch/arm/kernel/bios32.c
@@ -410,7 +410,8 @@ static int pcibios_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
return irq;
}
-static int pcibios_init_resources(int busnr, struct pci_sys_data *sys)
+static int pcibios_init_resource(int busnr, struct pci_sys_data *sys,
+ int io_optional)
{
int ret;
struct resource_entry *window;
@@ -420,6 +421,14 @@ static int pcibios_init_resources(int busnr, struct pci_sys_data *sys)
&iomem_resource, sys->mem_offset);
}
+ /*
+ * If a platform says I/O port support is optional, we don't add
+ * the default I/O space. The platform is responsible for adding
+ * any I/O space it needs.
+ */
+ if (io_optional)
+ return 0;
+
resource_list_for_each_entry(window, &sys->resources)
if (resource_type(window->res) == IORESOURCE_IO)
return 0;
@@ -466,7 +475,7 @@ static void pcibios_init_hw(struct device *parent, struct hw_pci *hw,
if (ret > 0) {
struct pci_host_bridge *host_bridge;
- ret = pcibios_init_resources(nr, sys);
+ ret = pcibios_init_resource(nr, sys, hw->io_optional);
if (ret) {
kfree(sys);
break;
@@ -515,25 +524,23 @@ void pci_common_init_dev(struct device *parent, struct hw_pci *hw)
list_for_each_entry(sys, &head, node) {
struct pci_bus *bus = sys->bus;
- if (!pci_has_flag(PCI_PROBE_ONLY)) {
+ /*
+ * We insert PCI resources into the iomem_resource and
+ * ioport_resource trees in either pci_bus_claim_resources()
+ * or pci_bus_assign_resources().
+ */
+ if (pci_has_flag(PCI_PROBE_ONLY)) {
+ pci_bus_claim_resources(bus);
+ } else {
struct pci_bus *child;
- /*
- * Size the bridge windows.
- */
pci_bus_size_bridges(bus);
-
- /*
- * Assign resources.
- */
pci_bus_assign_resources(bus);
list_for_each_entry(child, &bus->children, node)
pcie_bus_configure_settings(child);
}
- /*
- * Tell drivers about devices found.
- */
+
pci_bus_add_devices(bus);
}
}
@@ -590,18 +597,6 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
return start;
}
-/**
- * pcibios_enable_device - Enable I/O and memory.
- * @dev: PCI device to be enabled
- */
-int pcibios_enable_device(struct pci_dev *dev, int mask)
-{
- if (pci_has_flag(PCI_PROBE_ONLY))
- return 0;
-
- return pci_enable_resources(dev, mask);
-}
-
int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
enum pci_mmap_state mmap_state, int write_combine)
{
diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
index 02abfff68ee5..95a000515e43 100644
--- a/arch/arm/kvm/Kconfig
+++ b/arch/arm/kvm/Kconfig
@@ -46,13 +46,6 @@ config KVM_ARM_HOST
---help---
Provides host support for ARM processors.
-config KVM_NEW_VGIC
- bool "New VGIC implementation"
- depends on KVM
- default y
- ---help---
- uses the new VGIC implementation
-
source drivers/vhost/Kconfig
endif # VIRTUALIZATION
diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile
index a596b58f6d37..5e28df80dca7 100644
--- a/arch/arm/kvm/Makefile
+++ b/arch/arm/kvm/Makefile
@@ -22,7 +22,6 @@ obj-y += kvm-arm.o init.o interrupts.o
obj-y += arm.o handle_exit.o guest.o mmu.o emulate.o reset.o
obj-y += coproc.o coproc_a15.o coproc_a7.o mmio.o psci.o perf.o
-ifeq ($(CONFIG_KVM_NEW_VGIC),y)
obj-y += $(KVM)/arm/vgic/vgic.o
obj-y += $(KVM)/arm/vgic/vgic-init.o
obj-y += $(KVM)/arm/vgic/vgic-irqfd.o
@@ -30,9 +29,4 @@ obj-y += $(KVM)/arm/vgic/vgic-v2.o
obj-y += $(KVM)/arm/vgic/vgic-mmio.o
obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o
obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o
-else
-obj-y += $(KVM)/arm/vgic.o
-obj-y += $(KVM)/arm/vgic-v2.o
-obj-y += $(KVM)/arm/vgic-v2-emul.o
-endif
obj-y += $(KVM)/arm/arch_timer.o
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index f1bde7c4e736..d94bb9093ead 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -20,6 +20,7 @@
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/kvm_host.h>
+#include <linux/list.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
#include <linux/fs.h>
@@ -122,7 +123,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
if (ret)
goto out_fail_alloc;
- ret = create_hyp_mappings(kvm, kvm + 1);
+ ret = create_hyp_mappings(kvm, kvm + 1, PAGE_HYP);
if (ret)
goto out_free_stage2_pgd;
@@ -201,7 +202,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = KVM_MAX_VCPUS;
break;
default:
- r = kvm_arch_dev_ioctl_check_extension(ext);
+ r = kvm_arch_dev_ioctl_check_extension(kvm, ext);
break;
}
return r;
@@ -239,7 +240,7 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
if (err)
goto free_vcpu;
- err = create_hyp_mappings(vcpu, vcpu + 1);
+ err = create_hyp_mappings(vcpu, vcpu + 1, PAGE_HYP);
if (err)
goto vcpu_uninit;
@@ -377,7 +378,7 @@ void force_vm_exit(const cpumask_t *mask)
/**
* need_new_vmid_gen - check that the VMID is still valid
- * @kvm: The VM's VMID to checkt
+ * @kvm: The VM's VMID to check
*
* return true if there is a new generation of VMIDs being used
*
@@ -616,7 +617,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
* Enter the guest
*/
trace_kvm_entry(*vcpu_pc(vcpu));
- __kvm_guest_enter();
+ guest_enter_irqoff();
vcpu->mode = IN_GUEST_MODE;
ret = kvm_call_hyp(__kvm_vcpu_run, vcpu);
@@ -642,14 +643,14 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
local_irq_enable();
/*
- * We do local_irq_enable() before calling kvm_guest_exit() so
+ * We do local_irq_enable() before calling guest_exit() so
* that if a timer interrupt hits while running the guest we
* account that tick as being spent in the guest. We enable
- * preemption after calling kvm_guest_exit() so that if we get
+ * preemption after calling guest_exit() so that if we get
* preempted we make sure ticks after that is not counted as
* guest time.
*/
- kvm_guest_exit();
+ guest_exit();
trace_kvm_exit(ret, kvm_vcpu_trap_get_class(vcpu), *vcpu_pc(vcpu));
/*
@@ -1039,7 +1040,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
static void cpu_init_hyp_mode(void *dummy)
{
- phys_addr_t boot_pgd_ptr;
phys_addr_t pgd_ptr;
unsigned long hyp_stack_ptr;
unsigned long stack_page;
@@ -1048,13 +1048,12 @@ static void cpu_init_hyp_mode(void *dummy)
/* Switch from the HYP stub to our own HYP init vector */
__hyp_set_vectors(kvm_get_idmap_vector());
- boot_pgd_ptr = kvm_mmu_get_boot_httbr();
pgd_ptr = kvm_mmu_get_httbr();
stack_page = __this_cpu_read(kvm_arm_hyp_stack_page);
hyp_stack_ptr = stack_page + PAGE_SIZE;
vector_ptr = (unsigned long)kvm_ksym_ref(__kvm_hyp_vector);
- __cpu_init_hyp_mode(boot_pgd_ptr, pgd_ptr, hyp_stack_ptr, vector_ptr);
+ __cpu_init_hyp_mode(pgd_ptr, hyp_stack_ptr, vector_ptr);
__cpu_init_stage2();
kvm_arm_init_debug();
@@ -1076,15 +1075,9 @@ static void cpu_hyp_reinit(void)
static void cpu_hyp_reset(void)
{
- phys_addr_t boot_pgd_ptr;
- phys_addr_t phys_idmap_start;
-
- if (!is_kernel_in_hyp_mode()) {
- boot_pgd_ptr = kvm_mmu_get_boot_httbr();
- phys_idmap_start = kvm_get_idmap_start();
-
- __cpu_reset_hyp_mode(boot_pgd_ptr, phys_idmap_start);
- }
+ if (!is_kernel_in_hyp_mode())
+ __cpu_reset_hyp_mode(hyp_default_vectors,
+ kvm_get_idmap_start());
}
static void _kvm_arch_hardware_enable(void *discard)
@@ -1294,14 +1287,14 @@ static int init_hyp_mode(void)
* Map the Hyp-code called directly from the host
*/
err = create_hyp_mappings(kvm_ksym_ref(__hyp_text_start),
- kvm_ksym_ref(__hyp_text_end));
+ kvm_ksym_ref(__hyp_text_end), PAGE_HYP_EXEC);
if (err) {
kvm_err("Cannot map world-switch code\n");
goto out_err;
}
err = create_hyp_mappings(kvm_ksym_ref(__start_rodata),
- kvm_ksym_ref(__end_rodata));
+ kvm_ksym_ref(__end_rodata), PAGE_HYP_RO);
if (err) {
kvm_err("Cannot map rodata section\n");
goto out_err;
@@ -1312,7 +1305,8 @@ static int init_hyp_mode(void)
*/
for_each_possible_cpu(cpu) {
char *stack_page = (char *)per_cpu(kvm_arm_hyp_stack_page, cpu);
- err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE);
+ err = create_hyp_mappings(stack_page, stack_page + PAGE_SIZE,
+ PAGE_HYP);
if (err) {
kvm_err("Cannot map hyp stack\n");
@@ -1324,7 +1318,7 @@ static int init_hyp_mode(void)
kvm_cpu_context_t *cpu_ctxt;
cpu_ctxt = per_cpu_ptr(kvm_host_cpu_state, cpu);
- err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1);
+ err = create_hyp_mappings(cpu_ctxt, cpu_ctxt + 1, PAGE_HYP);
if (err) {
kvm_err("Cannot map host CPU state: %d\n", err);
@@ -1332,10 +1326,6 @@ static int init_hyp_mode(void)
}
}
-#ifndef CONFIG_HOTPLUG_CPU
- free_boot_hyp_pgd();
-#endif
-
/* set size of VMID supported by CPU */
kvm_vmid_bits = kvm_get_vmid_bits();
kvm_info("%d-bit VMID\n", kvm_vmid_bits);
diff --git a/arch/arm/kvm/emulate.c b/arch/arm/kvm/emulate.c
index a494def3f195..af93e3ffc9f3 100644
--- a/arch/arm/kvm/emulate.c
+++ b/arch/arm/kvm/emulate.c
@@ -210,7 +210,7 @@ bool kvm_condition_valid(struct kvm_vcpu *vcpu)
* @vcpu: The VCPU pointer
*
* When exceptions occur while instructions are executed in Thumb IF-THEN
- * blocks, the ITSTATE field of the CPSR is not advanved (updated), so we have
+ * blocks, the ITSTATE field of the CPSR is not advanced (updated), so we have
* to do this little bit of work manually. The fields map like this:
*
* IT[7:0] -> CPSR[26:25],CPSR[15:10]
diff --git a/arch/arm/kvm/guest.c b/arch/arm/kvm/guest.c
index 9093ed0f8b2a..9aca92074f85 100644
--- a/arch/arm/kvm/guest.c
+++ b/arch/arm/kvm/guest.c
@@ -182,7 +182,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
/**
* kvm_arm_copy_reg_indices - get indices of all registers.
*
- * We do core registers right here, then we apppend coproc regs.
+ * We do core registers right here, then we append coproc regs.
*/
int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
{
diff --git a/arch/arm/kvm/init.S b/arch/arm/kvm/init.S
index 1f9ae17476f9..bf89c919efc1 100644
--- a/arch/arm/kvm/init.S
+++ b/arch/arm/kvm/init.S
@@ -32,23 +32,13 @@
* r2,r3 = Hypervisor pgd pointer
*
* The init scenario is:
- * - We jump in HYP with four parameters: boot HYP pgd, runtime HYP pgd,
- * runtime stack, runtime vectors
- * - Enable the MMU with the boot pgd
- * - Jump to a target into the trampoline page (remember, this is the same
- * physical page!)
- * - Now switch to the runtime pgd (same VA, and still the same physical
- * page!)
+ * - We jump in HYP with 3 parameters: runtime HYP pgd, runtime stack,
+ * runtime vectors
* - Invalidate TLBs
* - Set stack and vectors
+ * - Setup the page tables
+ * - Enable the MMU
* - Profit! (or eret, if you only care about the code).
- *
- * As we only have four registers available to pass parameters (and we
- * need six), we split the init in two phases:
- * - Phase 1: r0 = 0, r1 = 0, r2,r3 contain the boot PGD.
- * Provides the basic HYP init, and enable the MMU.
- * - Phase 2: r0 = ToS, r1 = vectors, r2,r3 contain the runtime PGD.
- * Switches to the runtime PGD, set stack and vectors.
*/
.text
@@ -68,8 +58,11 @@ __kvm_hyp_init:
W(b) .
__do_hyp_init:
- cmp r0, #0 @ We have a SP?
- bne phase2 @ Yes, second stage init
+ @ Set stack pointer
+ mov sp, r0
+
+ @ Set HVBAR to point to the HYP vectors
+ mcr p15, 4, r1, c12, c0, 0 @ HVBAR
@ Set the HTTBR to point to the hypervisor PGD pointer passed
mcrr p15, 4, rr_lo_hi(r2, r3), c2
@@ -114,34 +107,25 @@ __do_hyp_init:
THUMB( ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE) )
orr r1, r1, r2
orr r0, r0, r1
- isb
mcr p15, 4, r0, c1, c0, 0 @ HSCR
+ isb
- @ End of init phase-1
eret
-phase2:
- @ Set stack pointer
- mov sp, r0
-
- @ Set HVBAR to point to the HYP vectors
- mcr p15, 4, r1, c12, c0, 0 @ HVBAR
-
- @ Jump to the trampoline page
- ldr r0, =TRAMPOLINE_VA
- adr r1, target
- bfi r0, r1, #0, #PAGE_SHIFT
- ret r0
+ @ r0 : stub vectors address
+ENTRY(__kvm_hyp_reset)
+ /* We're now in idmap, disable MMU */
+ mrc p15, 4, r1, c1, c0, 0 @ HSCTLR
+ ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_C | HSCTLR_I)
+ bic r1, r1, r2
+ mcr p15, 4, r1, c1, c0, 0 @ HSCTLR
-target: @ We're now in the trampoline code, switch page tables
- mcrr p15, 4, rr_lo_hi(r2, r3), c2
+ /* Install stub vectors */
+ mcr p15, 4, r0, c12, c0, 0 @ HVBAR
isb
- @ Invalidate the old TLBs
- mcr p15, 4, r0, c8, c7, 0 @ TLBIALLH
- dsb ish
-
eret
+ENDPROC(__kvm_hyp_reset)
.ltorg
diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c
index 45c43aecb8f2..bda27b6b1aa2 100644
--- a/arch/arm/kvm/mmu.c
+++ b/arch/arm/kvm/mmu.c
@@ -32,8 +32,6 @@
#include "trace.h"
-extern char __hyp_idmap_text_start[], __hyp_idmap_text_end[];
-
static pgd_t *boot_hyp_pgd;
static pgd_t *hyp_pgd;
static pgd_t *merged_hyp_pgd;
@@ -484,28 +482,6 @@ static void unmap_hyp_range(pgd_t *pgdp, phys_addr_t start, u64 size)
}
/**
- * free_boot_hyp_pgd - free HYP boot page tables
- *
- * Free the HYP boot page tables. The bounce page is also freed.
- */
-void free_boot_hyp_pgd(void)
-{
- mutex_lock(&kvm_hyp_pgd_mutex);
-
- if (boot_hyp_pgd) {
- unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
- unmap_hyp_range(boot_hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
- free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
- boot_hyp_pgd = NULL;
- }
-
- if (hyp_pgd)
- unmap_hyp_range(hyp_pgd, TRAMPOLINE_VA, PAGE_SIZE);
-
- mutex_unlock(&kvm_hyp_pgd_mutex);
-}
-
-/**
* free_hyp_pgds - free Hyp-mode page tables
*
* Assumes hyp_pgd is a page table used strictly in Hyp-mode and
@@ -519,15 +495,20 @@ void free_hyp_pgds(void)
{
unsigned long addr;
- free_boot_hyp_pgd();
-
mutex_lock(&kvm_hyp_pgd_mutex);
+ if (boot_hyp_pgd) {
+ unmap_hyp_range(boot_hyp_pgd, hyp_idmap_start, PAGE_SIZE);
+ free_pages((unsigned long)boot_hyp_pgd, hyp_pgd_order);
+ boot_hyp_pgd = NULL;
+ }
+
if (hyp_pgd) {
+ unmap_hyp_range(hyp_pgd, hyp_idmap_start, PAGE_SIZE);
for (addr = PAGE_OFFSET; virt_addr_valid(addr); addr += PGDIR_SIZE)
- unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+ unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE);
for (addr = VMALLOC_START; is_vmalloc_addr((void*)addr); addr += PGDIR_SIZE)
- unmap_hyp_range(hyp_pgd, KERN_TO_HYP(addr), PGDIR_SIZE);
+ unmap_hyp_range(hyp_pgd, kern_hyp_va(addr), PGDIR_SIZE);
free_pages((unsigned long)hyp_pgd, hyp_pgd_order);
hyp_pgd = NULL;
@@ -679,17 +660,18 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr)
* create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode
* @from: The virtual kernel start address of the range
* @to: The virtual kernel end address of the range (exclusive)
+ * @prot: The protection to be applied to this range
*
* The same virtual address as the kernel virtual address is also used
* in Hyp-mode mapping (modulo HYP_PAGE_OFFSET) to the same underlying
* physical pages.
*/
-int create_hyp_mappings(void *from, void *to)
+int create_hyp_mappings(void *from, void *to, pgprot_t prot)
{
phys_addr_t phys_addr;
unsigned long virt_addr;
- unsigned long start = KERN_TO_HYP((unsigned long)from);
- unsigned long end = KERN_TO_HYP((unsigned long)to);
+ unsigned long start = kern_hyp_va((unsigned long)from);
+ unsigned long end = kern_hyp_va((unsigned long)to);
if (is_kernel_in_hyp_mode())
return 0;
@@ -704,7 +686,7 @@ int create_hyp_mappings(void *from, void *to)
err = __create_hyp_mappings(hyp_pgd, virt_addr,
virt_addr + PAGE_SIZE,
__phys_to_pfn(phys_addr),
- PAGE_HYP);
+ prot);
if (err)
return err;
}
@@ -723,8 +705,8 @@ int create_hyp_mappings(void *from, void *to)
*/
int create_hyp_io_mappings(void *from, void *to, phys_addr_t phys_addr)
{
- unsigned long start = KERN_TO_HYP((unsigned long)from);
- unsigned long end = KERN_TO_HYP((unsigned long)to);
+ unsigned long start = kern_hyp_va((unsigned long)from);
+ unsigned long end = kern_hyp_va((unsigned long)to);
if (is_kernel_in_hyp_mode())
return 0;
@@ -1687,14 +1669,6 @@ phys_addr_t kvm_mmu_get_httbr(void)
return virt_to_phys(hyp_pgd);
}
-phys_addr_t kvm_mmu_get_boot_httbr(void)
-{
- if (__kvm_cpu_uses_extended_idmap())
- return virt_to_phys(merged_hyp_pgd);
- else
- return virt_to_phys(boot_hyp_pgd);
-}
-
phys_addr_t kvm_get_idmap_vector(void)
{
return hyp_idmap_vector;
@@ -1705,6 +1679,22 @@ phys_addr_t kvm_get_idmap_start(void)
return hyp_idmap_start;
}
+static int kvm_map_idmap_text(pgd_t *pgd)
+{
+ int err;
+
+ /* Create the idmap in the boot page tables */
+ err = __create_hyp_mappings(pgd,
+ hyp_idmap_start, hyp_idmap_end,
+ __phys_to_pfn(hyp_idmap_start),
+ PAGE_HYP_EXEC);
+ if (err)
+ kvm_err("Failed to idmap %lx-%lx\n",
+ hyp_idmap_start, hyp_idmap_end);
+
+ return err;
+}
+
int kvm_mmu_init(void)
{
int err;
@@ -1719,28 +1709,41 @@ int kvm_mmu_init(void)
*/
BUG_ON((hyp_idmap_start ^ (hyp_idmap_end - 1)) & PAGE_MASK);
- hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
- boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
+ kvm_info("IDMAP page: %lx\n", hyp_idmap_start);
+ kvm_info("HYP VA range: %lx:%lx\n",
+ kern_hyp_va(PAGE_OFFSET), kern_hyp_va(~0UL));
- if (!hyp_pgd || !boot_hyp_pgd) {
- kvm_err("Hyp mode PGD not allocated\n");
- err = -ENOMEM;
+ if (hyp_idmap_start >= kern_hyp_va(PAGE_OFFSET) &&
+ hyp_idmap_start < kern_hyp_va(~0UL)) {
+ /*
+ * The idmap page is intersecting with the VA space,
+ * it is not safe to continue further.
+ */
+ kvm_err("IDMAP intersecting with HYP VA, unable to continue\n");
+ err = -EINVAL;
goto out;
}
- /* Create the idmap in the boot page tables */
- err = __create_hyp_mappings(boot_hyp_pgd,
- hyp_idmap_start, hyp_idmap_end,
- __phys_to_pfn(hyp_idmap_start),
- PAGE_HYP);
-
- if (err) {
- kvm_err("Failed to idmap %lx-%lx\n",
- hyp_idmap_start, hyp_idmap_end);
+ hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, hyp_pgd_order);
+ if (!hyp_pgd) {
+ kvm_err("Hyp mode PGD not allocated\n");
+ err = -ENOMEM;
goto out;
}
if (__kvm_cpu_uses_extended_idmap()) {
+ boot_hyp_pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
+ hyp_pgd_order);
+ if (!boot_hyp_pgd) {
+ kvm_err("Hyp boot PGD not allocated\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ err = kvm_map_idmap_text(boot_hyp_pgd);
+ if (err)
+ goto out;
+
merged_hyp_pgd = (pgd_t *)__get_free_page(GFP_KERNEL | __GFP_ZERO);
if (!merged_hyp_pgd) {
kvm_err("Failed to allocate extra HYP pgd\n");
@@ -1748,29 +1751,10 @@ int kvm_mmu_init(void)
}
__kvm_extend_hypmap(boot_hyp_pgd, hyp_pgd, merged_hyp_pgd,
hyp_idmap_start);
- return 0;
- }
-
- /* Map the very same page at the trampoline VA */
- err = __create_hyp_mappings(boot_hyp_pgd,
- TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
- __phys_to_pfn(hyp_idmap_start),
- PAGE_HYP);
- if (err) {
- kvm_err("Failed to map trampoline @%lx into boot HYP pgd\n",
- TRAMPOLINE_VA);
- goto out;
- }
-
- /* Map the same page again into the runtime page tables */
- err = __create_hyp_mappings(hyp_pgd,
- TRAMPOLINE_VA, TRAMPOLINE_VA + PAGE_SIZE,
- __phys_to_pfn(hyp_idmap_start),
- PAGE_HYP);
- if (err) {
- kvm_err("Failed to map trampoline @%lx into runtime HYP pgd\n",
- TRAMPOLINE_VA);
- goto out;
+ } else {
+ err = kvm_map_idmap_text(hyp_pgd);
+ if (err)
+ goto out;
}
return 0;
diff --git a/arch/arm/kvm/reset.c b/arch/arm/kvm/reset.c
index 0048b5a62a50..4b5e802e57d1 100644
--- a/arch/arm/kvm/reset.c
+++ b/arch/arm/kvm/reset.c
@@ -52,7 +52,7 @@ static const struct kvm_irq_level cortexa_vtimer_irq = {
* @vcpu: The VCPU pointer
*
* This function finds the right table above and sets the registers on the
- * virtual CPU struct to their architectually defined reset values.
+ * virtual CPU struct to their architecturally defined reset values.
*/
int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
{
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 9f8b99e20557..69c8787bec7d 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -3,6 +3,7 @@ config ARM64
select ACPI_CCA_REQUIRED if ACPI
select ACPI_GENERIC_GSI if ACPI
select ACPI_REDUCED_HARDWARE_ONLY if ACPI
+ select ACPI_MCFG if ACPI
select ARCH_HAS_DEVMEM_IS_ALLOWED
select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI
select ARCH_HAS_ATOMIC64_DEC_IF_POSITIVE
@@ -22,9 +23,9 @@ config ARM64
select ARM_ARCH_TIMER
select ARM_GIC
select AUDIT_ARCH_COMPAT_GENERIC
- select ARM_GIC_V2M if PCI_MSI
+ select ARM_GIC_V2M if PCI
select ARM_GIC_V3
- select ARM_GIC_V3_ITS if PCI_MSI
+ select ARM_GIC_V3_ITS if PCI
select ARM_PSCI_FW
select BUILDTIME_EXTABLE_SORT
select CLONE_BACKWARDS
@@ -78,6 +79,7 @@ config ARM64
select HAVE_FTRACE_MCOUNT_RECORD
select HAVE_FUNCTION_TRACER
select HAVE_FUNCTION_GRAPH_TRACER
+ select HAVE_GCC_PLUGINS
select HAVE_GENERIC_DMA_COHERENT
select HAVE_HW_BREAKPOINT if PERF_EVENTS
select HAVE_IRQ_TIME_ACCOUNTING
@@ -101,6 +103,7 @@ config ARM64
select OF_EARLY_FLATTREE
select OF_NUMA if NUMA && OF
select OF_RESERVED_MEM
+ select PCI_ECAM if ACPI
select PERF_USE_VMALLOC
select POWER_RESET
select POWER_SUPPLY
diff --git a/arch/arm64/boot/dts/marvell/armada-3720-db.dts b/arch/arm64/boot/dts/marvell/armada-3720-db.dts
index 86110a6ae330..1372e9a6aaa4 100644
--- a/arch/arm64/boot/dts/marvell/armada-3720-db.dts
+++ b/arch/arm64/boot/dts/marvell/armada-3720-db.dts
@@ -76,3 +76,8 @@
&usb3 {
status = "okay";
};
+
+/* CON17 (PCIe) / CON12 (mini-PCIe) */
+&pcie0 {
+ status = "okay";
+};
diff --git a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
index eb29280962d7..c4762538ec01 100644
--- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
+++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi
@@ -176,5 +176,30 @@
<0x1d40000 0x40000>; /* GICR */
};
};
+
+ pcie0: pcie@d0070000 {
+ compatible = "marvell,armada-3700-pcie";
+ device_type = "pci";
+ status = "disabled";
+ reg = <0 0xd0070000 0 0x20000>;
+ #address-cells = <3>;
+ #size-cells = <2>;
+ bus-range = <0x00 0xff>;
+ interrupts = <GIC_SPI 29 IRQ_TYPE_LEVEL_HIGH>;
+ #interrupt-cells = <1>;
+ msi-parent = <&pcie0>;
+ msi-controller;
+ ranges = <0x82000000 0 0xe8000000 0 0xe8000000 0 0x1000000 /* Port 0 MEM */
+ 0x81000000 0 0xe9000000 0 0xe9000000 0 0x10000>; /* Port 0 IO*/
+ interrupt-map-mask = <0 0 0 7>;
+ interrupt-map = <0 0 0 1 &pcie_intc 0>,
+ <0 0 0 2 &pcie_intc 1>,
+ <0 0 0 3 &pcie_intc 2>,
+ <0 0 0 4 &pcie_intc 3>;
+ pcie_intc: interrupt-controller {
+ interrupt-controller;
+ #interrupt-cells = <1>;
+ };
+ };
};
};
diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h
index 49dd1bd3ea50..7099f26e3702 100644
--- a/arch/arm64/include/asm/cpufeature.h
+++ b/arch/arm64/include/asm/cpufeature.h
@@ -36,8 +36,9 @@
#define ARM64_HAS_VIRT_HOST_EXTN 11
#define ARM64_WORKAROUND_CAVIUM_27456 12
#define ARM64_HAS_32BIT_EL0 13
+#define ARM64_HYP_OFFSET_LOW 14
-#define ARM64_NCAPS 14
+#define ARM64_NCAPS 15
#ifndef __ASSEMBLY__
diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h
index 2cdb6b551ac6..4b5c977af465 100644
--- a/arch/arm64/include/asm/kvm_arm.h
+++ b/arch/arm64/include/asm/kvm_arm.h
@@ -178,7 +178,7 @@
/* Hyp System Trap Register */
#define HSTR_EL2_T(x) (1 << x)
-/* Hyp Coproccessor Trap Register Shifts */
+/* Hyp Coprocessor Trap Register Shifts */
#define CPTR_EL2_TFP_SHIFT 10
/* Hyp Coprocessor Trap Register */
diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
index 49095fc4b482..3eda975837d0 100644
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -47,8 +47,7 @@
int __attribute_const__ kvm_target_cpu(void);
int kvm_reset_vcpu(struct kvm_vcpu *vcpu);
-int kvm_arch_dev_ioctl_check_extension(long ext);
-unsigned long kvm_hyp_reset_entry(void);
+int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext);
void __extended_idmap_trampoline(phys_addr_t boot_pgd, phys_addr_t idmap_start);
struct kvm_arch {
@@ -348,8 +347,7 @@ int kvm_perf_teardown(void);
struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr);
-static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
- phys_addr_t pgd_ptr,
+static inline void __cpu_init_hyp_mode(phys_addr_t pgd_ptr,
unsigned long hyp_stack_ptr,
unsigned long vector_ptr)
{
@@ -357,19 +355,14 @@ static inline void __cpu_init_hyp_mode(phys_addr_t boot_pgd_ptr,
* Call initialization code, and switch to the full blown
* HYP code.
*/
- __kvm_call_hyp((void *)boot_pgd_ptr, pgd_ptr,
- hyp_stack_ptr, vector_ptr);
+ __kvm_call_hyp((void *)pgd_ptr, hyp_stack_ptr, vector_ptr);
}
-static inline void __cpu_reset_hyp_mode(phys_addr_t boot_pgd_ptr,
+void __kvm_hyp_teardown(void);
+static inline void __cpu_reset_hyp_mode(unsigned long vector_ptr,
phys_addr_t phys_idmap_start)
{
- /*
- * Call reset code, and switch back to stub hyp vectors.
- * Uses __kvm_call_hyp() to avoid kaslr's kvm_ksym_ref() translation.
- */
- __kvm_call_hyp((void *)kvm_hyp_reset_entry(),
- boot_pgd_ptr, phys_idmap_start);
+ kvm_call_hyp(__kvm_hyp_teardown, phys_idmap_start);
}
static inline void kvm_arch_hardware_unsetup(void) {}
diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h
index 44eaff70da6a..cff510574fae 100644
--- a/arch/arm64/include/asm/kvm_hyp.h
+++ b/arch/arm64/include/asm/kvm_hyp.h
@@ -25,29 +25,6 @@
#define __hyp_text __section(.hyp.text) notrace
-static inline unsigned long __kern_hyp_va(unsigned long v)
-{
- asm volatile(ALTERNATIVE("and %0, %0, %1",
- "nop",
- ARM64_HAS_VIRT_HOST_EXTN)
- : "+r" (v) : "i" (HYP_PAGE_OFFSET_MASK));
- return v;
-}
-
-#define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v)))
-
-static inline unsigned long __hyp_kern_va(unsigned long v)
-{
- u64 offset = PAGE_OFFSET - HYP_PAGE_OFFSET;
- asm volatile(ALTERNATIVE("add %0, %0, %1",
- "nop",
- ARM64_HAS_VIRT_HOST_EXTN)
- : "+r" (v) : "r" (offset));
- return v;
-}
-
-#define hyp_kern_va(v) (typeof(v))(__hyp_kern_va((unsigned long)(v)))
-
#define read_sysreg_elx(r,nvh,vh) \
({ \
u64 reg; \
diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
index f05ac27d033e..b6bb83400cd8 100644
--- a/arch/arm64/include/asm/kvm_mmu.h
+++ b/arch/arm64/include/asm/kvm_mmu.h
@@ -29,21 +29,48 @@
*
* Instead, give the HYP mode its own VA region at a fixed offset from
* the kernel by just masking the top bits (which are all ones for a
- * kernel address).
+ * kernel address). We need to find out how many bits to mask.
*
- * ARMv8.1 (using VHE) does have a TTBR1_EL2, and doesn't use these
- * macros (the entire kernel runs at EL2).
+ * We want to build a set of page tables that cover both parts of the
+ * idmap (the trampoline page used to initialize EL2), and our normal
+ * runtime VA space, at the same time.
+ *
+ * Given that the kernel uses VA_BITS for its entire address space,
+ * and that half of that space (VA_BITS - 1) is used for the linear
+ * mapping, we can also limit the EL2 space to (VA_BITS - 1).
+ *
+ * The main question is "Within the VA_BITS space, does EL2 use the
+ * top or the bottom half of that space to shadow the kernel's linear
+ * mapping?". As we need to idmap the trampoline page, this is
+ * determined by the range in which this page lives.
+ *
+ * If the page is in the bottom half, we have to use the top half. If
+ * the page is in the top half, we have to use the bottom half:
+ *
+ * T = __virt_to_phys(__hyp_idmap_text_start)
+ * if (T & BIT(VA_BITS - 1))
+ * HYP_VA_MIN = 0 //idmap in upper half
+ * else
+ * HYP_VA_MIN = 1 << (VA_BITS - 1)
+ * HYP_VA_MAX = HYP_VA_MIN + (1 << (VA_BITS - 1)) - 1
+ *
+ * This of course assumes that the trampoline page exists within the
+ * VA_BITS range. If it doesn't, then it means we're in the odd case
+ * where the kernel idmap (as well as HYP) uses more levels than the
+ * kernel runtime page tables (as seen when the kernel is configured
+ * for 4k pages, 39bits VA, and yet memory lives just above that
+ * limit, forcing the idmap to use 4 levels of page tables while the
+ * kernel itself only uses 3). In this particular case, it doesn't
+ * matter which side of VA_BITS we use, as we're guaranteed not to
+ * conflict with anything.
+ *
+ * When using VHE, there are no separate hyp mappings and all KVM
+ * functionality is already mapped as part of the main kernel
+ * mappings, and none of this applies in that case.
*/
-#define HYP_PAGE_OFFSET_SHIFT VA_BITS
-#define HYP_PAGE_OFFSET_MASK ((UL(1) << HYP_PAGE_OFFSET_SHIFT) - 1)
-#define HYP_PAGE_OFFSET (PAGE_OFFSET & HYP_PAGE_OFFSET_MASK)
-/*
- * Our virtual mapping for the idmap-ed MMU-enable code. Must be
- * shared across all the page-tables. Conveniently, we use the last
- * possible page, where no kernel mapping will ever exist.
- */
-#define TRAMPOLINE_VA (HYP_PAGE_OFFSET_MASK & PAGE_MASK)
+#define HYP_PAGE_OFFSET_HIGH_MASK ((UL(1) << VA_BITS) - 1)
+#define HYP_PAGE_OFFSET_LOW_MASK ((UL(1) << (VA_BITS - 1)) - 1)
#ifdef __ASSEMBLY__
@@ -53,13 +80,33 @@
/*
* Convert a kernel VA into a HYP VA.
* reg: VA to be converted.
+ *
+ * This generates the following sequences:
+ * - High mask:
+ * and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK
+ * nop
+ * - Low mask:
+ * and x0, x0, #HYP_PAGE_OFFSET_HIGH_MASK
+ * and x0, x0, #HYP_PAGE_OFFSET_LOW_MASK
+ * - VHE:
+ * nop
+ * nop
+ *
+ * The "low mask" version works because the mask is a strict subset of
+ * the "high mask", hence performing the first mask for nothing.
+ * Should be completely invisible on any viable CPU.
*/
.macro kern_hyp_va reg
-alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
- and \reg, \reg, #HYP_PAGE_OFFSET_MASK
+alternative_if_not ARM64_HAS_VIRT_HOST_EXTN
+ and \reg, \reg, #HYP_PAGE_OFFSET_HIGH_MASK
alternative_else
nop
alternative_endif
+alternative_if_not ARM64_HYP_OFFSET_LOW
+ nop
+alternative_else
+ and \reg, \reg, #HYP_PAGE_OFFSET_LOW_MASK
+alternative_endif
.endm
#else
@@ -70,7 +117,22 @@ alternative_endif
#include <asm/mmu_context.h>
#include <asm/pgtable.h>
-#define KERN_TO_HYP(kva) ((unsigned long)kva - PAGE_OFFSET + HYP_PAGE_OFFSET)
+static inline unsigned long __kern_hyp_va(unsigned long v)
+{
+ asm volatile(ALTERNATIVE("and %0, %0, %1",
+ "nop",
+ ARM64_HAS_VIRT_HOST_EXTN)
+ : "+r" (v)
+ : "i" (HYP_PAGE_OFFSET_HIGH_MASK));
+ asm volatile(ALTERNATIVE("nop",
+ "and %0, %0, %1",
+ ARM64_HYP_OFFSET_LOW)
+ : "+r" (v)
+ : "i" (HYP_PAGE_OFFSET_LOW_MASK));
+ return v;
+}
+
+#define kern_hyp_va(v) (typeof(v))(__kern_hyp_va((unsigned long)(v)))
/*
* We currently only support a 40bit IPA.
@@ -81,9 +143,8 @@ alternative_endif
#include <asm/stage2_pgtable.h>
-int create_hyp_mappings(void *from, void *to);
+int create_hyp_mappings(void *from, void *to, pgprot_t prot);
int create_hyp_io_mappings(void *from, void *to, phys_addr_t);
-void free_boot_hyp_pgd(void);
void free_hyp_pgds(void);
void stage2_unmap_vm(struct kvm *kvm);
@@ -97,7 +158,6 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run);
void kvm_mmu_free_memory_caches(struct kvm_vcpu *vcpu);
phys_addr_t kvm_mmu_get_httbr(void);
-phys_addr_t kvm_mmu_get_boot_httbr(void);
phys_addr_t kvm_get_idmap_vector(void);
phys_addr_t kvm_get_idmap_start(void);
int kvm_mmu_init(void);
diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h
index 2813748e2f24..c3ae239db3ee 100644
--- a/arch/arm64/include/asm/pgtable-hwdef.h
+++ b/arch/arm64/include/asm/pgtable-hwdef.h
@@ -164,6 +164,7 @@
#define PTE_CONT (_AT(pteval_t, 1) << 52) /* Contiguous range */
#define PTE_PXN (_AT(pteval_t, 1) << 53) /* Privileged XN */
#define PTE_UXN (_AT(pteval_t, 1) << 54) /* User XN */
+#define PTE_HYP_XN (_AT(pteval_t, 1) << 54) /* HYP XN */
/*
* AttrIndx[2:0] encoding (mapping attributes defined in the MAIR* registers).
diff --git a/arch/arm64/include/asm/pgtable-prot.h b/arch/arm64/include/asm/pgtable-prot.h
index 29fcb33ab401..39f5252673f7 100644
--- a/arch/arm64/include/asm/pgtable-prot.h
+++ b/arch/arm64/include/asm/pgtable-prot.h
@@ -55,7 +55,9 @@
#define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE)
#define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT)
-#define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP)
+#define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN)
+#define PAGE_HYP_EXEC __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY)
+#define PAGE_HYP_RO __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN)
#define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP)
#define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY)
diff --git a/arch/arm64/include/asm/virt.h b/arch/arm64/include/asm/virt.h
index bbc6a8cf83f1..1788545f25bc 100644
--- a/arch/arm64/include/asm/virt.h
+++ b/arch/arm64/include/asm/virt.h
@@ -87,6 +87,10 @@ extern void verify_cpu_run_el(void);
static inline void verify_cpu_run_el(void) {}
#endif
+/* The section containing the hypervisor idmap text */
+extern char __hyp_idmap_text_start[];
+extern char __hyp_idmap_text_end[];
+
/* The section containing the hypervisor text */
extern char __hyp_text_start[];
extern char __hyp_text_end[];
diff --git a/arch/arm64/include/uapi/asm/kvm.h b/arch/arm64/include/uapi/asm/kvm.h
index f209ea151dca..3051f86a9b5f 100644
--- a/arch/arm64/include/uapi/asm/kvm.h
+++ b/arch/arm64/include/uapi/asm/kvm.h
@@ -87,9 +87,11 @@ struct kvm_regs {
/* Supported VGICv3 address types */
#define KVM_VGIC_V3_ADDR_TYPE_DIST 2
#define KVM_VGIC_V3_ADDR_TYPE_REDIST 3
+#define KVM_VGIC_ITS_ADDR_TYPE 4
#define KVM_VGIC_V3_DIST_SIZE SZ_64K
#define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K)
+#define KVM_VGIC_V3_ITS_SIZE (2 * SZ_64K)
#define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */
#define KVM_ARM_VCPU_EL1_32BIT 1 /* CPU running a 32bit VM */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 916d27ad79c1..62272eac1352 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -726,6 +726,19 @@ static bool runs_at_el2(const struct arm64_cpu_capabilities *entry, int __unused
return is_kernel_in_hyp_mode();
}
+static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry,
+ int __unused)
+{
+ phys_addr_t idmap_addr = virt_to_phys(__hyp_idmap_text_start);
+
+ /*
+ * Activate the lower HYP offset only if:
+ * - the idmap doesn't clash with it,
+ * - the kernel is not running at EL2.
+ */
+ return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode();
+}
+
static const struct arm64_cpu_capabilities arm64_features[] = {
{
.desc = "GIC system register CPU interface",
@@ -803,6 +816,12 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
.field_pos = ID_AA64PFR0_EL0_SHIFT,
.min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT,
},
+ {
+ .desc = "Reduced HYP mapping offset",
+ .capability = ARM64_HYP_OFFSET_LOW,
+ .def_scope = SCOPE_SYSTEM,
+ .matches = hyp_offset_low,
+ },
{},
};
diff --git a/arch/arm64/kernel/pci.c b/arch/arm64/kernel/pci.c
index 3c4e308b40a0..acf38722457b 100644
--- a/arch/arm64/kernel/pci.c
+++ b/arch/arm64/kernel/pci.c
@@ -17,6 +17,9 @@
#include <linux/mm.h>
#include <linux/of_pci.h>
#include <linux/of_platform.h>
+#include <linux/pci.h>
+#include <linux/pci-acpi.h>
+#include <linux/pci-ecam.h>
#include <linux/slab.h>
/*
@@ -36,25 +39,17 @@ resource_size_t pcibios_align_resource(void *data, const struct resource *res,
return res->start;
}
-/**
- * pcibios_enable_device - Enable I/O and memory.
- * @dev: PCI device to be enabled
- * @mask: bitmask of BARs to enable
- */
-int pcibios_enable_device(struct pci_dev *dev, int mask)
-{
- if (pci_has_flag(PCI_PROBE_ONLY))
- return 0;
-
- return pci_enable_resources(dev, mask);
-}
-
/*
- * Try to assign the IRQ number from DT when adding a new device
+ * Try to assign the IRQ number when probing a new device
*/
-int pcibios_add_device(struct pci_dev *dev)
+int pcibios_alloc_irq(struct pci_dev *dev)
{
- dev->irq = of_irq_parse_and_map_pci(dev, 0, 0);
+ if (acpi_disabled)
+ dev->irq = of_irq_parse_and_map_pci(dev, 0, 0);
+#ifdef CONFIG_ACPI
+ else
+ return acpi_pci_irq_enable(dev);
+#endif
return 0;
}
@@ -65,13 +60,21 @@ int pcibios_add_device(struct pci_dev *dev)
int raw_pci_read(unsigned int domain, unsigned int bus,
unsigned int devfn, int reg, int len, u32 *val)
{
- return -ENXIO;
+ struct pci_bus *b = pci_find_bus(domain, bus);
+
+ if (!b)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+ return b->ops->read(b, devfn, reg, len, val);
}
int raw_pci_write(unsigned int domain, unsigned int bus,
unsigned int devfn, int reg, int len, u32 val)
{
- return -ENXIO;
+ struct pci_bus *b = pci_find_bus(domain, bus);
+
+ if (!b)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+ return b->ops->write(b, devfn, reg, len, val);
}
#ifdef CONFIG_NUMA
@@ -85,10 +88,124 @@ EXPORT_SYMBOL(pcibus_to_node);
#endif
#ifdef CONFIG_ACPI
-/* Root bridge scanning */
+
+struct acpi_pci_generic_root_info {
+ struct acpi_pci_root_info common;
+ struct pci_config_window *cfg; /* config space mapping */
+};
+
+int acpi_pci_bus_find_domain_nr(struct pci_bus *bus)
+{
+ struct pci_config_window *cfg = bus->sysdata;
+ struct acpi_device *adev = to_acpi_device(cfg->parent);
+ struct acpi_pci_root *root = acpi_driver_data(adev);
+
+ return root->segment;
+}
+
+int pcibios_root_bridge_prepare(struct pci_host_bridge *bridge)
+{
+ if (!acpi_disabled) {
+ struct pci_config_window *cfg = bridge->bus->sysdata;
+ struct acpi_device *adev = to_acpi_device(cfg->parent);
+ ACPI_COMPANION_SET(&bridge->dev, adev);
+ }
+
+ return 0;
+}
+
+/*
+ * Lookup the bus range for the domain in MCFG, and set up config space
+ * mapping.
+ */
+static struct pci_config_window *
+pci_acpi_setup_ecam_mapping(struct acpi_pci_root *root)
+{
+ struct resource *bus_res = &root->secondary;
+ u16 seg = root->segment;
+ struct pci_config_window *cfg;
+ struct resource cfgres;
+ unsigned int bsz;
+
+ /* Use address from _CBA if present, otherwise lookup MCFG */
+ if (!root->mcfg_addr)
+ root->mcfg_addr = pci_mcfg_lookup(seg, bus_res);
+
+ if (!root->mcfg_addr) {
+ dev_err(&root->device->dev, "%04x:%pR ECAM region not found\n",
+ seg, bus_res);
+ return NULL;
+ }
+
+ bsz = 1 << pci_generic_ecam_ops.bus_shift;
+ cfgres.start = root->mcfg_addr + bus_res->start * bsz;
+ cfgres.end = cfgres.start + resource_size(bus_res) * bsz - 1;
+ cfgres.flags = IORESOURCE_MEM;
+ cfg = pci_ecam_create(&root->device->dev, &cfgres, bus_res,
+ &pci_generic_ecam_ops);
+ if (IS_ERR(cfg)) {
+ dev_err(&root->device->dev, "%04x:%pR error %ld mapping ECAM\n",
+ seg, bus_res, PTR_ERR(cfg));
+ return NULL;
+ }
+
+ return cfg;
+}
+
+/* release_info: free resources allocated by init_info */
+static void pci_acpi_generic_release_info(struct acpi_pci_root_info *ci)
+{
+ struct acpi_pci_generic_root_info *ri;
+
+ ri = container_of(ci, struct acpi_pci_generic_root_info, common);
+ pci_ecam_free(ri->cfg);
+ kfree(ri);
+}
+
+static struct acpi_pci_root_ops acpi_pci_root_ops = {
+ .release_info = pci_acpi_generic_release_info,
+};
+
+/* Interface called from ACPI code to setup PCI host controller */
struct pci_bus *pci_acpi_scan_root(struct acpi_pci_root *root)
{
- /* TODO: Should be revisited when implementing PCI on ACPI */
- return NULL;
+ int node = acpi_get_node(root->device->handle);
+ struct acpi_pci_generic_root_info *ri;
+ struct pci_bus *bus, *child;
+
+ ri = kzalloc_node(sizeof(*ri), GFP_KERNEL, node);
+ if (!ri)
+ return NULL;
+
+ ri->cfg = pci_acpi_setup_ecam_mapping(root);
+ if (!ri->cfg) {
+ kfree(ri);
+ return NULL;
+ }
+
+ acpi_pci_root_ops.pci_ops = &ri->cfg->ops->pci_ops;
+ bus = acpi_pci_root_create(root, &acpi_pci_root_ops, &ri->common,
+ ri->cfg);
+ if (!bus)
+ return NULL;
+
+ pci_bus_size_bridges(bus);
+ pci_bus_assign_resources(bus);
+
+ list_for_each_entry(child, &bus->children, node)
+ pcie_bus_configure_settings(child);
+
+ return bus;
}
+
+void pcibios_add_bus(struct pci_bus *bus)
+{
+ acpi_pci_add_bus(bus);
+}
+
+void pcibios_remove_bus(struct pci_bus *bus)
+{
+ acpi_pci_remove_bus(bus);
+}
+
#endif
diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig
index c4f26ef91e77..9d2eff0b3ad3 100644
--- a/arch/arm64/kvm/Kconfig
+++ b/arch/arm64/kvm/Kconfig
@@ -36,6 +36,7 @@ config KVM
select HAVE_KVM_IRQFD
select KVM_ARM_VGIC_V3
select KVM_ARM_PMU if HW_PERF_EVENTS
+ select HAVE_KVM_MSI
---help---
Support hosting virtualized guest machines.
We don't support KVM with 16K page tables yet, due to the multiple
@@ -54,13 +55,6 @@ config KVM_ARM_PMU
Adds support for a virtual Performance Monitoring Unit (PMU) in
virtual machines.
-config KVM_NEW_VGIC
- bool "New VGIC implementation"
- depends on KVM
- default y
- ---help---
- uses the new VGIC implementation
-
source drivers/vhost/Kconfig
endif # VIRTUALIZATION
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index a7a958ca29d5..a5b96642a9cb 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -20,7 +20,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += emulate.o inject_fault.o regmap.o
kvm-$(CONFIG_KVM_ARM_HOST) += hyp.o hyp-init.o handle_exit.o
kvm-$(CONFIG_KVM_ARM_HOST) += guest.o debug.o reset.o sys_regs.o sys_regs_generic_v8.o
-ifeq ($(CONFIG_KVM_NEW_VGIC),y)
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-init.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-irqfd.o
@@ -30,12 +29,6 @@ kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v2.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-mmio-v3.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-kvm-device.o
-else
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v2-emul.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3.o
-kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic-v3-emul.o
-endif
+kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/vgic/vgic-its.o
kvm-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/arch_timer.o
kvm-$(CONFIG_KVM_ARM_PMU) += $(KVM)/arm/pmu.o
diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c
index 32fad75bb9ff..3f9e15722473 100644
--- a/arch/arm64/kvm/guest.c
+++ b/arch/arm64/kvm/guest.c
@@ -211,7 +211,7 @@ unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu)
/**
* kvm_arm_copy_reg_indices - get indices of all registers.
*
- * We do core registers right here, then we apppend system regs.
+ * We do core registers right here, then we append system regs.
*/
int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *uindices)
{
diff --git a/arch/arm64/kvm/hyp-init.S b/arch/arm64/kvm/hyp-init.S
index a873a6d8be90..6b29d3d9e1f2 100644
--- a/arch/arm64/kvm/hyp-init.S
+++ b/arch/arm64/kvm/hyp-init.S
@@ -53,10 +53,9 @@ __invalid:
b .
/*
- * x0: HYP boot pgd
- * x1: HYP pgd
- * x2: HYP stack
- * x3: HYP vectors
+ * x0: HYP pgd
+ * x1: HYP stack
+ * x2: HYP vectors
*/
__do_hyp_init:
@@ -110,71 +109,27 @@ __do_hyp_init:
msr sctlr_el2, x4
isb
- /* Skip the trampoline dance if we merged the boot and runtime PGDs */
- cmp x0, x1
- b.eq merged
-
- /* MMU is now enabled. Get ready for the trampoline dance */
- ldr x4, =TRAMPOLINE_VA
- adr x5, target
- bfi x4, x5, #0, #PAGE_SHIFT
- br x4
-
-target: /* We're now in the trampoline code, switch page tables */
- msr ttbr0_el2, x1
- isb
-
- /* Invalidate the old TLBs */
- tlbi alle2
- dsb sy
-
-merged:
/* Set the stack and new vectors */
+ kern_hyp_va x1
+ mov sp, x1
kern_hyp_va x2
- mov sp, x2
- kern_hyp_va x3
- msr vbar_el2, x3
+ msr vbar_el2, x2
/* Hello, World! */
eret
ENDPROC(__kvm_hyp_init)
/*
- * Reset kvm back to the hyp stub. This is the trampoline dance in
- * reverse. If kvm used an extended idmap, __extended_idmap_trampoline
- * calls this code directly in the idmap. In this case switching to the
- * boot tables is a no-op.
- *
- * x0: HYP boot pgd
- * x1: HYP phys_idmap_start
+ * Reset kvm back to the hyp stub.
*/
ENTRY(__kvm_hyp_reset)
- /* We're in trampoline code in VA, switch back to boot page tables */
- msr ttbr0_el2, x0
- isb
-
- /* Ensure the PA branch doesn't find a stale tlb entry or stale code. */
- ic iallu
- tlbi alle2
- dsb sy
- isb
-
- /* Branch into PA space */
- adr x0, 1f
- bfi x1, x0, #0, #PAGE_SHIFT
- br x1
-
/* We're now in idmap, disable MMU */
-1: mrs x0, sctlr_el2
+ mrs x0, sctlr_el2
ldr x1, =SCTLR_ELx_FLAGS
bic x0, x0, x1 // Clear SCTL_M and etc
msr sctlr_el2, x0
isb
- /* Invalidate the old TLBs */
- tlbi alle2
- dsb sy
-
/* Install stub vectors */
adr_l x0, __hyp_stub_vectors
msr vbar_el2, x0
diff --git a/arch/arm64/kvm/hyp/entry.S b/arch/arm64/kvm/hyp/entry.S
index 70254a65bd5b..ce9e5e5f28cf 100644
--- a/arch/arm64/kvm/hyp/entry.S
+++ b/arch/arm64/kvm/hyp/entry.S
@@ -164,22 +164,3 @@ alternative_endif
eret
ENDPROC(__fpsimd_guest_restore)
-
-/*
- * When using the extended idmap, we don't have a trampoline page we can use
- * while we switch pages tables during __kvm_hyp_reset. Accessing the idmap
- * directly would be ideal, but if we're using the extended idmap then the
- * idmap is located above HYP_PAGE_OFFSET, and the address will be masked by
- * kvm_call_hyp using kern_hyp_va.
- *
- * x0: HYP boot pgd
- * x1: HYP phys_idmap_start
- */
-ENTRY(__extended_idmap_trampoline)
- mov x4, x1
- adr_l x3, __kvm_hyp_reset
-
- /* insert __kvm_hyp_reset()s offset into phys_idmap_start */
- bfi x4, x3, #0, #PAGE_SHIFT
- br x4
-ENDPROC(__extended_idmap_trampoline)
diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S
index 2d87f36d5cb4..f6d9694ae3b1 100644
--- a/arch/arm64/kvm/hyp/hyp-entry.S
+++ b/arch/arm64/kvm/hyp/hyp-entry.S
@@ -62,6 +62,21 @@ ENTRY(__vhe_hyp_call)
isb
ret
ENDPROC(__vhe_hyp_call)
+
+/*
+ * Compute the idmap address of __kvm_hyp_reset based on the idmap
+ * start passed as a parameter, and jump there.
+ *
+ * x0: HYP phys_idmap_start
+ */
+ENTRY(__kvm_hyp_teardown)
+ mov x4, x0
+ adr_l x3, __kvm_hyp_reset
+
+ /* insert __kvm_hyp_reset()s offset into phys_idmap_start */
+ bfi x4, x3, #0, #PAGE_SHIFT
+ br x4
+ENDPROC(__kvm_hyp_teardown)
el1_sync: // Guest trapped into EL2
save_x0_to_x3
diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c
index 4373997d1a70..ae7855f16ec2 100644
--- a/arch/arm64/kvm/hyp/switch.c
+++ b/arch/arm64/kvm/hyp/switch.c
@@ -299,9 +299,16 @@ static const char __hyp_panic_string[] = "HYP panic:\nPS:%08llx PC:%016llx ESR:%
static void __hyp_text __hyp_call_panic_nvhe(u64 spsr, u64 elr, u64 par)
{
- unsigned long str_va = (unsigned long)__hyp_panic_string;
+ unsigned long str_va;
- __hyp_do_panic(hyp_kern_va(str_va),
+ /*
+ * Force the panic string to be loaded from the literal pool,
+ * making sure it is a kernel address and not a PC-relative
+ * reference.
+ */
+ asm volatile("ldr %0, =__hyp_panic_string" : "=r" (str_va));
+
+ __hyp_do_panic(str_va,
spsr, elr,
read_sysreg(esr_el2), read_sysreg_el2(far),
read_sysreg(hpfar_el2), par,
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index b1ad730e1567..5bc460884639 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -65,7 +65,7 @@ static bool cpu_has_32bit_el1(void)
* We currently assume that the number of HW registers is uniform
* across all CPUs (see cpuinfo_sanity_check).
*/
-int kvm_arch_dev_ioctl_check_extension(long ext)
+int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext)
{
int r;
@@ -86,6 +86,12 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
case KVM_CAP_VCPU_ATTRIBUTES:
r = 1;
break;
+ case KVM_CAP_MSI_DEVID:
+ if (!kvm)
+ r = -EINVAL;
+ else
+ r = kvm->arch.vgic.msis_require_devid;
+ break;
default:
r = 0;
}
@@ -98,7 +104,7 @@ int kvm_arch_dev_ioctl_check_extension(long ext)
* @vcpu: The VCPU pointer
*
* This function finds the right table above and sets the registers on
- * the virtual CPU struct to their architectually defined reset
+ * the virtual CPU struct to their architecturally defined reset
* values.
*/
int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
@@ -132,31 +138,3 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
/* Reset timer */
return kvm_timer_vcpu_reset(vcpu, cpu_vtimer_irq);
}
-
-extern char __hyp_idmap_text_start[];
-
-unsigned long kvm_hyp_reset_entry(void)
-{
- if (!__kvm_cpu_uses_extended_idmap()) {
- unsigned long offset;
-
- /*
- * Find the address of __kvm_hyp_reset() in the trampoline page.
- * This is present in the running page tables, and the boot page
- * tables, so we call the code here to start the trampoline
- * dance in reverse.
- */
- offset = (unsigned long)__kvm_hyp_reset
- - ((unsigned long)__hyp_idmap_text_start & PAGE_MASK);
-
- return TRAMPOLINE_VA + offset;
- } else {
- /*
- * KVM is running with merged page tables, which don't have the
- * trampoline page mapped. We know the idmap is still mapped,
- * but can't be called into directly. Use
- * __extended_idmap_trampoline to do the call.
- */
- return (unsigned long)kvm_ksym_ref(__extended_idmap_trampoline);
- }
-}
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index a57d650f552c..b0b225ceca18 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1546,7 +1546,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu,
struct sys_reg_params *params)
{
u8 hsr_ec = kvm_vcpu_trap_get_class(vcpu);
- int cp;
+ int cp = -1;
switch(hsr_ec) {
case ESR_ELx_EC_CP15_32:
@@ -1558,7 +1558,7 @@ static void unhandled_cp_access(struct kvm_vcpu *vcpu,
cp = 14;
break;
default:
- WARN_ON((cp = -1));
+ WARN_ON(1);
}
kvm_err("Unsupported guest CP%d access at: %08lx\n",
diff --git a/arch/cris/arch-v10/drivers/axisflashmap.c b/arch/cris/arch-v10/drivers/axisflashmap.c
index 60d57c590032..bdc25aa43468 100644
--- a/arch/cris/arch-v10/drivers/axisflashmap.c
+++ b/arch/cris/arch-v10/drivers/axisflashmap.c
@@ -397,7 +397,7 @@ static int __init init_axis_flash(void)
if (!romfs_in_flash) {
/* Create an RAM device for the root partition (romfs). */
-#if !defined(CONFIG_MTD_MTDRAM) || (CONFIG_MTDRAM_TOTAL_SIZE != 0) || (CONFIG_MTDRAM_ABS_POS != 0)
+#if !defined(CONFIG_MTD_MTDRAM) || (CONFIG_MTDRAM_TOTAL_SIZE != 0)
/* No use trying to boot this kernel from RAM. Panic! */
printk(KERN_EMERG "axisflashmap: Cannot create an MTD RAM "
"device due to kernel (mis)configuration!\n");
diff --git a/arch/cris/arch-v32/drivers/axisflashmap.c b/arch/cris/arch-v32/drivers/axisflashmap.c
index bd10d3ba0949..87656c41fec7 100644
--- a/arch/cris/arch-v32/drivers/axisflashmap.c
+++ b/arch/cris/arch-v32/drivers/axisflashmap.c
@@ -320,7 +320,7 @@ static int __init init_axis_flash(void)
* but its size must be configured as 0 so as not to conflict
* with our usage.
*/
-#if !defined(CONFIG_MTD_MTDRAM) || (CONFIG_MTDRAM_TOTAL_SIZE != 0) || (CONFIG_MTDRAM_ABS_POS != 0)
+#if !defined(CONFIG_MTD_MTDRAM) || (CONFIG_MTDRAM_TOTAL_SIZE != 0)
if (!romfs_in_flash && !nand_boot) {
printk(KERN_EMERG "axisflashmap: Cannot create an MTD RAM "
"device; configure CONFIG_MTD_MTDRAM with size = 0!\n");
diff --git a/arch/microblaze/include/asm/pci.h b/arch/microblaze/include/asm/pci.h
index fc3ecb55f1b2..2a120bb70e54 100644
--- a/arch/microblaze/include/asm/pci.h
+++ b/arch/microblaze/include/asm/pci.h
@@ -82,9 +82,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file,
pgprot_t prot);
#define HAVE_ARCH_PCI_RESOURCE_TO_USER
-extern void pci_resource_to_user(const struct pci_dev *dev, int bar,
- const struct resource *rsrc,
- resource_size_t *start, resource_size_t *end);
extern void pcibios_setup_bus_devices(struct pci_bus *bus);
extern void pcibios_setup_bus_self(struct pci_bus *bus);
diff --git a/arch/microblaze/pci/pci-common.c b/arch/microblaze/pci/pci-common.c
index 14cba600da7a..81556b843a8e 100644
--- a/arch/microblaze/pci/pci-common.c
+++ b/arch/microblaze/pci/pci-common.c
@@ -219,33 +219,6 @@ static struct resource *__pci_mmap_make_offset(struct pci_dev *dev,
}
/*
- * Set vm_page_prot of VMA, as appropriate for this architecture, for a pci
- * device mapping.
- */
-static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp,
- pgprot_t protection,
- enum pci_mmap_state mmap_state,
- int write_combine)
-{
- pgprot_t prot = protection;
-
- /* Write combine is always 0 on non-memory space mappings. On
- * memory space, if the user didn't pass 1, we check for a
- * "prefetchable" resource. This is a bit hackish, but we use
- * this to workaround the inability of /sysfs to provide a write
- * combine bit
- */
- if (mmap_state != pci_mmap_mem)
- write_combine = 0;
- else if (write_combine == 0) {
- if (rp->flags & IORESOURCE_PREFETCH)
- write_combine = 1;
- }
-
- return pgprot_noncached(prot);
-}
-
-/*
* This one is used by /dev/mem and fbdev who have no clue about the
* PCI device, it tries to find the PCI device first and calls the
* above routine
@@ -317,9 +290,7 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
return -EINVAL;
vma->vm_pgoff = offset >> PAGE_SHIFT;
- vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp,
- vma->vm_page_prot,
- mmap_state, write_combine);
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
vma->vm_end - vma->vm_start, vma->vm_page_prot);
@@ -473,39 +444,25 @@ void pci_resource_to_user(const struct pci_dev *dev, int bar,
const struct resource *rsrc,
resource_size_t *start, resource_size_t *end)
{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- resource_size_t offset = 0;
+ struct pci_bus_region region;
- if (hose == NULL)
+ if (rsrc->flags & IORESOURCE_IO) {
+ pcibios_resource_to_bus(dev->bus, &region,
+ (struct resource *) rsrc);
+ *start = region.start;
+ *end = region.end;
return;
+ }
- if (rsrc->flags & IORESOURCE_IO)
- offset = (unsigned long)hose->io_base_virt - _IO_BASE;
-
- /* We pass a fully fixed up address to userland for MMIO instead of
- * a BAR value because X is lame and expects to be able to use that
- * to pass to /dev/mem !
+ /* We pass a CPU physical address to userland for MMIO instead of a
+ * BAR value because X is lame and expects to be able to use that
+ * to pass to /dev/mem!
*
- * That means that we'll have potentially 64 bits values where some
- * userland apps only expect 32 (like X itself since it thinks only
- * Sparc has 64 bits MMIO) but if we don't do that, we break it on
- * 32 bits CHRPs :-(
- *
- * Hopefully, the sysfs insterface is immune to that gunk. Once X
- * has been fixed (and the fix spread enough), we can re-enable the
- * 2 lines below and pass down a BAR value to userland. In that case
- * we'll also have to re-enable the matching code in
- * __pci_mmap_make_offset().
- *
- * BenH.
+ * That means we may have 64-bit values where some apps only expect
+ * 32 (like X itself since it thinks only Sparc has 64-bit MMIO).
*/
-#if 0
- else if (rsrc->flags & IORESOURCE_MEM)
- offset = hose->pci_mem_offset;
-#endif
-
- *start = rsrc->start - offset;
- *end = rsrc->end - offset;
+ *start = rsrc->start;
+ *end = rsrc->end;
}
/**
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index ac91939b9b75..29867139851e 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -1488,6 +1488,7 @@ config CPU_MIPS64_R2
select CPU_SUPPORTS_HIGHMEM
select CPU_SUPPORTS_HUGEPAGES
select CPU_SUPPORTS_MSA
+ select HAVE_KVM
help
Choose this option to build a kernel for release 2 or later of the
MIPS64 architecture. Many modern embedded systems with a 64-bit
@@ -1505,6 +1506,7 @@ config CPU_MIPS64_R6
select CPU_SUPPORTS_MSA
select GENERIC_CSUM
select MIPS_O32_FP64_SUPPORT if MIPS32_O32
+ select HAVE_KVM
help
Choose this option to build a kernel for release 6 or later of the
MIPS64 architecture. New MIPS processors, starting with the Warrior
diff --git a/arch/mips/include/asm/addrspace.h b/arch/mips/include/asm/addrspace.h
index 3b0e51d5a613..c5b04e752e97 100644
--- a/arch/mips/include/asm/addrspace.h
+++ b/arch/mips/include/asm/addrspace.h
@@ -45,7 +45,7 @@
/*
* Returns the kernel segment base of a given address
*/
-#define KSEGX(a) ((_ACAST32_ (a)) & 0xe0000000)
+#define KSEGX(a) ((_ACAST32_(a)) & _ACAST32_(0xe0000000))
/*
* Returns the physical address of a CKSEGx / XKPHYS address
diff --git a/arch/mips/include/asm/kvm_host.h b/arch/mips/include/asm/kvm_host.h
index 36a391d289aa..b54bcadd8aec 100644
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -19,6 +19,9 @@
#include <linux/threads.h>
#include <linux/spinlock.h>
+#include <asm/inst.h>
+#include <asm/mipsregs.h>
+
/* MIPS KVM register ids */
#define MIPS_CP0_32(_R, _S) \
(KVM_REG_MIPS_CP0 | KVM_REG_SIZE_U32 | (8 * (_R) + (_S)))
@@ -53,6 +56,12 @@
#define KVM_REG_MIPS_CP0_CONFIG7 MIPS_CP0_32(16, 7)
#define KVM_REG_MIPS_CP0_XCONTEXT MIPS_CP0_64(20, 0)
#define KVM_REG_MIPS_CP0_ERROREPC MIPS_CP0_64(30, 0)
+#define KVM_REG_MIPS_CP0_KSCRATCH1 MIPS_CP0_64(31, 2)
+#define KVM_REG_MIPS_CP0_KSCRATCH2 MIPS_CP0_64(31, 3)
+#define KVM_REG_MIPS_CP0_KSCRATCH3 MIPS_CP0_64(31, 4)
+#define KVM_REG_MIPS_CP0_KSCRATCH4 MIPS_CP0_64(31, 5)
+#define KVM_REG_MIPS_CP0_KSCRATCH5 MIPS_CP0_64(31, 6)
+#define KVM_REG_MIPS_CP0_KSCRATCH6 MIPS_CP0_64(31, 7)
#define KVM_MAX_VCPUS 1
@@ -65,8 +74,14 @@
-/* Special address that contains the comm page, used for reducing # of traps */
-#define KVM_GUEST_COMMPAGE_ADDR 0x0
+/*
+ * Special address that contains the comm page, used for reducing # of traps
+ * This needs to be within 32Kb of 0x0 (so the zero register can be used), but
+ * preferably not at 0x0 so that most kernel NULL pointer dereferences can be
+ * caught.
+ */
+#define KVM_GUEST_COMMPAGE_ADDR ((PAGE_SIZE > 0x8000) ? 0 : \
+ (0x8000 - PAGE_SIZE))
#define KVM_GUEST_KERNEL_MODE(vcpu) ((kvm_read_c0_guest_status(vcpu->arch.cop0) & (ST0_EXL | ST0_ERL)) || \
((kvm_read_c0_guest_status(vcpu->arch.cop0) & KSU_USER) == 0))
@@ -93,9 +108,6 @@
#define KVM_INVALID_ADDR 0xdeadbeef
extern atomic_t kvm_mips_instance;
-extern kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
-extern void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
-extern bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
struct kvm_vm_stat {
u32 remote_tlb_flush;
@@ -126,28 +138,6 @@ struct kvm_vcpu_stat {
u32 halt_wakeup;
};
-enum kvm_mips_exit_types {
- WAIT_EXITS,
- CACHE_EXITS,
- SIGNAL_EXITS,
- INT_EXITS,
- COP_UNUSABLE_EXITS,
- TLBMOD_EXITS,
- TLBMISS_LD_EXITS,
- TLBMISS_ST_EXITS,
- ADDRERR_ST_EXITS,
- ADDRERR_LD_EXITS,
- SYSCALL_EXITS,
- RESVD_INST_EXITS,
- BREAK_INST_EXITS,
- TRAP_INST_EXITS,
- MSA_FPE_EXITS,
- FPE_EXITS,
- MSA_DISABLED_EXITS,
- FLUSH_DCACHE_EXITS,
- MAX_KVM_MIPS_EXIT_TYPES
-};
-
struct kvm_arch_memory_slot {
};
@@ -215,73 +205,6 @@ struct mips_coproc {
#define MIPS_CP0_CONFIG4_SEL 4
#define MIPS_CP0_CONFIG5_SEL 5
-/* Config0 register bits */
-#define CP0C0_M 31
-#define CP0C0_K23 28
-#define CP0C0_KU 25
-#define CP0C0_MDU 20
-#define CP0C0_MM 17
-#define CP0C0_BM 16
-#define CP0C0_BE 15
-#define CP0C0_AT 13
-#define CP0C0_AR 10
-#define CP0C0_MT 7
-#define CP0C0_VI 3
-#define CP0C0_K0 0
-
-/* Config1 register bits */
-#define CP0C1_M 31
-#define CP0C1_MMU 25
-#define CP0C1_IS 22
-#define CP0C1_IL 19
-#define CP0C1_IA 16
-#define CP0C1_DS 13
-#define CP0C1_DL 10
-#define CP0C1_DA 7
-#define CP0C1_C2 6
-#define CP0C1_MD 5
-#define CP0C1_PC 4
-#define CP0C1_WR 3
-#define CP0C1_CA 2
-#define CP0C1_EP 1
-#define CP0C1_FP 0
-
-/* Config2 Register bits */
-#define CP0C2_M 31
-#define CP0C2_TU 28
-#define CP0C2_TS 24
-#define CP0C2_TL 20
-#define CP0C2_TA 16
-#define CP0C2_SU 12
-#define CP0C2_SS 8
-#define CP0C2_SL 4
-#define CP0C2_SA 0
-
-/* Config3 Register bits */
-#define CP0C3_M 31
-#define CP0C3_ISA_ON_EXC 16
-#define CP0C3_ULRI 13
-#define CP0C3_DSPP 10
-#define CP0C3_LPA 7
-#define CP0C3_VEIC 6
-#define CP0C3_VInt 5
-#define CP0C3_SP 4
-#define CP0C3_MT 2
-#define CP0C3_SM 1
-#define CP0C3_TL 0
-
-/* MMU types, the first four entries have the same layout as the
- CP0C0_MT field. */
-enum mips_mmu_types {
- MMU_TYPE_NONE,
- MMU_TYPE_R4000,
- MMU_TYPE_RESERVED,
- MMU_TYPE_FMT,
- MMU_TYPE_R3000,
- MMU_TYPE_R6000,
- MMU_TYPE_R8000
-};
-
/* Resume Flags */
#define RESUME_FLAG_DR (1<<0) /* Reload guest nonvolatile state? */
#define RESUME_FLAG_HOST (1<<1) /* Resume host? */
@@ -298,11 +221,6 @@ enum emulation_result {
EMULATE_PRIV_FAIL,
};
-#define MIPS3_PG_G 0x00000001 /* Global; ignore ASID if in lo0 & lo1 */
-#define MIPS3_PG_V 0x00000002 /* Valid */
-#define MIPS3_PG_NV 0x00000000
-#define MIPS3_PG_D 0x00000004 /* Dirty */
-
#define mips3_paddr_to_tlbpfn(x) \
(((unsigned long)(x) >> MIPS3_PG_SHIFT) & MIPS3_PG_FRAME)
#define mips3_tlbpfn_to_paddr(x) \
@@ -313,13 +231,11 @@ enum emulation_result {
#define VPN2_MASK 0xffffe000
#define KVM_ENTRYHI_ASID MIPS_ENTRYHI_ASID
-#define TLB_IS_GLOBAL(x) (((x).tlb_lo0 & MIPS3_PG_G) && \
- ((x).tlb_lo1 & MIPS3_PG_G))
+#define TLB_IS_GLOBAL(x) ((x).tlb_lo[0] & (x).tlb_lo[1] & ENTRYLO_G)
#define TLB_VPN2(x) ((x).tlb_hi & VPN2_MASK)
#define TLB_ASID(x) ((x).tlb_hi & KVM_ENTRYHI_ASID)
-#define TLB_IS_VALID(x, va) (((va) & (1 << PAGE_SHIFT)) \
- ? ((x).tlb_lo1 & MIPS3_PG_V) \
- : ((x).tlb_lo0 & MIPS3_PG_V))
+#define TLB_LO_IDX(x, va) (((va) >> PAGE_SHIFT) & 1)
+#define TLB_IS_VALID(x, va) ((x).tlb_lo[TLB_LO_IDX(x, va)] & ENTRYLO_V)
#define TLB_HI_VPN2_HIT(x, y) ((TLB_VPN2(x) & ~(x).tlb_mask) == \
((y) & VPN2_MASK & ~(x).tlb_mask))
#define TLB_HI_ASID_HIT(x, y) (TLB_IS_GLOBAL(x) || \
@@ -328,26 +244,23 @@ enum emulation_result {
struct kvm_mips_tlb {
long tlb_mask;
long tlb_hi;
- long tlb_lo0;
- long tlb_lo1;
+ long tlb_lo[2];
};
-#define KVM_MIPS_FPU_FPU 0x1
-#define KVM_MIPS_FPU_MSA 0x2
+#define KVM_MIPS_AUX_FPU 0x1
+#define KVM_MIPS_AUX_MSA 0x2
#define KVM_MIPS_GUEST_TLB_SIZE 64
struct kvm_vcpu_arch {
- void *host_ebase, *guest_ebase;
+ void *guest_ebase;
int (*vcpu_run)(struct kvm_run *run, struct kvm_vcpu *vcpu);
unsigned long host_stack;
unsigned long host_gp;
/* Host CP0 registers used when handling exits from guest */
unsigned long host_cp0_badvaddr;
- unsigned long host_cp0_cause;
unsigned long host_cp0_epc;
- unsigned long host_cp0_entryhi;
- uint32_t guest_inst;
+ u32 host_cp0_cause;
/* GPRS */
unsigned long gprs[32];
@@ -357,8 +270,8 @@ struct kvm_vcpu_arch {
/* FPU State */
struct mips_fpu_struct fpu;
- /* Which FPU state is loaded (KVM_MIPS_FPU_*) */
- unsigned int fpu_inuse;
+ /* Which auxiliary state is loaded (KVM_MIPS_AUX_*) */
+ unsigned int aux_inuse;
/* COP0 State */
struct mips_coproc *cop0;
@@ -370,11 +283,11 @@ struct kvm_vcpu_arch {
struct hrtimer comparecount_timer;
/* Count timer control KVM register */
- uint32_t count_ctl;
+ u32 count_ctl;
/* Count bias from the raw time */
- uint32_t count_bias;
+ u32 count_bias;
/* Frequency of timer in Hz */
- uint32_t count_hz;
+ u32 count_hz;
/* Dynamic nanosecond bias (multiple of count_period) to avoid overflow */
s64 count_dyn_bias;
/* Resume time */
@@ -388,7 +301,7 @@ struct kvm_vcpu_arch {
/* Bitmask of pending exceptions to be cleared */
unsigned long pending_exceptions_clr;
- unsigned long pending_load_cause;
+ u32 pending_load_cause;
/* Save/Restore the entryhi register when are are preempted/scheduled back in */
unsigned long preempt_entryhi;
@@ -397,8 +310,8 @@ struct kvm_vcpu_arch {
struct kvm_mips_tlb guest_tlb[KVM_MIPS_GUEST_TLB_SIZE];
/* Cached guest kernel/user ASIDs */
- uint32_t guest_user_asid[NR_CPUS];
- uint32_t guest_kernel_asid[NR_CPUS];
+ u32 guest_user_asid[NR_CPUS];
+ u32 guest_kernel_asid[NR_CPUS];
struct mm_struct guest_kernel_mm, guest_user_mm;
int last_sched_cpu;
@@ -408,6 +321,7 @@ struct kvm_vcpu_arch {
u8 fpu_enabled;
u8 msa_enabled;
+ u8 kscratch_enabled;
};
@@ -461,6 +375,18 @@ struct kvm_vcpu_arch {
#define kvm_write_c0_guest_config7(cop0, val) (cop0->reg[MIPS_CP0_CONFIG][7] = (val))
#define kvm_read_c0_guest_errorepc(cop0) (cop0->reg[MIPS_CP0_ERROR_PC][0])
#define kvm_write_c0_guest_errorepc(cop0, val) (cop0->reg[MIPS_CP0_ERROR_PC][0] = (val))
+#define kvm_read_c0_guest_kscratch1(cop0) (cop0->reg[MIPS_CP0_DESAVE][2])
+#define kvm_read_c0_guest_kscratch2(cop0) (cop0->reg[MIPS_CP0_DESAVE][3])
+#define kvm_read_c0_guest_kscratch3(cop0) (cop0->reg[MIPS_CP0_DESAVE][4])
+#define kvm_read_c0_guest_kscratch4(cop0) (cop0->reg[MIPS_CP0_DESAVE][5])
+#define kvm_read_c0_guest_kscratch5(cop0) (cop0->reg[MIPS_CP0_DESAVE][6])
+#define kvm_read_c0_guest_kscratch6(cop0) (cop0->reg[MIPS_CP0_DESAVE][7])
+#define kvm_write_c0_guest_kscratch1(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][2] = (val))
+#define kvm_write_c0_guest_kscratch2(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][3] = (val))
+#define kvm_write_c0_guest_kscratch3(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][4] = (val))
+#define kvm_write_c0_guest_kscratch4(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][5] = (val))
+#define kvm_write_c0_guest_kscratch5(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][6] = (val))
+#define kvm_write_c0_guest_kscratch6(cop0, val) (cop0->reg[MIPS_CP0_DESAVE][7] = (val))
/*
* Some of the guest registers may be modified asynchronously (e.g. from a
@@ -474,7 +400,7 @@ static inline void _kvm_atomic_set_c0_guest_reg(unsigned long *reg,
unsigned long temp;
do {
__asm__ __volatile__(
- " .set mips3 \n"
+ " .set "MIPS_ISA_ARCH_LEVEL" \n"
" " __LL "%0, %1 \n"
" or %0, %2 \n"
" " __SC "%0, %1 \n"
@@ -490,7 +416,7 @@ static inline void _kvm_atomic_clear_c0_guest_reg(unsigned long *reg,
unsigned long temp;
do {
__asm__ __volatile__(
- " .set mips3 \n"
+ " .set "MIPS_ISA_ARCH_LEVEL" \n"
" " __LL "%0, %1 \n"
" and %0, %2 \n"
" " __SC "%0, %1 \n"
@@ -507,7 +433,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
unsigned long temp;
do {
__asm__ __volatile__(
- " .set mips3 \n"
+ " .set "MIPS_ISA_ARCH_LEVEL" \n"
" " __LL "%0, %1 \n"
" and %0, %2 \n"
" or %0, %3 \n"
@@ -542,7 +468,7 @@ static inline void _kvm_atomic_change_c0_guest_reg(unsigned long *reg,
static inline bool kvm_mips_guest_can_have_fpu(struct kvm_vcpu_arch *vcpu)
{
- return (!__builtin_constant_p(cpu_has_fpu) || cpu_has_fpu) &&
+ return (!__builtin_constant_p(raw_cpu_has_fpu) || raw_cpu_has_fpu) &&
vcpu->fpu_enabled;
}
@@ -589,9 +515,11 @@ struct kvm_mips_callbacks {
void (*dequeue_io_int)(struct kvm_vcpu *vcpu,
struct kvm_mips_interrupt *irq);
int (*irq_deliver)(struct kvm_vcpu *vcpu, unsigned int priority,
- uint32_t cause);
+ u32 cause);
int (*irq_clear)(struct kvm_vcpu *vcpu, unsigned int priority,
- uint32_t cause);
+ u32 cause);
+ unsigned long (*num_regs)(struct kvm_vcpu *vcpu);
+ int (*copy_reg_indices)(struct kvm_vcpu *vcpu, u64 __user *indices);
int (*get_one_reg)(struct kvm_vcpu *vcpu,
const struct kvm_one_reg *reg, s64 *v);
int (*set_one_reg)(struct kvm_vcpu *vcpu,
@@ -605,8 +533,13 @@ int kvm_mips_emulation_init(struct kvm_mips_callbacks **install_callbacks);
/* Debug: dump vcpu state */
int kvm_arch_vcpu_dump_regs(struct kvm_vcpu *vcpu);
-/* Trampoline ASM routine to start running in "Guest" context */
-extern int __kvm_mips_vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+extern int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu);
+
+/* Building of entry/exception code */
+int kvm_mips_entry_setup(void);
+void *kvm_mips_build_vcpu_run(void *addr);
+void *kvm_mips_build_exception(void *addr, void *handler);
+void *kvm_mips_build_exit(void *addr);
/* FPU/MSA context management */
void __kvm_save_fpu(struct kvm_vcpu_arch *vcpu);
@@ -622,11 +555,11 @@ void kvm_drop_fpu(struct kvm_vcpu *vcpu);
void kvm_lose_fpu(struct kvm_vcpu *vcpu);
/* TLB handling */
-uint32_t kvm_get_kernel_asid(struct kvm_vcpu *vcpu);
+u32 kvm_get_kernel_asid(struct kvm_vcpu *vcpu);
-uint32_t kvm_get_user_asid(struct kvm_vcpu *vcpu);
+u32 kvm_get_user_asid(struct kvm_vcpu *vcpu);
-uint32_t kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
+u32 kvm_get_commpage_asid (struct kvm_vcpu *vcpu);
extern int kvm_mips_handle_kseg0_tlb_fault(unsigned long badbaddr,
struct kvm_vcpu *vcpu);
@@ -635,22 +568,24 @@ extern int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
struct kvm_vcpu *vcpu);
extern int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
- struct kvm_mips_tlb *tlb,
- unsigned long *hpa0,
- unsigned long *hpa1);
+ struct kvm_mips_tlb *tlb);
-extern enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_handle_tlbmod(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
extern void kvm_mips_dump_host_tlbs(void);
extern void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu);
+extern int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
+ unsigned long entrylo0,
+ unsigned long entrylo1,
+ int flush_dcache_mask);
extern void kvm_mips_flush_host_tlb(int skip_kseg0);
extern int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long entryhi);
@@ -667,90 +602,90 @@ extern void kvm_mips_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
extern void kvm_mips_vcpu_put(struct kvm_vcpu *vcpu);
/* Emulation */
-uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu);
-enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause);
+u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu);
+enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause);
-extern enum emulation_result kvm_mips_emulate_inst(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_inst(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_syscall(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_handle_ri(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_handle_ri(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-extern enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
- uint32_t *opc,
+extern enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
extern enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
struct kvm_run *run);
-uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu);
-void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count);
-void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack);
+u32 kvm_mips_read_count(struct kvm_vcpu *vcpu);
+void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count);
+void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack);
void kvm_mips_init_count(struct kvm_vcpu *vcpu);
int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl);
int kvm_mips_set_count_resume(struct kvm_vcpu *vcpu, s64 count_resume);
@@ -759,27 +694,27 @@ void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu);
void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu);
enum hrtimer_restart kvm_mips_count_timeout(struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_check_privilege(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_check_privilege(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_cache(uint32_t inst,
- uint32_t *opc,
- uint32_t cause,
+enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
+ u32 *opc,
+ u32 cause,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_CP0(uint32_t inst,
- uint32_t *opc,
- uint32_t cause,
+enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
+ u32 *opc,
+ u32 cause,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_store(uint32_t inst,
- uint32_t cause,
+enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
+ u32 cause,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
-enum emulation_result kvm_mips_emulate_load(uint32_t inst,
- uint32_t cause,
+enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
+ u32 cause,
struct kvm_run *run,
struct kvm_vcpu *vcpu);
@@ -789,13 +724,13 @@ unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu);
unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu);
/* Dynamic binary translation */
-extern int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc,
- struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc,
+extern int kvm_mips_trans_cache_index(union mips_instruction inst,
+ u32 *opc, struct kvm_vcpu *vcpu);
+extern int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc,
struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc,
+extern int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc,
struct kvm_vcpu *vcpu);
-extern int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc,
+extern int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
struct kvm_vcpu *vcpu);
/* Misc */
diff --git a/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h b/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h
index d68e685cde60..bd8b9bbe1771 100644
--- a/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h
+++ b/arch/mips/include/asm/mach-cavium-octeon/cpu-feature-overrides.h
@@ -55,7 +55,7 @@
#define cpu_has_mipsmt 0
#define cpu_has_vint 0
#define cpu_has_veic 0
-#define cpu_hwrena_impl_bits 0xc0000000
+#define cpu_hwrena_impl_bits (MIPS_HWRENA_IMPL1 | MIPS_HWRENA_IMPL2)
#define cpu_has_wsbh 1
#define cpu_has_rixi (cpu_data[0].cputype != CPU_CAVIUM_OCTEON)
diff --git a/arch/mips/include/asm/mipsregs.h b/arch/mips/include/asm/mipsregs.h
index e1ca65c62f6a..def9d8d13f6e 100644
--- a/arch/mips/include/asm/mipsregs.h
+++ b/arch/mips/include/asm/mipsregs.h
@@ -53,7 +53,7 @@
#define CP0_SEGCTL2 $5, 4
#define CP0_WIRED $6
#define CP0_INFO $7
-#define CP0_HWRENA $7, 0
+#define CP0_HWRENA $7
#define CP0_BADVADDR $8
#define CP0_BADINSTR $8, 1
#define CP0_COUNT $9
@@ -533,6 +533,7 @@
#define TX49_CONF_CWFON (_ULCAST_(1) << 27)
/* Bits specific to the MIPS32/64 PRA. */
+#define MIPS_CONF_VI (_ULCAST_(1) << 3)
#define MIPS_CONF_MT (_ULCAST_(7) << 7)
#define MIPS_CONF_MT_TLB (_ULCAST_(1) << 7)
#define MIPS_CONF_MT_FTLB (_ULCAST_(4) << 7)
@@ -853,6 +854,24 @@
#define MIPS_CDMMBASE_ADDR_SHIFT 11
#define MIPS_CDMMBASE_ADDR_START 15
+/* RDHWR register numbers */
+#define MIPS_HWR_CPUNUM 0 /* CPU number */
+#define MIPS_HWR_SYNCISTEP 1 /* SYNCI step size */
+#define MIPS_HWR_CC 2 /* Cycle counter */
+#define MIPS_HWR_CCRES 3 /* Cycle counter resolution */
+#define MIPS_HWR_ULR 29 /* UserLocal */
+#define MIPS_HWR_IMPL1 30 /* Implementation dependent */
+#define MIPS_HWR_IMPL2 31 /* Implementation dependent */
+
+/* Bits in HWREna register */
+#define MIPS_HWRENA_CPUNUM (_ULCAST_(1) << MIPS_HWR_CPUNUM)
+#define MIPS_HWRENA_SYNCISTEP (_ULCAST_(1) << MIPS_HWR_SYNCISTEP)
+#define MIPS_HWRENA_CC (_ULCAST_(1) << MIPS_HWR_CC)
+#define MIPS_HWRENA_CCRES (_ULCAST_(1) << MIPS_HWR_CCRES)
+#define MIPS_HWRENA_ULR (_ULCAST_(1) << MIPS_HWR_ULR)
+#define MIPS_HWRENA_IMPL1 (_ULCAST_(1) << MIPS_HWR_IMPL1)
+#define MIPS_HWRENA_IMPL2 (_ULCAST_(1) << MIPS_HWR_IMPL2)
+
/*
* Bitfields in the TX39 family CP0 Configuration Register 3
*/
diff --git a/arch/mips/include/asm/pci.h b/arch/mips/include/asm/pci.h
index 86b239d9d75d..9b63cd41213d 100644
--- a/arch/mips/include/asm/pci.h
+++ b/arch/mips/include/asm/pci.h
@@ -80,16 +80,6 @@ extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
#define HAVE_ARCH_PCI_RESOURCE_TO_USER
-static inline void pci_resource_to_user(const struct pci_dev *dev, int bar,
- const struct resource *rsrc, resource_size_t *start,
- resource_size_t *end)
-{
- phys_addr_t size = resource_size(rsrc);
-
- *start = fixup_bigphys_addr(rsrc->start, size);
- *end = rsrc->start + size;
-}
-
/*
* Dynamic DMA mapping stuff.
* MIPS has everything mapped statically.
diff --git a/arch/mips/include/asm/setup.h b/arch/mips/include/asm/setup.h
index d7bfdeba9e84..4f5279a8308d 100644
--- a/arch/mips/include/asm/setup.h
+++ b/arch/mips/include/asm/setup.h
@@ -21,6 +21,7 @@ extern void *set_vi_handler(int n, vi_handler_t addr);
extern void *set_except_vector(int n, void *addr);
extern unsigned long ebase;
+extern unsigned int hwrena;
extern void per_cpu_trap_init(bool);
extern void cpu_cache_init(void);
diff --git a/arch/mips/include/asm/uasm.h b/arch/mips/include/asm/uasm.h
index b6ecfeee4dbe..f7929f65f7ca 100644
--- a/arch/mips/include/asm/uasm.h
+++ b/arch/mips/include/asm/uasm.h
@@ -104,8 +104,13 @@ Ip_u1s2(_bltz);
Ip_u1s2(_bltzl);
Ip_u1u2s3(_bne);
Ip_u2s3u1(_cache);
+Ip_u1u2(_cfc1);
+Ip_u2u1(_cfcmsa);
+Ip_u1u2(_ctc1);
+Ip_u2u1(_ctcmsa);
Ip_u2u1s3(_daddiu);
Ip_u3u1u2(_daddu);
+Ip_u1(_di);
Ip_u2u1msbu3(_dins);
Ip_u2u1msbu3(_dinsm);
Ip_u1u2(_divu);
@@ -141,6 +146,8 @@ Ip_u1(_mfhi);
Ip_u1(_mflo);
Ip_u1u2u3(_mtc0);
Ip_u1u2u3(_mthc0);
+Ip_u1(_mthi);
+Ip_u1(_mtlo);
Ip_u3u1u2(_mul);
Ip_u3u1u2(_or);
Ip_u2u1u3(_ori);
diff --git a/arch/mips/include/uapi/asm/inst.h b/arch/mips/include/uapi/asm/inst.h
index 8051f9aa1379..77429d1622b3 100644
--- a/arch/mips/include/uapi/asm/inst.h
+++ b/arch/mips/include/uapi/asm/inst.h
@@ -21,20 +21,20 @@
enum major_op {
spec_op, bcond_op, j_op, jal_op,
beq_op, bne_op, blez_op, bgtz_op,
- addi_op, cbcond0_op = addi_op, addiu_op, slti_op, sltiu_op,
+ addi_op, pop10_op = addi_op, addiu_op, slti_op, sltiu_op,
andi_op, ori_op, xori_op, lui_op,
cop0_op, cop1_op, cop2_op, cop1x_op,
beql_op, bnel_op, blezl_op, bgtzl_op,
- daddi_op, cbcond1_op = daddi_op, daddiu_op, ldl_op, ldr_op,
+ daddi_op, pop30_op = daddi_op, daddiu_op, ldl_op, ldr_op,
spec2_op, jalx_op, mdmx_op, msa_op = mdmx_op, spec3_op,
lb_op, lh_op, lwl_op, lw_op,
lbu_op, lhu_op, lwr_op, lwu_op,
sb_op, sh_op, swl_op, sw_op,
sdl_op, sdr_op, swr_op, cache_op,
ll_op, lwc1_op, lwc2_op, bc6_op = lwc2_op, pref_op,
- lld_op, ldc1_op, ldc2_op, beqzcjic_op = ldc2_op, ld_op,
+ lld_op, ldc1_op, ldc2_op, pop66_op = ldc2_op, ld_op,
sc_op, swc1_op, swc2_op, balc6_op = swc2_op, major_3b_op,
- scd_op, sdc1_op, sdc2_op, bnezcjialc_op = sdc2_op, sd_op
+ scd_op, sdc1_op, sdc2_op, pop76_op = sdc2_op, sd_op
};
/*
@@ -93,6 +93,50 @@ enum spec3_op {
};
/*
+ * Bits 10-6 minor opcode for r6 spec mult/div encodings
+ */
+enum mult_op {
+ mult_mult_op = 0x0,
+ mult_mul_op = 0x2,
+ mult_muh_op = 0x3,
+};
+enum multu_op {
+ multu_multu_op = 0x0,
+ multu_mulu_op = 0x2,
+ multu_muhu_op = 0x3,
+};
+enum div_op {
+ div_div_op = 0x0,
+ div_div6_op = 0x2,
+ div_mod_op = 0x3,
+};
+enum divu_op {
+ divu_divu_op = 0x0,
+ divu_divu6_op = 0x2,
+ divu_modu_op = 0x3,
+};
+enum dmult_op {
+ dmult_dmult_op = 0x0,
+ dmult_dmul_op = 0x2,
+ dmult_dmuh_op = 0x3,
+};
+enum dmultu_op {
+ dmultu_dmultu_op = 0x0,
+ dmultu_dmulu_op = 0x2,
+ dmultu_dmuhu_op = 0x3,
+};
+enum ddiv_op {
+ ddiv_ddiv_op = 0x0,
+ ddiv_ddiv6_op = 0x2,
+ ddiv_dmod_op = 0x3,
+};
+enum ddivu_op {
+ ddivu_ddivu_op = 0x0,
+ ddivu_ddivu6_op = 0x2,
+ ddivu_dmodu_op = 0x3,
+};
+
+/*
* rt field of bcond opcodes.
*/
enum rt_op {
@@ -103,7 +147,7 @@ enum rt_op {
bltzal_op, bgezal_op, bltzall_op, bgezall_op,
rt_op_0x14, rt_op_0x15, rt_op_0x16, rt_op_0x17,
rt_op_0x18, rt_op_0x19, rt_op_0x1a, rt_op_0x1b,
- bposge32_op, rt_op_0x1d, rt_op_0x1e, rt_op_0x1f
+ bposge32_op, rt_op_0x1d, rt_op_0x1e, synci_op
};
/*
@@ -238,6 +282,21 @@ enum bshfl_func {
};
/*
+ * MSA minor opcodes.
+ */
+enum msa_func {
+ msa_elm_op = 0x19,
+};
+
+/*
+ * MSA ELM opcodes.
+ */
+enum msa_elm {
+ msa_ctc_op = 0x3e,
+ msa_cfc_op = 0x7e,
+};
+
+/*
* func field for MSA MI10 format.
*/
enum msa_mi10_func {
@@ -264,7 +323,7 @@ enum mm_major_op {
mm_pool32b_op, mm_pool16b_op, mm_lhu16_op, mm_andi16_op,
mm_addiu32_op, mm_lhu32_op, mm_sh32_op, mm_lh32_op,
mm_pool32i_op, mm_pool16c_op, mm_lwsp16_op, mm_pool16d_op,
- mm_ori32_op, mm_pool32f_op, mm_reserved1_op, mm_reserved2_op,
+ mm_ori32_op, mm_pool32f_op, mm_pool32s_op, mm_reserved2_op,
mm_pool32c_op, mm_lwgp16_op, mm_lw16_op, mm_pool16e_op,
mm_xori32_op, mm_jals32_op, mm_addiupc_op, mm_reserved3_op,
mm_reserved4_op, mm_pool16f_op, mm_sb16_op, mm_beqz16_op,
@@ -360,7 +419,10 @@ enum mm_32axf_minor_op {
mm_mflo32_op = 0x075,
mm_jalrhb_op = 0x07c,
mm_tlbwi_op = 0x08d,
+ mm_mthi32_op = 0x0b5,
mm_tlbwr_op = 0x0cd,
+ mm_mtlo32_op = 0x0f5,
+ mm_di_op = 0x11d,
mm_jalrs_op = 0x13c,
mm_jalrshb_op = 0x17c,
mm_sync_op = 0x1ad,
@@ -479,6 +541,13 @@ enum mm_32f_73_minor_op {
};
/*
+ * (microMIPS) POOL32S minor opcodes.
+ */
+enum mm_32s_minor_op {
+ mm_32s_elm_op = 0x16,
+};
+
+/*
* (microMIPS) POOL16C minor opcodes.
*/
enum mm_16c_minor_op {
@@ -586,6 +655,36 @@ struct r_format { /* Register format */
;))))))
};
+struct c0r_format { /* C0 register format */
+ __BITFIELD_FIELD(unsigned int opcode : 6,
+ __BITFIELD_FIELD(unsigned int rs : 5,
+ __BITFIELD_FIELD(unsigned int rt : 5,
+ __BITFIELD_FIELD(unsigned int rd : 5,
+ __BITFIELD_FIELD(unsigned int z: 8,
+ __BITFIELD_FIELD(unsigned int sel : 3,
+ ;))))))
+};
+
+struct mfmc0_format { /* MFMC0 register format */
+ __BITFIELD_FIELD(unsigned int opcode : 6,
+ __BITFIELD_FIELD(unsigned int rs : 5,
+ __BITFIELD_FIELD(unsigned int rt : 5,
+ __BITFIELD_FIELD(unsigned int rd : 5,
+ __BITFIELD_FIELD(unsigned int re : 5,
+ __BITFIELD_FIELD(unsigned int sc : 1,
+ __BITFIELD_FIELD(unsigned int : 2,
+ __BITFIELD_FIELD(unsigned int sel : 3,
+ ;))))))))
+};
+
+struct co_format { /* C0 CO format */
+ __BITFIELD_FIELD(unsigned int opcode : 6,
+ __BITFIELD_FIELD(unsigned int co : 1,
+ __BITFIELD_FIELD(unsigned int code : 19,
+ __BITFIELD_FIELD(unsigned int func : 6,
+ ;))))
+};
+
struct p_format { /* Performance counter format (R10000) */
__BITFIELD_FIELD(unsigned int opcode : 6,
__BITFIELD_FIELD(unsigned int rs : 5,
@@ -937,6 +1036,9 @@ union mips_instruction {
struct u_format u_format;
struct c_format c_format;
struct r_format r_format;
+ struct c0r_format c0r_format;
+ struct mfmc0_format mfmc0_format;
+ struct co_format co_format;
struct p_format p_format;
struct f_format f_format;
struct ma_format ma_format;
diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c
index 1ea973b2abb1..fae2f9447792 100644
--- a/arch/mips/kernel/asm-offsets.c
+++ b/arch/mips/kernel/asm-offsets.c
@@ -339,71 +339,9 @@ void output_pm_defines(void)
}
#endif
-void output_cpuinfo_defines(void)
-{
- COMMENT(" MIPS cpuinfo offsets. ");
- DEFINE(CPUINFO_SIZE, sizeof(struct cpuinfo_mips));
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
- OFFSET(CPUINFO_ASID_MASK, cpuinfo_mips, asid_mask);
-#endif
-}
-
void output_kvm_defines(void)
{
COMMENT(" KVM/MIPS Specfic offsets. ");
- DEFINE(VCPU_ARCH_SIZE, sizeof(struct kvm_vcpu_arch));
- OFFSET(VCPU_RUN, kvm_vcpu, run);
- OFFSET(VCPU_HOST_ARCH, kvm_vcpu, arch);
-
- OFFSET(VCPU_HOST_EBASE, kvm_vcpu_arch, host_ebase);
- OFFSET(VCPU_GUEST_EBASE, kvm_vcpu_arch, guest_ebase);
-
- OFFSET(VCPU_HOST_STACK, kvm_vcpu_arch, host_stack);
- OFFSET(VCPU_HOST_GP, kvm_vcpu_arch, host_gp);
-
- OFFSET(VCPU_HOST_CP0_BADVADDR, kvm_vcpu_arch, host_cp0_badvaddr);
- OFFSET(VCPU_HOST_CP0_CAUSE, kvm_vcpu_arch, host_cp0_cause);
- OFFSET(VCPU_HOST_EPC, kvm_vcpu_arch, host_cp0_epc);
- OFFSET(VCPU_HOST_ENTRYHI, kvm_vcpu_arch, host_cp0_entryhi);
-
- OFFSET(VCPU_GUEST_INST, kvm_vcpu_arch, guest_inst);
-
- OFFSET(VCPU_R0, kvm_vcpu_arch, gprs[0]);
- OFFSET(VCPU_R1, kvm_vcpu_arch, gprs[1]);
- OFFSET(VCPU_R2, kvm_vcpu_arch, gprs[2]);
- OFFSET(VCPU_R3, kvm_vcpu_arch, gprs[3]);
- OFFSET(VCPU_R4, kvm_vcpu_arch, gprs[4]);
- OFFSET(VCPU_R5, kvm_vcpu_arch, gprs[5]);
- OFFSET(VCPU_R6, kvm_vcpu_arch, gprs[6]);
- OFFSET(VCPU_R7, kvm_vcpu_arch, gprs[7]);
- OFFSET(VCPU_R8, kvm_vcpu_arch, gprs[8]);
- OFFSET(VCPU_R9, kvm_vcpu_arch, gprs[9]);
- OFFSET(VCPU_R10, kvm_vcpu_arch, gprs[10]);
- OFFSET(VCPU_R11, kvm_vcpu_arch, gprs[11]);
- OFFSET(VCPU_R12, kvm_vcpu_arch, gprs[12]);
- OFFSET(VCPU_R13, kvm_vcpu_arch, gprs[13]);
- OFFSET(VCPU_R14, kvm_vcpu_arch, gprs[14]);
- OFFSET(VCPU_R15, kvm_vcpu_arch, gprs[15]);
- OFFSET(VCPU_R16, kvm_vcpu_arch, gprs[16]);
- OFFSET(VCPU_R17, kvm_vcpu_arch, gprs[17]);
- OFFSET(VCPU_R18, kvm_vcpu_arch, gprs[18]);
- OFFSET(VCPU_R19, kvm_vcpu_arch, gprs[19]);
- OFFSET(VCPU_R20, kvm_vcpu_arch, gprs[20]);
- OFFSET(VCPU_R21, kvm_vcpu_arch, gprs[21]);
- OFFSET(VCPU_R22, kvm_vcpu_arch, gprs[22]);
- OFFSET(VCPU_R23, kvm_vcpu_arch, gprs[23]);
- OFFSET(VCPU_R24, kvm_vcpu_arch, gprs[24]);
- OFFSET(VCPU_R25, kvm_vcpu_arch, gprs[25]);
- OFFSET(VCPU_R26, kvm_vcpu_arch, gprs[26]);
- OFFSET(VCPU_R27, kvm_vcpu_arch, gprs[27]);
- OFFSET(VCPU_R28, kvm_vcpu_arch, gprs[28]);
- OFFSET(VCPU_R29, kvm_vcpu_arch, gprs[29]);
- OFFSET(VCPU_R30, kvm_vcpu_arch, gprs[30]);
- OFFSET(VCPU_R31, kvm_vcpu_arch, gprs[31]);
- OFFSET(VCPU_LO, kvm_vcpu_arch, lo);
- OFFSET(VCPU_HI, kvm_vcpu_arch, hi);
- OFFSET(VCPU_PC, kvm_vcpu_arch, pc);
- BLANK();
OFFSET(VCPU_FPR0, kvm_vcpu_arch, fpu.fpr[0]);
OFFSET(VCPU_FPR1, kvm_vcpu_arch, fpu.fpr[1]);
@@ -441,14 +379,6 @@ void output_kvm_defines(void)
OFFSET(VCPU_FCR31, kvm_vcpu_arch, fpu.fcr31);
OFFSET(VCPU_MSA_CSR, kvm_vcpu_arch, fpu.msacsr);
BLANK();
-
- OFFSET(VCPU_COP0, kvm_vcpu_arch, cop0);
- OFFSET(VCPU_GUEST_KERNEL_ASID, kvm_vcpu_arch, guest_kernel_asid);
- OFFSET(VCPU_GUEST_USER_ASID, kvm_vcpu_arch, guest_user_asid);
-
- OFFSET(COP0_TLB_HI, mips_coproc, reg[MIPS_CP0_TLB_HI][0]);
- OFFSET(COP0_STATUS, mips_coproc, reg[MIPS_CP0_STATUS][0]);
- BLANK();
}
#ifdef CONFIG_MIPS_CPS
diff --git a/arch/mips/kernel/branch.c b/arch/mips/kernel/branch.c
index 6dc3f1fdaccc..46c227fc98f5 100644
--- a/arch/mips/kernel/branch.c
+++ b/arch/mips/kernel/branch.c
@@ -790,7 +790,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
epc += 4 + (insn.i_format.simmediate << 2);
regs->cp0_epc = epc;
break;
- case beqzcjic_op:
+ case pop66_op:
if (!cpu_has_mips_r6) {
ret = -SIGILL;
break;
@@ -798,7 +798,7 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
/* Compact branch: BEQZC || JIC */
regs->cp0_epc += 8;
break;
- case bnezcjialc_op:
+ case pop76_op:
if (!cpu_has_mips_r6) {
ret = -SIGILL;
break;
@@ -809,8 +809,8 @@ int __compute_return_epc_for_insn(struct pt_regs *regs,
regs->cp0_epc += 8;
break;
#endif
- case cbcond0_op:
- case cbcond1_op:
+ case pop10_op:
+ case pop30_op:
/* Only valid for MIPS R6 */
if (!cpu_has_mips_r6) {
ret = -SIGILL;
diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c
index 4a1712b5abdf..6fb4704bd156 100644
--- a/arch/mips/kernel/traps.c
+++ b/arch/mips/kernel/traps.c
@@ -619,17 +619,17 @@ static int simulate_rdhwr(struct pt_regs *regs, int rd, int rt)
perf_sw_event(PERF_COUNT_SW_EMULATION_FAULTS,
1, regs, 0);
switch (rd) {
- case 0: /* CPU number */
+ case MIPS_HWR_CPUNUM: /* CPU number */
regs->regs[rt] = smp_processor_id();
return 0;
- case 1: /* SYNCI length */
+ case MIPS_HWR_SYNCISTEP: /* SYNCI length */
regs->regs[rt] = min(current_cpu_data.dcache.linesz,
current_cpu_data.icache.linesz);
return 0;
- case 2: /* Read count register */
+ case MIPS_HWR_CC: /* Read count register */
regs->regs[rt] = read_c0_count();
return 0;
- case 3: /* Count register resolution */
+ case MIPS_HWR_CCRES: /* Count register resolution */
switch (current_cpu_type()) {
case CPU_20KC:
case CPU_25KF:
@@ -639,7 +639,7 @@ static int simulate_rdhwr(struct pt_regs *regs, int rd, int rt)
regs->regs[rt] = 2;
}
return 0;
- case 29:
+ case MIPS_HWR_ULR: /* Read UserLocal register */
regs->regs[rt] = ti->tp_value;
return 0;
default:
@@ -1859,6 +1859,7 @@ void __noreturn nmi_exception_handler(struct pt_regs *regs)
#define VECTORSPACING 0x100 /* for EI/VI mode */
unsigned long ebase;
+EXPORT_SYMBOL_GPL(ebase);
unsigned long exception_handlers[32];
unsigned long vi_handlers[64];
@@ -2063,16 +2064,22 @@ static void configure_status(void)
status_set);
}
+unsigned int hwrena;
+EXPORT_SYMBOL_GPL(hwrena);
+
/* configure HWRENA register */
static void configure_hwrena(void)
{
- unsigned int hwrena = cpu_hwrena_impl_bits;
+ hwrena = cpu_hwrena_impl_bits;
if (cpu_has_mips_r2_r6)
- hwrena |= 0x0000000f;
+ hwrena |= MIPS_HWRENA_CPUNUM |
+ MIPS_HWRENA_SYNCISTEP |
+ MIPS_HWRENA_CC |
+ MIPS_HWRENA_CCRES;
if (!noulri && cpu_has_userlocal)
- hwrena |= (1 << 29);
+ hwrena |= MIPS_HWRENA_ULR;
if (hwrena)
write_c0_hwrena(hwrena);
diff --git a/arch/mips/kvm/Kconfig b/arch/mips/kvm/Kconfig
index 2ae12825529f..7c56d6b124d1 100644
--- a/arch/mips/kvm/Kconfig
+++ b/arch/mips/kvm/Kconfig
@@ -17,6 +17,7 @@ if VIRTUALIZATION
config KVM
tristate "Kernel-based Virtual Machine (KVM) support"
depends on HAVE_KVM
+ select EXPORT_UASM
select PREEMPT_NOTIFIERS
select ANON_INODES
select KVM_MMIO
diff --git a/arch/mips/kvm/Makefile b/arch/mips/kvm/Makefile
index 637ebbebd549..847429de780d 100644
--- a/arch/mips/kvm/Makefile
+++ b/arch/mips/kvm/Makefile
@@ -7,9 +7,10 @@ EXTRA_CFLAGS += -Ivirt/kvm -Iarch/mips/kvm
common-objs-$(CONFIG_CPU_HAS_MSA) += msa.o
-kvm-objs := $(common-objs-y) mips.o emulate.o locore.o \
+kvm-objs := $(common-objs-y) mips.o emulate.o entry.o \
interrupt.o stats.o commpage.o \
dyntrans.o trap_emul.o fpu.o
+kvm-objs += mmu.o
obj-$(CONFIG_KVM) += kvm.o
obj-y += callback.o tlb.o
diff --git a/arch/mips/kvm/commpage.c b/arch/mips/kvm/commpage.c
index 2d6e976d1add..a36b77e1705c 100644
--- a/arch/mips/kvm/commpage.c
+++ b/arch/mips/kvm/commpage.c
@@ -4,7 +4,7 @@
* for more details.
*
* commpage, currently used for Virtual COP0 registers.
- * Mapped into the guest kernel @ 0x0.
+ * Mapped into the guest kernel @ KVM_GUEST_COMMPAGE_ADDR.
*
* Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved.
* Authors: Sanjay Lal <sanjayl@kymasys.com>
diff --git a/arch/mips/kvm/dyntrans.c b/arch/mips/kvm/dyntrans.c
index f1527a465c1b..d280894915ed 100644
--- a/arch/mips/kvm/dyntrans.c
+++ b/arch/mips/kvm/dyntrans.c
@@ -11,6 +11,7 @@
#include <linux/errno.h>
#include <linux/err.h>
+#include <linux/highmem.h>
#include <linux/kvm_host.h>
#include <linux/module.h>
#include <linux/vmalloc.h>
@@ -20,125 +21,114 @@
#include "commpage.h"
-#define SYNCI_TEMPLATE 0x041f0000
-#define SYNCI_BASE(x) (((x) >> 21) & 0x1f)
-#define SYNCI_OFFSET ((x) & 0xffff)
+/**
+ * kvm_mips_trans_replace() - Replace trapping instruction in guest memory.
+ * @vcpu: Virtual CPU.
+ * @opc: PC of instruction to replace.
+ * @replace: Instruction to write
+ */
+static int kvm_mips_trans_replace(struct kvm_vcpu *vcpu, u32 *opc,
+ union mips_instruction replace)
+{
+ unsigned long paddr, flags;
+ void *vaddr;
+
+ if (KVM_GUEST_KSEGX((unsigned long)opc) == KVM_GUEST_KSEG0) {
+ paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
+ (unsigned long)opc);
+ vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
+ vaddr += paddr & ~PAGE_MASK;
+ memcpy(vaddr, (void *)&replace, sizeof(u32));
+ local_flush_icache_range((unsigned long)vaddr,
+ (unsigned long)vaddr + 32);
+ kunmap_atomic(vaddr);
+ } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
+ local_irq_save(flags);
+ memcpy((void *)opc, (void *)&replace, sizeof(u32));
+ local_flush_icache_range((unsigned long)opc,
+ (unsigned long)opc + 32);
+ local_irq_restore(flags);
+ } else {
+ kvm_err("%s: Invalid address: %p\n", __func__, opc);
+ return -EFAULT;
+ }
-#define LW_TEMPLATE 0x8c000000
-#define CLEAR_TEMPLATE 0x00000020
-#define SW_TEMPLATE 0xac000000
+ return 0;
+}
-int kvm_mips_trans_cache_index(uint32_t inst, uint32_t *opc,
+int kvm_mips_trans_cache_index(union mips_instruction inst, u32 *opc,
struct kvm_vcpu *vcpu)
{
- int result = 0;
- unsigned long kseg0_opc;
- uint32_t synci_inst = 0x0;
+ union mips_instruction nop_inst = { 0 };
/* Replace the CACHE instruction, with a NOP */
- kseg0_opc =
- CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
- (vcpu, (unsigned long) opc));
- memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t));
- local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-
- return result;
+ return kvm_mips_trans_replace(vcpu, opc, nop_inst);
}
/*
* Address based CACHE instructions are transformed into synci(s). A little
* heavy for just D-cache invalidates, but avoids an expensive trap
*/
-int kvm_mips_trans_cache_va(uint32_t inst, uint32_t *opc,
+int kvm_mips_trans_cache_va(union mips_instruction inst, u32 *opc,
struct kvm_vcpu *vcpu)
{
- int result = 0;
- unsigned long kseg0_opc;
- uint32_t synci_inst = SYNCI_TEMPLATE, base, offset;
-
- base = (inst >> 21) & 0x1f;
- offset = inst & 0xffff;
- synci_inst |= (base << 21);
- synci_inst |= offset;
-
- kseg0_opc =
- CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
- (vcpu, (unsigned long) opc));
- memcpy((void *)kseg0_opc, (void *)&synci_inst, sizeof(uint32_t));
- local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
-
- return result;
+ union mips_instruction synci_inst = { 0 };
+
+ synci_inst.i_format.opcode = bcond_op;
+ synci_inst.i_format.rs = inst.i_format.rs;
+ synci_inst.i_format.rt = synci_op;
+ if (cpu_has_mips_r6)
+ synci_inst.i_format.simmediate = inst.spec3_format.simmediate;
+ else
+ synci_inst.i_format.simmediate = inst.i_format.simmediate;
+
+ return kvm_mips_trans_replace(vcpu, opc, synci_inst);
}
-int kvm_mips_trans_mfc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu)
+int kvm_mips_trans_mfc0(union mips_instruction inst, u32 *opc,
+ struct kvm_vcpu *vcpu)
{
- int32_t rt, rd, sel;
- uint32_t mfc0_inst;
- unsigned long kseg0_opc, flags;
-
- rt = (inst >> 16) & 0x1f;
- rd = (inst >> 11) & 0x1f;
- sel = inst & 0x7;
+ union mips_instruction mfc0_inst = { 0 };
+ u32 rd, sel;
- if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) {
- mfc0_inst = CLEAR_TEMPLATE;
- mfc0_inst |= ((rt & 0x1f) << 16);
- } else {
- mfc0_inst = LW_TEMPLATE;
- mfc0_inst |= ((rt & 0x1f) << 16);
- mfc0_inst |= offsetof(struct kvm_mips_commpage,
- cop0.reg[rd][sel]);
- }
+ rd = inst.c0r_format.rd;
+ sel = inst.c0r_format.sel;
- if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
- kseg0_opc =
- CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
- (vcpu, (unsigned long) opc));
- memcpy((void *)kseg0_opc, (void *)&mfc0_inst, sizeof(uint32_t));
- local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
- } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
- local_irq_save(flags);
- memcpy((void *)opc, (void *)&mfc0_inst, sizeof(uint32_t));
- local_flush_icache_range((unsigned long)opc,
- (unsigned long)opc + 32);
- local_irq_restore(flags);
+ if (rd == MIPS_CP0_ERRCTL && sel == 0) {
+ mfc0_inst.r_format.opcode = spec_op;
+ mfc0_inst.r_format.rd = inst.c0r_format.rt;
+ mfc0_inst.r_format.func = add_op;
} else {
- kvm_err("%s: Invalid address: %p\n", __func__, opc);
- return -EFAULT;
+ mfc0_inst.i_format.opcode = lw_op;
+ mfc0_inst.i_format.rt = inst.c0r_format.rt;
+ mfc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR |
+ offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
+#ifdef CONFIG_CPU_BIG_ENDIAN
+ if (sizeof(vcpu->arch.cop0->reg[0][0]) == 8)
+ mfc0_inst.i_format.simmediate |= 4;
+#endif
}
- return 0;
+ return kvm_mips_trans_replace(vcpu, opc, mfc0_inst);
}
-int kvm_mips_trans_mtc0(uint32_t inst, uint32_t *opc, struct kvm_vcpu *vcpu)
+int kvm_mips_trans_mtc0(union mips_instruction inst, u32 *opc,
+ struct kvm_vcpu *vcpu)
{
- int32_t rt, rd, sel;
- uint32_t mtc0_inst = SW_TEMPLATE;
- unsigned long kseg0_opc, flags;
-
- rt = (inst >> 16) & 0x1f;
- rd = (inst >> 11) & 0x1f;
- sel = inst & 0x7;
-
- mtc0_inst |= ((rt & 0x1f) << 16);
- mtc0_inst |= offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
-
- if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
- kseg0_opc =
- CKSEG0ADDR(kvm_mips_translate_guest_kseg0_to_hpa
- (vcpu, (unsigned long) opc));
- memcpy((void *)kseg0_opc, (void *)&mtc0_inst, sizeof(uint32_t));
- local_flush_icache_range(kseg0_opc, kseg0_opc + 32);
- } else if (KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
- local_irq_save(flags);
- memcpy((void *)opc, (void *)&mtc0_inst, sizeof(uint32_t));
- local_flush_icache_range((unsigned long)opc,
- (unsigned long)opc + 32);
- local_irq_restore(flags);
- } else {
- kvm_err("%s: Invalid address: %p\n", __func__, opc);
- return -EFAULT;
- }
-
- return 0;
+ union mips_instruction mtc0_inst = { 0 };
+ u32 rd, sel;
+
+ rd = inst.c0r_format.rd;
+ sel = inst.c0r_format.sel;
+
+ mtc0_inst.i_format.opcode = sw_op;
+ mtc0_inst.i_format.rt = inst.c0r_format.rt;
+ mtc0_inst.i_format.simmediate = KVM_GUEST_COMMPAGE_ADDR |
+ offsetof(struct kvm_mips_commpage, cop0.reg[rd][sel]);
+#ifdef CONFIG_CPU_BIG_ENDIAN
+ if (sizeof(vcpu->arch.cop0->reg[0][0]) == 8)
+ mtc0_inst.i_format.simmediate |= 4;
+#endif
+
+ return kvm_mips_trans_replace(vcpu, opc, mtc0_inst);
}
diff --git a/arch/mips/kvm/emulate.c b/arch/mips/kvm/emulate.c
index 645c8a1982a7..6eb52b9c9818 100644
--- a/arch/mips/kvm/emulate.c
+++ b/arch/mips/kvm/emulate.c
@@ -52,7 +52,7 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
goto unaligned;
/* Read the instruction */
- insn.word = kvm_get_inst((uint32_t *) epc, vcpu);
+ insn.word = kvm_get_inst((u32 *) epc, vcpu);
if (insn.word == KVM_INVALID_INST)
return KVM_INVALID_INST;
@@ -161,9 +161,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
nextpc = epc;
break;
- case blez_op: /* not really i_format */
- case blezl_op:
- /* rt field assumed to be zero */
+ case blez_op: /* POP06 */
+#ifndef CONFIG_CPU_MIPSR6
+ case blezl_op: /* removed in R6 */
+#endif
+ if (insn.i_format.rt != 0)
+ goto compact_branch;
if ((long)arch->gprs[insn.i_format.rs] <= 0)
epc = epc + 4 + (insn.i_format.simmediate << 2);
else
@@ -171,9 +174,12 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
nextpc = epc;
break;
- case bgtz_op:
- case bgtzl_op:
- /* rt field assumed to be zero */
+ case bgtz_op: /* POP07 */
+#ifndef CONFIG_CPU_MIPSR6
+ case bgtzl_op: /* removed in R6 */
+#endif
+ if (insn.i_format.rt != 0)
+ goto compact_branch;
if ((long)arch->gprs[insn.i_format.rs] > 0)
epc = epc + 4 + (insn.i_format.simmediate << 2);
else
@@ -185,6 +191,40 @@ unsigned long kvm_compute_return_epc(struct kvm_vcpu *vcpu,
case cop1_op:
kvm_err("%s: unsupported cop1_op\n", __func__);
break;
+
+#ifdef CONFIG_CPU_MIPSR6
+ /* R6 added the following compact branches with forbidden slots */
+ case blezl_op: /* POP26 */
+ case bgtzl_op: /* POP27 */
+ /* only rt == 0 isn't compact branch */
+ if (insn.i_format.rt != 0)
+ goto compact_branch;
+ break;
+ case pop10_op:
+ case pop30_op:
+ /* only rs == rt == 0 is reserved, rest are compact branches */
+ if (insn.i_format.rs != 0 || insn.i_format.rt != 0)
+ goto compact_branch;
+ break;
+ case pop66_op:
+ case pop76_op:
+ /* only rs == 0 isn't compact branch */
+ if (insn.i_format.rs != 0)
+ goto compact_branch;
+ break;
+compact_branch:
+ /*
+ * If we've hit an exception on the forbidden slot, then
+ * the branch must not have been taken.
+ */
+ epc += 8;
+ nextpc = epc;
+ break;
+#else
+compact_branch:
+ /* Compact branches not supported before R6 */
+ break;
+#endif
}
return nextpc;
@@ -198,7 +238,7 @@ sigill:
return nextpc;
}
-enum emulation_result update_pc(struct kvm_vcpu *vcpu, uint32_t cause)
+enum emulation_result update_pc(struct kvm_vcpu *vcpu, u32 cause)
{
unsigned long branch_pc;
enum emulation_result er = EMULATE_DONE;
@@ -243,7 +283,7 @@ static inline int kvm_mips_count_disabled(struct kvm_vcpu *vcpu)
*
* Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
*/
-static uint32_t kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now)
+static u32 kvm_mips_ktime_to_count(struct kvm_vcpu *vcpu, ktime_t now)
{
s64 now_ns, periods;
u64 delta;
@@ -300,11 +340,11 @@ static inline ktime_t kvm_mips_count_time(struct kvm_vcpu *vcpu)
*
* Returns: The current value of the guest CP0_Count register.
*/
-static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
+static u32 kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
ktime_t expires, threshold;
- uint32_t count, compare;
+ u32 count, compare;
int running;
/* Calculate the biased and scaled guest CP0_Count */
@@ -315,7 +355,7 @@ static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
* Find whether CP0_Count has reached the closest timer interrupt. If
* not, we shouldn't inject it.
*/
- if ((int32_t)(count - compare) < 0)
+ if ((s32)(count - compare) < 0)
return count;
/*
@@ -360,7 +400,7 @@ static uint32_t kvm_mips_read_count_running(struct kvm_vcpu *vcpu, ktime_t now)
*
* Returns: The current guest CP0_Count value.
*/
-uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu)
+u32 kvm_mips_read_count(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
@@ -387,8 +427,7 @@ uint32_t kvm_mips_read_count(struct kvm_vcpu *vcpu)
*
* Returns: The ktime at the point of freeze.
*/
-static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu,
- uint32_t *count)
+static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu, u32 *count)
{
ktime_t now;
@@ -419,16 +458,16 @@ static ktime_t kvm_mips_freeze_hrtimer(struct kvm_vcpu *vcpu,
* Assumes !kvm_mips_count_disabled(@vcpu) (guest CP0_Count timer is running).
*/
static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
- ktime_t now, uint32_t count)
+ ktime_t now, u32 count)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
- uint32_t compare;
+ u32 compare;
u64 delta;
ktime_t expire;
/* Calculate timeout (wrap 0 to 2^32) */
compare = kvm_read_c0_guest_compare(cop0);
- delta = (u64)(uint32_t)(compare - count - 1) + 1;
+ delta = (u64)(u32)(compare - count - 1) + 1;
delta = div_u64(delta * NSEC_PER_SEC, vcpu->arch.count_hz);
expire = ktime_add_ns(now, delta);
@@ -444,7 +483,7 @@ static void kvm_mips_resume_hrtimer(struct kvm_vcpu *vcpu,
*
* Sets the CP0_Count value and updates the timer accordingly.
*/
-void kvm_mips_write_count(struct kvm_vcpu *vcpu, uint32_t count)
+void kvm_mips_write_count(struct kvm_vcpu *vcpu, u32 count)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
ktime_t now;
@@ -538,13 +577,13 @@ int kvm_mips_set_count_hz(struct kvm_vcpu *vcpu, s64 count_hz)
* If @ack, atomically acknowledge any pending timer interrupt, otherwise ensure
* any pending timer interrupt is preserved.
*/
-void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack)
+void kvm_mips_write_compare(struct kvm_vcpu *vcpu, u32 compare, bool ack)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
int dc;
u32 old_compare = kvm_read_c0_guest_compare(cop0);
ktime_t now;
- uint32_t count;
+ u32 count;
/* if unchanged, must just be an ack */
if (old_compare == compare) {
@@ -585,7 +624,7 @@ void kvm_mips_write_compare(struct kvm_vcpu *vcpu, uint32_t compare, bool ack)
static ktime_t kvm_mips_count_disable(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
- uint32_t count;
+ u32 count;
ktime_t now;
/* Stop hrtimer */
@@ -632,7 +671,7 @@ void kvm_mips_count_disable_cause(struct kvm_vcpu *vcpu)
void kvm_mips_count_enable_cause(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
- uint32_t count;
+ u32 count;
kvm_clear_c0_guest_cause(cop0, CAUSEF_DC);
@@ -661,7 +700,7 @@ int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl)
s64 changed = count_ctl ^ vcpu->arch.count_ctl;
s64 delta;
ktime_t expire, now;
- uint32_t count, compare;
+ u32 count, compare;
/* Only allow defined bits to be changed */
if (changed & ~(s64)(KVM_REG_MIPS_COUNT_CTL_DC))
@@ -687,7 +726,7 @@ int kvm_mips_set_count_ctl(struct kvm_vcpu *vcpu, s64 count_ctl)
*/
count = kvm_read_c0_guest_count(cop0);
compare = kvm_read_c0_guest_compare(cop0);
- delta = (u64)(uint32_t)(compare - count - 1) + 1;
+ delta = (u64)(u32)(compare - count - 1) + 1;
delta = div_u64(delta * NSEC_PER_SEC,
vcpu->arch.count_hz);
expire = ktime_add_ns(vcpu->arch.count_resume, delta);
@@ -776,7 +815,7 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
vcpu->arch.pending_exceptions);
++vcpu->stat.wait_exits;
- trace_kvm_exit(vcpu, WAIT_EXITS);
+ trace_kvm_exit(vcpu, KVM_TRACE_EXIT_WAIT);
if (!vcpu->arch.pending_exceptions) {
vcpu->arch.wait = 1;
kvm_vcpu_block(vcpu);
@@ -801,9 +840,9 @@ enum emulation_result kvm_mips_emul_wait(struct kvm_vcpu *vcpu)
enum emulation_result kvm_mips_emul_tlbr(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
- uint32_t pc = vcpu->arch.pc;
+ unsigned long pc = vcpu->arch.pc;
- kvm_err("[%#x] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0));
+ kvm_err("[%#lx] COP0_TLBR [%ld]\n", pc, kvm_read_c0_guest_index(cop0));
return EMULATE_FAIL;
}
@@ -813,11 +852,11 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
struct mips_coproc *cop0 = vcpu->arch.cop0;
int index = kvm_read_c0_guest_index(cop0);
struct kvm_mips_tlb *tlb = NULL;
- uint32_t pc = vcpu->arch.pc;
+ unsigned long pc = vcpu->arch.pc;
if (index < 0 || index >= KVM_MIPS_GUEST_TLB_SIZE) {
kvm_debug("%s: illegal index: %d\n", __func__, index);
- kvm_debug("[%#x] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
+ kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
pc, index, kvm_read_c0_guest_entryhi(cop0),
kvm_read_c0_guest_entrylo0(cop0),
kvm_read_c0_guest_entrylo1(cop0),
@@ -834,10 +873,10 @@ enum emulation_result kvm_mips_emul_tlbwi(struct kvm_vcpu *vcpu)
tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
- tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0);
- tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0);
+ tlb->tlb_lo[0] = kvm_read_c0_guest_entrylo0(cop0);
+ tlb->tlb_lo[1] = kvm_read_c0_guest_entrylo1(cop0);
- kvm_debug("[%#x] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
+ kvm_debug("[%#lx] COP0_TLBWI [%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx, mask: %#lx)\n",
pc, index, kvm_read_c0_guest_entryhi(cop0),
kvm_read_c0_guest_entrylo0(cop0),
kvm_read_c0_guest_entrylo1(cop0),
@@ -851,7 +890,7 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
struct kvm_mips_tlb *tlb = NULL;
- uint32_t pc = vcpu->arch.pc;
+ unsigned long pc = vcpu->arch.pc;
int index;
get_random_bytes(&index, sizeof(index));
@@ -867,10 +906,10 @@ enum emulation_result kvm_mips_emul_tlbwr(struct kvm_vcpu *vcpu)
tlb->tlb_mask = kvm_read_c0_guest_pagemask(cop0);
tlb->tlb_hi = kvm_read_c0_guest_entryhi(cop0);
- tlb->tlb_lo0 = kvm_read_c0_guest_entrylo0(cop0);
- tlb->tlb_lo1 = kvm_read_c0_guest_entrylo1(cop0);
+ tlb->tlb_lo[0] = kvm_read_c0_guest_entrylo0(cop0);
+ tlb->tlb_lo[1] = kvm_read_c0_guest_entrylo1(cop0);
- kvm_debug("[%#x] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n",
+ kvm_debug("[%#lx] COP0_TLBWR[%d] (entryhi: %#lx, entrylo0: %#lx entrylo1: %#lx)\n",
pc, index, kvm_read_c0_guest_entryhi(cop0),
kvm_read_c0_guest_entrylo0(cop0),
kvm_read_c0_guest_entrylo1(cop0));
@@ -882,14 +921,14 @@ enum emulation_result kvm_mips_emul_tlbp(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
long entryhi = kvm_read_c0_guest_entryhi(cop0);
- uint32_t pc = vcpu->arch.pc;
+ unsigned long pc = vcpu->arch.pc;
int index = -1;
index = kvm_mips_guest_tlb_lookup(vcpu, entryhi);
kvm_write_c0_guest_index(cop0, index);
- kvm_debug("[%#x] COP0_TLBP (entryhi: %#lx), index: %d\n", pc, entryhi,
+ kvm_debug("[%#lx] COP0_TLBP (entryhi: %#lx), index: %d\n", pc, entryhi,
index);
return EMULATE_DONE;
@@ -922,8 +961,8 @@ unsigned int kvm_mips_config1_wrmask(struct kvm_vcpu *vcpu)
*/
unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu)
{
- /* Config4 is optional */
- unsigned int mask = MIPS_CONF_M;
+ /* Config4 and ULRI are optional */
+ unsigned int mask = MIPS_CONF_M | MIPS_CONF3_ULRI;
/* Permit MSA to be present if MSA is supported */
if (kvm_mips_guest_can_have_msa(&vcpu->arch))
@@ -942,7 +981,12 @@ unsigned int kvm_mips_config3_wrmask(struct kvm_vcpu *vcpu)
unsigned int kvm_mips_config4_wrmask(struct kvm_vcpu *vcpu)
{
/* Config5 is optional */
- return MIPS_CONF_M;
+ unsigned int mask = MIPS_CONF_M;
+
+ /* KScrExist */
+ mask |= (unsigned int)vcpu->arch.kscratch_enabled << 16;
+
+ return mask;
}
/**
@@ -973,14 +1017,14 @@ unsigned int kvm_mips_config5_wrmask(struct kvm_vcpu *vcpu)
return mask;
}
-enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
- uint32_t cause, struct kvm_run *run,
+enum emulation_result kvm_mips_emulate_CP0(union mips_instruction inst,
+ u32 *opc, u32 cause,
+ struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result er = EMULATE_DONE;
- int32_t rt, rd, copz, sel, co_bit, op;
- uint32_t pc = vcpu->arch.pc;
+ u32 rt, rd, sel;
unsigned long curr_pc;
/*
@@ -992,16 +1036,8 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
if (er == EMULATE_FAIL)
return er;
- copz = (inst >> 21) & 0x1f;
- rt = (inst >> 16) & 0x1f;
- rd = (inst >> 11) & 0x1f;
- sel = inst & 0x7;
- co_bit = (inst >> 25) & 1;
-
- if (co_bit) {
- op = (inst) & 0xff;
-
- switch (op) {
+ if (inst.co_format.co) {
+ switch (inst.co_format.func) {
case tlbr_op: /* Read indexed TLB entry */
er = kvm_mips_emul_tlbr(vcpu);
break;
@@ -1020,47 +1056,58 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
case eret_op:
er = kvm_mips_emul_eret(vcpu);
goto dont_update_pc;
- break;
case wait_op:
er = kvm_mips_emul_wait(vcpu);
break;
}
} else {
- switch (copz) {
+ rt = inst.c0r_format.rt;
+ rd = inst.c0r_format.rd;
+ sel = inst.c0r_format.sel;
+
+ switch (inst.c0r_format.rs) {
case mfc_op:
#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
cop0->stat[rd][sel]++;
#endif
/* Get reg */
if ((rd == MIPS_CP0_COUNT) && (sel == 0)) {
- vcpu->arch.gprs[rt] = kvm_mips_read_count(vcpu);
+ vcpu->arch.gprs[rt] =
+ (s32)kvm_mips_read_count(vcpu);
} else if ((rd == MIPS_CP0_ERRCTL) && (sel == 0)) {
vcpu->arch.gprs[rt] = 0x0;
#ifdef CONFIG_KVM_MIPS_DYN_TRANS
kvm_mips_trans_mfc0(inst, opc, vcpu);
#endif
} else {
- vcpu->arch.gprs[rt] = cop0->reg[rd][sel];
+ vcpu->arch.gprs[rt] = (s32)cop0->reg[rd][sel];
#ifdef CONFIG_KVM_MIPS_DYN_TRANS
kvm_mips_trans_mfc0(inst, opc, vcpu);
#endif
}
- kvm_debug
- ("[%#x] MFCz[%d][%d], vcpu->arch.gprs[%d]: %#lx\n",
- pc, rd, sel, rt, vcpu->arch.gprs[rt]);
-
+ trace_kvm_hwr(vcpu, KVM_TRACE_MFC0,
+ KVM_TRACE_COP0(rd, sel),
+ vcpu->arch.gprs[rt]);
break;
case dmfc_op:
vcpu->arch.gprs[rt] = cop0->reg[rd][sel];
+
+ trace_kvm_hwr(vcpu, KVM_TRACE_DMFC0,
+ KVM_TRACE_COP0(rd, sel),
+ vcpu->arch.gprs[rt]);
break;
case mtc_op:
#ifdef CONFIG_KVM_MIPS_DEBUG_COP0_COUNTERS
cop0->stat[rd][sel]++;
#endif
+ trace_kvm_hwr(vcpu, KVM_TRACE_MTC0,
+ KVM_TRACE_COP0(rd, sel),
+ vcpu->arch.gprs[rt]);
+
if ((rd == MIPS_CP0_TLB_INDEX)
&& (vcpu->arch.gprs[rt] >=
KVM_MIPS_GUEST_TLB_SIZE)) {
@@ -1078,16 +1125,15 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
kvm_err("MTCz, cop0->reg[EBASE]: %#lx\n",
kvm_read_c0_guest_ebase(cop0));
} else if (rd == MIPS_CP0_TLB_HI && sel == 0) {
- uint32_t nasid =
+ u32 nasid =
vcpu->arch.gprs[rt] & KVM_ENTRYHI_ASID;
if ((KSEGX(vcpu->arch.gprs[rt]) != CKSEG0) &&
((kvm_read_c0_guest_entryhi(cop0) &
KVM_ENTRYHI_ASID) != nasid)) {
- kvm_debug("MTCz, change ASID from %#lx to %#lx\n",
+ trace_kvm_asid_change(vcpu,
kvm_read_c0_guest_entryhi(cop0)
- & KVM_ENTRYHI_ASID,
- vcpu->arch.gprs[rt]
- & KVM_ENTRYHI_ASID);
+ & KVM_ENTRYHI_ASID,
+ nasid);
/* Blow away the shadow host TLBs */
kvm_mips_flush_host_tlb(1);
@@ -1100,10 +1146,6 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
kvm_mips_write_count(vcpu, vcpu->arch.gprs[rt]);
goto done;
} else if ((rd == MIPS_CP0_COMPARE) && (sel == 0)) {
- kvm_debug("[%#x] MTCz, COMPARE %#lx <- %#lx\n",
- pc, kvm_read_c0_guest_compare(cop0),
- vcpu->arch.gprs[rt]);
-
/* If we are writing to COMPARE */
/* Clear pending timer interrupt, if any */
kvm_mips_write_compare(vcpu,
@@ -1155,7 +1197,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
* it first.
*/
if (change & ST0_CU1 && !(val & ST0_FR) &&
- vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+ vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
kvm_lose_fpu(vcpu);
/*
@@ -1166,7 +1208,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
* the near future.
*/
if (change & ST0_CU1 &&
- vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)
+ vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)
change_c0_status(ST0_CU1, val);
preempt_enable();
@@ -1201,7 +1243,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
* context is already loaded.
*/
if (change & MIPS_CONF5_FRE &&
- vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)
+ vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)
change_c0_config5(MIPS_CONF5_FRE, val);
/*
@@ -1211,7 +1253,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
* quickly enabled again in the near future.
*/
if (change & MIPS_CONF5_MSAEN &&
- vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+ vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
change_c0_config5(MIPS_CONF5_MSAEN,
val);
@@ -1219,7 +1261,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
kvm_write_c0_guest_config5(cop0, val);
} else if ((rd == MIPS_CP0_CAUSE) && (sel == 0)) {
- uint32_t old_cause, new_cause;
+ u32 old_cause, new_cause;
old_cause = kvm_read_c0_guest_cause(cop0);
new_cause = vcpu->arch.gprs[rt];
@@ -1233,20 +1275,30 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
else
kvm_mips_count_enable_cause(vcpu);
}
+ } else if ((rd == MIPS_CP0_HWRENA) && (sel == 0)) {
+ u32 mask = MIPS_HWRENA_CPUNUM |
+ MIPS_HWRENA_SYNCISTEP |
+ MIPS_HWRENA_CC |
+ MIPS_HWRENA_CCRES;
+
+ if (kvm_read_c0_guest_config3(cop0) &
+ MIPS_CONF3_ULRI)
+ mask |= MIPS_HWRENA_ULR;
+ cop0->reg[rd][sel] = vcpu->arch.gprs[rt] & mask;
} else {
cop0->reg[rd][sel] = vcpu->arch.gprs[rt];
#ifdef CONFIG_KVM_MIPS_DYN_TRANS
kvm_mips_trans_mtc0(inst, opc, vcpu);
#endif
}
-
- kvm_debug("[%#x] MTCz, cop0->reg[%d][%d]: %#lx\n", pc,
- rd, sel, cop0->reg[rd][sel]);
break;
case dmtc_op:
kvm_err("!!!!!!![%#lx]dmtc_op: rt: %d, rd: %d, sel: %d!!!!!!\n",
vcpu->arch.pc, rt, rd, sel);
+ trace_kvm_hwr(vcpu, KVM_TRACE_DMTC0,
+ KVM_TRACE_COP0(rd, sel),
+ vcpu->arch.gprs[rt]);
er = EMULATE_FAIL;
break;
@@ -1258,7 +1310,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
vcpu->arch.gprs[rt] =
kvm_read_c0_guest_status(cop0);
/* EI */
- if (inst & 0x20) {
+ if (inst.mfmc0_format.sc) {
kvm_debug("[%#lx] mfmc0_op: EI\n",
vcpu->arch.pc);
kvm_set_c0_guest_status(cop0, ST0_IE);
@@ -1272,9 +1324,8 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
case wrpgpr_op:
{
- uint32_t css =
- cop0->reg[MIPS_CP0_STATUS][2] & 0xf;
- uint32_t pss =
+ u32 css = cop0->reg[MIPS_CP0_STATUS][2] & 0xf;
+ u32 pss =
(cop0->reg[MIPS_CP0_STATUS][2] >> 6) & 0xf;
/*
* We don't support any shadow register sets, so
@@ -1291,7 +1342,7 @@ enum emulation_result kvm_mips_emulate_CP0(uint32_t inst, uint32_t *opc,
break;
default:
kvm_err("[%#lx]MachEmulateCP0: unsupported COP0, copz: 0x%x\n",
- vcpu->arch.pc, copz);
+ vcpu->arch.pc, inst.c0r_format.rs);
er = EMULATE_FAIL;
break;
}
@@ -1312,13 +1363,14 @@ dont_update_pc:
return er;
}
-enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
+enum emulation_result kvm_mips_emulate_store(union mips_instruction inst,
+ u32 cause,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
enum emulation_result er = EMULATE_DO_MMIO;
- int32_t op, base, rt, offset;
- uint32_t bytes;
+ u32 rt;
+ u32 bytes;
void *data = run->mmio.data;
unsigned long curr_pc;
@@ -1331,12 +1383,9 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
if (er == EMULATE_FAIL)
return er;
- rt = (inst >> 16) & 0x1f;
- base = (inst >> 21) & 0x1f;
- offset = inst & 0xffff;
- op = (inst >> 26) & 0x3f;
+ rt = inst.i_format.rt;
- switch (op) {
+ switch (inst.i_format.opcode) {
case sb_op:
bytes = 1;
if (bytes > sizeof(run->mmio.data)) {
@@ -1357,7 +1406,7 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
*(u8 *) data = vcpu->arch.gprs[rt];
kvm_debug("OP_SB: eaddr: %#lx, gpr: %#lx, data: %#x\n",
vcpu->arch.host_cp0_badvaddr, vcpu->arch.gprs[rt],
- *(uint8_t *) data);
+ *(u8 *) data);
break;
@@ -1379,11 +1428,11 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
run->mmio.is_write = 1;
vcpu->mmio_needed = 1;
vcpu->mmio_is_write = 1;
- *(uint32_t *) data = vcpu->arch.gprs[rt];
+ *(u32 *) data = vcpu->arch.gprs[rt];
kvm_debug("[%#lx] OP_SW: eaddr: %#lx, gpr: %#lx, data: %#x\n",
vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
- vcpu->arch.gprs[rt], *(uint32_t *) data);
+ vcpu->arch.gprs[rt], *(u32 *) data);
break;
case sh_op:
@@ -1404,15 +1453,16 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
run->mmio.is_write = 1;
vcpu->mmio_needed = 1;
vcpu->mmio_is_write = 1;
- *(uint16_t *) data = vcpu->arch.gprs[rt];
+ *(u16 *) data = vcpu->arch.gprs[rt];
kvm_debug("[%#lx] OP_SH: eaddr: %#lx, gpr: %#lx, data: %#x\n",
vcpu->arch.pc, vcpu->arch.host_cp0_badvaddr,
- vcpu->arch.gprs[rt], *(uint32_t *) data);
+ vcpu->arch.gprs[rt], *(u32 *) data);
break;
default:
- kvm_err("Store not yet supported");
+ kvm_err("Store not yet supported (inst=0x%08x)\n",
+ inst.word);
er = EMULATE_FAIL;
break;
}
@@ -1424,18 +1474,16 @@ enum emulation_result kvm_mips_emulate_store(uint32_t inst, uint32_t cause,
return er;
}
-enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
- struct kvm_run *run,
+enum emulation_result kvm_mips_emulate_load(union mips_instruction inst,
+ u32 cause, struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
enum emulation_result er = EMULATE_DO_MMIO;
- int32_t op, base, rt, offset;
- uint32_t bytes;
+ u32 op, rt;
+ u32 bytes;
- rt = (inst >> 16) & 0x1f;
- base = (inst >> 21) & 0x1f;
- offset = inst & 0xffff;
- op = (inst >> 26) & 0x3f;
+ rt = inst.i_format.rt;
+ op = inst.i_format.opcode;
vcpu->arch.pending_load_cause = cause;
vcpu->arch.io_gpr = rt;
@@ -1521,7 +1569,8 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
break;
default:
- kvm_err("Load not yet supported");
+ kvm_err("Load not yet supported (inst=0x%08x)\n",
+ inst.word);
er = EMULATE_FAIL;
break;
}
@@ -1529,40 +1578,15 @@ enum emulation_result kvm_mips_emulate_load(uint32_t inst, uint32_t cause,
return er;
}
-int kvm_mips_sync_icache(unsigned long va, struct kvm_vcpu *vcpu)
-{
- unsigned long offset = (va & ~PAGE_MASK);
- struct kvm *kvm = vcpu->kvm;
- unsigned long pa;
- gfn_t gfn;
- kvm_pfn_t pfn;
-
- gfn = va >> PAGE_SHIFT;
-
- if (gfn >= kvm->arch.guest_pmap_npages) {
- kvm_err("%s: Invalid gfn: %#llx\n", __func__, gfn);
- kvm_mips_dump_host_tlbs();
- kvm_arch_vcpu_dump_regs(vcpu);
- return -1;
- }
- pfn = kvm->arch.guest_pmap[gfn];
- pa = (pfn << PAGE_SHIFT) | offset;
-
- kvm_debug("%s: va: %#lx, unmapped: %#x\n", __func__, va,
- CKSEG0ADDR(pa));
-
- local_flush_icache_range(CKSEG0ADDR(pa), 32);
- return 0;
-}
-
-enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
- uint32_t cause,
+enum emulation_result kvm_mips_emulate_cache(union mips_instruction inst,
+ u32 *opc, u32 cause,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
enum emulation_result er = EMULATE_DONE;
- int32_t offset, cache, op_inst, op, base;
+ u32 cache, op_inst, op, base;
+ s16 offset;
struct kvm_vcpu_arch *arch = &vcpu->arch;
unsigned long va;
unsigned long curr_pc;
@@ -1576,9 +1600,12 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
if (er == EMULATE_FAIL)
return er;
- base = (inst >> 21) & 0x1f;
- op_inst = (inst >> 16) & 0x1f;
- offset = (int16_t)inst;
+ base = inst.i_format.rs;
+ op_inst = inst.i_format.rt;
+ if (cpu_has_mips_r6)
+ offset = inst.spec3_format.simmediate;
+ else
+ offset = inst.i_format.simmediate;
cache = op_inst & CacheOp_Cache;
op = op_inst & CacheOp_Op;
@@ -1634,7 +1661,6 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
(cop0) & KVM_ENTRYHI_ASID));
if (index < 0) {
- vcpu->arch.host_cp0_entryhi = (va & VPN2_MASK);
vcpu->arch.host_cp0_badvaddr = va;
vcpu->arch.pc = curr_pc;
er = kvm_mips_emulate_tlbmiss_ld(cause, NULL, run,
@@ -1659,9 +1685,7 @@ enum emulation_result kvm_mips_emulate_cache(uint32_t inst, uint32_t *opc,
* We fault an entry from the guest tlb to the
* shadow host TLB
*/
- kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb,
- NULL,
- NULL);
+ kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb);
}
}
} else {
@@ -1714,20 +1738,20 @@ dont_update_pc:
return er;
}
-enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_emulate_inst(u32 cause, u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
+ union mips_instruction inst;
enum emulation_result er = EMULATE_DONE;
- uint32_t inst;
/* Fetch the instruction. */
if (cause & CAUSEF_BD)
opc += 1;
- inst = kvm_get_inst(opc, vcpu);
+ inst.word = kvm_get_inst(opc, vcpu);
- switch (((union mips_instruction)inst).r_format.opcode) {
+ switch (inst.r_format.opcode) {
case cop0_op:
er = kvm_mips_emulate_CP0(inst, opc, cause, run, vcpu);
break;
@@ -1744,15 +1768,31 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc,
er = kvm_mips_emulate_load(inst, cause, run, vcpu);
break;
+#ifndef CONFIG_CPU_MIPSR6
case cache_op:
++vcpu->stat.cache_exits;
- trace_kvm_exit(vcpu, CACHE_EXITS);
+ trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
er = kvm_mips_emulate_cache(inst, opc, cause, run, vcpu);
break;
+#else
+ case spec3_op:
+ switch (inst.spec3_format.func) {
+ case cache6_op:
+ ++vcpu->stat.cache_exits;
+ trace_kvm_exit(vcpu, KVM_TRACE_EXIT_CACHE);
+ er = kvm_mips_emulate_cache(inst, opc, cause, run,
+ vcpu);
+ break;
+ default:
+ goto unknown;
+ };
+ break;
+unknown:
+#endif
default:
kvm_err("Instruction emulation not supported (%p/%#x)\n", opc,
- inst);
+ inst.word);
kvm_arch_vcpu_dump_regs(vcpu);
er = EMULATE_FAIL;
break;
@@ -1761,8 +1801,8 @@ enum emulation_result kvm_mips_emulate_inst(unsigned long cause, uint32_t *opc,
return er;
}
-enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_syscall(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -1796,8 +1836,8 @@ enum emulation_result kvm_mips_emulate_syscall(unsigned long cause,
return er;
}
-enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbmiss_ld(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -1842,8 +1882,8 @@ enum emulation_result kvm_mips_emulate_tlbmiss_ld(unsigned long cause,
return EMULATE_DONE;
}
-enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbinv_ld(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -1888,8 +1928,8 @@ enum emulation_result kvm_mips_emulate_tlbinv_ld(unsigned long cause,
return EMULATE_DONE;
}
-enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbmiss_st(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -1932,8 +1972,8 @@ enum emulation_result kvm_mips_emulate_tlbmiss_st(unsigned long cause,
return EMULATE_DONE;
}
-enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbinv_st(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -1977,7 +2017,7 @@ enum emulation_result kvm_mips_emulate_tlbinv_st(unsigned long cause,
}
/* TLBMOD: store into address matching TLB with Dirty bit off */
-enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_handle_tlbmod(u32 cause, u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2005,8 +2045,8 @@ enum emulation_result kvm_mips_handle_tlbmod(unsigned long cause, uint32_t *opc,
return er;
}
-enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_tlbmod(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2048,8 +2088,8 @@ enum emulation_result kvm_mips_emulate_tlbmod(unsigned long cause,
return EMULATE_DONE;
}
-enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_fpu_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2077,8 +2117,8 @@ enum emulation_result kvm_mips_emulate_fpu_exc(unsigned long cause,
return EMULATE_DONE;
}
-enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_ri_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2112,8 +2152,8 @@ enum emulation_result kvm_mips_emulate_ri_exc(unsigned long cause,
return er;
}
-enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_bp_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2147,8 +2187,8 @@ enum emulation_result kvm_mips_emulate_bp_exc(unsigned long cause,
return er;
}
-enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_trap_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2182,8 +2222,8 @@ enum emulation_result kvm_mips_emulate_trap_exc(unsigned long cause,
return er;
}
-enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_msafpe_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2217,8 +2257,8 @@ enum emulation_result kvm_mips_emulate_msafpe_exc(unsigned long cause,
return er;
}
-enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_fpe_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2252,8 +2292,8 @@ enum emulation_result kvm_mips_emulate_fpe_exc(unsigned long cause,
return er;
}
-enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_emulate_msadis_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2287,22 +2327,7 @@ enum emulation_result kvm_mips_emulate_msadis_exc(unsigned long cause,
return er;
}
-/* ll/sc, rdhwr, sync emulation */
-
-#define OPCODE 0xfc000000
-#define BASE 0x03e00000
-#define RT 0x001f0000
-#define OFFSET 0x0000ffff
-#define LL 0xc0000000
-#define SC 0xe0000000
-#define SPEC0 0x00000000
-#define SPEC3 0x7c000000
-#define RD 0x0000f800
-#define FUNC 0x0000003f
-#define SYNC 0x0000000f
-#define RDHWR 0x0000003b
-
-enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
+enum emulation_result kvm_mips_handle_ri(u32 cause, u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
@@ -2310,7 +2335,7 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
struct kvm_vcpu_arch *arch = &vcpu->arch;
enum emulation_result er = EMULATE_DONE;
unsigned long curr_pc;
- uint32_t inst;
+ union mips_instruction inst;
/*
* Update PC and hold onto current PC in case there is
@@ -2325,17 +2350,22 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
if (cause & CAUSEF_BD)
opc += 1;
- inst = kvm_get_inst(opc, vcpu);
+ inst.word = kvm_get_inst(opc, vcpu);
- if (inst == KVM_INVALID_INST) {
+ if (inst.word == KVM_INVALID_INST) {
kvm_err("%s: Cannot get inst @ %p\n", __func__, opc);
return EMULATE_FAIL;
}
- if ((inst & OPCODE) == SPEC3 && (inst & FUNC) == RDHWR) {
+ if (inst.r_format.opcode == spec3_op &&
+ inst.r_format.func == rdhwr_op &&
+ inst.r_format.rs == 0 &&
+ (inst.r_format.re >> 3) == 0) {
int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
- int rd = (inst & RD) >> 11;
- int rt = (inst & RT) >> 16;
+ int rd = inst.r_format.rd;
+ int rt = inst.r_format.rt;
+ int sel = inst.r_format.re & 0x7;
+
/* If usermode, check RDHWR rd is allowed by guest HWREna */
if (usermode && !(kvm_read_c0_guest_hwrena(cop0) & BIT(rd))) {
kvm_debug("RDHWR %#x disallowed by HWREna @ %p\n",
@@ -2343,17 +2373,17 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
goto emulate_ri;
}
switch (rd) {
- case 0: /* CPU number */
- arch->gprs[rt] = 0;
+ case MIPS_HWR_CPUNUM: /* CPU number */
+ arch->gprs[rt] = vcpu->vcpu_id;
break;
- case 1: /* SYNCI length */
+ case MIPS_HWR_SYNCISTEP: /* SYNCI length */
arch->gprs[rt] = min(current_cpu_data.dcache.linesz,
current_cpu_data.icache.linesz);
break;
- case 2: /* Read count register */
- arch->gprs[rt] = kvm_mips_read_count(vcpu);
+ case MIPS_HWR_CC: /* Read count register */
+ arch->gprs[rt] = (s32)kvm_mips_read_count(vcpu);
break;
- case 3: /* Count register resolution */
+ case MIPS_HWR_CCRES: /* Count register resolution */
switch (current_cpu_data.cputype) {
case CPU_20KC:
case CPU_25KF:
@@ -2363,7 +2393,7 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
arch->gprs[rt] = 2;
}
break;
- case 29:
+ case MIPS_HWR_ULR: /* Read UserLocal register */
arch->gprs[rt] = kvm_read_c0_guest_userlocal(cop0);
break;
@@ -2371,8 +2401,12 @@ enum emulation_result kvm_mips_handle_ri(unsigned long cause, uint32_t *opc,
kvm_debug("RDHWR %#x not supported @ %p\n", rd, opc);
goto emulate_ri;
}
+
+ trace_kvm_hwr(vcpu, KVM_TRACE_RDHWR, KVM_TRACE_HWR(rd, sel),
+ vcpu->arch.gprs[rt]);
} else {
- kvm_debug("Emulate RI not supported @ %p: %#x\n", opc, inst);
+ kvm_debug("Emulate RI not supported @ %p: %#x\n",
+ opc, inst.word);
goto emulate_ri;
}
@@ -2405,19 +2439,19 @@ enum emulation_result kvm_mips_complete_mmio_load(struct kvm_vcpu *vcpu,
switch (run->mmio.len) {
case 4:
- *gpr = *(int32_t *) run->mmio.data;
+ *gpr = *(s32 *) run->mmio.data;
break;
case 2:
if (vcpu->mmio_needed == 2)
- *gpr = *(int16_t *) run->mmio.data;
+ *gpr = *(s16 *) run->mmio.data;
else
- *gpr = *(uint16_t *)run->mmio.data;
+ *gpr = *(u16 *)run->mmio.data;
break;
case 1:
if (vcpu->mmio_needed == 2)
- *gpr = *(int8_t *) run->mmio.data;
+ *gpr = *(s8 *) run->mmio.data;
else
*gpr = *(u8 *) run->mmio.data;
break;
@@ -2432,12 +2466,12 @@ done:
return er;
}
-static enum emulation_result kvm_mips_emulate_exc(unsigned long cause,
- uint32_t *opc,
+static enum emulation_result kvm_mips_emulate_exc(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
- uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+ u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
struct mips_coproc *cop0 = vcpu->arch.cop0;
struct kvm_vcpu_arch *arch = &vcpu->arch;
enum emulation_result er = EMULATE_DONE;
@@ -2470,13 +2504,13 @@ static enum emulation_result kvm_mips_emulate_exc(unsigned long cause,
return er;
}
-enum emulation_result kvm_mips_check_privilege(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_check_privilege(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
enum emulation_result er = EMULATE_DONE;
- uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+ u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
int usermode = !KVM_GUEST_KERNEL_MODE(vcpu);
@@ -2566,18 +2600,18 @@ enum emulation_result kvm_mips_check_privilege(unsigned long cause,
* (2) TLB entry is present in the Guest TLB but not in the shadow, in this
* case we inject the TLB from the Guest TLB into the shadow host TLB
*/
-enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
- uint32_t *opc,
+enum emulation_result kvm_mips_handle_tlbmiss(u32 cause,
+ u32 *opc,
struct kvm_run *run,
struct kvm_vcpu *vcpu)
{
enum emulation_result er = EMULATE_DONE;
- uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+ u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
unsigned long va = vcpu->arch.host_cp0_badvaddr;
int index;
- kvm_debug("kvm_mips_handle_tlbmiss: badvaddr: %#lx, entryhi: %#lx\n",
- vcpu->arch.host_cp0_badvaddr, vcpu->arch.host_cp0_entryhi);
+ kvm_debug("kvm_mips_handle_tlbmiss: badvaddr: %#lx\n",
+ vcpu->arch.host_cp0_badvaddr);
/*
* KVM would not have got the exception if this entry was valid in the
@@ -2620,13 +2654,12 @@ enum emulation_result kvm_mips_handle_tlbmiss(unsigned long cause,
}
} else {
kvm_debug("Injecting hi: %#lx, lo0: %#lx, lo1: %#lx into shadow host TLB\n",
- tlb->tlb_hi, tlb->tlb_lo0, tlb->tlb_lo1);
+ tlb->tlb_hi, tlb->tlb_lo[0], tlb->tlb_lo[1]);
/*
* OK we have a Guest TLB entry, now inject it into the
* shadow host TLB
*/
- kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb, NULL,
- NULL);
+ kvm_mips_handle_mapped_seg_tlb_fault(vcpu, tlb);
}
}
diff --git a/arch/mips/kvm/entry.c b/arch/mips/kvm/entry.c
new file mode 100644
index 000000000000..6a02b3a3fa65
--- /dev/null
+++ b/arch/mips/kvm/entry.c
@@ -0,0 +1,701 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Generation of main entry point for the guest, exception handling.
+ *
+ * Copyright (C) 2012 MIPS Technologies, Inc.
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
+ *
+ * Copyright (C) 2016 Imagination Technologies Ltd.
+ */
+
+#include <linux/kvm_host.h>
+#include <asm/msa.h>
+#include <asm/setup.h>
+#include <asm/uasm.h>
+
+/* Register names */
+#define ZERO 0
+#define AT 1
+#define V0 2
+#define V1 3
+#define A0 4
+#define A1 5
+
+#if _MIPS_SIM == _MIPS_SIM_ABI32
+#define T0 8
+#define T1 9
+#define T2 10
+#define T3 11
+#endif /* _MIPS_SIM == _MIPS_SIM_ABI32 */
+
+#if _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32
+#define T0 12
+#define T1 13
+#define T2 14
+#define T3 15
+#endif /* _MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32 */
+
+#define S0 16
+#define S1 17
+#define T9 25
+#define K0 26
+#define K1 27
+#define GP 28
+#define SP 29
+#define RA 31
+
+/* Some CP0 registers */
+#define C0_HWRENA 7, 0
+#define C0_BADVADDR 8, 0
+#define C0_ENTRYHI 10, 0
+#define C0_STATUS 12, 0
+#define C0_CAUSE 13, 0
+#define C0_EPC 14, 0
+#define C0_EBASE 15, 1
+#define C0_CONFIG5 16, 5
+#define C0_DDATA_LO 28, 3
+#define C0_ERROREPC 30, 0
+
+#define CALLFRAME_SIZ 32
+
+#ifdef CONFIG_64BIT
+#define ST0_KX_IF_64 ST0_KX
+#else
+#define ST0_KX_IF_64 0
+#endif
+
+static unsigned int scratch_vcpu[2] = { C0_DDATA_LO };
+static unsigned int scratch_tmp[2] = { C0_ERROREPC };
+
+enum label_id {
+ label_fpu_1 = 1,
+ label_msa_1,
+ label_return_to_host,
+ label_kernel_asid,
+ label_exit_common,
+};
+
+UASM_L_LA(_fpu_1)
+UASM_L_LA(_msa_1)
+UASM_L_LA(_return_to_host)
+UASM_L_LA(_kernel_asid)
+UASM_L_LA(_exit_common)
+
+static void *kvm_mips_build_enter_guest(void *addr);
+static void *kvm_mips_build_ret_from_exit(void *addr);
+static void *kvm_mips_build_ret_to_guest(void *addr);
+static void *kvm_mips_build_ret_to_host(void *addr);
+
+/**
+ * kvm_mips_entry_setup() - Perform global setup for entry code.
+ *
+ * Perform global setup for entry code, such as choosing a scratch register.
+ *
+ * Returns: 0 on success.
+ * -errno on failure.
+ */
+int kvm_mips_entry_setup(void)
+{
+ /*
+ * We prefer to use KScratchN registers if they are available over the
+ * defaults above, which may not work on all cores.
+ */
+ unsigned int kscratch_mask = cpu_data[0].kscratch_mask & 0xfc;
+
+ /* Pick a scratch register for storing VCPU */
+ if (kscratch_mask) {
+ scratch_vcpu[0] = 31;
+ scratch_vcpu[1] = ffs(kscratch_mask) - 1;
+ kscratch_mask &= ~BIT(scratch_vcpu[1]);
+ }
+
+ /* Pick a scratch register to use as a temp for saving state */
+ if (kscratch_mask) {
+ scratch_tmp[0] = 31;
+ scratch_tmp[1] = ffs(kscratch_mask) - 1;
+ kscratch_mask &= ~BIT(scratch_tmp[1]);
+ }
+
+ return 0;
+}
+
+static void kvm_mips_build_save_scratch(u32 **p, unsigned int tmp,
+ unsigned int frame)
+{
+ /* Save the VCPU scratch register value in cp0_epc of the stack frame */
+ UASM_i_MFC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
+ UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
+
+ /* Save the temp scratch register value in cp0_cause of stack frame */
+ if (scratch_tmp[0] == 31) {
+ UASM_i_MFC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
+ UASM_i_SW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
+ }
+}
+
+static void kvm_mips_build_restore_scratch(u32 **p, unsigned int tmp,
+ unsigned int frame)
+{
+ /*
+ * Restore host scratch register values saved by
+ * kvm_mips_build_save_scratch().
+ */
+ UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_epc), frame);
+ UASM_i_MTC0(p, tmp, scratch_vcpu[0], scratch_vcpu[1]);
+
+ if (scratch_tmp[0] == 31) {
+ UASM_i_LW(p, tmp, offsetof(struct pt_regs, cp0_cause), frame);
+ UASM_i_MTC0(p, tmp, scratch_tmp[0], scratch_tmp[1]);
+ }
+}
+
+/**
+ * build_set_exc_base() - Assemble code to write exception base address.
+ * @p: Code buffer pointer.
+ * @reg: Source register (generated code may set WG bit in @reg).
+ *
+ * Assemble code to modify the exception base address in the EBase register,
+ * using the appropriately sized access and setting the WG bit if necessary.
+ */
+static inline void build_set_exc_base(u32 **p, unsigned int reg)
+{
+ if (cpu_has_ebase_wg) {
+ /* Set WG so that all the bits get written */
+ uasm_i_ori(p, reg, reg, MIPS_EBASE_WG);
+ UASM_i_MTC0(p, reg, C0_EBASE);
+ } else {
+ uasm_i_mtc0(p, reg, C0_EBASE);
+ }
+}
+
+/**
+ * kvm_mips_build_vcpu_run() - Assemble function to start running a guest VCPU.
+ * @addr: Address to start writing code.
+ *
+ * Assemble the start of the vcpu_run function to run a guest VCPU. The function
+ * conforms to the following prototype:
+ *
+ * int vcpu_run(struct kvm_run *run, struct kvm_vcpu *vcpu);
+ *
+ * The exit from the guest and return to the caller is handled by the code
+ * generated by kvm_mips_build_ret_to_host().
+ *
+ * Returns: Next address after end of written function.
+ */
+void *kvm_mips_build_vcpu_run(void *addr)
+{
+ u32 *p = addr;
+ unsigned int i;
+
+ /*
+ * A0: run
+ * A1: vcpu
+ */
+
+ /* k0/k1 not being used in host kernel context */
+ UASM_i_ADDIU(&p, K1, SP, -(int)sizeof(struct pt_regs));
+ for (i = 16; i < 32; ++i) {
+ if (i == 24)
+ i = 28;
+ UASM_i_SW(&p, i, offsetof(struct pt_regs, regs[i]), K1);
+ }
+
+ /* Save host status */
+ uasm_i_mfc0(&p, V0, C0_STATUS);
+ UASM_i_SW(&p, V0, offsetof(struct pt_regs, cp0_status), K1);
+
+ /* Save scratch registers, will be used to store pointer to vcpu etc */
+ kvm_mips_build_save_scratch(&p, V1, K1);
+
+ /* VCPU scratch register has pointer to vcpu */
+ UASM_i_MTC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+
+ /* Offset into vcpu->arch */
+ UASM_i_ADDIU(&p, K1, A1, offsetof(struct kvm_vcpu, arch));
+
+ /*
+ * Save the host stack to VCPU, used for exception processing
+ * when we exit from the Guest
+ */
+ UASM_i_SW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+
+ /* Save the kernel gp as well */
+ UASM_i_SW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1);
+
+ /*
+ * Setup status register for running the guest in UM, interrupts
+ * are disabled
+ */
+ UASM_i_LA(&p, K0, ST0_EXL | KSU_USER | ST0_BEV | ST0_KX_IF_64);
+ uasm_i_mtc0(&p, K0, C0_STATUS);
+ uasm_i_ehb(&p);
+
+ /* load up the new EBASE */
+ UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
+ build_set_exc_base(&p, K0);
+
+ /*
+ * Now that the new EBASE has been loaded, unset BEV, set
+ * interrupt mask as it was but make sure that timer interrupts
+ * are enabled
+ */
+ uasm_i_addiu(&p, K0, ZERO, ST0_EXL | KSU_USER | ST0_IE | ST0_KX_IF_64);
+ uasm_i_andi(&p, V0, V0, ST0_IM);
+ uasm_i_or(&p, K0, K0, V0);
+ uasm_i_mtc0(&p, K0, C0_STATUS);
+ uasm_i_ehb(&p);
+
+ p = kvm_mips_build_enter_guest(p);
+
+ return p;
+}
+
+/**
+ * kvm_mips_build_enter_guest() - Assemble code to resume guest execution.
+ * @addr: Address to start writing code.
+ *
+ * Assemble the code to resume guest execution. This code is common between the
+ * initial entry into the guest from the host, and returning from the exit
+ * handler back to the guest.
+ *
+ * Returns: Next address after end of written function.
+ */
+static void *kvm_mips_build_enter_guest(void *addr)
+{
+ u32 *p = addr;
+ unsigned int i;
+ struct uasm_label labels[2];
+ struct uasm_reloc relocs[2];
+ struct uasm_label *l = labels;
+ struct uasm_reloc *r = relocs;
+
+ memset(labels, 0, sizeof(labels));
+ memset(relocs, 0, sizeof(relocs));
+
+ /* Set Guest EPC */
+ UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, pc), K1);
+ UASM_i_MTC0(&p, T0, C0_EPC);
+
+ /* Set the ASID for the Guest Kernel */
+ UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, cop0), K1);
+ UASM_i_LW(&p, T0, offsetof(struct mips_coproc, reg[MIPS_CP0_STATUS][0]),
+ T0);
+ uasm_i_andi(&p, T0, T0, KSU_USER | ST0_ERL | ST0_EXL);
+ uasm_i_xori(&p, T0, T0, KSU_USER);
+ uasm_il_bnez(&p, &r, T0, label_kernel_asid);
+ UASM_i_ADDIU(&p, T1, K1,
+ offsetof(struct kvm_vcpu_arch, guest_kernel_asid));
+ /* else user */
+ UASM_i_ADDIU(&p, T1, K1,
+ offsetof(struct kvm_vcpu_arch, guest_user_asid));
+ uasm_l_kernel_asid(&l, p);
+
+ /* t1: contains the base of the ASID array, need to get the cpu id */
+ /* smp_processor_id */
+ uasm_i_lw(&p, T2, offsetof(struct thread_info, cpu), GP);
+ /* x4 */
+ uasm_i_sll(&p, T2, T2, 2);
+ UASM_i_ADDU(&p, T3, T1, T2);
+ uasm_i_lw(&p, K0, 0, T3);
+#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
+ /* x sizeof(struct cpuinfo_mips)/4 */
+ uasm_i_addiu(&p, T3, ZERO, sizeof(struct cpuinfo_mips)/4);
+ uasm_i_mul(&p, T2, T2, T3);
+
+ UASM_i_LA_mostly(&p, AT, (long)&cpu_data[0].asid_mask);
+ UASM_i_ADDU(&p, AT, AT, T2);
+ UASM_i_LW(&p, T2, uasm_rel_lo((long)&cpu_data[0].asid_mask), AT);
+ uasm_i_and(&p, K0, K0, T2);
+#else
+ uasm_i_andi(&p, K0, K0, MIPS_ENTRYHI_ASID);
+#endif
+ uasm_i_mtc0(&p, K0, C0_ENTRYHI);
+ uasm_i_ehb(&p);
+
+ /* Disable RDHWR access */
+ uasm_i_mtc0(&p, ZERO, C0_HWRENA);
+
+ /* load the guest context from VCPU and return */
+ for (i = 1; i < 32; ++i) {
+ /* Guest k0/k1 loaded later */
+ if (i == K0 || i == K1)
+ continue;
+ UASM_i_LW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1);
+ }
+
+#ifndef CONFIG_CPU_MIPSR6
+ /* Restore hi/lo */
+ UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, hi), K1);
+ uasm_i_mthi(&p, K0);
+
+ UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, lo), K1);
+ uasm_i_mtlo(&p, K0);
+#endif
+
+ /* Restore the guest's k0/k1 registers */
+ UASM_i_LW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
+ UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1);
+
+ /* Jump to guest */
+ uasm_i_eret(&p);
+
+ uasm_resolve_relocs(relocs, labels);
+
+ return p;
+}
+
+/**
+ * kvm_mips_build_exception() - Assemble first level guest exception handler.
+ * @addr: Address to start writing code.
+ * @handler: Address of common handler (within range of @addr).
+ *
+ * Assemble exception vector code for guest execution. The generated vector will
+ * branch to the common exception handler generated by kvm_mips_build_exit().
+ *
+ * Returns: Next address after end of written function.
+ */
+void *kvm_mips_build_exception(void *addr, void *handler)
+{
+ u32 *p = addr;
+ struct uasm_label labels[2];
+ struct uasm_reloc relocs[2];
+ struct uasm_label *l = labels;
+ struct uasm_reloc *r = relocs;
+
+ memset(labels, 0, sizeof(labels));
+ memset(relocs, 0, sizeof(relocs));
+
+ /* Save guest k1 into scratch register */
+ UASM_i_MTC0(&p, K1, scratch_tmp[0], scratch_tmp[1]);
+
+ /* Get the VCPU pointer from the VCPU scratch register */
+ UASM_i_MFC0(&p, K1, scratch_vcpu[0], scratch_vcpu[1]);
+ UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+
+ /* Save guest k0 into VCPU structure */
+ UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, gprs[K0]), K1);
+
+ /* Branch to the common handler */
+ uasm_il_b(&p, &r, label_exit_common);
+ uasm_i_nop(&p);
+
+ uasm_l_exit_common(&l, handler);
+ uasm_resolve_relocs(relocs, labels);
+
+ return p;
+}
+
+/**
+ * kvm_mips_build_exit() - Assemble common guest exit handler.
+ * @addr: Address to start writing code.
+ *
+ * Assemble the generic guest exit handling code. This is called by the
+ * exception vectors (generated by kvm_mips_build_exception()), and calls
+ * kvm_mips_handle_exit(), then either resumes the guest or returns to the host
+ * depending on the return value.
+ *
+ * Returns: Next address after end of written function.
+ */
+void *kvm_mips_build_exit(void *addr)
+{
+ u32 *p = addr;
+ unsigned int i;
+ struct uasm_label labels[3];
+ struct uasm_reloc relocs[3];
+ struct uasm_label *l = labels;
+ struct uasm_reloc *r = relocs;
+
+ memset(labels, 0, sizeof(labels));
+ memset(relocs, 0, sizeof(relocs));
+
+ /*
+ * Generic Guest exception handler. We end up here when the guest
+ * does something that causes a trap to kernel mode.
+ *
+ * Both k0/k1 registers will have already been saved (k0 into the vcpu
+ * structure, and k1 into the scratch_tmp register).
+ *
+ * The k1 register will already contain the kvm_vcpu_arch pointer.
+ */
+
+ /* Start saving Guest context to VCPU */
+ for (i = 0; i < 32; ++i) {
+ /* Guest k0/k1 saved later */
+ if (i == K0 || i == K1)
+ continue;
+ UASM_i_SW(&p, i, offsetof(struct kvm_vcpu_arch, gprs[i]), K1);
+ }
+
+#ifndef CONFIG_CPU_MIPSR6
+ /* We need to save hi/lo and restore them on the way out */
+ uasm_i_mfhi(&p, T0);
+ UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, hi), K1);
+
+ uasm_i_mflo(&p, T0);
+ UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, lo), K1);
+#endif
+
+ /* Finally save guest k1 to VCPU */
+ uasm_i_ehb(&p);
+ UASM_i_MFC0(&p, T0, scratch_tmp[0], scratch_tmp[1]);
+ UASM_i_SW(&p, T0, offsetof(struct kvm_vcpu_arch, gprs[K1]), K1);
+
+ /* Now that context has been saved, we can use other registers */
+
+ /* Restore vcpu */
+ UASM_i_MFC0(&p, A1, scratch_vcpu[0], scratch_vcpu[1]);
+ uasm_i_move(&p, S1, A1);
+
+ /* Restore run (vcpu->run) */
+ UASM_i_LW(&p, A0, offsetof(struct kvm_vcpu, run), A1);
+ /* Save pointer to run in s0, will be saved by the compiler */
+ uasm_i_move(&p, S0, A0);
+
+ /*
+ * Save Host level EPC, BadVaddr and Cause to VCPU, useful to process
+ * the exception
+ */
+ UASM_i_MFC0(&p, K0, C0_EPC);
+ UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, pc), K1);
+
+ UASM_i_MFC0(&p, K0, C0_BADVADDR);
+ UASM_i_SW(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_badvaddr),
+ K1);
+
+ uasm_i_mfc0(&p, K0, C0_CAUSE);
+ uasm_i_sw(&p, K0, offsetof(struct kvm_vcpu_arch, host_cp0_cause), K1);
+
+ /* Now restore the host state just enough to run the handlers */
+
+ /* Switch EBASE to the one used by Linux */
+ /* load up the host EBASE */
+ uasm_i_mfc0(&p, V0, C0_STATUS);
+
+ uasm_i_lui(&p, AT, ST0_BEV >> 16);
+ uasm_i_or(&p, K0, V0, AT);
+
+ uasm_i_mtc0(&p, K0, C0_STATUS);
+ uasm_i_ehb(&p);
+
+ UASM_i_LA_mostly(&p, K0, (long)&ebase);
+ UASM_i_LW(&p, K0, uasm_rel_lo((long)&ebase), K0);
+ build_set_exc_base(&p, K0);
+
+ if (raw_cpu_has_fpu) {
+ /*
+ * If FPU is enabled, save FCR31 and clear it so that later
+ * ctc1's don't trigger FPE for pending exceptions.
+ */
+ uasm_i_lui(&p, AT, ST0_CU1 >> 16);
+ uasm_i_and(&p, V1, V0, AT);
+ uasm_il_beqz(&p, &r, V1, label_fpu_1);
+ uasm_i_nop(&p);
+ uasm_i_cfc1(&p, T0, 31);
+ uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.fcr31),
+ K1);
+ uasm_i_ctc1(&p, ZERO, 31);
+ uasm_l_fpu_1(&l, p);
+ }
+
+ if (cpu_has_msa) {
+ /*
+ * If MSA is enabled, save MSACSR and clear it so that later
+ * instructions don't trigger MSAFPE for pending exceptions.
+ */
+ uasm_i_mfc0(&p, T0, C0_CONFIG5);
+ uasm_i_ext(&p, T0, T0, 27, 1); /* MIPS_CONF5_MSAEN */
+ uasm_il_beqz(&p, &r, T0, label_msa_1);
+ uasm_i_nop(&p);
+ uasm_i_cfcmsa(&p, T0, MSA_CSR);
+ uasm_i_sw(&p, T0, offsetof(struct kvm_vcpu_arch, fpu.msacsr),
+ K1);
+ uasm_i_ctcmsa(&p, MSA_CSR, ZERO);
+ uasm_l_msa_1(&l, p);
+ }
+
+ /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
+ uasm_i_addiu(&p, AT, ZERO, ~(ST0_EXL | KSU_USER | ST0_IE));
+ uasm_i_and(&p, V0, V0, AT);
+ uasm_i_lui(&p, AT, ST0_CU0 >> 16);
+ uasm_i_or(&p, V0, V0, AT);
+ uasm_i_mtc0(&p, V0, C0_STATUS);
+ uasm_i_ehb(&p);
+
+ /* Load up host GP */
+ UASM_i_LW(&p, GP, offsetof(struct kvm_vcpu_arch, host_gp), K1);
+
+ /* Need a stack before we can jump to "C" */
+ UASM_i_LW(&p, SP, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+
+ /* Saved host state */
+ UASM_i_ADDIU(&p, SP, SP, -(int)sizeof(struct pt_regs));
+
+ /*
+ * XXXKYMA do we need to load the host ASID, maybe not because the
+ * kernel entries are marked GLOBAL, need to verify
+ */
+
+ /* Restore host scratch registers, as we'll have clobbered them */
+ kvm_mips_build_restore_scratch(&p, K0, SP);
+
+ /* Restore RDHWR access */
+ UASM_i_LA_mostly(&p, K0, (long)&hwrena);
+ uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0);
+ uasm_i_mtc0(&p, K0, C0_HWRENA);
+
+ /* Jump to handler */
+ /*
+ * XXXKYMA: not sure if this is safe, how large is the stack??
+ * Now jump to the kvm_mips_handle_exit() to see if we can deal
+ * with this in the kernel
+ */
+ UASM_i_LA(&p, T9, (unsigned long)kvm_mips_handle_exit);
+ uasm_i_jalr(&p, RA, T9);
+ UASM_i_ADDIU(&p, SP, SP, -CALLFRAME_SIZ);
+
+ uasm_resolve_relocs(relocs, labels);
+
+ p = kvm_mips_build_ret_from_exit(p);
+
+ return p;
+}
+
+/**
+ * kvm_mips_build_ret_from_exit() - Assemble guest exit return handler.
+ * @addr: Address to start writing code.
+ *
+ * Assemble the code to handle the return from kvm_mips_handle_exit(), either
+ * resuming the guest or returning to the host depending on the return value.
+ *
+ * Returns: Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_from_exit(void *addr)
+{
+ u32 *p = addr;
+ struct uasm_label labels[2];
+ struct uasm_reloc relocs[2];
+ struct uasm_label *l = labels;
+ struct uasm_reloc *r = relocs;
+
+ memset(labels, 0, sizeof(labels));
+ memset(relocs, 0, sizeof(relocs));
+
+ /* Return from handler Make sure interrupts are disabled */
+ uasm_i_di(&p, ZERO);
+ uasm_i_ehb(&p);
+
+ /*
+ * XXXKYMA: k0/k1 could have been blown away if we processed
+ * an exception while we were handling the exception from the
+ * guest, reload k1
+ */
+
+ uasm_i_move(&p, K1, S1);
+ UASM_i_ADDIU(&p, K1, K1, offsetof(struct kvm_vcpu, arch));
+
+ /*
+ * Check return value, should tell us if we are returning to the
+ * host (handle I/O etc)or resuming the guest
+ */
+ uasm_i_andi(&p, T0, V0, RESUME_HOST);
+ uasm_il_bnez(&p, &r, T0, label_return_to_host);
+ uasm_i_nop(&p);
+
+ p = kvm_mips_build_ret_to_guest(p);
+
+ uasm_l_return_to_host(&l, p);
+ p = kvm_mips_build_ret_to_host(p);
+
+ uasm_resolve_relocs(relocs, labels);
+
+ return p;
+}
+
+/**
+ * kvm_mips_build_ret_to_guest() - Assemble code to return to the guest.
+ * @addr: Address to start writing code.
+ *
+ * Assemble the code to handle return from the guest exit handler
+ * (kvm_mips_handle_exit()) back to the guest.
+ *
+ * Returns: Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_to_guest(void *addr)
+{
+ u32 *p = addr;
+
+ /* Put the saved pointer to vcpu (s1) back into the scratch register */
+ UASM_i_MTC0(&p, S1, scratch_vcpu[0], scratch_vcpu[1]);
+
+ /* Load up the Guest EBASE to minimize the window where BEV is set */
+ UASM_i_LW(&p, T0, offsetof(struct kvm_vcpu_arch, guest_ebase), K1);
+
+ /* Switch EBASE back to the one used by KVM */
+ uasm_i_mfc0(&p, V1, C0_STATUS);
+ uasm_i_lui(&p, AT, ST0_BEV >> 16);
+ uasm_i_or(&p, K0, V1, AT);
+ uasm_i_mtc0(&p, K0, C0_STATUS);
+ uasm_i_ehb(&p);
+ build_set_exc_base(&p, T0);
+
+ /* Setup status register for running guest in UM */
+ uasm_i_ori(&p, V1, V1, ST0_EXL | KSU_USER | ST0_IE);
+ UASM_i_LA(&p, AT, ~(ST0_CU0 | ST0_MX));
+ uasm_i_and(&p, V1, V1, AT);
+ uasm_i_mtc0(&p, V1, C0_STATUS);
+ uasm_i_ehb(&p);
+
+ p = kvm_mips_build_enter_guest(p);
+
+ return p;
+}
+
+/**
+ * kvm_mips_build_ret_to_host() - Assemble code to return to the host.
+ * @addr: Address to start writing code.
+ *
+ * Assemble the code to handle return from the guest exit handler
+ * (kvm_mips_handle_exit()) back to the host, i.e. to the caller of the vcpu_run
+ * function generated by kvm_mips_build_vcpu_run().
+ *
+ * Returns: Next address after end of written function.
+ */
+static void *kvm_mips_build_ret_to_host(void *addr)
+{
+ u32 *p = addr;
+ unsigned int i;
+
+ /* EBASE is already pointing to Linux */
+ UASM_i_LW(&p, K1, offsetof(struct kvm_vcpu_arch, host_stack), K1);
+ UASM_i_ADDIU(&p, K1, K1, -(int)sizeof(struct pt_regs));
+
+ /*
+ * r2/v0 is the return code, shift it down by 2 (arithmetic)
+ * to recover the err code
+ */
+ uasm_i_sra(&p, K0, V0, 2);
+ uasm_i_move(&p, V0, K0);
+
+ /* Load context saved on the host stack */
+ for (i = 16; i < 31; ++i) {
+ if (i == 24)
+ i = 28;
+ UASM_i_LW(&p, i, offsetof(struct pt_regs, regs[i]), K1);
+ }
+
+ /* Restore RDHWR access */
+ UASM_i_LA_mostly(&p, K0, (long)&hwrena);
+ uasm_i_lw(&p, K0, uasm_rel_lo((long)&hwrena), K0);
+ uasm_i_mtc0(&p, K0, C0_HWRENA);
+
+ /* Restore RA, which is the address we will return to */
+ UASM_i_LW(&p, RA, offsetof(struct pt_regs, regs[RA]), K1);
+ uasm_i_jr(&p, RA);
+ uasm_i_nop(&p);
+
+ return p;
+}
+
diff --git a/arch/mips/kvm/fpu.S b/arch/mips/kvm/fpu.S
index 531fbf5131c0..16f17c6390dd 100644
--- a/arch/mips/kvm/fpu.S
+++ b/arch/mips/kvm/fpu.S
@@ -14,13 +14,16 @@
#include <asm/mipsregs.h>
#include <asm/regdef.h>
+/* preprocessor replaces the fp in ".set fp=64" with $30 otherwise */
+#undef fp
+
.set noreorder
.set noat
LEAF(__kvm_save_fpu)
.set push
- .set mips64r2
SET_HARDFLOAT
+ .set fp=64
mfc0 t0, CP0_STATUS
sll t0, t0, 5 # is Status.FR set?
bgez t0, 1f # no: skip odd doubles
@@ -63,8 +66,8 @@ LEAF(__kvm_save_fpu)
LEAF(__kvm_restore_fpu)
.set push
- .set mips64r2
SET_HARDFLOAT
+ .set fp=64
mfc0 t0, CP0_STATUS
sll t0, t0, 5 # is Status.FR set?
bgez t0, 1f # no: skip odd doubles
diff --git a/arch/mips/kvm/interrupt.c b/arch/mips/kvm/interrupt.c
index 95f790663b0c..ad28dac6b7e9 100644
--- a/arch/mips/kvm/interrupt.c
+++ b/arch/mips/kvm/interrupt.c
@@ -22,12 +22,12 @@
#include "interrupt.h"
-void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority)
+void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
{
set_bit(priority, &vcpu->arch.pending_exceptions);
}
-void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority)
+void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority)
{
clear_bit(priority, &vcpu->arch.pending_exceptions);
}
@@ -114,10 +114,10 @@ void kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
/* Deliver the interrupt of the corresponding priority, if possible. */
int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
- uint32_t cause)
+ u32 cause)
{
int allowed = 0;
- uint32_t exccode;
+ u32 exccode;
struct kvm_vcpu_arch *arch = &vcpu->arch;
struct mips_coproc *cop0 = vcpu->arch.cop0;
@@ -196,12 +196,12 @@ int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
}
int kvm_mips_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
- uint32_t cause)
+ u32 cause)
{
return 1;
}
-void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, uint32_t cause)
+void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, u32 cause)
{
unsigned long *pending = &vcpu->arch.pending_exceptions;
unsigned long *pending_clr = &vcpu->arch.pending_exceptions_clr;
diff --git a/arch/mips/kvm/interrupt.h b/arch/mips/kvm/interrupt.h
index 2143884709e4..fb118a2c8379 100644
--- a/arch/mips/kvm/interrupt.h
+++ b/arch/mips/kvm/interrupt.h
@@ -28,17 +28,13 @@
#define MIPS_EXC_MAX 12
/* XXXSL More to follow */
-extern char __kvm_mips_vcpu_run_end[];
-extern char mips32_exception[], mips32_exceptionEnd[];
-extern char mips32_GuestException[], mips32_GuestExceptionEnd[];
-
#define C_TI (_ULCAST_(1) << 30)
#define KVM_MIPS_IRQ_DELIVER_ALL_AT_ONCE (0)
#define KVM_MIPS_IRQ_CLEAR_ALL_AT_ONCE (0)
-void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, uint32_t priority);
-void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, uint32_t priority);
+void kvm_mips_queue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
+void kvm_mips_dequeue_irq(struct kvm_vcpu *vcpu, unsigned int priority);
int kvm_mips_pending_timer(struct kvm_vcpu *vcpu);
void kvm_mips_queue_timer_int_cb(struct kvm_vcpu *vcpu);
@@ -48,7 +44,7 @@ void kvm_mips_queue_io_int_cb(struct kvm_vcpu *vcpu,
void kvm_mips_dequeue_io_int_cb(struct kvm_vcpu *vcpu,
struct kvm_mips_interrupt *irq);
int kvm_mips_irq_deliver_cb(struct kvm_vcpu *vcpu, unsigned int priority,
- uint32_t cause);
+ u32 cause);
int kvm_mips_irq_clear_cb(struct kvm_vcpu *vcpu, unsigned int priority,
- uint32_t cause);
-void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, uint32_t cause);
+ u32 cause);
+void kvm_mips_deliver_interrupts(struct kvm_vcpu *vcpu, u32 cause);
diff --git a/arch/mips/kvm/locore.S b/arch/mips/kvm/locore.S
deleted file mode 100644
index 828fcfc1cd7f..000000000000
--- a/arch/mips/kvm/locore.S
+++ /dev/null
@@ -1,605 +0,0 @@
-/*
- * This file is subject to the terms and conditions of the GNU General Public
- * License. See the file "COPYING" in the main directory of this archive
- * for more details.
- *
- * Main entry point for the guest, exception handling.
- *
- * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved.
- * Authors: Sanjay Lal <sanjayl@kymasys.com>
- */
-
-#include <asm/asm.h>
-#include <asm/asmmacro.h>
-#include <asm/regdef.h>
-#include <asm/mipsregs.h>
-#include <asm/stackframe.h>
-#include <asm/asm-offsets.h>
-
-#define _C_LABEL(x) x
-#define MIPSX(name) mips32_ ## name
-#define CALLFRAME_SIZ 32
-
-/*
- * VECTOR
- * exception vector entrypoint
- */
-#define VECTOR(x, regmask) \
- .ent _C_LABEL(x),0; \
- EXPORT(x);
-
-#define VECTOR_END(x) \
- EXPORT(x);
-
-/* Overload, Danger Will Robinson!! */
-#define PT_HOST_USERLOCAL PT_EPC
-
-#define CP0_DDATA_LO $28,3
-
-/* Resume Flags */
-#define RESUME_FLAG_HOST (1<<1) /* Resume host? */
-
-#define RESUME_GUEST 0
-#define RESUME_HOST RESUME_FLAG_HOST
-
-/*
- * __kvm_mips_vcpu_run: entry point to the guest
- * a0: run
- * a1: vcpu
- */
- .set noreorder
-
-FEXPORT(__kvm_mips_vcpu_run)
- /* k0/k1 not being used in host kernel context */
- INT_ADDIU k1, sp, -PT_SIZE
- LONG_S $16, PT_R16(k1)
- LONG_S $17, PT_R17(k1)
- LONG_S $18, PT_R18(k1)
- LONG_S $19, PT_R19(k1)
- LONG_S $20, PT_R20(k1)
- LONG_S $21, PT_R21(k1)
- LONG_S $22, PT_R22(k1)
- LONG_S $23, PT_R23(k1)
-
- LONG_S $28, PT_R28(k1)
- LONG_S $29, PT_R29(k1)
- LONG_S $30, PT_R30(k1)
- LONG_S $31, PT_R31(k1)
-
- /* Save hi/lo */
- mflo v0
- LONG_S v0, PT_LO(k1)
- mfhi v1
- LONG_S v1, PT_HI(k1)
-
- /* Save host status */
- mfc0 v0, CP0_STATUS
- LONG_S v0, PT_STATUS(k1)
-
- /* Save DDATA_LO, will be used to store pointer to vcpu */
- mfc0 v1, CP0_DDATA_LO
- LONG_S v1, PT_HOST_USERLOCAL(k1)
-
- /* DDATA_LO has pointer to vcpu */
- mtc0 a1, CP0_DDATA_LO
-
- /* Offset into vcpu->arch */
- INT_ADDIU k1, a1, VCPU_HOST_ARCH
-
- /*
- * Save the host stack to VCPU, used for exception processing
- * when we exit from the Guest
- */
- LONG_S sp, VCPU_HOST_STACK(k1)
-
- /* Save the kernel gp as well */
- LONG_S gp, VCPU_HOST_GP(k1)
-
- /*
- * Setup status register for running the guest in UM, interrupts
- * are disabled
- */
- li k0, (ST0_EXL | KSU_USER | ST0_BEV)
- mtc0 k0, CP0_STATUS
- ehb
-
- /* load up the new EBASE */
- LONG_L k0, VCPU_GUEST_EBASE(k1)
- mtc0 k0, CP0_EBASE
-
- /*
- * Now that the new EBASE has been loaded, unset BEV, set
- * interrupt mask as it was but make sure that timer interrupts
- * are enabled
- */
- li k0, (ST0_EXL | KSU_USER | ST0_IE)
- andi v0, v0, ST0_IM
- or k0, k0, v0
- mtc0 k0, CP0_STATUS
- ehb
-
- /* Set Guest EPC */
- LONG_L t0, VCPU_PC(k1)
- mtc0 t0, CP0_EPC
-
-FEXPORT(__kvm_mips_load_asid)
- /* Set the ASID for the Guest Kernel */
- PTR_L t0, VCPU_COP0(k1)
- LONG_L t0, COP0_STATUS(t0)
- andi t0, KSU_USER | ST0_ERL | ST0_EXL
- xori t0, KSU_USER
- bnez t0, 1f /* If kernel */
- INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID /* (BD) */
- INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID /* else user */
-1:
- /* t1: contains the base of the ASID array, need to get the cpu id */
- LONG_L t2, TI_CPU($28) /* smp_processor_id */
- INT_SLL t2, t2, 2 /* x4 */
- REG_ADDU t3, t1, t2
- LONG_L k0, (t3)
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
- li t3, CPUINFO_SIZE/4
- mul t2, t2, t3 /* x sizeof(struct cpuinfo_mips)/4 */
- LONG_L t2, (cpu_data + CPUINFO_ASID_MASK)(t2)
- and k0, k0, t2
-#else
- andi k0, k0, MIPS_ENTRYHI_ASID
-#endif
- mtc0 k0, CP0_ENTRYHI
- ehb
-
- /* Disable RDHWR access */
- mtc0 zero, CP0_HWRENA
-
- .set noat
- /* Now load up the Guest Context from VCPU */
- LONG_L $1, VCPU_R1(k1)
- LONG_L $2, VCPU_R2(k1)
- LONG_L $3, VCPU_R3(k1)
-
- LONG_L $4, VCPU_R4(k1)
- LONG_L $5, VCPU_R5(k1)
- LONG_L $6, VCPU_R6(k1)
- LONG_L $7, VCPU_R7(k1)
-
- LONG_L $8, VCPU_R8(k1)
- LONG_L $9, VCPU_R9(k1)
- LONG_L $10, VCPU_R10(k1)
- LONG_L $11, VCPU_R11(k1)
- LONG_L $12, VCPU_R12(k1)
- LONG_L $13, VCPU_R13(k1)
- LONG_L $14, VCPU_R14(k1)
- LONG_L $15, VCPU_R15(k1)
- LONG_L $16, VCPU_R16(k1)
- LONG_L $17, VCPU_R17(k1)
- LONG_L $18, VCPU_R18(k1)
- LONG_L $19, VCPU_R19(k1)
- LONG_L $20, VCPU_R20(k1)
- LONG_L $21, VCPU_R21(k1)
- LONG_L $22, VCPU_R22(k1)
- LONG_L $23, VCPU_R23(k1)
- LONG_L $24, VCPU_R24(k1)
- LONG_L $25, VCPU_R25(k1)
-
- /* k0/k1 loaded up later */
-
- LONG_L $28, VCPU_R28(k1)
- LONG_L $29, VCPU_R29(k1)
- LONG_L $30, VCPU_R30(k1)
- LONG_L $31, VCPU_R31(k1)
-
- /* Restore hi/lo */
- LONG_L k0, VCPU_LO(k1)
- mtlo k0
-
- LONG_L k0, VCPU_HI(k1)
- mthi k0
-
-FEXPORT(__kvm_mips_load_k0k1)
- /* Restore the guest's k0/k1 registers */
- LONG_L k0, VCPU_R26(k1)
- LONG_L k1, VCPU_R27(k1)
-
- /* Jump to guest */
- eret
-EXPORT(__kvm_mips_vcpu_run_end)
-
-VECTOR(MIPSX(exception), unknown)
-/* Find out what mode we came from and jump to the proper handler. */
- mtc0 k0, CP0_ERROREPC #01: Save guest k0
- ehb #02:
-
- mfc0 k0, CP0_EBASE #02: Get EBASE
- INT_SRL k0, k0, 10 #03: Get rid of CPUNum
- INT_SLL k0, k0, 10 #04
- LONG_S k1, 0x3000(k0) #05: Save k1 @ offset 0x3000
- INT_ADDIU k0, k0, 0x2000 #06: Exception handler is
- # installed @ offset 0x2000
- j k0 #07: jump to the function
- nop #08: branch delay slot
-VECTOR_END(MIPSX(exceptionEnd))
-.end MIPSX(exception)
-
-/*
- * Generic Guest exception handler. We end up here when the guest
- * does something that causes a trap to kernel mode.
- */
-NESTED (MIPSX(GuestException), CALLFRAME_SIZ, ra)
- /* Get the VCPU pointer from DDTATA_LO */
- mfc0 k1, CP0_DDATA_LO
- INT_ADDIU k1, k1, VCPU_HOST_ARCH
-
- /* Start saving Guest context to VCPU */
- LONG_S $0, VCPU_R0(k1)
- LONG_S $1, VCPU_R1(k1)
- LONG_S $2, VCPU_R2(k1)
- LONG_S $3, VCPU_R3(k1)
- LONG_S $4, VCPU_R4(k1)
- LONG_S $5, VCPU_R5(k1)
- LONG_S $6, VCPU_R6(k1)
- LONG_S $7, VCPU_R7(k1)
- LONG_S $8, VCPU_R8(k1)
- LONG_S $9, VCPU_R9(k1)
- LONG_S $10, VCPU_R10(k1)
- LONG_S $11, VCPU_R11(k1)
- LONG_S $12, VCPU_R12(k1)
- LONG_S $13, VCPU_R13(k1)
- LONG_S $14, VCPU_R14(k1)
- LONG_S $15, VCPU_R15(k1)
- LONG_S $16, VCPU_R16(k1)
- LONG_S $17, VCPU_R17(k1)
- LONG_S $18, VCPU_R18(k1)
- LONG_S $19, VCPU_R19(k1)
- LONG_S $20, VCPU_R20(k1)
- LONG_S $21, VCPU_R21(k1)
- LONG_S $22, VCPU_R22(k1)
- LONG_S $23, VCPU_R23(k1)
- LONG_S $24, VCPU_R24(k1)
- LONG_S $25, VCPU_R25(k1)
-
- /* Guest k0/k1 saved later */
-
- LONG_S $28, VCPU_R28(k1)
- LONG_S $29, VCPU_R29(k1)
- LONG_S $30, VCPU_R30(k1)
- LONG_S $31, VCPU_R31(k1)
-
- .set at
-
- /* We need to save hi/lo and restore them on the way out */
- mfhi t0
- LONG_S t0, VCPU_HI(k1)
-
- mflo t0
- LONG_S t0, VCPU_LO(k1)
-
- /* Finally save guest k0/k1 to VCPU */
- mfc0 t0, CP0_ERROREPC
- LONG_S t0, VCPU_R26(k1)
-
- /* Get GUEST k1 and save it in VCPU */
- PTR_LI t1, ~0x2ff
- mfc0 t0, CP0_EBASE
- and t0, t0, t1
- LONG_L t0, 0x3000(t0)
- LONG_S t0, VCPU_R27(k1)
-
- /* Now that context has been saved, we can use other registers */
-
- /* Restore vcpu */
- mfc0 a1, CP0_DDATA_LO
- move s1, a1
-
- /* Restore run (vcpu->run) */
- LONG_L a0, VCPU_RUN(a1)
- /* Save pointer to run in s0, will be saved by the compiler */
- move s0, a0
-
- /*
- * Save Host level EPC, BadVaddr and Cause to VCPU, useful to
- * process the exception
- */
- mfc0 k0,CP0_EPC
- LONG_S k0, VCPU_PC(k1)
-
- mfc0 k0, CP0_BADVADDR
- LONG_S k0, VCPU_HOST_CP0_BADVADDR(k1)
-
- mfc0 k0, CP0_CAUSE
- LONG_S k0, VCPU_HOST_CP0_CAUSE(k1)
-
- mfc0 k0, CP0_ENTRYHI
- LONG_S k0, VCPU_HOST_ENTRYHI(k1)
-
- /* Now restore the host state just enough to run the handlers */
-
- /* Switch EBASE to the one used by Linux */
- /* load up the host EBASE */
- mfc0 v0, CP0_STATUS
-
- or k0, v0, ST0_BEV
-
- mtc0 k0, CP0_STATUS
- ehb
-
- LONG_L k0, VCPU_HOST_EBASE(k1)
- mtc0 k0,CP0_EBASE
-
- /*
- * If FPU is enabled, save FCR31 and clear it so that later ctc1's don't
- * trigger FPE for pending exceptions.
- */
- and v1, v0, ST0_CU1
- beqz v1, 1f
- nop
- .set push
- SET_HARDFLOAT
- cfc1 t0, fcr31
- sw t0, VCPU_FCR31(k1)
- ctc1 zero,fcr31
- .set pop
-1:
-
-#ifdef CONFIG_CPU_HAS_MSA
- /*
- * If MSA is enabled, save MSACSR and clear it so that later
- * instructions don't trigger MSAFPE for pending exceptions.
- */
- mfc0 t0, CP0_CONFIG3
- ext t0, t0, 28, 1 /* MIPS_CONF3_MSAP */
- beqz t0, 1f
- nop
- mfc0 t0, CP0_CONFIG5
- ext t0, t0, 27, 1 /* MIPS_CONF5_MSAEN */
- beqz t0, 1f
- nop
- _cfcmsa t0, MSA_CSR
- sw t0, VCPU_MSA_CSR(k1)
- _ctcmsa MSA_CSR, zero
-1:
-#endif
-
- /* Now that the new EBASE has been loaded, unset BEV and KSU_USER */
- and v0, v0, ~(ST0_EXL | KSU_USER | ST0_IE)
- or v0, v0, ST0_CU0
- mtc0 v0, CP0_STATUS
- ehb
-
- /* Load up host GP */
- LONG_L gp, VCPU_HOST_GP(k1)
-
- /* Need a stack before we can jump to "C" */
- LONG_L sp, VCPU_HOST_STACK(k1)
-
- /* Saved host state */
- INT_ADDIU sp, sp, -PT_SIZE
-
- /*
- * XXXKYMA do we need to load the host ASID, maybe not because the
- * kernel entries are marked GLOBAL, need to verify
- */
-
- /* Restore host DDATA_LO */
- LONG_L k0, PT_HOST_USERLOCAL(sp)
- mtc0 k0, CP0_DDATA_LO
-
- /* Restore RDHWR access */
- PTR_LI k0, 0x2000000F
- mtc0 k0, CP0_HWRENA
-
- /* Jump to handler */
-FEXPORT(__kvm_mips_jump_to_handler)
- /*
- * XXXKYMA: not sure if this is safe, how large is the stack??
- * Now jump to the kvm_mips_handle_exit() to see if we can deal
- * with this in the kernel
- */
- PTR_LA t9, kvm_mips_handle_exit
- jalr.hb t9
- INT_ADDIU sp, sp, -CALLFRAME_SIZ /* BD Slot */
-
- /* Return from handler Make sure interrupts are disabled */
- di
- ehb
-
- /*
- * XXXKYMA: k0/k1 could have been blown away if we processed
- * an exception while we were handling the exception from the
- * guest, reload k1
- */
-
- move k1, s1
- INT_ADDIU k1, k1, VCPU_HOST_ARCH
-
- /*
- * Check return value, should tell us if we are returning to the
- * host (handle I/O etc)or resuming the guest
- */
- andi t0, v0, RESUME_HOST
- bnez t0, __kvm_mips_return_to_host
- nop
-
-__kvm_mips_return_to_guest:
- /* Put the saved pointer to vcpu (s1) back into the DDATA_LO Register */
- mtc0 s1, CP0_DDATA_LO
-
- /* Load up the Guest EBASE to minimize the window where BEV is set */
- LONG_L t0, VCPU_GUEST_EBASE(k1)
-
- /* Switch EBASE back to the one used by KVM */
- mfc0 v1, CP0_STATUS
- or k0, v1, ST0_BEV
- mtc0 k0, CP0_STATUS
- ehb
- mtc0 t0, CP0_EBASE
-
- /* Setup status register for running guest in UM */
- or v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
- and v1, v1, ~(ST0_CU0 | ST0_MX)
- mtc0 v1, CP0_STATUS
- ehb
-
- /* Set Guest EPC */
- LONG_L t0, VCPU_PC(k1)
- mtc0 t0, CP0_EPC
-
- /* Set the ASID for the Guest Kernel */
- PTR_L t0, VCPU_COP0(k1)
- LONG_L t0, COP0_STATUS(t0)
- andi t0, KSU_USER | ST0_ERL | ST0_EXL
- xori t0, KSU_USER
- bnez t0, 1f /* If kernel */
- INT_ADDIU t1, k1, VCPU_GUEST_KERNEL_ASID /* (BD) */
- INT_ADDIU t1, k1, VCPU_GUEST_USER_ASID /* else user */
-1:
- /* t1: contains the base of the ASID array, need to get the cpu id */
- LONG_L t2, TI_CPU($28) /* smp_processor_id */
- INT_SLL t2, t2, 2 /* x4 */
- REG_ADDU t3, t1, t2
- LONG_L k0, (t3)
-#ifdef CONFIG_MIPS_ASID_BITS_VARIABLE
- li t3, CPUINFO_SIZE/4
- mul t2, t2, t3 /* x sizeof(struct cpuinfo_mips)/4 */
- LONG_L t2, (cpu_data + CPUINFO_ASID_MASK)(t2)
- and k0, k0, t2
-#else
- andi k0, k0, MIPS_ENTRYHI_ASID
-#endif
- mtc0 k0, CP0_ENTRYHI
- ehb
-
- /* Disable RDHWR access */
- mtc0 zero, CP0_HWRENA
-
- .set noat
- /* load the guest context from VCPU and return */
- LONG_L $0, VCPU_R0(k1)
- LONG_L $1, VCPU_R1(k1)
- LONG_L $2, VCPU_R2(k1)
- LONG_L $3, VCPU_R3(k1)
- LONG_L $4, VCPU_R4(k1)
- LONG_L $5, VCPU_R5(k1)
- LONG_L $6, VCPU_R6(k1)
- LONG_L $7, VCPU_R7(k1)
- LONG_L $8, VCPU_R8(k1)
- LONG_L $9, VCPU_R9(k1)
- LONG_L $10, VCPU_R10(k1)
- LONG_L $11, VCPU_R11(k1)
- LONG_L $12, VCPU_R12(k1)
- LONG_L $13, VCPU_R13(k1)
- LONG_L $14, VCPU_R14(k1)
- LONG_L $15, VCPU_R15(k1)
- LONG_L $16, VCPU_R16(k1)
- LONG_L $17, VCPU_R17(k1)
- LONG_L $18, VCPU_R18(k1)
- LONG_L $19, VCPU_R19(k1)
- LONG_L $20, VCPU_R20(k1)
- LONG_L $21, VCPU_R21(k1)
- LONG_L $22, VCPU_R22(k1)
- LONG_L $23, VCPU_R23(k1)
- LONG_L $24, VCPU_R24(k1)
- LONG_L $25, VCPU_R25(k1)
-
- /* $/k1 loaded later */
- LONG_L $28, VCPU_R28(k1)
- LONG_L $29, VCPU_R29(k1)
- LONG_L $30, VCPU_R30(k1)
- LONG_L $31, VCPU_R31(k1)
-
-FEXPORT(__kvm_mips_skip_guest_restore)
- LONG_L k0, VCPU_HI(k1)
- mthi k0
-
- LONG_L k0, VCPU_LO(k1)
- mtlo k0
-
- LONG_L k0, VCPU_R26(k1)
- LONG_L k1, VCPU_R27(k1)
-
- eret
- .set at
-
-__kvm_mips_return_to_host:
- /* EBASE is already pointing to Linux */
- LONG_L k1, VCPU_HOST_STACK(k1)
- INT_ADDIU k1,k1, -PT_SIZE
-
- /* Restore host DDATA_LO */
- LONG_L k0, PT_HOST_USERLOCAL(k1)
- mtc0 k0, CP0_DDATA_LO
-
- /*
- * r2/v0 is the return code, shift it down by 2 (arithmetic)
- * to recover the err code
- */
- INT_SRA k0, v0, 2
- move $2, k0
-
- /* Load context saved on the host stack */
- LONG_L $16, PT_R16(k1)
- LONG_L $17, PT_R17(k1)
- LONG_L $18, PT_R18(k1)
- LONG_L $19, PT_R19(k1)
- LONG_L $20, PT_R20(k1)
- LONG_L $21, PT_R21(k1)
- LONG_L $22, PT_R22(k1)
- LONG_L $23, PT_R23(k1)
-
- LONG_L $28, PT_R28(k1)
- LONG_L $29, PT_R29(k1)
- LONG_L $30, PT_R30(k1)
-
- LONG_L k0, PT_HI(k1)
- mthi k0
-
- LONG_L k0, PT_LO(k1)
- mtlo k0
-
- /* Restore RDHWR access */
- PTR_LI k0, 0x2000000F
- mtc0 k0, CP0_HWRENA
-
- /* Restore RA, which is the address we will return to */
- LONG_L ra, PT_R31(k1)
- j ra
- nop
-
-VECTOR_END(MIPSX(GuestExceptionEnd))
-.end MIPSX(GuestException)
-
-MIPSX(exceptions):
- ####
- ##### The exception handlers.
- #####
- .word _C_LABEL(MIPSX(GuestException)) # 0
- .word _C_LABEL(MIPSX(GuestException)) # 1
- .word _C_LABEL(MIPSX(GuestException)) # 2
- .word _C_LABEL(MIPSX(GuestException)) # 3
- .word _C_LABEL(MIPSX(GuestException)) # 4
- .word _C_LABEL(MIPSX(GuestException)) # 5
- .word _C_LABEL(MIPSX(GuestException)) # 6
- .word _C_LABEL(MIPSX(GuestException)) # 7
- .word _C_LABEL(MIPSX(GuestException)) # 8
- .word _C_LABEL(MIPSX(GuestException)) # 9
- .word _C_LABEL(MIPSX(GuestException)) # 10
- .word _C_LABEL(MIPSX(GuestException)) # 11
- .word _C_LABEL(MIPSX(GuestException)) # 12
- .word _C_LABEL(MIPSX(GuestException)) # 13
- .word _C_LABEL(MIPSX(GuestException)) # 14
- .word _C_LABEL(MIPSX(GuestException)) # 15
- .word _C_LABEL(MIPSX(GuestException)) # 16
- .word _C_LABEL(MIPSX(GuestException)) # 17
- .word _C_LABEL(MIPSX(GuestException)) # 18
- .word _C_LABEL(MIPSX(GuestException)) # 19
- .word _C_LABEL(MIPSX(GuestException)) # 20
- .word _C_LABEL(MIPSX(GuestException)) # 21
- .word _C_LABEL(MIPSX(GuestException)) # 22
- .word _C_LABEL(MIPSX(GuestException)) # 23
- .word _C_LABEL(MIPSX(GuestException)) # 24
- .word _C_LABEL(MIPSX(GuestException)) # 25
- .word _C_LABEL(MIPSX(GuestException)) # 26
- .word _C_LABEL(MIPSX(GuestException)) # 27
- .word _C_LABEL(MIPSX(GuestException)) # 28
- .word _C_LABEL(MIPSX(GuestException)) # 29
- .word _C_LABEL(MIPSX(GuestException)) # 30
- .word _C_LABEL(MIPSX(GuestException)) # 31
diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c
index 44da5259f390..a6ea084b4d9d 100644
--- a/arch/mips/kvm/mips.c
+++ b/arch/mips/kvm/mips.c
@@ -9,6 +9,7 @@
* Authors: Sanjay Lal <sanjayl@kymasys.com>
*/
+#include <linux/bitops.h>
#include <linux/errno.h>
#include <linux/err.h>
#include <linux/kdebug.h>
@@ -147,7 +148,7 @@ void kvm_mips_free_vcpus(struct kvm *kvm)
/* Put the pages we reserved for the guest pmap */
for (i = 0; i < kvm->arch.guest_pmap_npages; i++) {
if (kvm->arch.guest_pmap[i] != KVM_INVALID_PAGE)
- kvm_mips_release_pfn_clean(kvm->arch.guest_pmap[i]);
+ kvm_release_pfn_clean(kvm->arch.guest_pmap[i]);
}
kfree(kvm->arch.guest_pmap);
@@ -244,10 +245,27 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
}
}
+static inline void dump_handler(const char *symbol, void *start, void *end)
+{
+ u32 *p;
+
+ pr_debug("LEAF(%s)\n", symbol);
+
+ pr_debug("\t.set push\n");
+ pr_debug("\t.set noreorder\n");
+
+ for (p = start; p < (u32 *)end; ++p)
+ pr_debug("\t.word\t0x%08x\t\t# %p\n", *p, p);
+
+ pr_debug("\t.set\tpop\n");
+
+ pr_debug("\tEND(%s)\n", symbol);
+}
+
struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
{
- int err, size, offset;
- void *gebase;
+ int err, size;
+ void *gebase, *p, *handler;
int i;
struct kvm_vcpu *vcpu = kzalloc(sizeof(struct kvm_vcpu), GFP_KERNEL);
@@ -273,9 +291,6 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
else
size = 0x4000;
- /* Save Linux EBASE */
- vcpu->arch.host_ebase = (void *)read_c0_ebase();
-
gebase = kzalloc(ALIGN(size, PAGE_SIZE), GFP_KERNEL);
if (!gebase) {
@@ -285,44 +300,53 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, unsigned int id)
kvm_debug("Allocated %d bytes for KVM Exception Handlers @ %p\n",
ALIGN(size, PAGE_SIZE), gebase);
+ /*
+ * Check new ebase actually fits in CP0_EBase. The lack of a write gate
+ * limits us to the low 512MB of physical address space. If the memory
+ * we allocate is out of range, just give up now.
+ */
+ if (!cpu_has_ebase_wg && virt_to_phys(gebase) >= 0x20000000) {
+ kvm_err("CP0_EBase.WG required for guest exception base %pK\n",
+ gebase);
+ err = -ENOMEM;
+ goto out_free_gebase;
+ }
+
/* Save new ebase */
vcpu->arch.guest_ebase = gebase;
- /* Copy L1 Guest Exception handler to correct offset */
+ /* Build guest exception vectors dynamically in unmapped memory */
+ handler = gebase + 0x2000;
/* TLB Refill, EXL = 0 */
- memcpy(gebase, mips32_exception,
- mips32_exceptionEnd - mips32_exception);
+ kvm_mips_build_exception(gebase, handler);
/* General Exception Entry point */
- memcpy(gebase + 0x180, mips32_exception,
- mips32_exceptionEnd - mips32_exception);
+ kvm_mips_build_exception(gebase + 0x180, handler);
/* For vectored interrupts poke the exception code @ all offsets 0-7 */
for (i = 0; i < 8; i++) {
kvm_debug("L1 Vectored handler @ %p\n",
gebase + 0x200 + (i * VECTORSPACING));
- memcpy(gebase + 0x200 + (i * VECTORSPACING), mips32_exception,
- mips32_exceptionEnd - mips32_exception);
+ kvm_mips_build_exception(gebase + 0x200 + i * VECTORSPACING,
+ handler);
}
- /* General handler, relocate to unmapped space for sanity's sake */
- offset = 0x2000;
- kvm_debug("Installing KVM Exception handlers @ %p, %#x bytes\n",
- gebase + offset,
- mips32_GuestExceptionEnd - mips32_GuestException);
+ /* General exit handler */
+ p = handler;
+ p = kvm_mips_build_exit(p);
- memcpy(gebase + offset, mips32_GuestException,
- mips32_GuestExceptionEnd - mips32_GuestException);
+ /* Guest entry routine */
+ vcpu->arch.vcpu_run = p;
+ p = kvm_mips_build_vcpu_run(p);
-#ifdef MODULE
- offset += mips32_GuestExceptionEnd - mips32_GuestException;
- memcpy(gebase + offset, (char *)__kvm_mips_vcpu_run,
- __kvm_mips_vcpu_run_end - (char *)__kvm_mips_vcpu_run);
- vcpu->arch.vcpu_run = gebase + offset;
-#else
- vcpu->arch.vcpu_run = __kvm_mips_vcpu_run;
-#endif
+ /* Dump the generated code */
+ pr_debug("#include <asm/asm.h>\n");
+ pr_debug("#include <asm/regdef.h>\n");
+ pr_debug("\n");
+ dump_handler("kvm_vcpu_run", vcpu->arch.vcpu_run, p);
+ dump_handler("kvm_gen_exc", gebase + 0x180, gebase + 0x200);
+ dump_handler("kvm_exit", gebase + 0x2000, vcpu->arch.vcpu_run);
/* Invalidate the icache for these ranges */
local_flush_icache_range((unsigned long)gebase,
@@ -408,17 +432,19 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *run)
kvm_mips_deliver_interrupts(vcpu,
kvm_read_c0_guest_cause(vcpu->arch.cop0));
- __kvm_guest_enter();
+ guest_enter_irqoff();
/* Disable hardware page table walking while in guest */
htw_stop();
+ trace_kvm_enter(vcpu);
r = vcpu->arch.vcpu_run(run, vcpu);
+ trace_kvm_out(vcpu);
/* Re-enable HTW before enabling interrupts */
htw_start();
- __kvm_guest_exit();
+ guest_exit_irqoff();
local_irq_enable();
if (vcpu->sigset_active)
@@ -507,8 +533,10 @@ static u64 kvm_mips_get_one_regs[] = {
KVM_REG_MIPS_R30,
KVM_REG_MIPS_R31,
+#ifndef CONFIG_CPU_MIPSR6
KVM_REG_MIPS_HI,
KVM_REG_MIPS_LO,
+#endif
KVM_REG_MIPS_PC,
KVM_REG_MIPS_CP0_INDEX,
@@ -539,6 +567,104 @@ static u64 kvm_mips_get_one_regs[] = {
KVM_REG_MIPS_COUNT_HZ,
};
+static u64 kvm_mips_get_one_regs_fpu[] = {
+ KVM_REG_MIPS_FCR_IR,
+ KVM_REG_MIPS_FCR_CSR,
+};
+
+static u64 kvm_mips_get_one_regs_msa[] = {
+ KVM_REG_MIPS_MSA_IR,
+ KVM_REG_MIPS_MSA_CSR,
+};
+
+static u64 kvm_mips_get_one_regs_kscratch[] = {
+ KVM_REG_MIPS_CP0_KSCRATCH1,
+ KVM_REG_MIPS_CP0_KSCRATCH2,
+ KVM_REG_MIPS_CP0_KSCRATCH3,
+ KVM_REG_MIPS_CP0_KSCRATCH4,
+ KVM_REG_MIPS_CP0_KSCRATCH5,
+ KVM_REG_MIPS_CP0_KSCRATCH6,
+};
+
+static unsigned long kvm_mips_num_regs(struct kvm_vcpu *vcpu)
+{
+ unsigned long ret;
+
+ ret = ARRAY_SIZE(kvm_mips_get_one_regs);
+ if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) {
+ ret += ARRAY_SIZE(kvm_mips_get_one_regs_fpu) + 48;
+ /* odd doubles */
+ if (boot_cpu_data.fpu_id & MIPS_FPIR_F64)
+ ret += 16;
+ }
+ if (kvm_mips_guest_can_have_msa(&vcpu->arch))
+ ret += ARRAY_SIZE(kvm_mips_get_one_regs_msa) + 32;
+ ret += __arch_hweight8(vcpu->arch.kscratch_enabled);
+ ret += kvm_mips_callbacks->num_regs(vcpu);
+
+ return ret;
+}
+
+static int kvm_mips_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices)
+{
+ u64 index;
+ unsigned int i;
+
+ if (copy_to_user(indices, kvm_mips_get_one_regs,
+ sizeof(kvm_mips_get_one_regs)))
+ return -EFAULT;
+ indices += ARRAY_SIZE(kvm_mips_get_one_regs);
+
+ if (kvm_mips_guest_can_have_fpu(&vcpu->arch)) {
+ if (copy_to_user(indices, kvm_mips_get_one_regs_fpu,
+ sizeof(kvm_mips_get_one_regs_fpu)))
+ return -EFAULT;
+ indices += ARRAY_SIZE(kvm_mips_get_one_regs_fpu);
+
+ for (i = 0; i < 32; ++i) {
+ index = KVM_REG_MIPS_FPR_32(i);
+ if (copy_to_user(indices, &index, sizeof(index)))
+ return -EFAULT;
+ ++indices;
+
+ /* skip odd doubles if no F64 */
+ if (i & 1 && !(boot_cpu_data.fpu_id & MIPS_FPIR_F64))
+ continue;
+
+ index = KVM_REG_MIPS_FPR_64(i);
+ if (copy_to_user(indices, &index, sizeof(index)))
+ return -EFAULT;
+ ++indices;
+ }
+ }
+
+ if (kvm_mips_guest_can_have_msa(&vcpu->arch)) {
+ if (copy_to_user(indices, kvm_mips_get_one_regs_msa,
+ sizeof(kvm_mips_get_one_regs_msa)))
+ return -EFAULT;
+ indices += ARRAY_SIZE(kvm_mips_get_one_regs_msa);
+
+ for (i = 0; i < 32; ++i) {
+ index = KVM_REG_MIPS_VEC_128(i);
+ if (copy_to_user(indices, &index, sizeof(index)))
+ return -EFAULT;
+ ++indices;
+ }
+ }
+
+ for (i = 0; i < 6; ++i) {
+ if (!(vcpu->arch.kscratch_enabled & BIT(i + 2)))
+ continue;
+
+ if (copy_to_user(indices, &kvm_mips_get_one_regs_kscratch[i],
+ sizeof(kvm_mips_get_one_regs_kscratch[i])))
+ return -EFAULT;
+ ++indices;
+ }
+
+ return kvm_mips_callbacks->copy_reg_indices(vcpu, indices);
+}
+
static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
const struct kvm_one_reg *reg)
{
@@ -554,12 +680,14 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
case KVM_REG_MIPS_R0 ... KVM_REG_MIPS_R31:
v = (long)vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0];
break;
+#ifndef CONFIG_CPU_MIPSR6
case KVM_REG_MIPS_HI:
v = (long)vcpu->arch.hi;
break;
case KVM_REG_MIPS_LO:
v = (long)vcpu->arch.lo;
break;
+#endif
case KVM_REG_MIPS_PC:
v = (long)vcpu->arch.pc;
break;
@@ -688,17 +816,37 @@ static int kvm_mips_get_reg(struct kvm_vcpu *vcpu,
case KVM_REG_MIPS_CP0_ERROREPC:
v = (long)kvm_read_c0_guest_errorepc(cop0);
break;
+ case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+ idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+ if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
+ return -EINVAL;
+ switch (idx) {
+ case 2:
+ v = (long)kvm_read_c0_guest_kscratch1(cop0);
+ break;
+ case 3:
+ v = (long)kvm_read_c0_guest_kscratch2(cop0);
+ break;
+ case 4:
+ v = (long)kvm_read_c0_guest_kscratch3(cop0);
+ break;
+ case 5:
+ v = (long)kvm_read_c0_guest_kscratch4(cop0);
+ break;
+ case 6:
+ v = (long)kvm_read_c0_guest_kscratch5(cop0);
+ break;
+ case 7:
+ v = (long)kvm_read_c0_guest_kscratch6(cop0);
+ break;
+ }
+ break;
/* registers to be handled specially */
- case KVM_REG_MIPS_CP0_COUNT:
- case KVM_REG_MIPS_COUNT_CTL:
- case KVM_REG_MIPS_COUNT_RESUME:
- case KVM_REG_MIPS_COUNT_HZ:
+ default:
ret = kvm_mips_callbacks->get_one_reg(vcpu, reg, &v);
if (ret)
return ret;
break;
- default:
- return -EINVAL;
}
if ((reg->id & KVM_REG_SIZE_MASK) == KVM_REG_SIZE_U64) {
u64 __user *uaddr64 = (u64 __user *)(long)reg->addr;
@@ -755,12 +903,14 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
case KVM_REG_MIPS_R1 ... KVM_REG_MIPS_R31:
vcpu->arch.gprs[reg->id - KVM_REG_MIPS_R0] = v;
break;
+#ifndef CONFIG_CPU_MIPSR6
case KVM_REG_MIPS_HI:
vcpu->arch.hi = v;
break;
case KVM_REG_MIPS_LO:
vcpu->arch.lo = v;
break;
+#endif
case KVM_REG_MIPS_PC:
vcpu->arch.pc = v;
break;
@@ -859,22 +1009,34 @@ static int kvm_mips_set_reg(struct kvm_vcpu *vcpu,
case KVM_REG_MIPS_CP0_ERROREPC:
kvm_write_c0_guest_errorepc(cop0, v);
break;
+ case KVM_REG_MIPS_CP0_KSCRATCH1 ... KVM_REG_MIPS_CP0_KSCRATCH6:
+ idx = reg->id - KVM_REG_MIPS_CP0_KSCRATCH1 + 2;
+ if (!(vcpu->arch.kscratch_enabled & BIT(idx)))
+ return -EINVAL;
+ switch (idx) {
+ case 2:
+ kvm_write_c0_guest_kscratch1(cop0, v);
+ break;
+ case 3:
+ kvm_write_c0_guest_kscratch2(cop0, v);
+ break;
+ case 4:
+ kvm_write_c0_guest_kscratch3(cop0, v);
+ break;
+ case 5:
+ kvm_write_c0_guest_kscratch4(cop0, v);
+ break;
+ case 6:
+ kvm_write_c0_guest_kscratch5(cop0, v);
+ break;
+ case 7:
+ kvm_write_c0_guest_kscratch6(cop0, v);
+ break;
+ }
+ break;
/* registers to be handled specially */
- case KVM_REG_MIPS_CP0_COUNT:
- case KVM_REG_MIPS_CP0_COMPARE:
- case KVM_REG_MIPS_CP0_CAUSE:
- case KVM_REG_MIPS_CP0_CONFIG:
- case KVM_REG_MIPS_CP0_CONFIG1:
- case KVM_REG_MIPS_CP0_CONFIG2:
- case KVM_REG_MIPS_CP0_CONFIG3:
- case KVM_REG_MIPS_CP0_CONFIG4:
- case KVM_REG_MIPS_CP0_CONFIG5:
- case KVM_REG_MIPS_COUNT_CTL:
- case KVM_REG_MIPS_COUNT_RESUME:
- case KVM_REG_MIPS_COUNT_HZ:
- return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
default:
- return -EINVAL;
+ return kvm_mips_callbacks->set_one_reg(vcpu, reg, v);
}
return 0;
}
@@ -927,23 +1089,18 @@ long kvm_arch_vcpu_ioctl(struct file *filp, unsigned int ioctl,
}
case KVM_GET_REG_LIST: {
struct kvm_reg_list __user *user_list = argp;
- u64 __user *reg_dest;
struct kvm_reg_list reg_list;
unsigned n;
if (copy_from_user(&reg_list, user_list, sizeof(reg_list)))
return -EFAULT;
n = reg_list.n;
- reg_list.n = ARRAY_SIZE(kvm_mips_get_one_regs);
+ reg_list.n = kvm_mips_num_regs(vcpu);
if (copy_to_user(user_list, &reg_list, sizeof(reg_list)))
return -EFAULT;
if (n < reg_list.n)
return -E2BIG;
- reg_dest = user_list->reg;
- if (copy_to_user(reg_dest, kvm_mips_get_one_regs,
- sizeof(kvm_mips_get_one_regs)))
- return -EFAULT;
- return 0;
+ return kvm_mips_copy_reg_indices(vcpu, user_list->reg);
}
case KVM_NMI:
/* Treat the NMI as a CPU reset */
@@ -1222,7 +1379,7 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
static void kvm_mips_set_c0_status(void)
{
- uint32_t status = read_c0_status();
+ u32 status = read_c0_status();
if (cpu_has_dsp)
status |= (ST0_MX);
@@ -1236,9 +1393,9 @@ static void kvm_mips_set_c0_status(void)
*/
int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
{
- uint32_t cause = vcpu->arch.host_cp0_cause;
- uint32_t exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
+ u32 exccode = (cause >> CAUSEB_EXCCODE) & 0x1f;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -1260,6 +1417,7 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
kvm_debug("kvm_mips_handle_exit: cause: %#x, PC: %p, kvm_run: %p, kvm_vcpu: %p\n",
cause, opc, run, vcpu);
+ trace_kvm_exit(vcpu, exccode);
/*
* Do a privilege check, if in UM most of these exit conditions end up
@@ -1279,7 +1437,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
kvm_debug("[%d]EXCCODE_INT @ %p\n", vcpu->vcpu_id, opc);
++vcpu->stat.int_exits;
- trace_kvm_exit(vcpu, INT_EXITS);
if (need_resched())
cond_resched();
@@ -1291,7 +1448,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
kvm_debug("EXCCODE_CPU: @ PC: %p\n", opc);
++vcpu->stat.cop_unusable_exits;
- trace_kvm_exit(vcpu, COP_UNUSABLE_EXITS);
ret = kvm_mips_callbacks->handle_cop_unusable(vcpu);
/* XXXKYMA: Might need to return to user space */
if (run->exit_reason == KVM_EXIT_IRQ_WINDOW_OPEN)
@@ -1300,7 +1456,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
case EXCCODE_MOD:
++vcpu->stat.tlbmod_exits;
- trace_kvm_exit(vcpu, TLBMOD_EXITS);
ret = kvm_mips_callbacks->handle_tlb_mod(vcpu);
break;
@@ -1310,7 +1465,6 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
badvaddr);
++vcpu->stat.tlbmiss_st_exits;
- trace_kvm_exit(vcpu, TLBMISS_ST_EXITS);
ret = kvm_mips_callbacks->handle_tlb_st_miss(vcpu);
break;
@@ -1319,61 +1473,51 @@ int kvm_mips_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu)
cause, opc, badvaddr);
++vcpu->stat.tlbmiss_ld_exits;
- trace_kvm_exit(vcpu, TLBMISS_LD_EXITS);
ret = kvm_mips_callbacks->handle_tlb_ld_miss(vcpu);
break;
case EXCCODE_ADES:
++vcpu->stat.addrerr_st_exits;
- trace_kvm_exit(vcpu, ADDRERR_ST_EXITS);
ret = kvm_mips_callbacks->handle_addr_err_st(vcpu);
break;
case EXCCODE_ADEL:
++vcpu->stat.addrerr_ld_exits;
- trace_kvm_exit(vcpu, ADDRERR_LD_EXITS);
ret = kvm_mips_callbacks->handle_addr_err_ld(vcpu);
break;
case EXCCODE_SYS:
++vcpu->stat.syscall_exits;
- trace_kvm_exit(vcpu, SYSCALL_EXITS);
ret = kvm_mips_callbacks->handle_syscall(vcpu);
break;
case EXCCODE_RI:
++vcpu->stat.resvd_inst_exits;
- trace_kvm_exit(vcpu, RESVD_INST_EXITS);
ret = kvm_mips_callbacks->handle_res_inst(vcpu);
break;
case EXCCODE_BP:
++vcpu->stat.break_inst_exits;
- trace_kvm_exit(vcpu, BREAK_INST_EXITS);
ret = kvm_mips_callbacks->handle_break(vcpu);
break;
case EXCCODE_TR:
++vcpu->stat.trap_inst_exits;
- trace_kvm_exit(vcpu, TRAP_INST_EXITS);
ret = kvm_mips_callbacks->handle_trap(vcpu);
break;
case EXCCODE_MSAFPE:
++vcpu->stat.msa_fpe_exits;
- trace_kvm_exit(vcpu, MSA_FPE_EXITS);
ret = kvm_mips_callbacks->handle_msa_fpe(vcpu);
break;
case EXCCODE_FPE:
++vcpu->stat.fpe_exits;
- trace_kvm_exit(vcpu, FPE_EXITS);
ret = kvm_mips_callbacks->handle_fpe(vcpu);
break;
case EXCCODE_MSADIS:
++vcpu->stat.msa_disabled_exits;
- trace_kvm_exit(vcpu, MSA_DISABLED_EXITS);
ret = kvm_mips_callbacks->handle_msa_disabled(vcpu);
break;
@@ -1400,11 +1544,13 @@ skip_emul:
run->exit_reason = KVM_EXIT_INTR;
ret = (-EINTR << 2) | RESUME_HOST;
++vcpu->stat.signal_exits;
- trace_kvm_exit(vcpu, SIGNAL_EXITS);
+ trace_kvm_exit(vcpu, KVM_TRACE_EXIT_SIGNAL);
}
}
if (ret == RESUME_GUEST) {
+ trace_kvm_reenter(vcpu);
+
/*
* If FPU / MSA are enabled (i.e. the guest's FPU / MSA context
* is live), restore FCR31 / MSACSR.
@@ -1450,7 +1596,7 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu)
* not to clobber the status register directly via the commpage.
*/
if (cpu_has_msa && sr & ST0_CU1 && !(sr & ST0_FR) &&
- vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA)
+ vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA)
kvm_lose_fpu(vcpu);
/*
@@ -1465,9 +1611,12 @@ void kvm_own_fpu(struct kvm_vcpu *vcpu)
enable_fpu_hazard();
/* If guest FPU state not active, restore it now */
- if (!(vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU)) {
+ if (!(vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU)) {
__kvm_restore_fpu(&vcpu->arch);
- vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU;
+ vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU;
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_FPU);
+ } else {
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_ENABLE, KVM_TRACE_AUX_FPU);
}
preempt_enable();
@@ -1494,8 +1643,8 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
* interacts with MSA state, so play it safe and save it first.
*/
if (!(sr & ST0_FR) &&
- (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU |
- KVM_MIPS_FPU_MSA)) == KVM_MIPS_FPU_FPU)
+ (vcpu->arch.aux_inuse & (KVM_MIPS_AUX_FPU |
+ KVM_MIPS_AUX_MSA)) == KVM_MIPS_AUX_FPU)
kvm_lose_fpu(vcpu);
change_c0_status(ST0_CU1 | ST0_FR, sr);
@@ -1509,22 +1658,26 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
set_c0_config5(MIPS_CONF5_MSAEN);
enable_fpu_hazard();
- switch (vcpu->arch.fpu_inuse & (KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA)) {
- case KVM_MIPS_FPU_FPU:
+ switch (vcpu->arch.aux_inuse & (KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA)) {
+ case KVM_MIPS_AUX_FPU:
/*
* Guest FPU state already loaded, only restore upper MSA state
*/
__kvm_restore_msa_upper(&vcpu->arch);
- vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA;
+ vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA;
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE, KVM_TRACE_AUX_MSA);
break;
case 0:
/* Neither FPU or MSA already active, restore full MSA state */
__kvm_restore_msa(&vcpu->arch);
- vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_MSA;
+ vcpu->arch.aux_inuse |= KVM_MIPS_AUX_MSA;
if (kvm_mips_guest_has_fpu(&vcpu->arch))
- vcpu->arch.fpu_inuse |= KVM_MIPS_FPU_FPU;
+ vcpu->arch.aux_inuse |= KVM_MIPS_AUX_FPU;
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_RESTORE,
+ KVM_TRACE_AUX_FPU_MSA);
break;
default:
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_ENABLE, KVM_TRACE_AUX_MSA);
break;
}
@@ -1536,13 +1689,15 @@ void kvm_own_msa(struct kvm_vcpu *vcpu)
void kvm_drop_fpu(struct kvm_vcpu *vcpu)
{
preempt_disable();
- if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) {
+ if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
disable_msa();
- vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_MSA;
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_DISCARD, KVM_TRACE_AUX_MSA);
+ vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_MSA;
}
- if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+ if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
clear_c0_status(ST0_CU1 | ST0_FR);
- vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU;
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_DISCARD, KVM_TRACE_AUX_FPU);
+ vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;
}
preempt_enable();
}
@@ -1558,25 +1713,27 @@ void kvm_lose_fpu(struct kvm_vcpu *vcpu)
*/
preempt_disable();
- if (cpu_has_msa && vcpu->arch.fpu_inuse & KVM_MIPS_FPU_MSA) {
+ if (cpu_has_msa && vcpu->arch.aux_inuse & KVM_MIPS_AUX_MSA) {
set_c0_config5(MIPS_CONF5_MSAEN);
enable_fpu_hazard();
__kvm_save_msa(&vcpu->arch);
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU_MSA);
/* Disable MSA & FPU */
disable_msa();
- if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+ if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
clear_c0_status(ST0_CU1 | ST0_FR);
disable_fpu_hazard();
}
- vcpu->arch.fpu_inuse &= ~(KVM_MIPS_FPU_FPU | KVM_MIPS_FPU_MSA);
- } else if (vcpu->arch.fpu_inuse & KVM_MIPS_FPU_FPU) {
+ vcpu->arch.aux_inuse &= ~(KVM_MIPS_AUX_FPU | KVM_MIPS_AUX_MSA);
+ } else if (vcpu->arch.aux_inuse & KVM_MIPS_AUX_FPU) {
set_c0_status(ST0_CU1);
enable_fpu_hazard();
__kvm_save_fpu(&vcpu->arch);
- vcpu->arch.fpu_inuse &= ~KVM_MIPS_FPU_FPU;
+ vcpu->arch.aux_inuse &= ~KVM_MIPS_AUX_FPU;
+ trace_kvm_aux(vcpu, KVM_TRACE_AUX_SAVE, KVM_TRACE_AUX_FPU);
/* Disable FPU */
clear_c0_status(ST0_CU1 | ST0_FR);
@@ -1638,6 +1795,10 @@ static int __init kvm_mips_init(void)
{
int ret;
+ ret = kvm_mips_entry_setup();
+ if (ret)
+ return ret;
+
ret = kvm_init(NULL, sizeof(struct kvm_vcpu), 0, THIS_MODULE);
if (ret)
@@ -1645,18 +1806,6 @@ static int __init kvm_mips_init(void)
register_die_notifier(&kvm_mips_csr_die_notifier);
- /*
- * On MIPS, kernel modules are executed from "mapped space", which
- * requires TLBs. The TLB handling code is statically linked with
- * the rest of the kernel (tlb.c) to avoid the possibility of
- * double faulting. The issue is that the TLB code references
- * routines that are part of the the KVM module, which are only
- * available once the module is loaded.
- */
- kvm_mips_gfn_to_pfn = gfn_to_pfn;
- kvm_mips_release_pfn_clean = kvm_release_pfn_clean;
- kvm_mips_is_error_pfn = is_error_pfn;
-
return 0;
}
@@ -1664,10 +1813,6 @@ static void __exit kvm_mips_exit(void)
{
kvm_exit();
- kvm_mips_gfn_to_pfn = NULL;
- kvm_mips_release_pfn_clean = NULL;
- kvm_mips_is_error_pfn = NULL;
-
unregister_die_notifier(&kvm_mips_csr_die_notifier);
}
diff --git a/arch/mips/kvm/mmu.c b/arch/mips/kvm/mmu.c
new file mode 100644
index 000000000000..57319ee57c4f
--- /dev/null
+++ b/arch/mips/kvm/mmu.c
@@ -0,0 +1,375 @@
+/*
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License. See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * KVM/MIPS MMU handling in the KVM module.
+ *
+ * Copyright (C) 2012 MIPS Technologies, Inc. All rights reserved.
+ * Authors: Sanjay Lal <sanjayl@kymasys.com>
+ */
+
+#include <linux/highmem.h>
+#include <linux/kvm_host.h>
+#include <asm/mmu_context.h>
+
+static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
+{
+ int cpu = smp_processor_id();
+
+ return vcpu->arch.guest_kernel_asid[cpu] &
+ cpu_asid_mask(&cpu_data[cpu]);
+}
+
+static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
+{
+ int cpu = smp_processor_id();
+
+ return vcpu->arch.guest_user_asid[cpu] &
+ cpu_asid_mask(&cpu_data[cpu]);
+}
+
+static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
+{
+ int srcu_idx, err = 0;
+ kvm_pfn_t pfn;
+
+ if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
+ return 0;
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+ pfn = gfn_to_pfn(kvm, gfn);
+
+ if (is_error_pfn(pfn)) {
+ kvm_err("Couldn't get pfn for gfn %#llx!\n", gfn);
+ err = -EFAULT;
+ goto out;
+ }
+
+ kvm->arch.guest_pmap[gfn] = pfn;
+out:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+ return err;
+}
+
+/* Translate guest KSEG0 addresses to Host PA */
+unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
+ unsigned long gva)
+{
+ gfn_t gfn;
+ unsigned long offset = gva & ~PAGE_MASK;
+ struct kvm *kvm = vcpu->kvm;
+
+ if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) {
+ kvm_err("%s/%p: Invalid gva: %#lx\n", __func__,
+ __builtin_return_address(0), gva);
+ return KVM_INVALID_PAGE;
+ }
+
+ gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT);
+
+ if (gfn >= kvm->arch.guest_pmap_npages) {
+ kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn,
+ gva);
+ return KVM_INVALID_PAGE;
+ }
+
+ if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
+ return KVM_INVALID_ADDR;
+
+ return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
+}
+
+/* XXXKYMA: Must be called with interrupts disabled */
+int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
+ struct kvm_vcpu *vcpu)
+{
+ gfn_t gfn;
+ kvm_pfn_t pfn0, pfn1;
+ unsigned long vaddr = 0;
+ unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
+ struct kvm *kvm = vcpu->kvm;
+ const int flush_dcache_mask = 0;
+ int ret;
+
+ if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
+ kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
+ kvm_mips_dump_host_tlbs();
+ return -1;
+ }
+
+ gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
+ if (gfn >= kvm->arch.guest_pmap_npages) {
+ kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
+ gfn, badvaddr);
+ kvm_mips_dump_host_tlbs();
+ return -1;
+ }
+ vaddr = badvaddr & (PAGE_MASK << 1);
+
+ if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
+ return -1;
+
+ if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0)
+ return -1;
+
+ pfn0 = kvm->arch.guest_pmap[gfn & ~0x1];
+ pfn1 = kvm->arch.guest_pmap[gfn | 0x1];
+
+ entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
+ ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+ ENTRYLO_D | ENTRYLO_V;
+ entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
+ ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+ ENTRYLO_D | ENTRYLO_V;
+
+ preempt_disable();
+ entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
+ ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+ flush_dcache_mask);
+ preempt_enable();
+
+ return ret;
+}
+
+int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
+ struct kvm_mips_tlb *tlb)
+{
+ unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
+ struct kvm *kvm = vcpu->kvm;
+ kvm_pfn_t pfn0, pfn1;
+ int ret;
+
+ if ((tlb->tlb_hi & VPN2_MASK) == 0) {
+ pfn0 = 0;
+ pfn1 = 0;
+ } else {
+ if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[0])
+ >> PAGE_SHIFT) < 0)
+ return -1;
+
+ if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo[1])
+ >> PAGE_SHIFT) < 0)
+ return -1;
+
+ pfn0 = kvm->arch.guest_pmap[
+ mips3_tlbpfn_to_paddr(tlb->tlb_lo[0]) >> PAGE_SHIFT];
+ pfn1 = kvm->arch.guest_pmap[
+ mips3_tlbpfn_to_paddr(tlb->tlb_lo[1]) >> PAGE_SHIFT];
+ }
+
+ /* Get attributes from the Guest TLB */
+ entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) |
+ ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+ (tlb->tlb_lo[0] & ENTRYLO_D) |
+ (tlb->tlb_lo[0] & ENTRYLO_V);
+ entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) |
+ ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+ (tlb->tlb_lo[1] & ENTRYLO_D) |
+ (tlb->tlb_lo[1] & ENTRYLO_V);
+
+ kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
+ tlb->tlb_lo[0], tlb->tlb_lo[1]);
+
+ preempt_disable();
+ entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
+ kvm_mips_get_kernel_asid(vcpu) :
+ kvm_mips_get_user_asid(vcpu));
+ ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
+ tlb->tlb_mask);
+ preempt_enable();
+
+ return ret;
+}
+
+void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
+ struct kvm_vcpu *vcpu)
+{
+ unsigned long asid = asid_cache(cpu);
+
+ asid += cpu_asid_inc();
+ if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
+ if (cpu_has_vtag_icache)
+ flush_icache_all();
+
+ kvm_local_flush_tlb_all(); /* start new asid cycle */
+
+ if (!asid) /* fix version if needed */
+ asid = asid_first_version(cpu);
+ }
+
+ cpu_context(cpu, mm) = asid_cache(cpu) = asid;
+}
+
+/**
+ * kvm_mips_migrate_count() - Migrate timer.
+ * @vcpu: Virtual CPU.
+ *
+ * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it
+ * if it was running prior to being cancelled.
+ *
+ * Must be called when the VCPU is migrated to a different CPU to ensure that
+ * timer expiry during guest execution interrupts the guest and causes the
+ * interrupt to be delivered in a timely manner.
+ */
+static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
+{
+ if (hrtimer_cancel(&vcpu->arch.comparecount_timer))
+ hrtimer_restart(&vcpu->arch.comparecount_timer);
+}
+
+/* Restore ASID once we are scheduled back after preemption */
+void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
+{
+ unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
+ unsigned long flags;
+ int newasid = 0;
+
+ kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
+
+ /* Allocate new kernel and user ASIDs if needed */
+
+ local_irq_save(flags);
+
+ if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
+ asid_version_mask(cpu)) {
+ kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
+ vcpu->arch.guest_kernel_asid[cpu] =
+ vcpu->arch.guest_kernel_mm.context.asid[cpu];
+ kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
+ vcpu->arch.guest_user_asid[cpu] =
+ vcpu->arch.guest_user_mm.context.asid[cpu];
+ newasid++;
+
+ kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
+ cpu_context(cpu, current->mm));
+ kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
+ cpu, vcpu->arch.guest_kernel_asid[cpu]);
+ kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
+ vcpu->arch.guest_user_asid[cpu]);
+ }
+
+ if (vcpu->arch.last_sched_cpu != cpu) {
+ kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
+ vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
+ /*
+ * Migrate the timer interrupt to the current CPU so that it
+ * always interrupts the guest and synchronously triggers a
+ * guest timer interrupt.
+ */
+ kvm_mips_migrate_count(vcpu);
+ }
+
+ if (!newasid) {
+ /*
+ * If we preempted while the guest was executing, then reload
+ * the pre-empted ASID
+ */
+ if (current->flags & PF_VCPU) {
+ write_c0_entryhi(vcpu->arch.
+ preempt_entryhi & asid_mask);
+ ehb();
+ }
+ } else {
+ /* New ASIDs were allocated for the VM */
+
+ /*
+ * Were we in guest context? If so then the pre-empted ASID is
+ * no longer valid, we need to set it to what it should be based
+ * on the mode of the Guest (Kernel/User)
+ */
+ if (current->flags & PF_VCPU) {
+ if (KVM_GUEST_KERNEL_MODE(vcpu))
+ write_c0_entryhi(vcpu->arch.
+ guest_kernel_asid[cpu] &
+ asid_mask);
+ else
+ write_c0_entryhi(vcpu->arch.
+ guest_user_asid[cpu] &
+ asid_mask);
+ ehb();
+ }
+ }
+
+ /* restore guest state to registers */
+ kvm_mips_callbacks->vcpu_set_regs(vcpu);
+
+ local_irq_restore(flags);
+
+}
+
+/* ASID can change if another task is scheduled during preemption */
+void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
+{
+ unsigned long flags;
+ int cpu;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+
+ vcpu->arch.preempt_entryhi = read_c0_entryhi();
+ vcpu->arch.last_sched_cpu = cpu;
+
+ /* save guest state in registers */
+ kvm_mips_callbacks->vcpu_get_regs(vcpu);
+
+ if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
+ asid_version_mask(cpu))) {
+ kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__,
+ cpu_context(cpu, current->mm));
+ drop_mmu_context(current->mm, cpu);
+ }
+ write_c0_entryhi(cpu_asid(cpu, current->mm));
+ ehb();
+
+ local_irq_restore(flags);
+}
+
+u32 kvm_get_inst(u32 *opc, struct kvm_vcpu *vcpu)
+{
+ struct mips_coproc *cop0 = vcpu->arch.cop0;
+ unsigned long paddr, flags, vpn2, asid;
+ unsigned long va = (unsigned long)opc;
+ void *vaddr;
+ u32 inst;
+ int index;
+
+ if (KVM_GUEST_KSEGX(va) < KVM_GUEST_KSEG0 ||
+ KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG23) {
+ local_irq_save(flags);
+ index = kvm_mips_host_tlb_lookup(vcpu, va);
+ if (index >= 0) {
+ inst = *(opc);
+ } else {
+ vpn2 = va & VPN2_MASK;
+ asid = kvm_read_c0_guest_entryhi(cop0) &
+ KVM_ENTRYHI_ASID;
+ index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
+ if (index < 0) {
+ kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
+ __func__, opc, vcpu, read_c0_entryhi());
+ kvm_mips_dump_host_tlbs();
+ kvm_mips_dump_guest_tlbs(vcpu);
+ local_irq_restore(flags);
+ return KVM_INVALID_INST;
+ }
+ kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
+ &vcpu->arch.
+ guest_tlb[index]);
+ inst = *(opc);
+ }
+ local_irq_restore(flags);
+ } else if (KVM_GUEST_KSEGX(va) == KVM_GUEST_KSEG0) {
+ paddr = kvm_mips_translate_guest_kseg0_to_hpa(vcpu, va);
+ vaddr = kmap_atomic(pfn_to_page(PHYS_PFN(paddr)));
+ vaddr += paddr & ~PAGE_MASK;
+ inst = *(u32 *)vaddr;
+ kunmap_atomic(vaddr);
+ } else {
+ kvm_err("%s: illegal address: %p\n", __func__, opc);
+ return KVM_INVALID_INST;
+ }
+
+ return inst;
+}
diff --git a/arch/mips/kvm/stats.c b/arch/mips/kvm/stats.c
index 888bb67070ac..53f851a61554 100644
--- a/arch/mips/kvm/stats.c
+++ b/arch/mips/kvm/stats.c
@@ -11,27 +11,6 @@
#include <linux/kvm_host.h>
-char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES] = {
- "WAIT",
- "CACHE",
- "Signal",
- "Interrupt",
- "COP0/1 Unusable",
- "TLB Mod",
- "TLB Miss (LD)",
- "TLB Miss (ST)",
- "Address Err (ST)",
- "Address Error (LD)",
- "System Call",
- "Reserved Inst",
- "Break Inst",
- "Trap Inst",
- "MSA FPE",
- "FPE",
- "MSA Disabled",
- "D-Cache Flushes",
-};
-
char *kvm_cop0_str[N_MIPS_COPROC_REGS] = {
"Index",
"Random",
diff --git a/arch/mips/kvm/tlb.c b/arch/mips/kvm/tlb.c
index ed021ae7867a..254377d8e0b9 100644
--- a/arch/mips/kvm/tlb.c
+++ b/arch/mips/kvm/tlb.c
@@ -14,7 +14,7 @@
#include <linux/smp.h>
#include <linux/mm.h>
#include <linux/delay.h>
-#include <linux/module.h>
+#include <linux/export.h>
#include <linux/kvm_host.h>
#include <linux/srcu.h>
@@ -24,6 +24,7 @@
#include <asm/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/tlb.h>
+#include <asm/tlbdebug.h>
#undef CONFIG_MIPS_MT
#include <asm/r4kcache.h>
@@ -32,22 +33,10 @@
#define KVM_GUEST_PC_TLB 0
#define KVM_GUEST_SP_TLB 1
-#define PRIx64 "llx"
-
atomic_t kvm_mips_instance;
EXPORT_SYMBOL_GPL(kvm_mips_instance);
-/* These function pointers are initialized once the KVM module is loaded */
-kvm_pfn_t (*kvm_mips_gfn_to_pfn)(struct kvm *kvm, gfn_t gfn);
-EXPORT_SYMBOL_GPL(kvm_mips_gfn_to_pfn);
-
-void (*kvm_mips_release_pfn_clean)(kvm_pfn_t pfn);
-EXPORT_SYMBOL_GPL(kvm_mips_release_pfn_clean);
-
-bool (*kvm_mips_is_error_pfn)(kvm_pfn_t pfn);
-EXPORT_SYMBOL_GPL(kvm_mips_is_error_pfn);
-
-uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
+static u32 kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
{
int cpu = smp_processor_id();
@@ -55,7 +44,7 @@ uint32_t kvm_mips_get_kernel_asid(struct kvm_vcpu *vcpu)
cpu_asid_mask(&cpu_data[cpu]);
}
-uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
+static u32 kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
{
int cpu = smp_processor_id();
@@ -63,7 +52,7 @@ uint32_t kvm_mips_get_user_asid(struct kvm_vcpu *vcpu)
cpu_asid_mask(&cpu_data[cpu]);
}
-inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
+inline u32 kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
{
return vcpu->kvm->arch.commpage_tlb;
}
@@ -72,50 +61,15 @@ inline uint32_t kvm_mips_get_commpage_asid(struct kvm_vcpu *vcpu)
void kvm_mips_dump_host_tlbs(void)
{
- unsigned long old_entryhi;
- unsigned long old_pagemask;
- struct kvm_mips_tlb tlb;
unsigned long flags;
- int i;
local_irq_save(flags);
- old_entryhi = read_c0_entryhi();
- old_pagemask = read_c0_pagemask();
-
kvm_info("HOST TLBs:\n");
- kvm_info("ASID: %#lx\n", read_c0_entryhi() &
- cpu_asid_mask(&current_cpu_data));
-
- for (i = 0; i < current_cpu_data.tlbsize; i++) {
- write_c0_index(i);
- mtc0_tlbw_hazard();
-
- tlb_read();
- tlbw_use_hazard();
+ dump_tlb_regs();
+ pr_info("\n");
+ dump_tlb_all();
- tlb.tlb_hi = read_c0_entryhi();
- tlb.tlb_lo0 = read_c0_entrylo0();
- tlb.tlb_lo1 = read_c0_entrylo1();
- tlb.tlb_mask = read_c0_pagemask();
-
- kvm_info("TLB%c%3d Hi 0x%08lx ",
- (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*',
- i, tlb.tlb_hi);
- kvm_info("Lo0=0x%09" PRIx64 " %c%c attr %lx ",
- (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
- (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ',
- (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ',
- (tlb.tlb_lo0 >> 3) & 7);
- kvm_info("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n",
- (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
- (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ',
- (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ',
- (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask);
- }
- write_c0_entryhi(old_entryhi);
- write_c0_pagemask(old_pagemask);
- mtc0_tlbw_hazard();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(kvm_mips_dump_host_tlbs);
@@ -132,74 +86,24 @@ void kvm_mips_dump_guest_tlbs(struct kvm_vcpu *vcpu)
for (i = 0; i < KVM_MIPS_GUEST_TLB_SIZE; i++) {
tlb = vcpu->arch.guest_tlb[i];
kvm_info("TLB%c%3d Hi 0x%08lx ",
- (tlb.tlb_lo0 | tlb.tlb_lo1) & MIPS3_PG_V ? ' ' : '*',
+ (tlb.tlb_lo[0] | tlb.tlb_lo[1]) & ENTRYLO_V
+ ? ' ' : '*',
i, tlb.tlb_hi);
- kvm_info("Lo0=0x%09" PRIx64 " %c%c attr %lx ",
- (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo0),
- (tlb.tlb_lo0 & MIPS3_PG_D) ? 'D' : ' ',
- (tlb.tlb_lo0 & MIPS3_PG_G) ? 'G' : ' ',
- (tlb.tlb_lo0 >> 3) & 7);
- kvm_info("Lo1=0x%09" PRIx64 " %c%c attr %lx sz=%lx\n",
- (uint64_t) mips3_tlbpfn_to_paddr(tlb.tlb_lo1),
- (tlb.tlb_lo1 & MIPS3_PG_D) ? 'D' : ' ',
- (tlb.tlb_lo1 & MIPS3_PG_G) ? 'G' : ' ',
- (tlb.tlb_lo1 >> 3) & 7, tlb.tlb_mask);
+ kvm_info("Lo0=0x%09llx %c%c attr %lx ",
+ (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[0]),
+ (tlb.tlb_lo[0] & ENTRYLO_D) ? 'D' : ' ',
+ (tlb.tlb_lo[0] & ENTRYLO_G) ? 'G' : ' ',
+ (tlb.tlb_lo[0] & ENTRYLO_C) >> ENTRYLO_C_SHIFT);
+ kvm_info("Lo1=0x%09llx %c%c attr %lx sz=%lx\n",
+ (u64) mips3_tlbpfn_to_paddr(tlb.tlb_lo[1]),
+ (tlb.tlb_lo[1] & ENTRYLO_D) ? 'D' : ' ',
+ (tlb.tlb_lo[1] & ENTRYLO_G) ? 'G' : ' ',
+ (tlb.tlb_lo[1] & ENTRYLO_C) >> ENTRYLO_C_SHIFT,
+ tlb.tlb_mask);
}
}
EXPORT_SYMBOL_GPL(kvm_mips_dump_guest_tlbs);
-static int kvm_mips_map_page(struct kvm *kvm, gfn_t gfn)
-{
- int srcu_idx, err = 0;
- kvm_pfn_t pfn;
-
- if (kvm->arch.guest_pmap[gfn] != KVM_INVALID_PAGE)
- return 0;
-
- srcu_idx = srcu_read_lock(&kvm->srcu);
- pfn = kvm_mips_gfn_to_pfn(kvm, gfn);
-
- if (kvm_mips_is_error_pfn(pfn)) {
- kvm_err("Couldn't get pfn for gfn %#" PRIx64 "!\n", gfn);
- err = -EFAULT;
- goto out;
- }
-
- kvm->arch.guest_pmap[gfn] = pfn;
-out:
- srcu_read_unlock(&kvm->srcu, srcu_idx);
- return err;
-}
-
-/* Translate guest KSEG0 addresses to Host PA */
-unsigned long kvm_mips_translate_guest_kseg0_to_hpa(struct kvm_vcpu *vcpu,
- unsigned long gva)
-{
- gfn_t gfn;
- uint32_t offset = gva & ~PAGE_MASK;
- struct kvm *kvm = vcpu->kvm;
-
- if (KVM_GUEST_KSEGX(gva) != KVM_GUEST_KSEG0) {
- kvm_err("%s/%p: Invalid gva: %#lx\n", __func__,
- __builtin_return_address(0), gva);
- return KVM_INVALID_PAGE;
- }
-
- gfn = (KVM_GUEST_CPHYSADDR(gva) >> PAGE_SHIFT);
-
- if (gfn >= kvm->arch.guest_pmap_npages) {
- kvm_err("%s: Invalid gfn: %#llx, GVA: %#lx\n", __func__, gfn,
- gva);
- return KVM_INVALID_PAGE;
- }
-
- if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
- return KVM_INVALID_ADDR;
-
- return (kvm->arch.guest_pmap[gfn] << PAGE_SHIFT) + offset;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_translate_guest_kseg0_to_hpa);
-
/* XXXKYMA: Must be called with interrupts disabled */
/* set flush_dcache_mask == 0 if no dcache flush required */
int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
@@ -243,12 +147,12 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
/* Flush D-cache */
if (flush_dcache_mask) {
- if (entrylo0 & MIPS3_PG_V) {
+ if (entrylo0 & ENTRYLO_V) {
++vcpu->stat.flush_dcache_exits;
flush_data_cache_page((entryhi & VPN2_MASK) &
~flush_dcache_mask);
}
- if (entrylo1 & MIPS3_PG_V) {
+ if (entrylo1 & ENTRYLO_V) {
++vcpu->stat.flush_dcache_exits;
flush_data_cache_page(((entryhi & VPN2_MASK) &
~flush_dcache_mask) |
@@ -259,96 +163,35 @@ int kvm_mips_host_tlb_write(struct kvm_vcpu *vcpu, unsigned long entryhi,
/* Restore old ASID */
write_c0_entryhi(old_entryhi);
mtc0_tlbw_hazard();
- tlbw_use_hazard();
local_irq_restore(flags);
return 0;
}
-
-/* XXXKYMA: Must be called with interrupts disabled */
-int kvm_mips_handle_kseg0_tlb_fault(unsigned long badvaddr,
- struct kvm_vcpu *vcpu)
-{
- gfn_t gfn;
- kvm_pfn_t pfn0, pfn1;
- unsigned long vaddr = 0;
- unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
- int even;
- struct kvm *kvm = vcpu->kvm;
- const int flush_dcache_mask = 0;
- int ret;
-
- if (KVM_GUEST_KSEGX(badvaddr) != KVM_GUEST_KSEG0) {
- kvm_err("%s: Invalid BadVaddr: %#lx\n", __func__, badvaddr);
- kvm_mips_dump_host_tlbs();
- return -1;
- }
-
- gfn = (KVM_GUEST_CPHYSADDR(badvaddr) >> PAGE_SHIFT);
- if (gfn >= kvm->arch.guest_pmap_npages) {
- kvm_err("%s: Invalid gfn: %#llx, BadVaddr: %#lx\n", __func__,
- gfn, badvaddr);
- kvm_mips_dump_host_tlbs();
- return -1;
- }
- even = !(gfn & 0x1);
- vaddr = badvaddr & (PAGE_MASK << 1);
-
- if (kvm_mips_map_page(vcpu->kvm, gfn) < 0)
- return -1;
-
- if (kvm_mips_map_page(vcpu->kvm, gfn ^ 0x1) < 0)
- return -1;
-
- if (even) {
- pfn0 = kvm->arch.guest_pmap[gfn];
- pfn1 = kvm->arch.guest_pmap[gfn ^ 0x1];
- } else {
- pfn0 = kvm->arch.guest_pmap[gfn ^ 0x1];
- pfn1 = kvm->arch.guest_pmap[gfn];
- }
-
- entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
- (1 << 2) | (0x1 << 1);
- entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
- (1 << 2) | (0x1 << 1);
-
- preempt_disable();
- entryhi = (vaddr | kvm_mips_get_kernel_asid(vcpu));
- ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
- flush_dcache_mask);
- preempt_enable();
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_handle_kseg0_tlb_fault);
+EXPORT_SYMBOL_GPL(kvm_mips_host_tlb_write);
int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
struct kvm_vcpu *vcpu)
{
- kvm_pfn_t pfn0, pfn1;
+ kvm_pfn_t pfn;
unsigned long flags, old_entryhi = 0, vaddr = 0;
- unsigned long entrylo0 = 0, entrylo1 = 0;
+ unsigned long entrylo[2] = { 0, 0 };
+ unsigned int pair_idx;
- pfn0 = CPHYSADDR(vcpu->arch.kseg0_commpage) >> PAGE_SHIFT;
- pfn1 = 0;
- entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
- (1 << 2) | (0x1 << 1);
- entrylo1 = 0;
+ pfn = PFN_DOWN(virt_to_phys(vcpu->arch.kseg0_commpage));
+ pair_idx = (badvaddr >> PAGE_SHIFT) & 1;
+ entrylo[pair_idx] = mips3_paddr_to_tlbpfn(pfn << PAGE_SHIFT) |
+ ((_page_cachable_default >> _CACHE_SHIFT) << ENTRYLO_C_SHIFT) |
+ ENTRYLO_D | ENTRYLO_V;
local_irq_save(flags);
old_entryhi = read_c0_entryhi();
vaddr = badvaddr & (PAGE_MASK << 1);
write_c0_entryhi(vaddr | kvm_mips_get_kernel_asid(vcpu));
- mtc0_tlbw_hazard();
- write_c0_entrylo0(entrylo0);
- mtc0_tlbw_hazard();
- write_c0_entrylo1(entrylo1);
- mtc0_tlbw_hazard();
+ write_c0_entrylo0(entrylo[0]);
+ write_c0_entrylo1(entrylo[1]);
write_c0_index(kvm_mips_get_commpage_asid(vcpu));
mtc0_tlbw_hazard();
tlb_write_indexed();
- mtc0_tlbw_hazard();
tlbw_use_hazard();
kvm_debug("@ %#lx idx: %2d [entryhi(R): %#lx] entrylo0 (R): 0x%08lx, entrylo1(R): 0x%08lx\n",
@@ -358,68 +201,12 @@ int kvm_mips_handle_commpage_tlb_fault(unsigned long badvaddr,
/* Restore old ASID */
write_c0_entryhi(old_entryhi);
mtc0_tlbw_hazard();
- tlbw_use_hazard();
local_irq_restore(flags);
return 0;
}
EXPORT_SYMBOL_GPL(kvm_mips_handle_commpage_tlb_fault);
-int kvm_mips_handle_mapped_seg_tlb_fault(struct kvm_vcpu *vcpu,
- struct kvm_mips_tlb *tlb,
- unsigned long *hpa0,
- unsigned long *hpa1)
-{
- unsigned long entryhi = 0, entrylo0 = 0, entrylo1 = 0;
- struct kvm *kvm = vcpu->kvm;
- kvm_pfn_t pfn0, pfn1;
- int ret;
-
- if ((tlb->tlb_hi & VPN2_MASK) == 0) {
- pfn0 = 0;
- pfn1 = 0;
- } else {
- if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
- >> PAGE_SHIFT) < 0)
- return -1;
-
- if (kvm_mips_map_page(kvm, mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
- >> PAGE_SHIFT) < 0)
- return -1;
-
- pfn0 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo0)
- >> PAGE_SHIFT];
- pfn1 = kvm->arch.guest_pmap[mips3_tlbpfn_to_paddr(tlb->tlb_lo1)
- >> PAGE_SHIFT];
- }
-
- if (hpa0)
- *hpa0 = pfn0 << PAGE_SHIFT;
-
- if (hpa1)
- *hpa1 = pfn1 << PAGE_SHIFT;
-
- /* Get attributes from the Guest TLB */
- entrylo0 = mips3_paddr_to_tlbpfn(pfn0 << PAGE_SHIFT) | (0x3 << 3) |
- (tlb->tlb_lo0 & MIPS3_PG_D) | (tlb->tlb_lo0 & MIPS3_PG_V);
- entrylo1 = mips3_paddr_to_tlbpfn(pfn1 << PAGE_SHIFT) | (0x3 << 3) |
- (tlb->tlb_lo1 & MIPS3_PG_D) | (tlb->tlb_lo1 & MIPS3_PG_V);
-
- kvm_debug("@ %#lx tlb_lo0: 0x%08lx tlb_lo1: 0x%08lx\n", vcpu->arch.pc,
- tlb->tlb_lo0, tlb->tlb_lo1);
-
- preempt_disable();
- entryhi = (tlb->tlb_hi & VPN2_MASK) | (KVM_GUEST_KERNEL_MODE(vcpu) ?
- kvm_mips_get_kernel_asid(vcpu) :
- kvm_mips_get_user_asid(vcpu));
- ret = kvm_mips_host_tlb_write(vcpu, entryhi, entrylo0, entrylo1,
- tlb->tlb_mask);
- preempt_enable();
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(kvm_mips_handle_mapped_seg_tlb_fault);
-
int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
{
int i;
@@ -435,7 +222,7 @@ int kvm_mips_guest_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long entryhi)
}
kvm_debug("%s: entryhi: %#lx, index: %d lo0: %#lx, lo1: %#lx\n",
- __func__, entryhi, index, tlb[i].tlb_lo0, tlb[i].tlb_lo1);
+ __func__, entryhi, index, tlb[i].tlb_lo[0], tlb[i].tlb_lo[1]);
return index;
}
@@ -467,7 +254,6 @@ int kvm_mips_host_tlb_lookup(struct kvm_vcpu *vcpu, unsigned long vaddr)
/* Restore old ASID */
write_c0_entryhi(old_entryhi);
mtc0_tlbw_hazard();
- tlbw_use_hazard();
local_irq_restore(flags);
@@ -498,21 +284,16 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va)
if (idx > 0) {
write_c0_entryhi(UNIQUE_ENTRYHI(idx));
- mtc0_tlbw_hazard();
-
write_c0_entrylo0(0);
- mtc0_tlbw_hazard();
-
write_c0_entrylo1(0);
mtc0_tlbw_hazard();
tlb_write_indexed();
- mtc0_tlbw_hazard();
+ tlbw_use_hazard();
}
write_c0_entryhi(old_entryhi);
mtc0_tlbw_hazard();
- tlbw_use_hazard();
local_irq_restore(flags);
@@ -540,61 +321,39 @@ void kvm_mips_flush_host_tlb(int skip_kseg0)
/* Blast 'em all away. */
for (entry = 0; entry < maxentry; entry++) {
write_c0_index(entry);
- mtc0_tlbw_hazard();
if (skip_kseg0) {
+ mtc0_tlbr_hazard();
tlb_read();
- tlbw_use_hazard();
+ tlb_read_hazard();
entryhi = read_c0_entryhi();
/* Don't blow away guest kernel entries */
if (KVM_GUEST_KSEGX(entryhi) == KVM_GUEST_KSEG0)
continue;
+
+ write_c0_pagemask(old_pagemask);
}
/* Make sure all entries differ. */
write_c0_entryhi(UNIQUE_ENTRYHI(entry));
- mtc0_tlbw_hazard();
write_c0_entrylo0(0);
- mtc0_tlbw_hazard();
write_c0_entrylo1(0);
mtc0_tlbw_hazard();
tlb_write_indexed();
- mtc0_tlbw_hazard();
+ tlbw_use_hazard();
}
- tlbw_use_hazard();
-
write_c0_entryhi(old_entryhi);
write_c0_pagemask(old_pagemask);
mtc0_tlbw_hazard();
- tlbw_use_hazard();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(kvm_mips_flush_host_tlb);
-void kvm_get_new_mmu_context(struct mm_struct *mm, unsigned long cpu,
- struct kvm_vcpu *vcpu)
-{
- unsigned long asid = asid_cache(cpu);
-
- asid += cpu_asid_inc();
- if (!(asid & cpu_asid_mask(&cpu_data[cpu]))) {
- if (cpu_has_vtag_icache)
- flush_icache_all();
-
- kvm_local_flush_tlb_all(); /* start new asid cycle */
-
- if (!asid) /* fix version if needed */
- asid = asid_first_version(cpu);
- }
-
- cpu_context(cpu, mm) = asid_cache(cpu) = asid;
-}
-
void kvm_local_flush_tlb_all(void)
{
unsigned long flags;
@@ -614,185 +373,12 @@ void kvm_local_flush_tlb_all(void)
write_c0_index(entry);
mtc0_tlbw_hazard();
tlb_write_indexed();
+ tlbw_use_hazard();
entry++;
}
- tlbw_use_hazard();
write_c0_entryhi(old_ctx);
mtc0_tlbw_hazard();
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(kvm_local_flush_tlb_all);
-
-/**
- * kvm_mips_migrate_count() - Migrate timer.
- * @vcpu: Virtual CPU.
- *
- * Migrate CP0_Count hrtimer to the current CPU by cancelling and restarting it
- * if it was running prior to being cancelled.
- *
- * Must be called when the VCPU is migrated to a different CPU to ensure that
- * timer expiry during guest execution interrupts the guest and causes the
- * interrupt to be delivered in a timely manner.
- */
-static void kvm_mips_migrate_count(struct kvm_vcpu *vcpu)
-{
- if (hrtimer_cancel(&vcpu->arch.comparecount_timer))
- hrtimer_restart(&vcpu->arch.comparecount_timer);
-}
-
-/* Restore ASID once we are scheduled back after preemption */
-void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
-{
- unsigned long asid_mask = cpu_asid_mask(&cpu_data[cpu]);
- unsigned long flags;
- int newasid = 0;
-
- kvm_debug("%s: vcpu %p, cpu: %d\n", __func__, vcpu, cpu);
-
- /* Allocate new kernel and user ASIDs if needed */
-
- local_irq_save(flags);
-
- if ((vcpu->arch.guest_kernel_asid[cpu] ^ asid_cache(cpu)) &
- asid_version_mask(cpu)) {
- kvm_get_new_mmu_context(&vcpu->arch.guest_kernel_mm, cpu, vcpu);
- vcpu->arch.guest_kernel_asid[cpu] =
- vcpu->arch.guest_kernel_mm.context.asid[cpu];
- kvm_get_new_mmu_context(&vcpu->arch.guest_user_mm, cpu, vcpu);
- vcpu->arch.guest_user_asid[cpu] =
- vcpu->arch.guest_user_mm.context.asid[cpu];
- newasid++;
-
- kvm_debug("[%d]: cpu_context: %#lx\n", cpu,
- cpu_context(cpu, current->mm));
- kvm_debug("[%d]: Allocated new ASID for Guest Kernel: %#x\n",
- cpu, vcpu->arch.guest_kernel_asid[cpu]);
- kvm_debug("[%d]: Allocated new ASID for Guest User: %#x\n", cpu,
- vcpu->arch.guest_user_asid[cpu]);
- }
-
- if (vcpu->arch.last_sched_cpu != cpu) {
- kvm_debug("[%d->%d]KVM VCPU[%d] switch\n",
- vcpu->arch.last_sched_cpu, cpu, vcpu->vcpu_id);
- /*
- * Migrate the timer interrupt to the current CPU so that it
- * always interrupts the guest and synchronously triggers a
- * guest timer interrupt.
- */
- kvm_mips_migrate_count(vcpu);
- }
-
- if (!newasid) {
- /*
- * If we preempted while the guest was executing, then reload
- * the pre-empted ASID
- */
- if (current->flags & PF_VCPU) {
- write_c0_entryhi(vcpu->arch.
- preempt_entryhi & asid_mask);
- ehb();
- }
- } else {
- /* New ASIDs were allocated for the VM */
-
- /*
- * Were we in guest context? If so then the pre-empted ASID is
- * no longer valid, we need to set it to what it should be based
- * on the mode of the Guest (Kernel/User)
- */
- if (current->flags & PF_VCPU) {
- if (KVM_GUEST_KERNEL_MODE(vcpu))
- write_c0_entryhi(vcpu->arch.
- guest_kernel_asid[cpu] &
- asid_mask);
- else
- write_c0_entryhi(vcpu->arch.
- guest_user_asid[cpu] &
- asid_mask);
- ehb();
- }
- }
-
- /* restore guest state to registers */
- kvm_mips_callbacks->vcpu_set_regs(vcpu);
-
- local_irq_restore(flags);
-
-}
-EXPORT_SYMBOL_GPL(kvm_arch_vcpu_load);
-
-/* ASID can change if another task is scheduled during preemption */
-void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
-{
- unsigned long flags;
- uint32_t cpu;
-
- local_irq_save(flags);
-
- cpu = smp_processor_id();
-
- vcpu->arch.preempt_entryhi = read_c0_entryhi();
- vcpu->arch.last_sched_cpu = cpu;
-
- /* save guest state in registers */
- kvm_mips_callbacks->vcpu_get_regs(vcpu);
-
- if (((cpu_context(cpu, current->mm) ^ asid_cache(cpu)) &
- asid_version_mask(cpu))) {
- kvm_debug("%s: Dropping MMU Context: %#lx\n", __func__,
- cpu_context(cpu, current->mm));
- drop_mmu_context(current->mm, cpu);
- }
- write_c0_entryhi(cpu_asid(cpu, current->mm));
- ehb();
-
- local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(kvm_arch_vcpu_put);
-
-uint32_t kvm_get_inst(uint32_t *opc, struct kvm_vcpu *vcpu)
-{
- struct mips_coproc *cop0 = vcpu->arch.cop0;
- unsigned long paddr, flags, vpn2, asid;
- uint32_t inst;
- int index;
-
- if (KVM_GUEST_KSEGX((unsigned long) opc) < KVM_GUEST_KSEG0 ||
- KVM_GUEST_KSEGX((unsigned long) opc) == KVM_GUEST_KSEG23) {
- local_irq_save(flags);
- index = kvm_mips_host_tlb_lookup(vcpu, (unsigned long) opc);
- if (index >= 0) {
- inst = *(opc);
- } else {
- vpn2 = (unsigned long) opc & VPN2_MASK;
- asid = kvm_read_c0_guest_entryhi(cop0) &
- KVM_ENTRYHI_ASID;
- index = kvm_mips_guest_tlb_lookup(vcpu, vpn2 | asid);
- if (index < 0) {
- kvm_err("%s: get_user_failed for %p, vcpu: %p, ASID: %#lx\n",
- __func__, opc, vcpu, read_c0_entryhi());
- kvm_mips_dump_host_tlbs();
- local_irq_restore(flags);
- return KVM_INVALID_INST;
- }
- kvm_mips_handle_mapped_seg_tlb_fault(vcpu,
- &vcpu->arch.
- guest_tlb[index],
- NULL, NULL);
- inst = *(opc);
- }
- local_irq_restore(flags);
- } else if (KVM_GUEST_KSEGX(opc) == KVM_GUEST_KSEG0) {
- paddr =
- kvm_mips_translate_guest_kseg0_to_hpa(vcpu,
- (unsigned long) opc);
- inst = *(uint32_t *) CKSEG0ADDR(paddr);
- } else {
- kvm_err("%s: illegal address: %p\n", __func__, opc);
- return KVM_INVALID_INST;
- }
-
- return inst;
-}
-EXPORT_SYMBOL_GPL(kvm_get_inst);
diff --git a/arch/mips/kvm/trace.h b/arch/mips/kvm/trace.h
index bd6437f67dc0..c858cf168078 100644
--- a/arch/mips/kvm/trace.h
+++ b/arch/mips/kvm/trace.h
@@ -17,8 +17,75 @@
#define TRACE_INCLUDE_PATH .
#define TRACE_INCLUDE_FILE trace
-/* Tracepoints for VM eists */
-extern char *kvm_mips_exit_types_str[MAX_KVM_MIPS_EXIT_TYPES];
+/*
+ * Tracepoints for VM enters
+ */
+DECLARE_EVENT_CLASS(kvm_transition,
+ TP_PROTO(struct kvm_vcpu *vcpu),
+ TP_ARGS(vcpu),
+ TP_STRUCT__entry(
+ __field(unsigned long, pc)
+ ),
+
+ TP_fast_assign(
+ __entry->pc = vcpu->arch.pc;
+ ),
+
+ TP_printk("PC: 0x%08lx",
+ __entry->pc)
+);
+
+DEFINE_EVENT(kvm_transition, kvm_enter,
+ TP_PROTO(struct kvm_vcpu *vcpu),
+ TP_ARGS(vcpu));
+
+DEFINE_EVENT(kvm_transition, kvm_reenter,
+ TP_PROTO(struct kvm_vcpu *vcpu),
+ TP_ARGS(vcpu));
+
+DEFINE_EVENT(kvm_transition, kvm_out,
+ TP_PROTO(struct kvm_vcpu *vcpu),
+ TP_ARGS(vcpu));
+
+/* The first 32 exit reasons correspond to Cause.ExcCode */
+#define KVM_TRACE_EXIT_INT 0
+#define KVM_TRACE_EXIT_TLBMOD 1
+#define KVM_TRACE_EXIT_TLBMISS_LD 2
+#define KVM_TRACE_EXIT_TLBMISS_ST 3
+#define KVM_TRACE_EXIT_ADDRERR_LD 4
+#define KVM_TRACE_EXIT_ADDRERR_ST 5
+#define KVM_TRACE_EXIT_SYSCALL 8
+#define KVM_TRACE_EXIT_BREAK_INST 9
+#define KVM_TRACE_EXIT_RESVD_INST 10
+#define KVM_TRACE_EXIT_COP_UNUSABLE 11
+#define KVM_TRACE_EXIT_TRAP_INST 13
+#define KVM_TRACE_EXIT_MSA_FPE 14
+#define KVM_TRACE_EXIT_FPE 15
+#define KVM_TRACE_EXIT_MSA_DISABLED 21
+/* Further exit reasons */
+#define KVM_TRACE_EXIT_WAIT 32
+#define KVM_TRACE_EXIT_CACHE 33
+#define KVM_TRACE_EXIT_SIGNAL 34
+
+/* Tracepoints for VM exits */
+#define kvm_trace_symbol_exit_types \
+ { KVM_TRACE_EXIT_INT, "Interrupt" }, \
+ { KVM_TRACE_EXIT_TLBMOD, "TLB Mod" }, \
+ { KVM_TRACE_EXIT_TLBMISS_LD, "TLB Miss (LD)" }, \
+ { KVM_TRACE_EXIT_TLBMISS_ST, "TLB Miss (ST)" }, \
+ { KVM_TRACE_EXIT_ADDRERR_LD, "Address Error (LD)" }, \
+ { KVM_TRACE_EXIT_ADDRERR_ST, "Address Err (ST)" }, \
+ { KVM_TRACE_EXIT_SYSCALL, "System Call" }, \
+ { KVM_TRACE_EXIT_BREAK_INST, "Break Inst" }, \
+ { KVM_TRACE_EXIT_RESVD_INST, "Reserved Inst" }, \
+ { KVM_TRACE_EXIT_COP_UNUSABLE, "COP0/1 Unusable" }, \
+ { KVM_TRACE_EXIT_TRAP_INST, "Trap Inst" }, \
+ { KVM_TRACE_EXIT_MSA_FPE, "MSA FPE" }, \
+ { KVM_TRACE_EXIT_FPE, "FPE" }, \
+ { KVM_TRACE_EXIT_MSA_DISABLED, "MSA Disabled" }, \
+ { KVM_TRACE_EXIT_WAIT, "WAIT" }, \
+ { KVM_TRACE_EXIT_CACHE, "CACHE" }, \
+ { KVM_TRACE_EXIT_SIGNAL, "Signal" }
TRACE_EVENT(kvm_exit,
TP_PROTO(struct kvm_vcpu *vcpu, unsigned int reason),
@@ -34,10 +101,173 @@ TRACE_EVENT(kvm_exit,
),
TP_printk("[%s]PC: 0x%08lx",
- kvm_mips_exit_types_str[__entry->reason],
+ __print_symbolic(__entry->reason,
+ kvm_trace_symbol_exit_types),
__entry->pc)
);
+#define KVM_TRACE_MFC0 0
+#define KVM_TRACE_MTC0 1
+#define KVM_TRACE_DMFC0 2
+#define KVM_TRACE_DMTC0 3
+#define KVM_TRACE_RDHWR 4
+
+#define KVM_TRACE_HWR_COP0 0
+#define KVM_TRACE_HWR_HWR 1
+
+#define KVM_TRACE_COP0(REG, SEL) ((KVM_TRACE_HWR_COP0 << 8) | \
+ ((REG) << 3) | (SEL))
+#define KVM_TRACE_HWR(REG, SEL) ((KVM_TRACE_HWR_HWR << 8) | \
+ ((REG) << 3) | (SEL))
+
+#define kvm_trace_symbol_hwr_ops \
+ { KVM_TRACE_MFC0, "MFC0" }, \
+ { KVM_TRACE_MTC0, "MTC0" }, \
+ { KVM_TRACE_DMFC0, "DMFC0" }, \
+ { KVM_TRACE_DMTC0, "DMTC0" }, \
+ { KVM_TRACE_RDHWR, "RDHWR" }
+
+#define kvm_trace_symbol_hwr_cop \
+ { KVM_TRACE_HWR_COP0, "COP0" }, \
+ { KVM_TRACE_HWR_HWR, "HWR" }
+
+#define kvm_trace_symbol_hwr_regs \
+ { KVM_TRACE_COP0( 0, 0), "Index" }, \
+ { KVM_TRACE_COP0( 2, 0), "EntryLo0" }, \
+ { KVM_TRACE_COP0( 3, 0), "EntryLo1" }, \
+ { KVM_TRACE_COP0( 4, 0), "Context" }, \
+ { KVM_TRACE_COP0( 4, 2), "UserLocal" }, \
+ { KVM_TRACE_COP0( 5, 0), "PageMask" }, \
+ { KVM_TRACE_COP0( 6, 0), "Wired" }, \
+ { KVM_TRACE_COP0( 7, 0), "HWREna" }, \
+ { KVM_TRACE_COP0( 8, 0), "BadVAddr" }, \
+ { KVM_TRACE_COP0( 9, 0), "Count" }, \
+ { KVM_TRACE_COP0(10, 0), "EntryHi" }, \
+ { KVM_TRACE_COP0(11, 0), "Compare" }, \
+ { KVM_TRACE_COP0(12, 0), "Status" }, \
+ { KVM_TRACE_COP0(12, 1), "IntCtl" }, \
+ { KVM_TRACE_COP0(12, 2), "SRSCtl" }, \
+ { KVM_TRACE_COP0(13, 0), "Cause" }, \
+ { KVM_TRACE_COP0(14, 0), "EPC" }, \
+ { KVM_TRACE_COP0(15, 0), "PRId" }, \
+ { KVM_TRACE_COP0(15, 1), "EBase" }, \
+ { KVM_TRACE_COP0(16, 0), "Config" }, \
+ { KVM_TRACE_COP0(16, 1), "Config1" }, \
+ { KVM_TRACE_COP0(16, 2), "Config2" }, \
+ { KVM_TRACE_COP0(16, 3), "Config3" }, \
+ { KVM_TRACE_COP0(16, 4), "Config4" }, \
+ { KVM_TRACE_COP0(16, 5), "Config5" }, \
+ { KVM_TRACE_COP0(16, 7), "Config7" }, \
+ { KVM_TRACE_COP0(26, 0), "ECC" }, \
+ { KVM_TRACE_COP0(30, 0), "ErrorEPC" }, \
+ { KVM_TRACE_COP0(31, 2), "KScratch1" }, \
+ { KVM_TRACE_COP0(31, 3), "KScratch2" }, \
+ { KVM_TRACE_COP0(31, 4), "KScratch3" }, \
+ { KVM_TRACE_COP0(31, 5), "KScratch4" }, \
+ { KVM_TRACE_COP0(31, 6), "KScratch5" }, \
+ { KVM_TRACE_COP0(31, 7), "KScratch6" }, \
+ { KVM_TRACE_HWR( 0, 0), "CPUNum" }, \
+ { KVM_TRACE_HWR( 1, 0), "SYNCI_Step" }, \
+ { KVM_TRACE_HWR( 2, 0), "CC" }, \
+ { KVM_TRACE_HWR( 3, 0), "CCRes" }, \
+ { KVM_TRACE_HWR(29, 0), "ULR" }
+
+TRACE_EVENT(kvm_hwr,
+ TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op, unsigned int reg,
+ unsigned long val),
+ TP_ARGS(vcpu, op, reg, val),
+ TP_STRUCT__entry(
+ __field(unsigned long, val)
+ __field(u16, reg)
+ __field(u8, op)
+ ),
+
+ TP_fast_assign(
+ __entry->val = val;
+ __entry->reg = reg;
+ __entry->op = op;
+ ),
+
+ TP_printk("%s %s (%s:%u:%u) 0x%08lx",
+ __print_symbolic(__entry->op,
+ kvm_trace_symbol_hwr_ops),
+ __print_symbolic(__entry->reg,
+ kvm_trace_symbol_hwr_regs),
+ __print_symbolic(__entry->reg >> 8,
+ kvm_trace_symbol_hwr_cop),
+ (__entry->reg >> 3) & 0x1f,
+ __entry->reg & 0x7,
+ __entry->val)
+);
+
+#define KVM_TRACE_AUX_RESTORE 0
+#define KVM_TRACE_AUX_SAVE 1
+#define KVM_TRACE_AUX_ENABLE 2
+#define KVM_TRACE_AUX_DISABLE 3
+#define KVM_TRACE_AUX_DISCARD 4
+
+#define KVM_TRACE_AUX_FPU 1
+#define KVM_TRACE_AUX_MSA 2
+#define KVM_TRACE_AUX_FPU_MSA 3
+
+#define kvm_trace_symbol_aux_op \
+ { KVM_TRACE_AUX_RESTORE, "restore" }, \
+ { KVM_TRACE_AUX_SAVE, "save" }, \
+ { KVM_TRACE_AUX_ENABLE, "enable" }, \
+ { KVM_TRACE_AUX_DISABLE, "disable" }, \
+ { KVM_TRACE_AUX_DISCARD, "discard" }
+
+#define kvm_trace_symbol_aux_state \
+ { KVM_TRACE_AUX_FPU, "FPU" }, \
+ { KVM_TRACE_AUX_MSA, "MSA" }, \
+ { KVM_TRACE_AUX_FPU_MSA, "FPU & MSA" }
+
+TRACE_EVENT(kvm_aux,
+ TP_PROTO(struct kvm_vcpu *vcpu, unsigned int op,
+ unsigned int state),
+ TP_ARGS(vcpu, op, state),
+ TP_STRUCT__entry(
+ __field(unsigned long, pc)
+ __field(u8, op)
+ __field(u8, state)
+ ),
+
+ TP_fast_assign(
+ __entry->pc = vcpu->arch.pc;
+ __entry->op = op;
+ __entry->state = state;
+ ),
+
+ TP_printk("%s %s PC: 0x%08lx",
+ __print_symbolic(__entry->op,
+ kvm_trace_symbol_aux_op),
+ __print_symbolic(__entry->state,
+ kvm_trace_symbol_aux_state),
+ __entry->pc)
+);
+
+TRACE_EVENT(kvm_asid_change,
+ TP_PROTO(struct kvm_vcpu *vcpu, unsigned int old_asid,
+ unsigned int new_asid),
+ TP_ARGS(vcpu, old_asid, new_asid),
+ TP_STRUCT__entry(
+ __field(unsigned long, pc)
+ __field(u8, old_asid)
+ __field(u8, new_asid)
+ ),
+
+ TP_fast_assign(
+ __entry->pc = vcpu->arch.pc;
+ __entry->old_asid = old_asid;
+ __entry->new_asid = new_asid;
+ ),
+
+ TP_printk("PC: 0x%08lx old: 0x%02x new: 0x%02x",
+ __entry->pc,
+ __entry->old_asid,
+ __entry->new_asid)
+);
+
#endif /* _TRACE_KVM_H */
/* This part must be outside protection */
diff --git a/arch/mips/kvm/trap_emul.c b/arch/mips/kvm/trap_emul.c
index 6ba0fafcecbc..091553942bcb 100644
--- a/arch/mips/kvm/trap_emul.c
+++ b/arch/mips/kvm/trap_emul.c
@@ -21,7 +21,7 @@
static gpa_t kvm_trap_emul_gva_to_gpa_cb(gva_t gva)
{
gpa_t gpa;
- uint32_t kseg = KSEGX(gva);
+ gva_t kseg = KSEGX(gva);
if ((kseg == CKSEG0) || (kseg == CKSEG1))
gpa = CPHYSADDR(gva);
@@ -40,8 +40,8 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -87,15 +87,15 @@ static int kvm_trap_emul_handle_cop_unusable(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
|| KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
- kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n",
+ kvm_debug("USER/KSEG23 ADDR TLB MOD fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
cause, opc, badvaddr);
er = kvm_mips_handle_tlbmod(cause, opc, run, vcpu);
@@ -111,14 +111,14 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
* when we are not using HIGHMEM. Need to address this in a
* HIGHMEM kernel
*/
- kvm_err("TLB MOD fault not handled, cause %#lx, PC: %p, BadVaddr: %#lx\n",
+ kvm_err("TLB MOD fault not handled, cause %#x, PC: %p, BadVaddr: %#lx\n",
cause, opc, badvaddr);
kvm_mips_dump_host_tlbs();
kvm_arch_vcpu_dump_regs(vcpu);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
ret = RESUME_HOST;
} else {
- kvm_err("Illegal TLB Mod fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
+ kvm_err("Illegal TLB Mod fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
cause, opc, badvaddr);
kvm_mips_dump_host_tlbs();
kvm_arch_vcpu_dump_regs(vcpu);
@@ -128,59 +128,12 @@ static int kvm_trap_emul_handle_tlb_mod(struct kvm_vcpu *vcpu)
return ret;
}
-static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
-{
- struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
- unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
- unsigned long cause = vcpu->arch.host_cp0_cause;
- enum emulation_result er = EMULATE_DONE;
- int ret = RESUME_GUEST;
-
- if (((badvaddr & PAGE_MASK) == KVM_GUEST_COMMPAGE_ADDR)
- && KVM_GUEST_KERNEL_MODE(vcpu)) {
- if (kvm_mips_handle_commpage_tlb_fault(badvaddr, vcpu) < 0) {
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
- }
- } else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
- || KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
- kvm_debug("USER ADDR TLB LD fault: cause %#lx, PC: %p, BadVaddr: %#lx\n",
- cause, opc, badvaddr);
- er = kvm_mips_handle_tlbmiss(cause, opc, run, vcpu);
- if (er == EMULATE_DONE)
- ret = RESUME_GUEST;
- else {
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
- }
- } else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
- /*
- * All KSEG0 faults are handled by KVM, as the guest kernel does
- * not expect to ever get them
- */
- if (kvm_mips_handle_kseg0_tlb_fault
- (vcpu->arch.host_cp0_badvaddr, vcpu) < 0) {
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
- }
- } else {
- kvm_err("Illegal TLB LD fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
- cause, opc, badvaddr);
- kvm_mips_dump_host_tlbs();
- kvm_arch_vcpu_dump_regs(vcpu);
- run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
- ret = RESUME_HOST;
- }
- return ret;
-}
-
-static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
+static int kvm_trap_emul_handle_tlb_miss(struct kvm_vcpu *vcpu, bool store)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -192,8 +145,8 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
}
} else if (KVM_GUEST_KSEGX(badvaddr) < KVM_GUEST_KSEG0
|| KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG23) {
- kvm_debug("USER ADDR TLB ST fault: PC: %#lx, BadVaddr: %#lx\n",
- vcpu->arch.pc, badvaddr);
+ kvm_debug("USER ADDR TLB %s fault: cause %#x, PC: %p, BadVaddr: %#lx\n",
+ store ? "ST" : "LD", cause, opc, badvaddr);
/*
* User Address (UA) fault, this could happen if
@@ -213,14 +166,18 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
ret = RESUME_HOST;
}
} else if (KVM_GUEST_KSEGX(badvaddr) == KVM_GUEST_KSEG0) {
+ /*
+ * All KSEG0 faults are handled by KVM, as the guest kernel does
+ * not expect to ever get them
+ */
if (kvm_mips_handle_kseg0_tlb_fault
(vcpu->arch.host_cp0_badvaddr, vcpu) < 0) {
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
ret = RESUME_HOST;
}
} else {
- kvm_err("Illegal TLB ST fault address , cause %#lx, PC: %p, BadVaddr: %#lx\n",
- cause, opc, badvaddr);
+ kvm_err("Illegal TLB %s fault address , cause %#x, PC: %p, BadVaddr: %#lx\n",
+ store ? "ST" : "LD", cause, opc, badvaddr);
kvm_mips_dump_host_tlbs();
kvm_arch_vcpu_dump_regs(vcpu);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
@@ -229,12 +186,22 @@ static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
return ret;
}
+static int kvm_trap_emul_handle_tlb_st_miss(struct kvm_vcpu *vcpu)
+{
+ return kvm_trap_emul_handle_tlb_miss(vcpu, true);
+}
+
+static int kvm_trap_emul_handle_tlb_ld_miss(struct kvm_vcpu *vcpu)
+{
+ return kvm_trap_emul_handle_tlb_miss(vcpu, false);
+}
+
static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -251,7 +218,7 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
ret = RESUME_HOST;
}
} else {
- kvm_err("Address Error (STORE): cause %#lx, PC: %p, BadVaddr: %#lx\n",
+ kvm_err("Address Error (STORE): cause %#x, PC: %p, BadVaddr: %#lx\n",
cause, opc, badvaddr);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
ret = RESUME_HOST;
@@ -262,9 +229,9 @@ static int kvm_trap_emul_handle_addr_err_st(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
unsigned long badvaddr = vcpu->arch.host_cp0_badvaddr;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -280,7 +247,7 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
ret = RESUME_HOST;
}
} else {
- kvm_err("Address Error (LOAD): cause %#lx, PC: %p, BadVaddr: %#lx\n",
+ kvm_err("Address Error (LOAD): cause %#x, PC: %p, BadVaddr: %#lx\n",
cause, opc, badvaddr);
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
ret = RESUME_HOST;
@@ -292,8 +259,8 @@ static int kvm_trap_emul_handle_addr_err_ld(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -310,8 +277,8 @@ static int kvm_trap_emul_handle_syscall(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -328,8 +295,8 @@ static int kvm_trap_emul_handle_res_inst(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -346,8 +313,8 @@ static int kvm_trap_emul_handle_break(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *)vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -364,8 +331,8 @@ static int kvm_trap_emul_handle_trap(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *)vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -382,8 +349,8 @@ static int kvm_trap_emul_handle_msa_fpe(struct kvm_vcpu *vcpu)
static int kvm_trap_emul_handle_fpe(struct kvm_vcpu *vcpu)
{
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *)vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *)vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -407,8 +374,8 @@ static int kvm_trap_emul_handle_msa_disabled(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
struct kvm_run *run = vcpu->run;
- uint32_t __user *opc = (uint32_t __user *) vcpu->arch.pc;
- unsigned long cause = vcpu->arch.host_cp0_cause;
+ u32 __user *opc = (u32 __user *) vcpu->arch.pc;
+ u32 cause = vcpu->arch.host_cp0_cause;
enum emulation_result er = EMULATE_DONE;
int ret = RESUME_GUEST;
@@ -451,24 +418,41 @@ static int kvm_trap_emul_vm_init(struct kvm *kvm)
static int kvm_trap_emul_vcpu_init(struct kvm_vcpu *vcpu)
{
+ vcpu->arch.kscratch_enabled = 0xfc;
+
return 0;
}
static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
{
struct mips_coproc *cop0 = vcpu->arch.cop0;
- uint32_t config1;
+ u32 config, config1;
int vcpu_id = vcpu->vcpu_id;
/*
* Arch specific stuff, set up config registers properly so that the
- * guest will come up as expected, for now we simulate a MIPS 24kc
+ * guest will come up as expected
*/
+#ifndef CONFIG_CPU_MIPSR6
+ /* r2-r5, simulate a MIPS 24kc */
kvm_write_c0_guest_prid(cop0, 0x00019300);
- /* Have config1, Cacheable, noncoherent, write-back, write allocate */
- kvm_write_c0_guest_config(cop0, MIPS_CONF_M | (0x3 << CP0C0_K0) |
- (0x1 << CP0C0_AR) |
- (MMU_TYPE_R4000 << CP0C0_MT));
+#else
+ /* r6+, simulate a generic QEMU machine */
+ kvm_write_c0_guest_prid(cop0, 0x00010000);
+#endif
+ /*
+ * Have config1, Cacheable, noncoherent, write-back, write allocate.
+ * Endianness, arch revision & virtually tagged icache should match
+ * host.
+ */
+ config = read_c0_config() & MIPS_CONF_AR;
+ config |= MIPS_CONF_M | CONF_CM_CACHABLE_NONCOHERENT | MIPS_CONF_MT_TLB;
+#ifdef CONFIG_CPU_BIG_ENDIAN
+ config |= CONF_BE;
+#endif
+ if (cpu_has_vtag_icache)
+ config |= MIPS_CONF_VI;
+ kvm_write_c0_guest_config(cop0, config);
/* Read the cache characteristics from the host Config1 Register */
config1 = (read_c0_config1() & ~0x7f);
@@ -478,9 +462,8 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
config1 |= ((KVM_MIPS_GUEST_TLB_SIZE - 1) << 25);
/* We unset some bits that we aren't emulating */
- config1 &=
- ~((1 << CP0C1_C2) | (1 << CP0C1_MD) | (1 << CP0C1_PC) |
- (1 << CP0C1_WR) | (1 << CP0C1_CA));
+ config1 &= ~(MIPS_CONF1_C2 | MIPS_CONF1_MD | MIPS_CONF1_PC |
+ MIPS_CONF1_WR | MIPS_CONF1_CA);
kvm_write_c0_guest_config1(cop0, config1);
/* Have config3, no tertiary/secondary caches implemented */
@@ -511,6 +494,17 @@ static int kvm_trap_emul_vcpu_setup(struct kvm_vcpu *vcpu)
return 0;
}
+static unsigned long kvm_trap_emul_num_regs(struct kvm_vcpu *vcpu)
+{
+ return 0;
+}
+
+static int kvm_trap_emul_copy_reg_indices(struct kvm_vcpu *vcpu,
+ u64 __user *indices)
+{
+ return 0;
+}
+
static int kvm_trap_emul_get_one_reg(struct kvm_vcpu *vcpu,
const struct kvm_one_reg *reg,
s64 *v)
@@ -660,6 +654,8 @@ static struct kvm_mips_callbacks kvm_trap_emul_callbacks = {
.dequeue_io_int = kvm_mips_dequeue_io_int_cb,
.irq_deliver = kvm_mips_irq_deliver_cb,
.irq_clear = kvm_mips_irq_clear_cb,
+ .num_regs = kvm_trap_emul_num_regs,
+ .copy_reg_indices = kvm_trap_emul_copy_reg_indices,
.get_one_reg = kvm_trap_emul_get_one_reg,
.set_one_reg = kvm_trap_emul_set_one_reg,
.vcpu_get_regs = kvm_trap_emul_vcpu_get_regs,
diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index d96e912b9d44..6dc07fba187f 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -627,8 +627,8 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
dec_insn.pc_inc +
dec_insn.next_pc_inc;
return 1;
- case cbcond0_op:
- case cbcond1_op:
+ case pop10_op:
+ case pop30_op:
if (!cpu_has_mips_r6)
break;
if (insn.i_format.rt && !insn.i_format.rs)
@@ -683,14 +683,14 @@ static int isBranchInstr(struct pt_regs *regs, struct mm_decoded_insn dec_insn,
dec_insn.next_pc_inc;
return 1;
- case beqzcjic_op:
+ case pop66_op:
if (!cpu_has_mips_r6)
break;
*contpc = regs->cp0_epc + dec_insn.pc_inc +
dec_insn.next_pc_inc;
return 1;
- case bnezcjialc_op:
+ case pop76_op:
if (!cpu_has_mips_r6)
break;
if (!insn.i_format.rs)
diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c
index ef7f925dd1b0..7a9c345e87e5 100644
--- a/arch/mips/mm/c-r4k.c
+++ b/arch/mips/mm/c-r4k.c
@@ -1206,7 +1206,7 @@ static void probe_pcache(void)
c->icache.linesz;
c->icache.waybit = __ffs(icache_size/c->icache.ways);
- if (config & 0x8) /* VI bit */
+ if (config & MIPS_CONF_VI)
c->icache.flags |= MIPS_CACHE_VTAG;
/*
diff --git a/arch/mips/mm/uasm-micromips.c b/arch/mips/mm/uasm-micromips.c
index d78178daea4b..277cf52d80e1 100644
--- a/arch/mips/mm/uasm-micromips.c
+++ b/arch/mips/mm/uasm-micromips.c
@@ -53,8 +53,13 @@ static struct insn insn_table_MM[] = {
{ insn_bltzl, 0, 0 },
{ insn_bne, M(mm_bne32_op, 0, 0, 0, 0, 0), RT | RS | BIMM },
{ insn_cache, M(mm_pool32b_op, 0, 0, mm_cache_func, 0, 0), RT | RS | SIMM },
+ { insn_cfc1, M(mm_pool32f_op, 0, 0, 0, mm_cfc1_op, mm_32f_73_op), RT | RS },
+ { insn_cfcmsa, M(mm_pool32s_op, 0, msa_cfc_op, 0, 0, mm_32s_elm_op), RD | RE },
+ { insn_ctc1, M(mm_pool32f_op, 0, 0, 0, mm_ctc1_op, mm_32f_73_op), RT | RS },
+ { insn_ctcmsa, M(mm_pool32s_op, 0, msa_ctc_op, 0, 0, mm_32s_elm_op), RD | RE },
{ insn_daddu, 0, 0 },
{ insn_daddiu, 0, 0 },
+ { insn_di, M(mm_pool32a_op, 0, 0, 0, mm_di_op, mm_pool32axf_op), RS },
{ insn_divu, M(mm_pool32a_op, 0, 0, 0, mm_divu_op, mm_pool32axf_op), RT | RS },
{ insn_dmfc0, 0, 0 },
{ insn_dmtc0, 0, 0 },
@@ -84,6 +89,8 @@ static struct insn insn_table_MM[] = {
{ insn_mfhi, M(mm_pool32a_op, 0, 0, 0, mm_mfhi32_op, mm_pool32axf_op), RS },
{ insn_mflo, M(mm_pool32a_op, 0, 0, 0, mm_mflo32_op, mm_pool32axf_op), RS },
{ insn_mtc0, M(mm_pool32a_op, 0, 0, 0, mm_mtc0_op, mm_pool32axf_op), RT | RS | RD },
+ { insn_mthi, M(mm_pool32a_op, 0, 0, 0, mm_mthi32_op, mm_pool32axf_op), RS },
+ { insn_mtlo, M(mm_pool32a_op, 0, 0, 0, mm_mtlo32_op, mm_pool32axf_op), RS },
{ insn_mul, M(mm_pool32a_op, 0, 0, 0, 0, mm_mul_op), RT | RS | RD },
{ insn_or, M(mm_pool32a_op, 0, 0, 0, 0, mm_or32_op), RT | RS | RD },
{ insn_ori, M(mm_ori32_op, 0, 0, 0, 0, 0), RT | RS | UIMM },
@@ -166,13 +173,15 @@ static void build_insn(u32 **buf, enum opcode opc, ...)
op = ip->match;
va_start(ap, opc);
if (ip->fields & RS) {
- if (opc == insn_mfc0 || opc == insn_mtc0)
+ if (opc == insn_mfc0 || opc == insn_mtc0 ||
+ opc == insn_cfc1 || opc == insn_ctc1)
op |= build_rt(va_arg(ap, u32));
else
op |= build_rs(va_arg(ap, u32));
}
if (ip->fields & RT) {
- if (opc == insn_mfc0 || opc == insn_mtc0)
+ if (opc == insn_mfc0 || opc == insn_mtc0 ||
+ opc == insn_cfc1 || opc == insn_ctc1)
op |= build_rs(va_arg(ap, u32));
else
op |= build_rt(va_arg(ap, u32));
diff --git a/arch/mips/mm/uasm-mips.c b/arch/mips/mm/uasm-mips.c
index 9c2220a45189..cec524167822 100644
--- a/arch/mips/mm/uasm-mips.c
+++ b/arch/mips/mm/uasm-mips.c
@@ -67,9 +67,14 @@ static struct insn insn_table[] = {
#else
{ insn_cache, M6(cache_op, 0, 0, 0, cache6_op), RS | RT | SIMM9 },
#endif
+ { insn_cfc1, M(cop1_op, cfc_op, 0, 0, 0, 0), RT | RD },
+ { insn_cfcmsa, M(msa_op, 0, msa_cfc_op, 0, 0, msa_elm_op), RD | RE },
+ { insn_ctc1, M(cop1_op, ctc_op, 0, 0, 0, 0), RT | RD },
+ { insn_ctcmsa, M(msa_op, 0, msa_ctc_op, 0, 0, msa_elm_op), RD | RE },
{ insn_daddiu, M(daddiu_op, 0, 0, 0, 0, 0), RS | RT | SIMM },
{ insn_daddu, M(spec_op, 0, 0, 0, 0, daddu_op), RS | RT | RD },
{ insn_dinsm, M(spec3_op, 0, 0, 0, 0, dinsm_op), RS | RT | RD | RE },
+ { insn_di, M(cop0_op, mfmc0_op, 0, 12, 0, 0), RT },
{ insn_dins, M(spec3_op, 0, 0, 0, 0, dins_op), RS | RT | RD | RE },
{ insn_divu, M(spec_op, 0, 0, 0, 0, divu_op), RS | RT },
{ insn_dmfc0, M(cop0_op, dmfc_op, 0, 0, 0, 0), RT | RD | SET},
@@ -114,7 +119,13 @@ static struct insn insn_table[] = {
{ insn_mflo, M(spec_op, 0, 0, 0, 0, mflo_op), RD },
{ insn_mtc0, M(cop0_op, mtc_op, 0, 0, 0, 0), RT | RD | SET},
{ insn_mthc0, M(cop0_op, mthc0_op, 0, 0, 0, 0), RT | RD | SET},
+ { insn_mthi, M(spec_op, 0, 0, 0, 0, mthi_op), RS },
+ { insn_mtlo, M(spec_op, 0, 0, 0, 0, mtlo_op), RS },
+#ifndef CONFIG_CPU_MIPSR6
{ insn_mul, M(spec2_op, 0, 0, 0, 0, mul_op), RS | RT | RD},
+#else
+ { insn_mul, M(spec_op, 0, 0, 0, mult_mul_op, mult_op), RS | RT | RD},
+#endif
{ insn_ori, M(ori_op, 0, 0, 0, 0, 0), RS | RT | UIMM },
{ insn_or, M(spec_op, 0, 0, 0, 0, or_op), RS | RT | RD },
#ifndef CONFIG_CPU_MIPSR6
diff --git a/arch/mips/mm/uasm.c b/arch/mips/mm/uasm.c
index ad718debc35a..3e0282d301d6 100644
--- a/arch/mips/mm/uasm.c
+++ b/arch/mips/mm/uasm.c
@@ -49,18 +49,19 @@ enum opcode {
insn_invalid,
insn_addiu, insn_addu, insn_and, insn_andi, insn_bbit0, insn_bbit1,
insn_beq, insn_beql, insn_bgez, insn_bgezl, insn_bltz, insn_bltzl,
- insn_bne, insn_cache, insn_daddiu, insn_daddu, insn_dins, insn_dinsm,
- insn_divu, insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll,
+ insn_bne, insn_cache, insn_cfc1, insn_cfcmsa, insn_ctc1, insn_ctcmsa,
+ insn_daddiu, insn_daddu, insn_di, insn_dins, insn_dinsm, insn_divu,
+ insn_dmfc0, insn_dmtc0, insn_drotr, insn_drotr32, insn_dsll,
insn_dsll32, insn_dsra, insn_dsrl, insn_dsrl32, insn_dsubu, insn_eret,
insn_ext, insn_ins, insn_j, insn_jal, insn_jalr, insn_jr, insn_lb,
insn_ld, insn_ldx, insn_lh, insn_ll, insn_lld, insn_lui, insn_lw,
insn_lwx, insn_mfc0, insn_mfhc0, insn_mfhi, insn_mflo, insn_mtc0,
- insn_mthc0, insn_mul, insn_or, insn_ori, insn_pref, insn_rfe,
- insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll, insn_sllv, insn_slt,
- insn_sltiu, insn_sltu, insn_sra, insn_srl, insn_srlv, insn_subu,
- insn_sw, insn_sync, insn_syscall, insn_tlbp, insn_tlbr, insn_tlbwi,
- insn_tlbwr, insn_wait, insn_wsbh, insn_xor, insn_xori, insn_yield,
- insn_lddir, insn_ldpte,
+ insn_mthc0, insn_mthi, insn_mtlo, insn_mul, insn_or, insn_ori,
+ insn_pref, insn_rfe, insn_rotr, insn_sc, insn_scd, insn_sd, insn_sll,
+ insn_sllv, insn_slt, insn_sltiu, insn_sltu, insn_sra, insn_srl,
+ insn_srlv, insn_subu, insn_sw, insn_sync, insn_syscall, insn_tlbp,
+ insn_tlbr, insn_tlbwi, insn_tlbwr, insn_wait, insn_wsbh, insn_xor,
+ insn_xori, insn_yield, insn_lddir, insn_ldpte,
};
struct insn {
@@ -268,10 +269,15 @@ I_u1s2(_bltz)
I_u1s2(_bltzl)
I_u1u2s3(_bne)
I_u2s3u1(_cache)
+I_u1u2(_cfc1)
+I_u2u1(_cfcmsa)
+I_u1u2(_ctc1)
+I_u2u1(_ctcmsa)
I_u1u2u3(_dmfc0)
I_u1u2u3(_dmtc0)
I_u2u1s3(_daddiu)
I_u3u1u2(_daddu)
+I_u1(_di);
I_u1u2(_divu)
I_u2u1u3(_dsll)
I_u2u1u3(_dsll32)
@@ -301,6 +307,8 @@ I_u1(_mfhi)
I_u1(_mflo)
I_u1u2u3(_mtc0)
I_u1u2u3(_mthc0)
+I_u1(_mthi)
+I_u1(_mtlo)
I_u3u1u2(_mul)
I_u2u1u3(_ori)
I_u3u1u2(_or)
diff --git a/arch/mips/pci/pci.c b/arch/mips/pci/pci.c
index f1b11f0dea2d..b4c02f29663e 100644
--- a/arch/mips/pci/pci.c
+++ b/arch/mips/pci/pci.c
@@ -112,7 +112,14 @@ static void pcibios_scanbus(struct pci_controller *hose)
need_domain_info = 1;
}
- if (!pci_has_flag(PCI_PROBE_ONLY)) {
+ /*
+ * We insert PCI resources into the iomem_resource and
+ * ioport_resource trees in either pci_bus_claim_resources()
+ * or pci_bus_assign_resources().
+ */
+ if (pci_has_flag(PCI_PROBE_ONLY)) {
+ pci_bus_claim_resources(bus);
+ } else {
pci_bus_size_bridges(bus);
pci_bus_assign_resources(bus);
}
@@ -319,6 +326,16 @@ void pcibios_fixup_bus(struct pci_bus *bus)
EXPORT_SYMBOL(PCIBIOS_MIN_IO);
EXPORT_SYMBOL(PCIBIOS_MIN_MEM);
+void pci_resource_to_user(const struct pci_dev *dev, int bar,
+ const struct resource *rsrc, resource_size_t *start,
+ resource_size_t *end)
+{
+ phys_addr_t size = resource_size(rsrc);
+
+ *start = fixup_bigphys_addr(rsrc->start, size);
+ *end = rsrc->start + size;
+}
+
int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
enum pci_mmap_state mmap_state, int write_combine)
{
diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile
index 4cd612a6e272..1a2a6e8dc40d 100644
--- a/arch/powerpc/boot/Makefile
+++ b/arch/powerpc/boot/Makefile
@@ -43,7 +43,7 @@ ifeq ($(call cc-option-yn, -fstack-protector),y)
BOOTCFLAGS += -fno-stack-protector
endif
-BOOTCFLAGS += -I$(obj) -I$(srctree)/$(obj)
+BOOTCFLAGS += -I$(objtree)/$(obj) -I$(srctree)/$(obj)
DTC_FLAGS ?= -p 1024
diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h
new file mode 100644
index 000000000000..88b4901ac4ee
--- /dev/null
+++ b/arch/powerpc/include/asm/hmi.h
@@ -0,0 +1,45 @@
+/*
+ * Hypervisor Maintenance Interrupt header file.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ *
+ * Copyright 2015 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#ifndef __ASM_PPC64_HMI_H__
+#define __ASM_PPC64_HMI_H__
+
+#ifdef CONFIG_PPC_BOOK3S_64
+
+#define CORE_TB_RESYNC_REQ_BIT 63
+#define MAX_SUBCORE_PER_CORE 4
+
+/*
+ * sibling_subcore_state structure is used to co-ordinate all threads
+ * during HMI to avoid TB corruption. This structure is allocated once
+ * per each core and shared by all threads on that core.
+ */
+struct sibling_subcore_state {
+ unsigned long flags;
+ u8 in_guest[MAX_SUBCORE_PER_CORE];
+};
+
+extern void wait_for_subcore_guest_exit(void);
+extern void wait_for_tb_resync(void);
+#else
+static inline void wait_for_subcore_guest_exit(void) { }
+static inline void wait_for_tb_resync(void) { }
+#endif
+#endif /* __ASM_PPC64_HMI_H__ */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index ad171e979ab0..148303e7771f 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -26,6 +26,7 @@
#include <asm/kvm_book3s_asm.h>
#endif
#include <asm/accounting.h>
+#include <asm/hmi.h>
register struct paca_struct *local_paca asm("r13");
@@ -182,6 +183,11 @@ struct paca_struct {
*/
u16 in_mce;
u8 hmi_event_available; /* HMI event is available */
+ /*
+ * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
+ * more details
+ */
+ struct sibling_subcore_state *sibling_subcore_state;
#endif
/* Stuff for accurate time accounting */
diff --git a/arch/powerpc/include/asm/pci.h b/arch/powerpc/include/asm/pci.h
index a6f3ac0d4602..e9bd6cf0212f 100644
--- a/arch/powerpc/include/asm/pci.h
+++ b/arch/powerpc/include/asm/pci.h
@@ -136,9 +136,6 @@ extern pgprot_t pci_phys_mem_access_prot(struct file *file,
pgprot_t prot);
#define HAVE_ARCH_PCI_RESOURCE_TO_USER
-extern void pci_resource_to_user(const struct pci_dev *dev, int bar,
- const struct resource *rsrc,
- resource_size_t *start, resource_size_t *end);
extern resource_size_t pcibios_io_space_offset(struct pci_controller *hose);
extern void pcibios_setup_bus_devices(struct pci_bus *bus);
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index fe4c075bcf50..b2027a5cf508 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32) += vdso32/
obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o
obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o
-obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o
+obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o hmi.o
obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o
obj-$(CONFIG_PPC64) += vdso64/
obj-$(CONFIG_ALTIVEC) += vecemu.o
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 6200e4925d26..694def6c9d61 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -671,6 +671,8 @@ BEGIN_FTR_SECTION
beq h_doorbell_common
cmpwi r3,0xea0
beq h_virt_irq_common
+ cmpwi r3,0xe60
+ beq hmi_exception_common
FTR_SECTION_ELSE
cmpwi r3,0xa00
beq doorbell_super_common
@@ -1172,7 +1174,7 @@ fwnmi_data_area:
.globl hmi_exception_early
hmi_exception_early:
- EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0xe60)
+ EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST, 0xe62)
mr r10,r1 /* Save r1 */
ld r1,PACAEMERGSP(r13) /* Use emergency stack */
subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */
diff --git a/arch/powerpc/kernel/hmi.c b/arch/powerpc/kernel/hmi.c
new file mode 100644
index 000000000000..e3f738eb1cac
--- /dev/null
+++ b/arch/powerpc/kernel/hmi.c
@@ -0,0 +1,56 @@
+/*
+ * Hypervisor Maintenance Interrupt (HMI) handling.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.
+ *
+ * Copyright 2015 IBM Corporation
+ * Author: Mahesh Salgaonkar <mahesh@linux.vnet.ibm.com>
+ */
+
+#undef DEBUG
+
+#include <linux/types.h>
+#include <linux/compiler.h>
+#include <asm/paca.h>
+#include <asm/hmi.h>
+
+void wait_for_subcore_guest_exit(void)
+{
+ int i;
+
+ /*
+ * NULL bitmap pointer indicates that KVM module hasn't
+ * been loaded yet and hence no guests are running.
+ * If no KVM is in use, no need to co-ordinate among threads
+ * as all of them will always be in host and no one is going
+ * to modify TB other than the opal hmi handler.
+ * Hence, just return from here.
+ */
+ if (!local_paca->sibling_subcore_state)
+ return;
+
+ for (i = 0; i < MAX_SUBCORE_PER_CORE; i++)
+ while (local_paca->sibling_subcore_state->in_guest[i])
+ cpu_relax();
+}
+
+void wait_for_tb_resync(void)
+{
+ if (!local_paca->sibling_subcore_state)
+ return;
+
+ while (test_bit(CORE_TB_RESYNC_REQ_BIT,
+ &local_paca->sibling_subcore_state->flags))
+ cpu_relax();
+}
diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S
index 335eb6cedae5..8a56a51fc0cb 100644
--- a/arch/powerpc/kernel/idle_book3s.S
+++ b/arch/powerpc/kernel/idle_book3s.S
@@ -336,7 +336,9 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \
ld r2,PACATOC(r13); \
ld r1,PACAR1(r13); \
std r3,ORIG_GPR3(r1); /* Save original r3 */ \
- bl opal_rm_handle_hmi; \
+ li r3,0; /* NULL argument */ \
+ bl hmi_exception_realmode; \
+ nop; \
ld r3,ORIG_GPR3(r1); /* Restore original r3 */ \
20: nop;
diff --git a/arch/powerpc/kernel/pci-common.c b/arch/powerpc/kernel/pci-common.c
index f93942b4b6a6..a5c0153ede37 100644
--- a/arch/powerpc/kernel/pci-common.c
+++ b/arch/powerpc/kernel/pci-common.c
@@ -412,36 +412,6 @@ static struct resource *__pci_mmap_make_offset(struct pci_dev *dev,
}
/*
- * Set vm_page_prot of VMA, as appropriate for this architecture, for a pci
- * device mapping.
- */
-static pgprot_t __pci_mmap_set_pgprot(struct pci_dev *dev, struct resource *rp,
- pgprot_t protection,
- enum pci_mmap_state mmap_state,
- int write_combine)
-{
-
- /* Write combine is always 0 on non-memory space mappings. On
- * memory space, if the user didn't pass 1, we check for a
- * "prefetchable" resource. This is a bit hackish, but we use
- * this to workaround the inability of /sysfs to provide a write
- * combine bit
- */
- if (mmap_state != pci_mmap_mem)
- write_combine = 0;
- else if (write_combine == 0) {
- if (rp->flags & IORESOURCE_PREFETCH)
- write_combine = 1;
- }
-
- /* XXX would be nice to have a way to ask for write-through */
- if (write_combine)
- return pgprot_noncached_wc(protection);
- else
- return pgprot_noncached(protection);
-}
-
-/*
* This one is used by /dev/mem and fbdev who have no clue about the
* PCI device, it tries to find the PCI device first and calls the
* above routine
@@ -514,9 +484,10 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
return -EINVAL;
vma->vm_pgoff = offset >> PAGE_SHIFT;
- vma->vm_page_prot = __pci_mmap_set_pgprot(dev, rp,
- vma->vm_page_prot,
- mmap_state, write_combine);
+ if (write_combine)
+ vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot);
+ else
+ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
ret = remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
vma->vm_end - vma->vm_start, vma->vm_page_prot);
@@ -666,39 +637,25 @@ void pci_resource_to_user(const struct pci_dev *dev, int bar,
const struct resource *rsrc,
resource_size_t *start, resource_size_t *end)
{
- struct pci_controller *hose = pci_bus_to_host(dev->bus);
- resource_size_t offset = 0;
+ struct pci_bus_region region;
- if (hose == NULL)
+ if (rsrc->flags & IORESOURCE_IO) {
+ pcibios_resource_to_bus(dev->bus, &region,
+ (struct resource *) rsrc);
+ *start = region.start;
+ *end = region.end;
return;
+ }
- if (rsrc->flags & IORESOURCE_IO)
- offset = (unsigned long)hose->io_base_virt - _IO_BASE;
-
- /* We pass a fully fixed up address to userland for MMIO instead of
- * a BAR value because X is lame and expects to be able to use that
- * to pass to /dev/mem !
- *
- * That means that we'll have potentially 64 bits values where some
- * userland apps only expect 32 (like X itself since it thinks only
- * Sparc has 64 bits MMIO) but if we don't do that, we break it on
- * 32 bits CHRPs :-(
- *
- * Hopefully, the sysfs insterface is immune to that gunk. Once X
- * has been fixed (and the fix spread enough), we can re-enable the
- * 2 lines below and pass down a BAR value to userland. In that case
- * we'll also have to re-enable the matching code in
- * __pci_mmap_make_offset().
+ /* We pass a CPU physical address to userland for MMIO instead of a
+ * BAR value because X is lame and expects to be able to use that
+ * to pass to /dev/mem!
*
- * BenH.
+ * That means we may have 64-bit values where some apps only expect
+ * 32 (like X itself since it thinks only Sparc has 64-bit MMIO).
*/
-#if 0
- else if (rsrc->flags & IORESOURCE_MEM)
- offset = hose->pci_mem_offset;
-#endif
-
- *start = rsrc->start - offset;
- *end = rsrc->end - offset;
+ *start = rsrc->start;
+ *end = rsrc->end;
}
/**
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index f7e2f2e318bd..2cb589264cb7 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -61,6 +61,7 @@
#include <asm/tm.h>
#include <asm/debug.h>
#include <asm/asm-prototypes.h>
+#include <asm/hmi.h>
#include <sysdev/fsl_pci.h>
#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
@@ -308,9 +309,13 @@ long hmi_exception_realmode(struct pt_regs *regs)
{
__this_cpu_inc(irq_stat.hmi_exceptions);
+ wait_for_subcore_guest_exit();
+
if (ppc_md.hmi_exception_early)
ppc_md.hmi_exception_early(regs);
+ wait_for_tb_resync();
+
return 0;
}
diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
index eba0bea6e032..1f9e5529e692 100644
--- a/arch/powerpc/kvm/Makefile
+++ b/arch/powerpc/kvm/Makefile
@@ -20,7 +20,7 @@ common-objs-y += powerpc.o emulate.o emulate_loadstore.o
obj-$(CONFIG_KVM_EXIT_TIMING) += timing.o
obj-$(CONFIG_KVM_BOOK3S_HANDLER) += book3s_exports.o
-AFLAGS_booke_interrupts.o := -I$(obj)
+AFLAGS_booke_interrupts.o := -I$(objtree)/$(obj)
kvm-e500-objs := \
$(common-objs-y) \
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e20beae5ca7a..2fd5580c8f6e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -52,6 +52,7 @@
#include <asm/switch_to.h>
#include <asm/smp.h>
#include <asm/dbell.h>
+#include <asm/hmi.h>
#include <linux/gfp.h>
#include <linux/vmalloc.h>
#include <linux/highmem.h>
@@ -2522,7 +2523,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
spin_unlock(&pvc->lock);
- kvm_guest_enter();
+ guest_enter();
srcu_idx = srcu_read_lock(&vc->kvm->srcu);
@@ -2570,7 +2571,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
/* make sure updates to secondary vcpu structs are visible now */
smp_mb();
- kvm_guest_exit();
+ guest_exit();
for (sub = 0; sub < core_info.n_subcores; ++sub)
list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
@@ -3401,6 +3402,38 @@ static struct kvmppc_ops kvm_ops_hv = {
.hcall_implemented = kvmppc_hcall_impl_hv,
};
+static int kvm_init_subcore_bitmap(void)
+{
+ int i, j;
+ int nr_cores = cpu_nr_cores();
+ struct sibling_subcore_state *sibling_subcore_state;
+
+ for (i = 0; i < nr_cores; i++) {
+ int first_cpu = i * threads_per_core;
+ int node = cpu_to_node(first_cpu);
+
+ /* Ignore if it is already allocated. */
+ if (paca[first_cpu].sibling_subcore_state)
+ continue;
+
+ sibling_subcore_state =
+ kmalloc_node(sizeof(struct sibling_subcore_state),
+ GFP_KERNEL, node);
+ if (!sibling_subcore_state)
+ return -ENOMEM;
+
+ memset(sibling_subcore_state, 0,
+ sizeof(struct sibling_subcore_state));
+
+ for (j = 0; j < threads_per_core; j++) {
+ int cpu = first_cpu + j;
+
+ paca[cpu].sibling_subcore_state = sibling_subcore_state;
+ }
+ }
+ return 0;
+}
+
static int kvmppc_book3s_init_hv(void)
{
int r;
@@ -3411,6 +3444,10 @@ static int kvmppc_book3s_init_hv(void)
if (r < 0)
return -ENODEV;
+ r = kvm_init_subcore_bitmap();
+ if (r)
+ return r;
+
kvm_ops_hv.owner = THIS_MODULE;
kvmppc_hv_ops = &kvm_ops_hv;
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index 93b5f5c9b445..0fa70a9618d7 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -13,6 +13,9 @@
#include <linux/kernel.h>
#include <asm/opal.h>
#include <asm/mce.h>
+#include <asm/machdep.h>
+#include <asm/cputhreads.h>
+#include <asm/hmi.h>
/* SRR1 bits for machine check on POWER7 */
#define SRR1_MC_LDSTERR (1ul << (63-42))
@@ -140,3 +143,176 @@ long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
{
return kvmppc_realmode_mc_power7(vcpu);
}
+
+/* Check if dynamic split is in force and return subcore size accordingly. */
+static inline int kvmppc_cur_subcore_size(void)
+{
+ if (local_paca->kvm_hstate.kvm_split_mode)
+ return local_paca->kvm_hstate.kvm_split_mode->subcore_size;
+
+ return threads_per_subcore;
+}
+
+void kvmppc_subcore_enter_guest(void)
+{
+ int thread_id, subcore_id;
+
+ thread_id = cpu_thread_in_core(local_paca->paca_index);
+ subcore_id = thread_id / kvmppc_cur_subcore_size();
+
+ local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
+}
+
+void kvmppc_subcore_exit_guest(void)
+{
+ int thread_id, subcore_id;
+
+ thread_id = cpu_thread_in_core(local_paca->paca_index);
+ subcore_id = thread_id / kvmppc_cur_subcore_size();
+
+ local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
+}
+
+static bool kvmppc_tb_resync_required(void)
+{
+ if (test_and_set_bit(CORE_TB_RESYNC_REQ_BIT,
+ &local_paca->sibling_subcore_state->flags))
+ return false;
+
+ return true;
+}
+
+static void kvmppc_tb_resync_done(void)
+{
+ clear_bit(CORE_TB_RESYNC_REQ_BIT,
+ &local_paca->sibling_subcore_state->flags);
+}
+
+/*
+ * kvmppc_realmode_hmi_handler() is called only by primary thread during
+ * guest exit path.
+ *
+ * There are multiple reasons why HMI could occur, one of them is
+ * Timebase (TB) error. If this HMI is due to TB error, then TB would
+ * have been in stopped state. The opal hmi handler Will fix it and
+ * restore the TB value with host timebase value. For HMI caused due
+ * to non-TB errors, opal hmi handler will not touch/restore TB register
+ * and hence there won't be any change in TB value.
+ *
+ * Since we are not sure about the cause of this HMI, we can't be sure
+ * about the content of TB register whether it holds guest or host timebase
+ * value. Hence the idea is to resync the TB on every HMI, so that we
+ * know about the exact state of the TB value. Resync TB call will
+ * restore TB to host timebase.
+ *
+ * Things to consider:
+ * - On TB error, HMI interrupt is reported on all the threads of the core
+ * that has encountered TB error irrespective of split-core mode.
+ * - The very first thread on the core that get chance to fix TB error
+ * would rsync the TB with local chipTOD value.
+ * - The resync TB is a core level action i.e. it will sync all the TBs
+ * in that core independent of split-core mode. This means if we trigger
+ * TB sync from a thread from one subcore, it would affect TB values of
+ * sibling subcores of the same core.
+ *
+ * All threads need to co-ordinate before making opal hmi handler.
+ * All threads will use sibling_subcore_state->in_guest[] (shared by all
+ * threads in the core) in paca which holds information about whether
+ * sibling subcores are in Guest mode or host mode. The in_guest[] array
+ * is of size MAX_SUBCORE_PER_CORE=4, indexed using subcore id to set/unset
+ * subcore status. Only primary threads from each subcore is responsible
+ * to set/unset its designated array element while entering/exiting the
+ * guset.
+ *
+ * After invoking opal hmi handler call, one of the thread (of entire core)
+ * will need to resync the TB. Bit 63 from subcore state bitmap flags
+ * (sibling_subcore_state->flags) will be used to co-ordinate between
+ * primary threads to decide who takes up the responsibility.
+ *
+ * This is what we do:
+ * - Primary thread from each subcore tries to set resync required bit[63]
+ * of paca->sibling_subcore_state->flags.
+ * - The first primary thread that is able to set the flag takes the
+ * responsibility of TB resync. (Let us call it as thread leader)
+ * - All other threads which are in host will call
+ * wait_for_subcore_guest_exit() and wait for in_guest[0-3] from
+ * paca->sibling_subcore_state to get cleared.
+ * - All the primary thread will clear its subcore status from subcore
+ * state in_guest[] array respectively.
+ * - Once all primary threads clear in_guest[0-3], all of them will invoke
+ * opal hmi handler.
+ * - Now all threads will wait for TB resync to complete by invoking
+ * wait_for_tb_resync() except the thread leader.
+ * - Thread leader will do a TB resync by invoking opal_resync_timebase()
+ * call and the it will clear the resync required bit.
+ * - All other threads will now come out of resync wait loop and proceed
+ * with individual execution.
+ * - On return of this function, primary thread will signal all
+ * secondary threads to proceed.
+ * - All secondary threads will eventually call opal hmi handler on
+ * their exit path.
+ */
+
+long kvmppc_realmode_hmi_handler(void)
+{
+ int ptid = local_paca->kvm_hstate.ptid;
+ bool resync_req;
+
+ /* This is only called on primary thread. */
+ BUG_ON(ptid != 0);
+ __this_cpu_inc(irq_stat.hmi_exceptions);
+
+ /*
+ * By now primary thread has already completed guest->host
+ * partition switch but haven't signaled secondaries yet.
+ * All the secondary threads on this subcore is waiting
+ * for primary thread to signal them to go ahead.
+ *
+ * For threads from subcore which isn't in guest, they all will
+ * wait until all other subcores on this core exit the guest.
+ *
+ * Now set the resync required bit. If you are the first to
+ * set this bit then kvmppc_tb_resync_required() function will
+ * return true. For rest all other subcores
+ * kvmppc_tb_resync_required() will return false.
+ *
+ * If resync_req == true, then this thread is responsible to
+ * initiate TB resync after hmi handler has completed.
+ * All other threads on this core will wait until this thread
+ * clears the resync required bit flag.
+ */
+ resync_req = kvmppc_tb_resync_required();
+
+ /* Reset the subcore status to indicate it has exited guest */
+ kvmppc_subcore_exit_guest();
+
+ /*
+ * Wait for other subcores on this core to exit the guest.
+ * All the primary threads and threads from subcore that are
+ * not in guest will wait here until all subcores are out
+ * of guest context.
+ */
+ wait_for_subcore_guest_exit();
+
+ /*
+ * At this point we are sure that primary threads from each
+ * subcore on this core have completed guest->host partition
+ * switch. Now it is safe to call HMI handler.
+ */
+ if (ppc_md.hmi_exception_early)
+ ppc_md.hmi_exception_early(NULL);
+
+ /*
+ * Check if this thread is responsible to resync TB.
+ * All other threads will wait until this thread completes the
+ * TB resync.
+ */
+ if (resync_req) {
+ opal_resync_timebase();
+ /* Reset TB resync req bit */
+ kvmppc_tb_resync_done();
+ } else {
+ wait_for_tb_resync();
+ }
+ return 0;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 86f0cae37a85..975655573844 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -29,6 +29,7 @@
#include <asm/kvm_book3s_asm.h>
#include <asm/book3s/64/mmu-hash.h>
#include <asm/tm.h>
+#include <asm/opal.h>
#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
@@ -373,6 +374,18 @@ kvm_secondary_got_guest:
lwsync
std r0, HSTATE_KVM_VCORE(r13)
+ /*
+ * All secondaries exiting guest will fall through this path.
+ * Before proceeding, just check for HMI interrupt and
+ * invoke opal hmi handler. By now we are sure that the
+ * primary thread on this core/subcore has already made partition
+ * switch/TB resync and we are good to call opal hmi handler.
+ */
+ cmpwi r12, BOOK3S_INTERRUPT_HMI
+ bne kvm_no_guest
+
+ li r3,0 /* NULL argument */
+ bl hmi_exception_realmode
/*
* At this point we have finished executing in the guest.
* We need to wait for hwthread_req to become zero, since
@@ -428,6 +441,22 @@ kvm_no_guest:
*/
kvm_unsplit_nap:
/*
+ * When secondaries are napping in kvm_unsplit_nap() with
+ * hwthread_req = 1, HMI goes ignored even though subcores are
+ * already exited the guest. Hence HMI keeps waking up secondaries
+ * from nap in a loop and secondaries always go back to nap since
+ * no vcore is assigned to them. This makes impossible for primary
+ * thread to get hold of secondary threads resulting into a soft
+ * lockup in KVM path.
+ *
+ * Let us check if HMI is pending and handle it before we go to nap.
+ */
+ cmpwi r12, BOOK3S_INTERRUPT_HMI
+ bne 55f
+ li r3, 0 /* NULL argument */
+ bl hmi_exception_realmode
+55:
+ /*
* Ensure that secondary doesn't nap when it has
* its vcore pointer set.
*/
@@ -601,6 +630,11 @@ BEGIN_FTR_SECTION
mtspr SPRN_DPDES, r8
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+ /* Mark the subcore state as inside guest */
+ bl kvmppc_subcore_enter_guest
+ nop
+ ld r5, HSTATE_KVM_VCORE(r13)
+ ld r4, HSTATE_KVM_VCPU(r13)
li r0,1
stb r0,VCORE_IN_GUEST(r5) /* signal secondaries to continue */
@@ -655,112 +689,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
- b skip_tm
-END_FTR_SECTION_IFCLR(CPU_FTR_TM)
-
- /* Turn on TM/FP/VSX/VMX so we can restore them. */
- mfmsr r5
- li r6, MSR_TM >> 32
- sldi r6, r6, 32
- or r5, r5, r6
- ori r5, r5, MSR_FP
- oris r5, r5, (MSR_VEC | MSR_VSX)@h
- mtmsrd r5
-
- /*
- * The user may change these outside of a transaction, so they must
- * always be context switched.
- */
- ld r5, VCPU_TFHAR(r4)
- ld r6, VCPU_TFIAR(r4)
- ld r7, VCPU_TEXASR(r4)
- mtspr SPRN_TFHAR, r5
- mtspr SPRN_TFIAR, r6
- mtspr SPRN_TEXASR, r7
-
- ld r5, VCPU_MSR(r4)
- rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
- beq skip_tm /* TM not active in guest */
-
- /* Make sure the failure summary is set, otherwise we'll program check
- * when we trechkpt. It's possible that this might have been not set
- * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
- * host.
- */
- oris r7, r7, (TEXASR_FS)@h
- mtspr SPRN_TEXASR, r7
-
- /*
- * We need to load up the checkpointed state for the guest.
- * We need to do this early as it will blow away any GPRs, VSRs and
- * some SPRs.
- */
-
- mr r31, r4
- addi r3, r31, VCPU_FPRS_TM
- bl load_fp_state
- addi r3, r31, VCPU_VRS_TM
- bl load_vr_state
- mr r4, r31
- lwz r7, VCPU_VRSAVE_TM(r4)
- mtspr SPRN_VRSAVE, r7
-
- ld r5, VCPU_LR_TM(r4)
- lwz r6, VCPU_CR_TM(r4)
- ld r7, VCPU_CTR_TM(r4)
- ld r8, VCPU_AMR_TM(r4)
- ld r9, VCPU_TAR_TM(r4)
- mtlr r5
- mtcr r6
- mtctr r7
- mtspr SPRN_AMR, r8
- mtspr SPRN_TAR, r9
-
- /*
- * Load up PPR and DSCR values but don't put them in the actual SPRs
- * till the last moment to avoid running with userspace PPR and DSCR for
- * too long.
- */
- ld r29, VCPU_DSCR_TM(r4)
- ld r30, VCPU_PPR_TM(r4)
-
- std r2, PACATMSCRATCH(r13) /* Save TOC */
-
- /* Clear the MSR RI since r1, r13 are all going to be foobar. */
- li r5, 0
- mtmsrd r5, 1
-
- /* Load GPRs r0-r28 */
- reg = 0
- .rept 29
- ld reg, VCPU_GPRS_TM(reg)(r31)
- reg = reg + 1
- .endr
-
- mtspr SPRN_DSCR, r29
- mtspr SPRN_PPR, r30
-
- /* Load final GPRs */
- ld 29, VCPU_GPRS_TM(29)(r31)
- ld 30, VCPU_GPRS_TM(30)(r31)
- ld 31, VCPU_GPRS_TM(31)(r31)
-
- /* TM checkpointed state is now setup. All GPRs are now volatile. */
- TRECHKPT
-
- /* Now let's get back the state we need. */
- HMT_MEDIUM
- GET_PACA(r13)
- ld r29, HSTATE_DSCR(r13)
- mtspr SPRN_DSCR, r29
- ld r4, HSTATE_KVM_VCPU(r13)
- ld r1, HSTATE_HOST_R1(r13)
- ld r2, PACATMSCRATCH(r13)
-
- /* Set the MSR RI since we have our registers back. */
- li r5, MSR_RI
- mtmsrd r5, 1
-skip_tm:
+ bl kvmppc_restore_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
/* Load guest PMU registers */
@@ -841,12 +771,6 @@ BEGIN_FTR_SECTION
/* Skip next section on POWER7 */
b 8f
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
- /* Turn on TM so we can access TFHAR/TFIAR/TEXASR */
- mfmsr r8
- li r0, 1
- rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG
- mtmsrd r8
-
/* Load up POWER8-specific registers */
ld r5, VCPU_IAMR(r4)
lwz r6, VCPU_PSPB(r4)
@@ -1436,106 +1360,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
BEGIN_FTR_SECTION
- b 2f
-END_FTR_SECTION_IFCLR(CPU_FTR_TM)
- /* Turn on TM. */
- mfmsr r8
- li r0, 1
- rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG
- mtmsrd r8
-
- ld r5, VCPU_MSR(r9)
- rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
- beq 1f /* TM not active in guest. */
-
- li r3, TM_CAUSE_KVM_RESCHED
-
- /* Clear the MSR RI since r1, r13 are all going to be foobar. */
- li r5, 0
- mtmsrd r5, 1
-
- /* All GPRs are volatile at this point. */
- TRECLAIM(R3)
-
- /* Temporarily store r13 and r9 so we have some regs to play with */
- SET_SCRATCH0(r13)
- GET_PACA(r13)
- std r9, PACATMSCRATCH(r13)
- ld r9, HSTATE_KVM_VCPU(r13)
-
- /* Get a few more GPRs free. */
- std r29, VCPU_GPRS_TM(29)(r9)
- std r30, VCPU_GPRS_TM(30)(r9)
- std r31, VCPU_GPRS_TM(31)(r9)
-
- /* Save away PPR and DSCR soon so don't run with user values. */
- mfspr r31, SPRN_PPR
- HMT_MEDIUM
- mfspr r30, SPRN_DSCR
- ld r29, HSTATE_DSCR(r13)
- mtspr SPRN_DSCR, r29
-
- /* Save all but r9, r13 & r29-r31 */
- reg = 0
- .rept 29
- .if (reg != 9) && (reg != 13)
- std reg, VCPU_GPRS_TM(reg)(r9)
- .endif
- reg = reg + 1
- .endr
- /* ... now save r13 */
- GET_SCRATCH0(r4)
- std r4, VCPU_GPRS_TM(13)(r9)
- /* ... and save r9 */
- ld r4, PACATMSCRATCH(r13)
- std r4, VCPU_GPRS_TM(9)(r9)
-
- /* Reload stack pointer and TOC. */
- ld r1, HSTATE_HOST_R1(r13)
- ld r2, PACATOC(r13)
-
- /* Set MSR RI now we have r1 and r13 back. */
- li r5, MSR_RI
- mtmsrd r5, 1
-
- /* Save away checkpinted SPRs. */
- std r31, VCPU_PPR_TM(r9)
- std r30, VCPU_DSCR_TM(r9)
- mflr r5
- mfcr r6
- mfctr r7
- mfspr r8, SPRN_AMR
- mfspr r10, SPRN_TAR
- std r5, VCPU_LR_TM(r9)
- stw r6, VCPU_CR_TM(r9)
- std r7, VCPU_CTR_TM(r9)
- std r8, VCPU_AMR_TM(r9)
- std r10, VCPU_TAR_TM(r9)
-
- /* Restore r12 as trap number. */
- lwz r12, VCPU_TRAP(r9)
-
- /* Save FP/VSX. */
- addi r3, r9, VCPU_FPRS_TM
- bl store_fp_state
- addi r3, r9, VCPU_VRS_TM
- bl store_vr_state
- mfspr r6, SPRN_VRSAVE
- stw r6, VCPU_VRSAVE_TM(r9)
-1:
- /*
- * We need to save these SPRs after the treclaim so that the software
- * error code is recorded correctly in the TEXASR. Also the user may
- * change these outside of a transaction, so they must always be
- * context switched.
- */
- mfspr r5, SPRN_TFHAR
- mfspr r6, SPRN_TFIAR
- mfspr r7, SPRN_TEXASR
- std r5, VCPU_TFHAR(r9)
- std r6, VCPU_TFIAR(r9)
- std r7, VCPU_TEXASR(r9)
-2:
+ bl kvmppc_save_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
#endif
/* Increment yield count if they have a VPA */
@@ -1683,6 +1509,23 @@ BEGIN_FTR_SECTION
mtspr SPRN_DPDES, r8
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+ /* If HMI, call kvmppc_realmode_hmi_handler() */
+ cmpwi r12, BOOK3S_INTERRUPT_HMI
+ bne 27f
+ bl kvmppc_realmode_hmi_handler
+ nop
+ li r12, BOOK3S_INTERRUPT_HMI
+ /*
+ * At this point kvmppc_realmode_hmi_handler would have resync-ed
+ * the TB. Hence it is not required to subtract guest timebase
+ * offset from timebase. So, skip it.
+ *
+ * Also, do not call kvmppc_subcore_exit_guest() because it has
+ * been invoked as part of kvmppc_realmode_hmi_handler().
+ */
+ b 30f
+
+27:
/* Subtract timebase offset from timebase */
ld r8,VCORE_TB_OFFSET(r5)
cmpdi r8,0
@@ -1698,8 +1541,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
addis r8,r8,0x100 /* if so, increment upper 40 bits */
mtspr SPRN_TBU40,r8
+17: bl kvmppc_subcore_exit_guest
+ nop
+30: ld r5,HSTATE_KVM_VCORE(r13)
+ ld r4,VCORE_KVM(r5) /* pointer to struct kvm */
+
/* Reset PCR */
-17: ld r0, VCORE_PCR(r5)
+ ld r0, VCORE_PCR(r5)
cmpdi r0, 0
beq 18f
li r0, 0
@@ -2245,6 +2093,13 @@ _GLOBAL(kvmppc_h_cede) /* r3 = vcpu pointer, r11 = msr, r13 = paca */
/* save FP state */
bl kvmppc_save_fp
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+ ld r9, HSTATE_KVM_VCPU(r13)
+ bl kvmppc_save_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
+
/*
* Set DEC to the smaller of DEC and HDEC, so that we wake
* no later than the end of our timeslice (HDEC interrupts
@@ -2321,6 +2176,12 @@ kvm_end_cede:
bl kvmhv_accumulate_time
#endif
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+BEGIN_FTR_SECTION
+ bl kvmppc_restore_tm
+END_FTR_SECTION_IFSET(CPU_FTR_TM)
+#endif
+
/* load up FP state */
bl kvmppc_load_fp
@@ -2461,6 +2322,8 @@ BEGIN_FTR_SECTION
cmpwi r6, 3 /* hypervisor doorbell? */
beq 3f
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
+ cmpwi r6, 0xa /* Hypervisor maintenance ? */
+ beq 4f
li r3, 1 /* anything else, return 1 */
0: blr
@@ -2482,6 +2345,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, -1
blr
+ /* Woken up due to Hypervisor maintenance interrupt */
+4: li r12, BOOK3S_INTERRUPT_HMI
+ li r3, 1
+ blr
+
/*
* Determine what sort of external interrupt is pending (if any).
* Returns:
@@ -2631,6 +2499,239 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
mr r4,r31
blr
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+/*
+ * Save transactional state and TM-related registers.
+ * Called with r9 pointing to the vcpu struct.
+ * This can modify all checkpointed registers, but
+ * restores r1, r2 and r9 (vcpu pointer) before exit.
+ */
+kvmppc_save_tm:
+ mflr r0
+ std r0, PPC_LR_STKOFF(r1)
+
+ /* Turn on TM. */
+ mfmsr r8
+ li r0, 1
+ rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG
+ mtmsrd r8
+
+ ld r5, VCPU_MSR(r9)
+ rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+ beq 1f /* TM not active in guest. */
+
+ std r1, HSTATE_HOST_R1(r13)
+ li r3, TM_CAUSE_KVM_RESCHED
+
+ /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+ li r5, 0
+ mtmsrd r5, 1
+
+ /* All GPRs are volatile at this point. */
+ TRECLAIM(R3)
+
+ /* Temporarily store r13 and r9 so we have some regs to play with */
+ SET_SCRATCH0(r13)
+ GET_PACA(r13)
+ std r9, PACATMSCRATCH(r13)
+ ld r9, HSTATE_KVM_VCPU(r13)
+
+ /* Get a few more GPRs free. */
+ std r29, VCPU_GPRS_TM(29)(r9)
+ std r30, VCPU_GPRS_TM(30)(r9)
+ std r31, VCPU_GPRS_TM(31)(r9)
+
+ /* Save away PPR and DSCR soon so don't run with user values. */
+ mfspr r31, SPRN_PPR
+ HMT_MEDIUM
+ mfspr r30, SPRN_DSCR
+ ld r29, HSTATE_DSCR(r13)
+ mtspr SPRN_DSCR, r29
+
+ /* Save all but r9, r13 & r29-r31 */
+ reg = 0
+ .rept 29
+ .if (reg != 9) && (reg != 13)
+ std reg, VCPU_GPRS_TM(reg)(r9)
+ .endif
+ reg = reg + 1
+ .endr
+ /* ... now save r13 */
+ GET_SCRATCH0(r4)
+ std r4, VCPU_GPRS_TM(13)(r9)
+ /* ... and save r9 */
+ ld r4, PACATMSCRATCH(r13)
+ std r4, VCPU_GPRS_TM(9)(r9)
+
+ /* Reload stack pointer and TOC. */
+ ld r1, HSTATE_HOST_R1(r13)
+ ld r2, PACATOC(r13)
+
+ /* Set MSR RI now we have r1 and r13 back. */
+ li r5, MSR_RI
+ mtmsrd r5, 1
+
+ /* Save away checkpinted SPRs. */
+ std r31, VCPU_PPR_TM(r9)
+ std r30, VCPU_DSCR_TM(r9)
+ mflr r5
+ mfcr r6
+ mfctr r7
+ mfspr r8, SPRN_AMR
+ mfspr r10, SPRN_TAR
+ std r5, VCPU_LR_TM(r9)
+ stw r6, VCPU_CR_TM(r9)
+ std r7, VCPU_CTR_TM(r9)
+ std r8, VCPU_AMR_TM(r9)
+ std r10, VCPU_TAR_TM(r9)
+
+ /* Restore r12 as trap number. */
+ lwz r12, VCPU_TRAP(r9)
+
+ /* Save FP/VSX. */
+ addi r3, r9, VCPU_FPRS_TM
+ bl store_fp_state
+ addi r3, r9, VCPU_VRS_TM
+ bl store_vr_state
+ mfspr r6, SPRN_VRSAVE
+ stw r6, VCPU_VRSAVE_TM(r9)
+1:
+ /*
+ * We need to save these SPRs after the treclaim so that the software
+ * error code is recorded correctly in the TEXASR. Also the user may
+ * change these outside of a transaction, so they must always be
+ * context switched.
+ */
+ mfspr r5, SPRN_TFHAR
+ mfspr r6, SPRN_TFIAR
+ mfspr r7, SPRN_TEXASR
+ std r5, VCPU_TFHAR(r9)
+ std r6, VCPU_TFIAR(r9)
+ std r7, VCPU_TEXASR(r9)
+
+ ld r0, PPC_LR_STKOFF(r1)
+ mtlr r0
+ blr
+
+/*
+ * Restore transactional state and TM-related registers.
+ * Called with r4 pointing to the vcpu struct.
+ * This potentially modifies all checkpointed registers.
+ * It restores r1, r2, r4 from the PACA.
+ */
+kvmppc_restore_tm:
+ mflr r0
+ std r0, PPC_LR_STKOFF(r1)
+
+ /* Turn on TM/FP/VSX/VMX so we can restore them. */
+ mfmsr r5
+ li r6, MSR_TM >> 32
+ sldi r6, r6, 32
+ or r5, r5, r6
+ ori r5, r5, MSR_FP
+ oris r5, r5, (MSR_VEC | MSR_VSX)@h
+ mtmsrd r5
+
+ /*
+ * The user may change these outside of a transaction, so they must
+ * always be context switched.
+ */
+ ld r5, VCPU_TFHAR(r4)
+ ld r6, VCPU_TFIAR(r4)
+ ld r7, VCPU_TEXASR(r4)
+ mtspr SPRN_TFHAR, r5
+ mtspr SPRN_TFIAR, r6
+ mtspr SPRN_TEXASR, r7
+
+ ld r5, VCPU_MSR(r4)
+ rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
+ beqlr /* TM not active in guest */
+ std r1, HSTATE_HOST_R1(r13)
+
+ /* Make sure the failure summary is set, otherwise we'll program check
+ * when we trechkpt. It's possible that this might have been not set
+ * on a kvmppc_set_one_reg() call but we shouldn't let this crash the
+ * host.
+ */
+ oris r7, r7, (TEXASR_FS)@h
+ mtspr SPRN_TEXASR, r7
+
+ /*
+ * We need to load up the checkpointed state for the guest.
+ * We need to do this early as it will blow away any GPRs, VSRs and
+ * some SPRs.
+ */
+
+ mr r31, r4
+ addi r3, r31, VCPU_FPRS_TM
+ bl load_fp_state
+ addi r3, r31, VCPU_VRS_TM
+ bl load_vr_state
+ mr r4, r31
+ lwz r7, VCPU_VRSAVE_TM(r4)
+ mtspr SPRN_VRSAVE, r7
+
+ ld r5, VCPU_LR_TM(r4)
+ lwz r6, VCPU_CR_TM(r4)
+ ld r7, VCPU_CTR_TM(r4)
+ ld r8, VCPU_AMR_TM(r4)
+ ld r9, VCPU_TAR_TM(r4)
+ mtlr r5
+ mtcr r6
+ mtctr r7
+ mtspr SPRN_AMR, r8
+ mtspr SPRN_TAR, r9
+
+ /*
+ * Load up PPR and DSCR values but don't put them in the actual SPRs
+ * till the last moment to avoid running with userspace PPR and DSCR for
+ * too long.
+ */
+ ld r29, VCPU_DSCR_TM(r4)
+ ld r30, VCPU_PPR_TM(r4)
+
+ std r2, PACATMSCRATCH(r13) /* Save TOC */
+
+ /* Clear the MSR RI since r1, r13 are all going to be foobar. */
+ li r5, 0
+ mtmsrd r5, 1
+
+ /* Load GPRs r0-r28 */
+ reg = 0
+ .rept 29
+ ld reg, VCPU_GPRS_TM(reg)(r31)
+ reg = reg + 1
+ .endr
+
+ mtspr SPRN_DSCR, r29
+ mtspr SPRN_PPR, r30
+
+ /* Load final GPRs */
+ ld 29, VCPU_GPRS_TM(29)(r31)
+ ld 30, VCPU_GPRS_TM(30)(r31)
+ ld 31, VCPU_GPRS_TM(31)(r31)
+
+ /* TM checkpointed state is now setup. All GPRs are now volatile. */
+ TRECHKPT
+
+ /* Now let's get back the state we need. */
+ HMT_MEDIUM
+ GET_PACA(r13)
+ ld r29, HSTATE_DSCR(r13)
+ mtspr SPRN_DSCR, r29
+ ld r4, HSTATE_KVM_VCPU(r13)
+ ld r1, HSTATE_HOST_R1(r13)
+ ld r2, PACATMSCRATCH(r13)
+
+ /* Set the MSR RI since we have our registers back. */
+ li r5, MSR_RI
+ mtmsrd r5, 1
+
+ ld r0, PPC_LR_STKOFF(r1)
+ mtlr r0
+ blr
+#endif
+
/*
* We come here if we get any exception or interrupt while we are
* executing host real mode code while in guest MMU context.
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index c4f7d6b86b9e..e76f79a45988 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -914,7 +914,7 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
/* We get here with MSR.EE=1 */
trace_kvm_exit(exit_nr, vcpu);
- kvm_guest_exit();
+ guest_exit();
switch (exit_nr) {
case BOOK3S_INTERRUPT_INST_STORAGE:
@@ -1049,7 +1049,17 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
int emul;
program_interrupt:
- flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+ /*
+ * shadow_srr1 only contains valid flags if we came here via
+ * a program exception. The other exceptions (emulation assist,
+ * FP unavailable, etc.) do not provide flags in SRR1, so use
+ * an illegal-instruction exception when injecting a program
+ * interrupt into the guest.
+ */
+ if (exit_nr == BOOK3S_INTERRUPT_PROGRAM)
+ flags = vcpu->arch.shadow_srr1 & 0x1f0000ull;
+ else
+ flags = SRR1_PROGILL;
emul = kvmppc_get_last_inst(vcpu, INST_GENERIC, &last_inst);
if (emul != EMULATE_DONE) {
@@ -1531,7 +1541,7 @@ static int kvmppc_vcpu_run_pr(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
kvmppc_clear_debug(vcpu);
- /* No need for kvm_guest_exit. It's done in handle_exit.
+ /* No need for guest_exit. It's done in handle_exit.
We also get here with interrupts enabled. */
/* Make sure we save the guest FPU/Altivec/VSX state */
diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
index 4afae695899a..02b4672f7347 100644
--- a/arch/powerpc/kvm/booke.c
+++ b/arch/powerpc/kvm/booke.c
@@ -776,7 +776,7 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
ret = __kvmppc_vcpu_run(kvm_run, vcpu);
- /* No need for kvm_guest_exit. It's done in handle_exit.
+ /* No need for guest_exit. It's done in handle_exit.
We also get here with interrupts enabled. */
/* Switch back to user space debug context */
@@ -1012,7 +1012,7 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu,
}
trace_kvm_exit(exit_nr, vcpu);
- __kvm_guest_exit();
+ guest_exit_irqoff();
local_irq_enable();
diff --git a/arch/powerpc/kvm/emulate.c b/arch/powerpc/kvm/emulate.c
index 5cc2e7af3a7b..b379146de55b 100644
--- a/arch/powerpc/kvm/emulate.c
+++ b/arch/powerpc/kvm/emulate.c
@@ -302,7 +302,6 @@ int kvmppc_emulate_instruction(struct kvm_run *run, struct kvm_vcpu *vcpu)
advance = 0;
printk(KERN_ERR "Couldn't emulate instruction 0x%08x "
"(op %d xop %d)\n", inst, get_op(inst), get_xop(inst));
- kvmppc_core_queue_program(vcpu, 0);
}
}
diff --git a/arch/powerpc/kvm/mpic.c b/arch/powerpc/kvm/mpic.c
index 6249cdc834d1..ed38f8114118 100644
--- a/arch/powerpc/kvm/mpic.c
+++ b/arch/powerpc/kvm/mpic.c
@@ -1823,7 +1823,8 @@ int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
return 0;
}
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
int r = -EINVAL;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 02416fea7653..6ce40dd6fe51 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -119,7 +119,7 @@ int kvmppc_prepare_to_enter(struct kvm_vcpu *vcpu)
continue;
}
- __kvm_guest_enter();
+ guest_enter_irqoff();
return 1;
}
@@ -588,6 +588,10 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = 1;
break;
#endif
+ case KVM_CAP_PPC_HTM:
+ r = cpu_has_feature(CPU_FTR_TM_COMP) &&
+ is_kvmppc_hv_enabled(kvm);
+ break;
default:
r = 0;
break;
diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S
index cf928bba4d9a..3d29d40eb0e9 100644
--- a/arch/powerpc/platforms/powernv/opal-wrappers.S
+++ b/arch/powerpc/platforms/powernv/opal-wrappers.S
@@ -64,7 +64,6 @@ END_FTR_SECTION(0, 1); \
OPAL_BRANCH(opal_tracepoint_entry) \
mfcr r12; \
stw r12,8(r1); \
- std r1,PACAR1(r13); \
li r11,0; \
mfmsr r12; \
ori r11,r11,MSR_EE; \
@@ -127,7 +126,6 @@ opal_tracepoint_entry:
mfcr r12
std r11,16(r1)
stw r12,8(r1)
- std r1,PACAR1(r13)
li r11,0
mfmsr r12
ori r11,r11,MSR_EE
diff --git a/arch/s390/boot/compressed/Makefile b/arch/s390/boot/compressed/Makefile
index 13723c39e58e..33ba697c782d 100644
--- a/arch/s390/boot/compressed/Makefile
+++ b/arch/s390/boot/compressed/Makefile
@@ -33,10 +33,10 @@ quiet_cmd_sizes = GEN $@
$(obj)/sizes.h: vmlinux
$(call if_changed,sizes)
-AFLAGS_head.o += -I$(obj)
+AFLAGS_head.o += -I$(objtree)/$(obj)
$(obj)/head.o: $(obj)/sizes.h
-CFLAGS_misc.o += -I$(obj)
+CFLAGS_misc.o += -I$(objtree)/$(obj)
$(obj)/misc.o: $(obj)/sizes.h
OBJCOPYFLAGS_vmlinux.bin := -R .comment -S
diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index 67d43a0eabb4..28f03ca60100 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -19,29 +19,10 @@
#include <asm/ebcdic.h>
#include "hypfs.h"
-#define LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */
-#define CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */
#define TMP_SIZE 64 /* size of temporary buffers */
#define DBFS_D204_HDR_VERSION 0
-/* diag 204 subcodes */
-enum diag204_sc {
- SUBC_STIB4 = 4,
- SUBC_RSI = 5,
- SUBC_STIB6 = 6,
- SUBC_STIB7 = 7
-};
-
-/* The two available diag 204 data formats */
-enum diag204_format {
- INFO_SIMPLE = 0,
- INFO_EXT = 0x00010000
-};
-
-/* bit is set in flags, when physical cpu info is included in diag 204 data */
-#define LPAR_PHYS_FLG 0x80
-
static char *diag224_cpu_names; /* diag 224 name table */
static enum diag204_sc diag204_store_sc; /* used subcode for store */
static enum diag204_format diag204_info_type; /* used diag 204 data format */
@@ -53,7 +34,7 @@ static int diag204_buf_pages; /* number of pages for diag204 data */
static struct dentry *dbfs_d204_file;
/*
- * DIAG 204 data structures and member access functions.
+ * DIAG 204 member access functions.
*
* Since we have two different diag 204 data formats for old and new s390
* machines, we do not access the structs directly, but use getter functions for
@@ -62,304 +43,173 @@ static struct dentry *dbfs_d204_file;
/* Time information block */
-struct info_blk_hdr {
- __u8 npar;
- __u8 flags;
- __u16 tslice;
- __u16 phys_cpus;
- __u16 this_part;
- __u64 curtod;
-} __attribute__ ((packed));
-
-struct x_info_blk_hdr {
- __u8 npar;
- __u8 flags;
- __u16 tslice;
- __u16 phys_cpus;
- __u16 this_part;
- __u64 curtod1;
- __u64 curtod2;
- char reserved[40];
-} __attribute__ ((packed));
-
static inline int info_blk_hdr__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct info_blk_hdr);
- else /* INFO_EXT */
- return sizeof(struct x_info_blk_hdr);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_info_blk_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_info_blk_hdr);
}
static inline __u8 info_blk_hdr__npar(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct info_blk_hdr *)hdr)->npar;
- else /* INFO_EXT */
- return ((struct x_info_blk_hdr *)hdr)->npar;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_info_blk_hdr *)hdr)->npar;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_info_blk_hdr *)hdr)->npar;
}
static inline __u8 info_blk_hdr__flags(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct info_blk_hdr *)hdr)->flags;
- else /* INFO_EXT */
- return ((struct x_info_blk_hdr *)hdr)->flags;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_info_blk_hdr *)hdr)->flags;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_info_blk_hdr *)hdr)->flags;
}
static inline __u16 info_blk_hdr__pcpus(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct info_blk_hdr *)hdr)->phys_cpus;
- else /* INFO_EXT */
- return ((struct x_info_blk_hdr *)hdr)->phys_cpus;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_info_blk_hdr *)hdr)->phys_cpus;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_info_blk_hdr *)hdr)->phys_cpus;
}
/* Partition header */
-struct part_hdr {
- __u8 pn;
- __u8 cpus;
- char reserved[6];
- char part_name[LPAR_NAME_LEN];
-} __attribute__ ((packed));
-
-struct x_part_hdr {
- __u8 pn;
- __u8 cpus;
- __u8 rcpus;
- __u8 pflag;
- __u32 mlu;
- char part_name[LPAR_NAME_LEN];
- char lpc_name[8];
- char os_name[8];
- __u64 online_cs;
- __u64 online_es;
- __u8 upid;
- char reserved1[3];
- __u32 group_mlu;
- char group_name[8];
- char reserved2[32];
-} __attribute__ ((packed));
-
static inline int part_hdr__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct part_hdr);
- else /* INFO_EXT */
- return sizeof(struct x_part_hdr);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_part_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_part_hdr);
}
static inline __u8 part_hdr__rcpus(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct part_hdr *)hdr)->cpus;
- else /* INFO_EXT */
- return ((struct x_part_hdr *)hdr)->rcpus;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_part_hdr *)hdr)->cpus;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_part_hdr *)hdr)->rcpus;
}
static inline void part_hdr__part_name(enum diag204_format type, void *hdr,
char *name)
{
- if (type == INFO_SIMPLE)
- memcpy(name, ((struct part_hdr *)hdr)->part_name,
- LPAR_NAME_LEN);
- else /* INFO_EXT */
- memcpy(name, ((struct x_part_hdr *)hdr)->part_name,
- LPAR_NAME_LEN);
- EBCASC(name, LPAR_NAME_LEN);
- name[LPAR_NAME_LEN] = 0;
+ if (type == DIAG204_INFO_SIMPLE)
+ memcpy(name, ((struct diag204_part_hdr *)hdr)->part_name,
+ DIAG204_LPAR_NAME_LEN);
+ else /* DIAG204_INFO_EXT */
+ memcpy(name, ((struct diag204_x_part_hdr *)hdr)->part_name,
+ DIAG204_LPAR_NAME_LEN);
+ EBCASC(name, DIAG204_LPAR_NAME_LEN);
+ name[DIAG204_LPAR_NAME_LEN] = 0;
strim(name);
}
-struct cpu_info {
- __u16 cpu_addr;
- char reserved1[2];
- __u8 ctidx;
- __u8 cflag;
- __u16 weight;
- __u64 acc_time;
- __u64 lp_time;
-} __attribute__ ((packed));
-
-struct x_cpu_info {
- __u16 cpu_addr;
- char reserved1[2];
- __u8 ctidx;
- __u8 cflag;
- __u16 weight;
- __u64 acc_time;
- __u64 lp_time;
- __u16 min_weight;
- __u16 cur_weight;
- __u16 max_weight;
- char reseved2[2];
- __u64 online_time;
- __u64 wait_time;
- __u32 pma_weight;
- __u32 polar_weight;
- char reserved3[40];
-} __attribute__ ((packed));
-
/* CPU info block */
static inline int cpu_info__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct cpu_info);
- else /* INFO_EXT */
- return sizeof(struct x_cpu_info);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_cpu_info);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_cpu_info);
}
static inline __u8 cpu_info__ctidx(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct cpu_info *)hdr)->ctidx;
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->ctidx;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->ctidx;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->ctidx;
}
static inline __u16 cpu_info__cpu_addr(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct cpu_info *)hdr)->cpu_addr;
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->cpu_addr;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->cpu_addr;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->cpu_addr;
}
static inline __u64 cpu_info__acc_time(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct cpu_info *)hdr)->acc_time;
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->acc_time;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->acc_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->acc_time;
}
static inline __u64 cpu_info__lp_time(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct cpu_info *)hdr)->lp_time;
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->lp_time;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_cpu_info *)hdr)->lp_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->lp_time;
}
static inline __u64 cpu_info__online_time(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
+ if (type == DIAG204_INFO_SIMPLE)
return 0; /* online_time not available in simple info */
- else /* INFO_EXT */
- return ((struct x_cpu_info *)hdr)->online_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_cpu_info *)hdr)->online_time;
}
/* Physical header */
-struct phys_hdr {
- char reserved1[1];
- __u8 cpus;
- char reserved2[6];
- char mgm_name[8];
-} __attribute__ ((packed));
-
-struct x_phys_hdr {
- char reserved1[1];
- __u8 cpus;
- char reserved2[6];
- char mgm_name[8];
- char reserved3[80];
-} __attribute__ ((packed));
-
static inline int phys_hdr__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct phys_hdr);
- else /* INFO_EXT */
- return sizeof(struct x_phys_hdr);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_phys_hdr);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_phys_hdr);
}
static inline __u8 phys_hdr__cpus(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct phys_hdr *)hdr)->cpus;
- else /* INFO_EXT */
- return ((struct x_phys_hdr *)hdr)->cpus;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_hdr *)hdr)->cpus;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_hdr *)hdr)->cpus;
}
/* Physical CPU info block */
-struct phys_cpu {
- __u16 cpu_addr;
- char reserved1[2];
- __u8 ctidx;
- char reserved2[3];
- __u64 mgm_time;
- char reserved3[8];
-} __attribute__ ((packed));
-
-struct x_phys_cpu {
- __u16 cpu_addr;
- char reserved1[2];
- __u8 ctidx;
- char reserved2[3];
- __u64 mgm_time;
- char reserved3[80];
-} __attribute__ ((packed));
-
static inline int phys_cpu__size(enum diag204_format type)
{
- if (type == INFO_SIMPLE)
- return sizeof(struct phys_cpu);
- else /* INFO_EXT */
- return sizeof(struct x_phys_cpu);
+ if (type == DIAG204_INFO_SIMPLE)
+ return sizeof(struct diag204_phys_cpu);
+ else /* DIAG204_INFO_EXT */
+ return sizeof(struct diag204_x_phys_cpu);
}
static inline __u16 phys_cpu__cpu_addr(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct phys_cpu *)hdr)->cpu_addr;
- else /* INFO_EXT */
- return ((struct x_phys_cpu *)hdr)->cpu_addr;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->cpu_addr;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->cpu_addr;
}
static inline __u64 phys_cpu__mgm_time(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct phys_cpu *)hdr)->mgm_time;
- else /* INFO_EXT */
- return ((struct x_phys_cpu *)hdr)->mgm_time;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->mgm_time;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->mgm_time;
}
static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
{
- if (type == INFO_SIMPLE)
- return ((struct phys_cpu *)hdr)->ctidx;
- else /* INFO_EXT */
- return ((struct x_phys_cpu *)hdr)->ctidx;
+ if (type == DIAG204_INFO_SIMPLE)
+ return ((struct diag204_phys_cpu *)hdr)->ctidx;
+ else /* DIAG204_INFO_EXT */
+ return ((struct diag204_x_phys_cpu *)hdr)->ctidx;
}
/* Diagnose 204 functions */
-
-static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
-{
- register unsigned long _subcode asm("0") = *subcode;
- register unsigned long _size asm("1") = size;
-
- asm volatile(
- " diag %2,%0,0x204\n"
- "0: nopr %%r7\n"
- EX_TABLE(0b,0b)
- : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
- *subcode = _subcode;
- return _size;
-}
-
-static int diag204(unsigned long subcode, unsigned long size, void *addr)
-{
- diag_stat_inc(DIAG_STAT_X204);
- size = __diag204(&subcode, size, addr);
- if (subcode)
- return -1;
- return size;
-}
-
/*
* For the old diag subcode 4 with simple data format we have to use real
* memory. If we use subcode 6 or 7 with extended data format, we can (and
@@ -411,12 +261,12 @@ static void *diag204_get_buffer(enum diag204_format fmt, int *pages)
*pages = diag204_buf_pages;
return diag204_buf;
}
- if (fmt == INFO_SIMPLE) {
+ if (fmt == DIAG204_INFO_SIMPLE) {
*pages = 1;
return diag204_alloc_rbuf();
- } else {/* INFO_EXT */
- *pages = diag204((unsigned long)SUBC_RSI |
- (unsigned long)INFO_EXT, 0, NULL);
+ } else {/* DIAG204_INFO_EXT */
+ *pages = diag204((unsigned long)DIAG204_SUBC_RSI |
+ (unsigned long)DIAG204_INFO_EXT, 0, NULL);
if (*pages <= 0)
return ERR_PTR(-ENOSYS);
else
@@ -443,18 +293,18 @@ static int diag204_probe(void)
void *buf;
int pages, rc;
- buf = diag204_get_buffer(INFO_EXT, &pages);
+ buf = diag204_get_buffer(DIAG204_INFO_EXT, &pages);
if (!IS_ERR(buf)) {
- if (diag204((unsigned long)SUBC_STIB7 |
- (unsigned long)INFO_EXT, pages, buf) >= 0) {
- diag204_store_sc = SUBC_STIB7;
- diag204_info_type = INFO_EXT;
+ if (diag204((unsigned long)DIAG204_SUBC_STIB7 |
+ (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
+ diag204_store_sc = DIAG204_SUBC_STIB7;
+ diag204_info_type = DIAG204_INFO_EXT;
goto out;
}
- if (diag204((unsigned long)SUBC_STIB6 |
- (unsigned long)INFO_EXT, pages, buf) >= 0) {
- diag204_store_sc = SUBC_STIB6;
- diag204_info_type = INFO_EXT;
+ if (diag204((unsigned long)DIAG204_SUBC_STIB6 |
+ (unsigned long)DIAG204_INFO_EXT, pages, buf) >= 0) {
+ diag204_store_sc = DIAG204_SUBC_STIB6;
+ diag204_info_type = DIAG204_INFO_EXT;
goto out;
}
diag204_free_buffer();
@@ -462,15 +312,15 @@ static int diag204_probe(void)
/* subcodes 6 and 7 failed, now try subcode 4 */
- buf = diag204_get_buffer(INFO_SIMPLE, &pages);
+ buf = diag204_get_buffer(DIAG204_INFO_SIMPLE, &pages);
if (IS_ERR(buf)) {
rc = PTR_ERR(buf);
goto fail_alloc;
}
- if (diag204((unsigned long)SUBC_STIB4 |
- (unsigned long)INFO_SIMPLE, pages, buf) >= 0) {
- diag204_store_sc = SUBC_STIB4;
- diag204_info_type = INFO_SIMPLE;
+ if (diag204((unsigned long)DIAG204_SUBC_STIB4 |
+ (unsigned long)DIAG204_INFO_SIMPLE, pages, buf) >= 0) {
+ diag204_store_sc = DIAG204_SUBC_STIB4;
+ diag204_info_type = DIAG204_INFO_SIMPLE;
goto out;
} else {
rc = -ENOSYS;
@@ -510,20 +360,6 @@ out:
/* Diagnose 224 functions */
-static int diag224(void *ptr)
-{
- int rc = -EOPNOTSUPP;
-
- diag_stat_inc(DIAG_STAT_X224);
- asm volatile(
- " diag %1,%2,0x224\n"
- "0: lhi %0,0x0\n"
- "1:\n"
- EX_TABLE(0b,1b)
- : "+d" (rc) :"d" (0), "d" (ptr) : "memory");
- return rc;
-}
-
static int diag224_get_name_table(void)
{
/* memory must be below 2GB */
@@ -545,9 +381,9 @@ static void diag224_delete_name_table(void)
static int diag224_idx2name(int index, char *name)
{
- memcpy(name, diag224_cpu_names + ((index + 1) * CPU_NAME_LEN),
- CPU_NAME_LEN);
- name[CPU_NAME_LEN] = 0;
+ memcpy(name, diag224_cpu_names + ((index + 1) * DIAG204_CPU_NAME_LEN),
+ DIAG204_CPU_NAME_LEN);
+ name[DIAG204_CPU_NAME_LEN] = 0;
strim(name);
return 0;
}
@@ -603,7 +439,7 @@ __init int hypfs_diag_init(void)
pr_err("The hardware system does not support hypfs\n");
return -ENODATA;
}
- if (diag204_info_type == INFO_EXT) {
+ if (diag204_info_type == DIAG204_INFO_EXT) {
rc = hypfs_dbfs_create_file(&dbfs_file_d204);
if (rc)
return rc;
@@ -651,7 +487,7 @@ static int hypfs_create_cpu_files(struct dentry *cpus_dir, void *cpu_info)
cpu_info__lp_time(diag204_info_type, cpu_info));
if (IS_ERR(rc))
return PTR_ERR(rc);
- if (diag204_info_type == INFO_EXT) {
+ if (diag204_info_type == DIAG204_INFO_EXT) {
rc = hypfs_create_u64(cpu_dir, "onlinetime",
cpu_info__online_time(diag204_info_type,
cpu_info));
@@ -667,12 +503,12 @@ static void *hypfs_create_lpar_files(struct dentry *systems_dir, void *part_hdr)
{
struct dentry *cpus_dir;
struct dentry *lpar_dir;
- char lpar_name[LPAR_NAME_LEN + 1];
+ char lpar_name[DIAG204_LPAR_NAME_LEN + 1];
void *cpu_info;
int i;
part_hdr__part_name(diag204_info_type, part_hdr, lpar_name);
- lpar_name[LPAR_NAME_LEN] = 0;
+ lpar_name[DIAG204_LPAR_NAME_LEN] = 0;
lpar_dir = hypfs_mkdir(systems_dir, lpar_name);
if (IS_ERR(lpar_dir))
return lpar_dir;
@@ -755,7 +591,8 @@ int hypfs_diag_create_files(struct dentry *root)
goto err_out;
}
}
- if (info_blk_hdr__flags(diag204_info_type, time_hdr) & LPAR_PHYS_FLG) {
+ if (info_blk_hdr__flags(diag204_info_type, time_hdr) &
+ DIAG204_LPAR_PHYS_FLG) {
ptr = hypfs_create_phys_files(root, part_hdr);
if (IS_ERR(ptr)) {
rc = PTR_ERR(ptr);
diff --git a/arch/s390/include/asm/cpacf.h b/arch/s390/include/asm/cpacf.h
index 1a82cf26ee11..d28621de8e0b 100644
--- a/arch/s390/include/asm/cpacf.h
+++ b/arch/s390/include/asm/cpacf.h
@@ -20,6 +20,9 @@
#define CPACF_KMC 0xb92f /* MSA */
#define CPACF_KIMD 0xb93e /* MSA */
#define CPACF_KLMD 0xb93f /* MSA */
+#define CPACF_PCKMO 0xb928 /* MSA3 */
+#define CPACF_KMF 0xb92a /* MSA4 */
+#define CPACF_KMO 0xb92b /* MSA4 */
#define CPACF_PCC 0xb92c /* MSA4 */
#define CPACF_KMCTR 0xb92d /* MSA4 */
#define CPACF_PPNO 0xb93c /* MSA5 */
@@ -136,6 +139,7 @@ static inline void __cpacf_query(unsigned int opcode, unsigned char *status)
register unsigned long r1 asm("1") = (unsigned long) status;
asm volatile(
+ " spm 0\n" /* pckmo doesn't change the cc */
/* Parameter registers are ignored, but may not be 0 */
"0: .insn rrf,%[opc] << 16,2,2,2,0\n"
" brc 1,0b\n" /* handle partial completion */
@@ -157,6 +161,12 @@ static inline int cpacf_query(unsigned int opcode, unsigned int func)
if (!test_facility(17)) /* check for MSA */
return 0;
break;
+ case CPACF_PCKMO:
+ if (!test_facility(76)) /* check for MSA3 */
+ return 0;
+ break;
+ case CPACF_KMF:
+ case CPACF_KMO:
case CPACF_PCC:
case CPACF_KMCTR:
if (!test_facility(77)) /* check for MSA4 */
diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h
index 86cae09e076a..8acf482162ed 100644
--- a/arch/s390/include/asm/diag.h
+++ b/arch/s390/include/asm/diag.h
@@ -78,4 +78,153 @@ struct diag210 {
extern int diag210(struct diag210 *addr);
+/* bit is set in flags, when physical cpu info is included in diag 204 data */
+#define DIAG204_LPAR_PHYS_FLG 0x80
+#define DIAG204_LPAR_NAME_LEN 8 /* lpar name len in diag 204 data */
+#define DIAG204_CPU_NAME_LEN 16 /* type name len of cpus in diag224 name table */
+
+/* diag 204 subcodes */
+enum diag204_sc {
+ DIAG204_SUBC_STIB4 = 4,
+ DIAG204_SUBC_RSI = 5,
+ DIAG204_SUBC_STIB6 = 6,
+ DIAG204_SUBC_STIB7 = 7
+};
+
+/* The two available diag 204 data formats */
+enum diag204_format {
+ DIAG204_INFO_SIMPLE = 0,
+ DIAG204_INFO_EXT = 0x00010000
+};
+
+enum diag204_cpu_flags {
+ DIAG204_CPU_ONLINE = 0x20,
+ DIAG204_CPU_CAPPED = 0x40,
+};
+
+struct diag204_info_blk_hdr {
+ __u8 npar;
+ __u8 flags;
+ __u16 tslice;
+ __u16 phys_cpus;
+ __u16 this_part;
+ __u64 curtod;
+} __packed;
+
+struct diag204_x_info_blk_hdr {
+ __u8 npar;
+ __u8 flags;
+ __u16 tslice;
+ __u16 phys_cpus;
+ __u16 this_part;
+ __u64 curtod1;
+ __u64 curtod2;
+ char reserved[40];
+} __packed;
+
+struct diag204_part_hdr {
+ __u8 pn;
+ __u8 cpus;
+ char reserved[6];
+ char part_name[DIAG204_LPAR_NAME_LEN];
+} __packed;
+
+struct diag204_x_part_hdr {
+ __u8 pn;
+ __u8 cpus;
+ __u8 rcpus;
+ __u8 pflag;
+ __u32 mlu;
+ char part_name[DIAG204_LPAR_NAME_LEN];
+ char lpc_name[8];
+ char os_name[8];
+ __u64 online_cs;
+ __u64 online_es;
+ __u8 upid;
+ __u8 reserved:3;
+ __u8 mtid:5;
+ char reserved1[2];
+ __u32 group_mlu;
+ char group_name[8];
+ char hardware_group_name[8];
+ char reserved2[24];
+} __packed;
+
+struct diag204_cpu_info {
+ __u16 cpu_addr;
+ char reserved1[2];
+ __u8 ctidx;
+ __u8 cflag;
+ __u16 weight;
+ __u64 acc_time;
+ __u64 lp_time;
+} __packed;
+
+struct diag204_x_cpu_info {
+ __u16 cpu_addr;
+ char reserved1[2];
+ __u8 ctidx;
+ __u8 cflag;
+ __u16 weight;
+ __u64 acc_time;
+ __u64 lp_time;
+ __u16 min_weight;
+ __u16 cur_weight;
+ __u16 max_weight;
+ char reseved2[2];
+ __u64 online_time;
+ __u64 wait_time;
+ __u32 pma_weight;
+ __u32 polar_weight;
+ __u32 cpu_type_cap;
+ __u32 group_cpu_type_cap;
+ char reserved3[32];
+} __packed;
+
+struct diag204_phys_hdr {
+ char reserved1[1];
+ __u8 cpus;
+ char reserved2[6];
+ char mgm_name[8];
+} __packed;
+
+struct diag204_x_phys_hdr {
+ char reserved1[1];
+ __u8 cpus;
+ char reserved2[6];
+ char mgm_name[8];
+ char reserved3[80];
+} __packed;
+
+struct diag204_phys_cpu {
+ __u16 cpu_addr;
+ char reserved1[2];
+ __u8 ctidx;
+ char reserved2[3];
+ __u64 mgm_time;
+ char reserved3[8];
+} __packed;
+
+struct diag204_x_phys_cpu {
+ __u16 cpu_addr;
+ char reserved1[2];
+ __u8 ctidx;
+ char reserved2[1];
+ __u16 weight;
+ __u64 mgm_time;
+ char reserved3[80];
+} __packed;
+
+struct diag204_x_part_block {
+ struct diag204_x_part_hdr hdr;
+ struct diag204_x_cpu_info cpus[];
+} __packed;
+
+struct diag204_x_phys_block {
+ struct diag204_x_phys_hdr hdr;
+ struct diag204_x_phys_cpu cpus[];
+} __packed;
+
+int diag204(unsigned long subcode, unsigned long size, void *addr);
+int diag224(void *ptr);
#endif /* _ASM_S390_DIAG_H */
diff --git a/arch/s390/include/asm/gmap.h b/arch/s390/include/asm/gmap.h
index d054c1b07a3c..741ddba0bf11 100644
--- a/arch/s390/include/asm/gmap.h
+++ b/arch/s390/include/asm/gmap.h
@@ -10,14 +10,25 @@
/**
* struct gmap_struct - guest address space
+ * @list: list head for the mm->context gmap list
* @crst_list: list of all crst tables used in the guest address space
* @mm: pointer to the parent mm_struct
* @guest_to_host: radix tree with guest to host address translation
* @host_to_guest: radix tree with pointer to segment table entries
* @guest_table_lock: spinlock to protect all entries in the guest page table
+ * @ref_count: reference counter for the gmap structure
* @table: pointer to the page directory
* @asce: address space control element for gmap page table
* @pfault_enabled: defines if pfaults are applicable for the guest
+ * @host_to_rmap: radix tree with gmap_rmap lists
+ * @children: list of shadow gmap structures
+ * @pt_list: list of all page tables used in the shadow guest address space
+ * @shadow_lock: spinlock to protect the shadow gmap list
+ * @parent: pointer to the parent gmap for shadow guest address spaces
+ * @orig_asce: ASCE for which the shadow page table has been created
+ * @edat_level: edat level to be used for the shadow translation
+ * @removed: flag to indicate if a shadow guest address space has been removed
+ * @initialized: flag to indicate if a shadow guest address space can be used
*/
struct gmap {
struct list_head list;
@@ -26,26 +37,64 @@ struct gmap {
struct radix_tree_root guest_to_host;
struct radix_tree_root host_to_guest;
spinlock_t guest_table_lock;
+ atomic_t ref_count;
unsigned long *table;
unsigned long asce;
unsigned long asce_end;
void *private;
bool pfault_enabled;
+ /* Additional data for shadow guest address spaces */
+ struct radix_tree_root host_to_rmap;
+ struct list_head children;
+ struct list_head pt_list;
+ spinlock_t shadow_lock;
+ struct gmap *parent;
+ unsigned long orig_asce;
+ int edat_level;
+ bool removed;
+ bool initialized;
};
/**
+ * struct gmap_rmap - reverse mapping for shadow page table entries
+ * @next: pointer to next rmap in the list
+ * @raddr: virtual rmap address in the shadow guest address space
+ */
+struct gmap_rmap {
+ struct gmap_rmap *next;
+ unsigned long raddr;
+};
+
+#define gmap_for_each_rmap(pos, head) \
+ for (pos = (head); pos; pos = pos->next)
+
+#define gmap_for_each_rmap_safe(pos, n, head) \
+ for (pos = (head); n = pos ? pos->next : NULL, pos; pos = n)
+
+/**
* struct gmap_notifier - notify function block for page invalidation
* @notifier_call: address of callback function
*/
struct gmap_notifier {
struct list_head list;
- void (*notifier_call)(struct gmap *gmap, unsigned long gaddr);
+ struct rcu_head rcu;
+ void (*notifier_call)(struct gmap *gmap, unsigned long start,
+ unsigned long end);
};
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit);
-void gmap_free(struct gmap *gmap);
+static inline int gmap_is_shadow(struct gmap *gmap)
+{
+ return !!gmap->parent;
+}
+
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit);
+void gmap_remove(struct gmap *gmap);
+struct gmap *gmap_get(struct gmap *gmap);
+void gmap_put(struct gmap *gmap);
+
void gmap_enable(struct gmap *gmap);
void gmap_disable(struct gmap *gmap);
+struct gmap *gmap_get_enabled(void);
int gmap_map_segment(struct gmap *gmap, unsigned long from,
unsigned long to, unsigned long len);
int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len);
@@ -57,8 +106,29 @@ void gmap_discard(struct gmap *, unsigned long from, unsigned long to);
void __gmap_zap(struct gmap *, unsigned long gaddr);
void gmap_unlink(struct mm_struct *, unsigned long *table, unsigned long vmaddr);
-void gmap_register_ipte_notifier(struct gmap_notifier *);
-void gmap_unregister_ipte_notifier(struct gmap_notifier *);
-int gmap_ipte_notify(struct gmap *, unsigned long start, unsigned long len);
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val);
+
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+ int edat_level);
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level);
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+ int fake);
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+ int fake);
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+ int fake);
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+ int fake);
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+ unsigned long *pgt, int *dat_protection, int *fake);
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte);
+
+void gmap_register_pte_notifier(struct gmap_notifier *);
+void gmap_unregister_pte_notifier(struct gmap_notifier *);
+void gmap_pte_notify(struct mm_struct *, unsigned long addr, pte_t *,
+ unsigned long bits);
+
+int gmap_mprotect_notify(struct gmap *, unsigned long start,
+ unsigned long len, int prot);
#endif /* _ASM_S390_GMAP_H */
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index ac82e8eb936d..8e5daf7a76ce 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -43,6 +43,7 @@
/* s390-specific vcpu->requests bit members */
#define KVM_REQ_ENABLE_IBS 8
#define KVM_REQ_DISABLE_IBS 9
+#define KVM_REQ_ICPT_OPEREXC 10
#define SIGP_CTRL_C 0x80
#define SIGP_CTRL_SCN_MASK 0x3f
@@ -145,7 +146,7 @@ struct kvm_s390_sie_block {
__u64 cputm; /* 0x0028 */
__u64 ckc; /* 0x0030 */
__u64 epoch; /* 0x0038 */
- __u8 reserved40[4]; /* 0x0040 */
+ __u32 svcc; /* 0x0040 */
#define LCTL_CR0 0x8000
#define LCTL_CR6 0x0200
#define LCTL_CR9 0x0040
@@ -154,6 +155,7 @@ struct kvm_s390_sie_block {
#define LCTL_CR14 0x0002
__u16 lctl; /* 0x0044 */
__s16 icpua; /* 0x0046 */
+#define ICTL_OPEREXC 0x80000000
#define ICTL_PINT 0x20000000
#define ICTL_LPSW 0x00400000
#define ICTL_STCTL 0x00040000
@@ -166,6 +168,9 @@ struct kvm_s390_sie_block {
#define ICPT_INST 0x04
#define ICPT_PROGI 0x08
#define ICPT_INSTPROGI 0x0C
+#define ICPT_EXTINT 0x14
+#define ICPT_VALIDITY 0x20
+#define ICPT_STOP 0x28
#define ICPT_OPEREXC 0x2C
#define ICPT_PARTEXEC 0x38
#define ICPT_IOINST 0x40
@@ -185,7 +190,9 @@ struct kvm_s390_sie_block {
__u32 scaol; /* 0x0064 */
__u8 reserved68[4]; /* 0x0068 */
__u32 todpr; /* 0x006c */
- __u8 reserved70[32]; /* 0x0070 */
+ __u8 reserved70[16]; /* 0x0070 */
+ __u64 mso; /* 0x0080 */
+ __u64 msl; /* 0x0088 */
psw_t gpsw; /* 0x0090 */
__u64 gg14; /* 0x00a0 */
__u64 gg15; /* 0x00a8 */
@@ -223,7 +230,7 @@ struct kvm_s390_sie_block {
__u8 reserved1e6[2]; /* 0x01e6 */
__u64 itdba; /* 0x01e8 */
__u64 riccbd; /* 0x01f0 */
- __u8 reserved1f8[8]; /* 0x01f8 */
+ __u64 gvrd; /* 0x01f8 */
} __attribute__((packed));
struct kvm_s390_itdb {
@@ -256,6 +263,7 @@ struct kvm_vcpu_stat {
u32 instruction_stctg;
u32 exit_program_interruption;
u32 exit_instr_and_program;
+ u32 exit_operation_exception;
u32 deliver_external_call;
u32 deliver_emergency_signal;
u32 deliver_service_signal;
@@ -278,7 +286,9 @@ struct kvm_vcpu_stat {
u32 instruction_stsi;
u32 instruction_stfl;
u32 instruction_tprot;
+ u32 instruction_sie;
u32 instruction_essa;
+ u32 instruction_sthyi;
u32 instruction_sigp_sense;
u32 instruction_sigp_sense_running;
u32 instruction_sigp_external_call;
@@ -541,12 +551,16 @@ struct kvm_guestdbg_info_arch {
struct kvm_vcpu_arch {
struct kvm_s390_sie_block *sie_block;
+ /* if vsie is active, currently executed shadow sie control block */
+ struct kvm_s390_sie_block *vsie_block;
unsigned int host_acrs[NUM_ACRS];
struct fpu host_fpregs;
struct kvm_s390_local_interrupt local_int;
struct hrtimer ckc_timer;
struct kvm_s390_pgm_info pgm;
struct gmap *gmap;
+ /* backup location for the currently enabled gmap when scheduled out */
+ struct gmap *enabled_gmap;
struct kvm_guestdbg_info_arch guestdbg;
unsigned long pfault_token;
unsigned long pfault_select;
@@ -631,6 +645,14 @@ struct sie_page2 {
u8 reserved900[0x1000 - 0x900]; /* 0x0900 */
} __packed;
+struct kvm_s390_vsie {
+ struct mutex mutex;
+ struct radix_tree_root addr_to_page;
+ int page_count;
+ int next;
+ struct page *pages[KVM_MAX_VCPUS];
+};
+
struct kvm_arch{
void *sca;
int use_esca;
@@ -646,15 +668,20 @@ struct kvm_arch{
int user_cpu_state_ctrl;
int user_sigp;
int user_stsi;
+ int user_instr0;
struct s390_io_adapter *adapters[MAX_S390_IO_ADAPTERS];
wait_queue_head_t ipte_wq;
int ipte_lock_count;
struct mutex ipte_mutex;
+ struct ratelimit_state sthyi_limit;
spinlock_t start_stop_lock;
struct sie_page2 *sie_page2;
struct kvm_s390_cpu_model model;
struct kvm_s390_crypto crypto;
+ struct kvm_s390_vsie vsie;
u64 epoch;
+ /* subset of available cpu features enabled by user space */
+ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
};
#define KVM_HVA_ERR_BAD (-1UL)
diff --git a/arch/s390/include/asm/mmu.h b/arch/s390/include/asm/mmu.h
index 18226437a832..6d39329c894b 100644
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -8,8 +8,9 @@ typedef struct {
cpumask_t cpu_attach_mask;
atomic_t flush_count;
unsigned int flush_mm;
- spinlock_t list_lock;
+ spinlock_t pgtable_lock;
struct list_head pgtable_list;
+ spinlock_t gmap_lock;
struct list_head gmap_list;
unsigned long asce;
unsigned long asce_limit;
@@ -22,9 +23,11 @@ typedef struct {
unsigned int use_skey:1;
} mm_context_t;
-#define INIT_MM_CONTEXT(name) \
- .context.list_lock = __SPIN_LOCK_UNLOCKED(name.context.list_lock), \
- .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
+#define INIT_MM_CONTEXT(name) \
+ .context.pgtable_lock = \
+ __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock), \
+ .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
+ .context.gmap_lock = __SPIN_LOCK_UNLOCKED(name.context.gmap_lock), \
.context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),
static inline int tprot(unsigned long addr)
diff --git a/arch/s390/include/asm/mmu_context.h b/arch/s390/include/asm/mmu_context.h
index f77c638bf397..c6a088c91aee 100644
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -15,8 +15,9 @@
static inline int init_new_context(struct task_struct *tsk,
struct mm_struct *mm)
{
- spin_lock_init(&mm->context.list_lock);
+ spin_lock_init(&mm->context.pgtable_lock);
INIT_LIST_HEAD(&mm->context.pgtable_list);
+ spin_lock_init(&mm->context.gmap_lock);
INIT_LIST_HEAD(&mm->context.gmap_list);
cpumask_clear(&mm->context.cpu_attach_mask);
atomic_set(&mm->context.flush_count, 0);
diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h
index b2146c4119b2..69b8a41fca84 100644
--- a/arch/s390/include/asm/page.h
+++ b/arch/s390/include/asm/page.h
@@ -111,13 +111,14 @@ static inline unsigned char page_get_storage_key(unsigned long addr)
static inline int page_reset_referenced(unsigned long addr)
{
- unsigned int ipm;
+ int cc;
asm volatile(
" rrbe 0,%1\n"
" ipm %0\n"
- : "=d" (ipm) : "a" (addr) : "cc");
- return !!(ipm & 0x20000000);
+ " srl %0,28\n"
+ : "=d" (cc) : "a" (addr) : "cc");
+ return cc;
}
/* Bits int the storage key */
@@ -148,6 +149,8 @@ static inline int devmem_is_allowed(unsigned long pfn)
#define virt_to_page(kaddr) pfn_to_page(__pa(kaddr) >> PAGE_SHIFT)
#define page_to_phys(page) (page_to_pfn(page) << PAGE_SHIFT)
#define virt_addr_valid(kaddr) pfn_valid(__pa(kaddr) >> PAGE_SHIFT)
+#define pfn_to_virt(pfn) __va((pfn) << PAGE_SHIFT)
+#define page_to_virt(page) pfn_to_virt(page_to_pfn(page))
#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | \
VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC)
diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h
index da34cb6b1f3b..f4eb9843eed4 100644
--- a/arch/s390/include/asm/pgalloc.h
+++ b/arch/s390/include/asm/pgalloc.h
@@ -19,8 +19,10 @@ unsigned long *crst_table_alloc(struct mm_struct *);
void crst_table_free(struct mm_struct *, unsigned long *);
unsigned long *page_table_alloc(struct mm_struct *);
+struct page *page_table_alloc_pgste(struct mm_struct *mm);
void page_table_free(struct mm_struct *, unsigned long *);
void page_table_free_rcu(struct mmu_gather *, unsigned long *, unsigned long);
+void page_table_free_pgste(struct page *page);
extern int page_table_allocate_pgste;
static inline void clear_table(unsigned long *s, unsigned long val, size_t n)
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index 48d383af078f..72c7f60bfe83 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -277,6 +277,7 @@ static inline int is_module_addr(void *addr)
/* Bits in the region table entry */
#define _REGION_ENTRY_ORIGIN ~0xfffUL/* region/segment table origin */
#define _REGION_ENTRY_PROTECT 0x200 /* region protection bit */
+#define _REGION_ENTRY_OFFSET 0xc0 /* region table offset */
#define _REGION_ENTRY_INVALID 0x20 /* invalid region table entry */
#define _REGION_ENTRY_TYPE_MASK 0x0c /* region/segment table type mask */
#define _REGION_ENTRY_TYPE_R1 0x0c /* region first table type */
@@ -364,6 +365,7 @@ static inline int is_module_addr(void *addr)
#define PGSTE_GC_BIT 0x0002000000000000UL
#define PGSTE_UC_BIT 0x0000800000000000UL /* user dirty (migration) */
#define PGSTE_IN_BIT 0x0000400000000000UL /* IPTE notify bit */
+#define PGSTE_VSIE_BIT 0x0000200000000000UL /* ref'd in a shadow table */
/* Guest Page State used for virtualization */
#define _PGSTE_GPS_ZERO 0x0000000080000000UL
@@ -1002,15 +1004,26 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
void ptep_set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t entry);
void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
-void ptep_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+void ptep_notify(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, unsigned long bits);
+int ptep_force_prot(struct mm_struct *mm, unsigned long gaddr,
+ pte_t *ptep, int prot, unsigned long bit);
void ptep_zap_unused(struct mm_struct *mm, unsigned long addr,
pte_t *ptep , int reset);
void ptep_zap_key(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+ pte_t *sptep, pte_t *tptep, pte_t pte);
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep);
bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long address);
int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
unsigned char key, bool nq);
-unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr);
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char key, unsigned char *oldkey,
+ bool nq, bool mr, bool mc);
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr);
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char *key);
/*
* Certain architectures need to do special things when PTEs
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 09529202ea77..03323175de30 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -112,6 +112,8 @@ struct thread_struct {
unsigned long ksp; /* kernel stack pointer */
mm_segment_t mm_segment;
unsigned long gmap_addr; /* address of last gmap fault. */
+ unsigned int gmap_write_flag; /* gmap fault write indication */
+ unsigned int gmap_int_code; /* int code of last gmap fault */
unsigned int gmap_pfault; /* signal of a pending guest pfault */
struct per_regs per_user; /* User specified PER registers */
struct per_event per_event; /* Cause of the last PER trap */
diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h
index e4f6f73afe2f..2ad9c204b1a2 100644
--- a/arch/s390/include/asm/sclp.h
+++ b/arch/s390/include/asm/sclp.h
@@ -32,12 +32,19 @@ struct sclp_core_entry {
u8 reserved0;
u8 : 4;
u8 sief2 : 1;
- u8 : 3;
- u8 : 3;
+ u8 skey : 1;
+ u8 : 2;
+ u8 : 2;
+ u8 gpere : 1;
u8 siif : 1;
u8 sigpif : 1;
u8 : 3;
- u8 reserved2[10];
+ u8 reserved2[3];
+ u8 : 2;
+ u8 ib : 1;
+ u8 cei : 1;
+ u8 : 4;
+ u8 reserved3[6];
u8 type;
u8 reserved1;
} __attribute__((packed));
@@ -59,6 +66,15 @@ struct sclp_info {
unsigned char has_hvs : 1;
unsigned char has_esca : 1;
unsigned char has_sief2 : 1;
+ unsigned char has_64bscao : 1;
+ unsigned char has_gpere : 1;
+ unsigned char has_cmma : 1;
+ unsigned char has_gsls : 1;
+ unsigned char has_ib : 1;
+ unsigned char has_cei : 1;
+ unsigned char has_pfmfi : 1;
+ unsigned char has_ibs : 1;
+ unsigned char has_skey : 1;
unsigned int ibc;
unsigned int mtid;
unsigned int mtid_cp;
@@ -101,5 +117,6 @@ int memcpy_hsa_kernel(void *dest, unsigned long src, size_t count);
int memcpy_hsa_user(void __user *dest, unsigned long src, size_t count);
void sclp_early_detect(void);
void _sclp_print_early(const char *);
+void sclp_ocf_cpc_name_copy(char *dst);
#endif /* _ASM_S390_SCLP_H */
diff --git a/arch/s390/include/uapi/asm/kvm.h b/arch/s390/include/uapi/asm/kvm.h
index 3b8e99ef9d58..a2ffec4139ad 100644
--- a/arch/s390/include/uapi/asm/kvm.h
+++ b/arch/s390/include/uapi/asm/kvm.h
@@ -93,6 +93,47 @@ struct kvm_s390_vm_cpu_machine {
__u64 fac_list[256];
};
+#define KVM_S390_VM_CPU_PROCESSOR_FEAT 2
+#define KVM_S390_VM_CPU_MACHINE_FEAT 3
+
+#define KVM_S390_VM_CPU_FEAT_NR_BITS 1024
+#define KVM_S390_VM_CPU_FEAT_ESOP 0
+#define KVM_S390_VM_CPU_FEAT_SIEF2 1
+#define KVM_S390_VM_CPU_FEAT_64BSCAO 2
+#define KVM_S390_VM_CPU_FEAT_SIIF 3
+#define KVM_S390_VM_CPU_FEAT_GPERE 4
+#define KVM_S390_VM_CPU_FEAT_GSLS 5
+#define KVM_S390_VM_CPU_FEAT_IB 6
+#define KVM_S390_VM_CPU_FEAT_CEI 7
+#define KVM_S390_VM_CPU_FEAT_IBS 8
+#define KVM_S390_VM_CPU_FEAT_SKEY 9
+#define KVM_S390_VM_CPU_FEAT_CMMA 10
+#define KVM_S390_VM_CPU_FEAT_PFMFI 11
+#define KVM_S390_VM_CPU_FEAT_SIGPIF 12
+struct kvm_s390_vm_cpu_feat {
+ __u64 feat[16];
+};
+
+#define KVM_S390_VM_CPU_PROCESSOR_SUBFUNC 4
+#define KVM_S390_VM_CPU_MACHINE_SUBFUNC 5
+/* for "test bit" instructions MSB 0 bit ordering, for "query" raw blocks */
+struct kvm_s390_vm_cpu_subfunc {
+ __u8 plo[32]; /* always */
+ __u8 ptff[16]; /* with TOD-clock steering */
+ __u8 kmac[16]; /* with MSA */
+ __u8 kmc[16]; /* with MSA */
+ __u8 km[16]; /* with MSA */
+ __u8 kimd[16]; /* with MSA */
+ __u8 klmd[16]; /* with MSA */
+ __u8 pckmo[16]; /* with MSA3 */
+ __u8 kmctr[16]; /* with MSA4 */
+ __u8 kmf[16]; /* with MSA4 */
+ __u8 kmo[16]; /* with MSA4 */
+ __u8 pcc[16]; /* with MSA4 */
+ __u8 ppno[16]; /* with MSA5 */
+ __u8 reserved[1824];
+};
+
/* kvm attributes for crypto */
#define KVM_S390_VM_CRYPTO_ENABLE_AES_KW 0
#define KVM_S390_VM_CRYPTO_ENABLE_DEA_KW 1
diff --git a/arch/s390/include/uapi/asm/sie.h b/arch/s390/include/uapi/asm/sie.h
index 8fb5d4a6dd25..3ac634368939 100644
--- a/arch/s390/include/uapi/asm/sie.h
+++ b/arch/s390/include/uapi/asm/sie.h
@@ -140,6 +140,7 @@
exit_code_ipa0(0xB2, 0x4c, "TAR"), \
exit_code_ipa0(0xB2, 0x50, "CSP"), \
exit_code_ipa0(0xB2, 0x54, "MVPG"), \
+ exit_code_ipa0(0xB2, 0x56, "STHYI"), \
exit_code_ipa0(0xB2, 0x58, "BSG"), \
exit_code_ipa0(0xB2, 0x5a, "BSA"), \
exit_code_ipa0(0xB2, 0x5f, "CHSC"), \
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c
index 48b37b8357e6..a97354c8c667 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@ -162,6 +162,30 @@ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
}
EXPORT_SYMBOL(diag14);
+static inline int __diag204(unsigned long *subcode, unsigned long size, void *addr)
+{
+ register unsigned long _subcode asm("0") = *subcode;
+ register unsigned long _size asm("1") = size;
+
+ asm volatile(
+ " diag %2,%0,0x204\n"
+ "0: nopr %%r7\n"
+ EX_TABLE(0b,0b)
+ : "+d" (_subcode), "+d" (_size) : "d" (addr) : "memory");
+ *subcode = _subcode;
+ return _size;
+}
+
+int diag204(unsigned long subcode, unsigned long size, void *addr)
+{
+ diag_stat_inc(DIAG_STAT_X204);
+ size = __diag204(&subcode, size, addr);
+ if (subcode)
+ return -1;
+ return size;
+}
+EXPORT_SYMBOL(diag204);
+
/*
* Diagnose 210: Get information about a virtual device
*/
@@ -196,3 +220,18 @@ int diag210(struct diag210 *addr)
return ccode;
}
EXPORT_SYMBOL(diag210);
+
+int diag224(void *ptr)
+{
+ int rc = -EOPNOTSUPP;
+
+ diag_stat_inc(DIAG_STAT_X224);
+ asm volatile(
+ " diag %1,%2,0x224\n"
+ "0: lhi %0,0x0\n"
+ "1:\n"
+ EX_TABLE(0b,1b)
+ : "+d" (rc) :"d" (0), "d" (ptr) : "memory");
+ return rc;
+}
+EXPORT_SYMBOL(diag224);
diff --git a/arch/s390/kvm/Makefile b/arch/s390/kvm/Makefile
index d42fa38c2429..09a9e6dfc09f 100644
--- a/arch/s390/kvm/Makefile
+++ b/arch/s390/kvm/Makefile
@@ -12,6 +12,6 @@ common-objs = $(KVM)/kvm_main.o $(KVM)/eventfd.o $(KVM)/async_pf.o $(KVM)/irqch
ccflags-y := -Ivirt/kvm -Iarch/s390/kvm
kvm-objs := $(common-objs) kvm-s390.o intercept.o interrupt.o priv.o sigp.o
-kvm-objs += diag.o gaccess.o guestdbg.o
+kvm-objs += diag.o gaccess.o guestdbg.o sthyi.o vsie.o
obj-$(CONFIG_KVM) += kvm.o
diff --git a/arch/s390/kvm/diag.c b/arch/s390/kvm/diag.c
index 1ea4095b67d7..ce865bd4f81d 100644
--- a/arch/s390/kvm/diag.c
+++ b/arch/s390/kvm/diag.c
@@ -212,6 +212,11 @@ static int __diag_virtio_hypercall(struct kvm_vcpu *vcpu)
(vcpu->run->s.regs.gprs[1] != KVM_S390_VIRTIO_CCW_NOTIFY))
return -EOPNOTSUPP;
+ VCPU_EVENT(vcpu, 4, "diag 0x500 schid 0x%8.8x queue 0x%x cookie 0x%llx",
+ (u32) vcpu->run->s.regs.gprs[2],
+ (u32) vcpu->run->s.regs.gprs[3],
+ vcpu->run->s.regs.gprs[4]);
+
/*
* The layout is as follows:
* - gpr 2 contains the subchannel id (passed as addr)
diff --git a/arch/s390/kvm/gaccess.c b/arch/s390/kvm/gaccess.c
index 66938d283b77..54200208bf24 100644
--- a/arch/s390/kvm/gaccess.c
+++ b/arch/s390/kvm/gaccess.c
@@ -8,6 +8,7 @@
#include <linux/vmalloc.h>
#include <linux/err.h>
#include <asm/pgtable.h>
+#include <asm/gmap.h>
#include "kvm-s390.h"
#include "gaccess.h"
#include <asm/switch_to.h>
@@ -476,18 +477,73 @@ enum {
FSI_FETCH = 2 /* Exception was due to fetch operation */
};
-static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
- ar_t ar, enum gacc_mode mode)
+enum prot_type {
+ PROT_TYPE_LA = 0,
+ PROT_TYPE_KEYC = 1,
+ PROT_TYPE_ALC = 2,
+ PROT_TYPE_DAT = 3,
+};
+
+static int trans_exc(struct kvm_vcpu *vcpu, int code, unsigned long gva,
+ ar_t ar, enum gacc_mode mode, enum prot_type prot)
{
- int rc;
- struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
- struct trans_exc_code_bits *tec_bits;
+ struct trans_exc_code_bits *tec;
memset(pgm, 0, sizeof(*pgm));
- tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
- tec_bits->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
- tec_bits->as = psw.as;
+ pgm->code = code;
+ tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
+
+ switch (code) {
+ case PGM_ASCE_TYPE:
+ case PGM_PAGE_TRANSLATION:
+ case PGM_REGION_FIRST_TRANS:
+ case PGM_REGION_SECOND_TRANS:
+ case PGM_REGION_THIRD_TRANS:
+ case PGM_SEGMENT_TRANSLATION:
+ /*
+ * op_access_id only applies to MOVE_PAGE -> set bit 61
+ * exc_access_id has to be set to 0 for some instructions. Both
+ * cases have to be handled by the caller. We can always store
+ * exc_access_id, as it is undefined for non-ar cases.
+ */
+ tec->addr = gva >> PAGE_SHIFT;
+ tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+ tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+ /* FALL THROUGH */
+ case PGM_ALEN_TRANSLATION:
+ case PGM_ALE_SEQUENCE:
+ case PGM_ASTE_VALIDITY:
+ case PGM_ASTE_SEQUENCE:
+ case PGM_EXTENDED_AUTHORITY:
+ pgm->exc_access_id = ar;
+ break;
+ case PGM_PROTECTION:
+ switch (prot) {
+ case PROT_TYPE_ALC:
+ tec->b60 = 1;
+ /* FALL THROUGH */
+ case PROT_TYPE_DAT:
+ tec->b61 = 1;
+ tec->addr = gva >> PAGE_SHIFT;
+ tec->fsi = mode == GACC_STORE ? FSI_STORE : FSI_FETCH;
+ tec->as = psw_bits(vcpu->arch.sie_block->gpsw).as;
+ /* exc_access_id is undefined for most cases */
+ pgm->exc_access_id = ar;
+ break;
+ default: /* LA and KEYC set b61 to 0, other params undefined */
+ break;
+ }
+ break;
+ }
+ return code;
+}
+
+static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
+ unsigned long ga, ar_t ar, enum gacc_mode mode)
+{
+ int rc;
+ struct psw_bits psw = psw_bits(vcpu->arch.sie_block->gpsw);
if (!psw.t) {
asce->val = 0;
@@ -510,21 +566,8 @@ static int get_vcpu_asce(struct kvm_vcpu *vcpu, union asce *asce,
return 0;
case PSW_AS_ACCREG:
rc = ar_translation(vcpu, asce, ar, mode);
- switch (rc) {
- case PGM_ALEN_TRANSLATION:
- case PGM_ALE_SEQUENCE:
- case PGM_ASTE_VALIDITY:
- case PGM_ASTE_SEQUENCE:
- case PGM_EXTENDED_AUTHORITY:
- vcpu->arch.pgm.exc_access_id = ar;
- break;
- case PGM_PROTECTION:
- tec_bits->b60 = 1;
- tec_bits->b61 = 1;
- break;
- }
if (rc > 0)
- pgm->code = rc;
+ return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_ALC);
return rc;
}
return 0;
@@ -729,40 +772,31 @@ static int low_address_protection_enabled(struct kvm_vcpu *vcpu,
return 1;
}
-static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga,
+static int guest_page_range(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar,
unsigned long *pages, unsigned long nr_pages,
const union asce asce, enum gacc_mode mode)
{
- struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
psw_t *psw = &vcpu->arch.sie_block->gpsw;
- struct trans_exc_code_bits *tec_bits;
- int lap_enabled, rc;
+ int lap_enabled, rc = 0;
- tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
lap_enabled = low_address_protection_enabled(vcpu, asce);
while (nr_pages) {
ga = kvm_s390_logical_to_effective(vcpu, ga);
- tec_bits->addr = ga >> PAGE_SHIFT;
- if (mode == GACC_STORE && lap_enabled && is_low_address(ga)) {
- pgm->code = PGM_PROTECTION;
- return pgm->code;
- }
+ if (mode == GACC_STORE && lap_enabled && is_low_address(ga))
+ return trans_exc(vcpu, PGM_PROTECTION, ga, ar, mode,
+ PROT_TYPE_LA);
ga &= PAGE_MASK;
if (psw_bits(*psw).t) {
rc = guest_translate(vcpu, ga, pages, asce, mode);
if (rc < 0)
return rc;
- if (rc == PGM_PROTECTION)
- tec_bits->b61 = 1;
- if (rc)
- pgm->code = rc;
} else {
*pages = kvm_s390_real_to_abs(vcpu, ga);
if (kvm_is_error_gpa(vcpu->kvm, *pages))
- pgm->code = PGM_ADDRESSING;
+ rc = PGM_ADDRESSING;
}
- if (pgm->code)
- return pgm->code;
+ if (rc)
+ return trans_exc(vcpu, rc, ga, ar, mode, PROT_TYPE_DAT);
ga += PAGE_SIZE;
pages++;
nr_pages--;
@@ -783,7 +817,8 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
if (!len)
return 0;
- rc = get_vcpu_asce(vcpu, &asce, ar, mode);
+ ga = kvm_s390_logical_to_effective(vcpu, ga);
+ rc = get_vcpu_asce(vcpu, &asce, ga, ar, mode);
if (rc)
return rc;
nr_pages = (((ga & ~PAGE_MASK) + len - 1) >> PAGE_SHIFT) + 1;
@@ -795,7 +830,7 @@ int access_guest(struct kvm_vcpu *vcpu, unsigned long ga, ar_t ar, void *data,
need_ipte_lock = psw_bits(*psw).t && !asce.r;
if (need_ipte_lock)
ipte_lock(vcpu);
- rc = guest_page_range(vcpu, ga, pages, nr_pages, asce, mode);
+ rc = guest_page_range(vcpu, ga, ar, pages, nr_pages, asce, mode);
for (idx = 0; idx < nr_pages && !rc; idx++) {
gpa = *(pages + idx) + (ga & ~PAGE_MASK);
_len = min(PAGE_SIZE - (gpa & ~PAGE_MASK), len);
@@ -846,37 +881,28 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
int guest_translate_address(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
unsigned long *gpa, enum gacc_mode mode)
{
- struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
psw_t *psw = &vcpu->arch.sie_block->gpsw;
- struct trans_exc_code_bits *tec;
union asce asce;
int rc;
gva = kvm_s390_logical_to_effective(vcpu, gva);
- tec = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
- rc = get_vcpu_asce(vcpu, &asce, ar, mode);
- tec->addr = gva >> PAGE_SHIFT;
+ rc = get_vcpu_asce(vcpu, &asce, gva, ar, mode);
if (rc)
return rc;
if (is_low_address(gva) && low_address_protection_enabled(vcpu, asce)) {
- if (mode == GACC_STORE) {
- rc = pgm->code = PGM_PROTECTION;
- return rc;
- }
+ if (mode == GACC_STORE)
+ return trans_exc(vcpu, PGM_PROTECTION, gva, 0,
+ mode, PROT_TYPE_LA);
}
if (psw_bits(*psw).t && !asce.r) { /* Use DAT? */
rc = guest_translate(vcpu, gva, gpa, asce, mode);
- if (rc > 0) {
- if (rc == PGM_PROTECTION)
- tec->b61 = 1;
- pgm->code = rc;
- }
+ if (rc > 0)
+ return trans_exc(vcpu, rc, gva, 0, mode, PROT_TYPE_DAT);
} else {
- rc = 0;
*gpa = kvm_s390_real_to_abs(vcpu, gva);
if (kvm_is_error_gpa(vcpu->kvm, *gpa))
- rc = pgm->code = PGM_ADDRESSING;
+ return trans_exc(vcpu, rc, gva, PGM_ADDRESSING, mode, 0);
}
return rc;
@@ -915,20 +941,247 @@ int check_gva_range(struct kvm_vcpu *vcpu, unsigned long gva, ar_t ar,
*/
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra)
{
- struct kvm_s390_pgm_info *pgm = &vcpu->arch.pgm;
- psw_t *psw = &vcpu->arch.sie_block->gpsw;
- struct trans_exc_code_bits *tec_bits;
union ctlreg0 ctlreg0 = {.val = vcpu->arch.sie_block->gcr[0]};
if (!ctlreg0.lap || !is_low_address(gra))
return 0;
+ return trans_exc(vcpu, PGM_PROTECTION, gra, 0, GACC_STORE, PROT_TYPE_LA);
+}
- memset(pgm, 0, sizeof(*pgm));
- tec_bits = (struct trans_exc_code_bits *)&pgm->trans_exc_code;
- tec_bits->fsi = FSI_STORE;
- tec_bits->as = psw_bits(*psw).as;
- tec_bits->addr = gra >> PAGE_SHIFT;
- pgm->code = PGM_PROTECTION;
+/**
+ * kvm_s390_shadow_tables - walk the guest page table and create shadow tables
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: pointer to the page table address result
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ */
+static int kvm_s390_shadow_tables(struct gmap *sg, unsigned long saddr,
+ unsigned long *pgt, int *dat_protection,
+ int *fake)
+{
+ struct gmap *parent;
+ union asce asce;
+ union vaddress vaddr;
+ unsigned long ptr;
+ int rc;
+
+ *fake = 0;
+ *dat_protection = 0;
+ parent = sg->parent;
+ vaddr.addr = saddr;
+ asce.val = sg->orig_asce;
+ ptr = asce.origin * 4096;
+ if (asce.r) {
+ *fake = 1;
+ asce.dt = ASCE_TYPE_REGION1;
+ }
+ switch (asce.dt) {
+ case ASCE_TYPE_REGION1:
+ if (vaddr.rfx01 > asce.tl && !asce.r)
+ return PGM_REGION_FIRST_TRANS;
+ break;
+ case ASCE_TYPE_REGION2:
+ if (vaddr.rfx)
+ return PGM_ASCE_TYPE;
+ if (vaddr.rsx01 > asce.tl)
+ return PGM_REGION_SECOND_TRANS;
+ break;
+ case ASCE_TYPE_REGION3:
+ if (vaddr.rfx || vaddr.rsx)
+ return PGM_ASCE_TYPE;
+ if (vaddr.rtx01 > asce.tl)
+ return PGM_REGION_THIRD_TRANS;
+ break;
+ case ASCE_TYPE_SEGMENT:
+ if (vaddr.rfx || vaddr.rsx || vaddr.rtx)
+ return PGM_ASCE_TYPE;
+ if (vaddr.sx01 > asce.tl)
+ return PGM_SEGMENT_TRANSLATION;
+ break;
+ }
+
+ switch (asce.dt) {
+ case ASCE_TYPE_REGION1: {
+ union region1_table_entry rfte;
- return pgm->code;
+ if (*fake) {
+ /* offset in 16EB guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.rsx << 53UL);
+ rfte.val = ptr;
+ goto shadow_r2t;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.rfx * 8, &rfte.val);
+ if (rc)
+ return rc;
+ if (rfte.i)
+ return PGM_REGION_FIRST_TRANS;
+ if (rfte.tt != TABLE_TYPE_REGION1)
+ return PGM_TRANSLATION_SPEC;
+ if (vaddr.rsx01 < rfte.tf || vaddr.rsx01 > rfte.tl)
+ return PGM_REGION_SECOND_TRANS;
+ if (sg->edat_level >= 1)
+ *dat_protection |= rfte.p;
+ ptr = rfte.rto << 12UL;
+shadow_r2t:
+ rc = gmap_shadow_r2t(sg, saddr, rfte.val, *fake);
+ if (rc)
+ return rc;
+ /* fallthrough */
+ }
+ case ASCE_TYPE_REGION2: {
+ union region2_table_entry rste;
+
+ if (*fake) {
+ /* offset in 8PB guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.rtx << 42UL);
+ rste.val = ptr;
+ goto shadow_r3t;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.rsx * 8, &rste.val);
+ if (rc)
+ return rc;
+ if (rste.i)
+ return PGM_REGION_SECOND_TRANS;
+ if (rste.tt != TABLE_TYPE_REGION2)
+ return PGM_TRANSLATION_SPEC;
+ if (vaddr.rtx01 < rste.tf || vaddr.rtx01 > rste.tl)
+ return PGM_REGION_THIRD_TRANS;
+ if (sg->edat_level >= 1)
+ *dat_protection |= rste.p;
+ ptr = rste.rto << 12UL;
+shadow_r3t:
+ rste.p |= *dat_protection;
+ rc = gmap_shadow_r3t(sg, saddr, rste.val, *fake);
+ if (rc)
+ return rc;
+ /* fallthrough */
+ }
+ case ASCE_TYPE_REGION3: {
+ union region3_table_entry rtte;
+
+ if (*fake) {
+ /* offset in 4TB guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.sx << 31UL);
+ rtte.val = ptr;
+ goto shadow_sgt;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.rtx * 8, &rtte.val);
+ if (rc)
+ return rc;
+ if (rtte.i)
+ return PGM_REGION_THIRD_TRANS;
+ if (rtte.tt != TABLE_TYPE_REGION3)
+ return PGM_TRANSLATION_SPEC;
+ if (rtte.cr && asce.p && sg->edat_level >= 2)
+ return PGM_TRANSLATION_SPEC;
+ if (rtte.fc && sg->edat_level >= 2) {
+ *dat_protection |= rtte.fc0.p;
+ *fake = 1;
+ ptr = rtte.fc1.rfaa << 31UL;
+ rtte.val = ptr;
+ goto shadow_sgt;
+ }
+ if (vaddr.sx01 < rtte.fc0.tf || vaddr.sx01 > rtte.fc0.tl)
+ return PGM_SEGMENT_TRANSLATION;
+ if (sg->edat_level >= 1)
+ *dat_protection |= rtte.fc0.p;
+ ptr = rtte.fc0.sto << 12UL;
+shadow_sgt:
+ rtte.fc0.p |= *dat_protection;
+ rc = gmap_shadow_sgt(sg, saddr, rtte.val, *fake);
+ if (rc)
+ return rc;
+ /* fallthrough */
+ }
+ case ASCE_TYPE_SEGMENT: {
+ union segment_table_entry ste;
+
+ if (*fake) {
+ /* offset in 2G guest memory block */
+ ptr = ptr + ((unsigned long) vaddr.sx << 20UL);
+ ste.val = ptr;
+ goto shadow_pgt;
+ }
+ rc = gmap_read_table(parent, ptr + vaddr.sx * 8, &ste.val);
+ if (rc)
+ return rc;
+ if (ste.i)
+ return PGM_SEGMENT_TRANSLATION;
+ if (ste.tt != TABLE_TYPE_SEGMENT)
+ return PGM_TRANSLATION_SPEC;
+ if (ste.cs && asce.p)
+ return PGM_TRANSLATION_SPEC;
+ *dat_protection |= ste.fc0.p;
+ if (ste.fc && sg->edat_level >= 1) {
+ *fake = 1;
+ ptr = ste.fc1.sfaa << 20UL;
+ ste.val = ptr;
+ goto shadow_pgt;
+ }
+ ptr = ste.fc0.pto << 11UL;
+shadow_pgt:
+ ste.fc0.p |= *dat_protection;
+ rc = gmap_shadow_pgt(sg, saddr, ste.val, *fake);
+ if (rc)
+ return rc;
+ }
+ }
+ /* Return the parent address of the page table */
+ *pgt = ptr;
+ return 0;
+}
+
+/**
+ * kvm_s390_shadow_fault - handle fault on a shadow page table
+ * @vcpu: virtual cpu
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ *
+ * Returns: - 0 if the shadow fault was successfully resolved
+ * - > 0 (pgm exception code) on exceptions while faulting
+ * - -EAGAIN if the caller can retry immediately
+ * - -EFAULT when accessing invalid guest addresses
+ * - -ENOMEM if out of memory
+ */
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *sg,
+ unsigned long saddr)
+{
+ union vaddress vaddr;
+ union page_table_entry pte;
+ unsigned long pgt;
+ int dat_protection, fake;
+ int rc;
+
+ down_read(&sg->mm->mmap_sem);
+ /*
+ * We don't want any guest-2 tables to change - so the parent
+ * tables/pointers we read stay valid - unshadowing is however
+ * always possible - only guest_table_lock protects us.
+ */
+ ipte_lock(vcpu);
+
+ rc = gmap_shadow_pgt_lookup(sg, saddr, &pgt, &dat_protection, &fake);
+ if (rc)
+ rc = kvm_s390_shadow_tables(sg, saddr, &pgt, &dat_protection,
+ &fake);
+
+ vaddr.addr = saddr;
+ if (fake) {
+ /* offset in 1MB guest memory block */
+ pte.val = pgt + ((unsigned long) vaddr.px << 12UL);
+ goto shadow_page;
+ }
+ if (!rc)
+ rc = gmap_read_table(sg->parent, pgt + vaddr.px * 8, &pte.val);
+ if (!rc && pte.i)
+ rc = PGM_PAGE_TRANSLATION;
+ if (!rc && (pte.z || (pte.co && sg->edat_level < 1)))
+ rc = PGM_TRANSLATION_SPEC;
+shadow_page:
+ pte.p |= dat_protection;
+ if (!rc)
+ rc = gmap_shadow_page(sg, saddr, __pte(pte.val));
+ ipte_unlock(vcpu);
+ up_read(&sg->mm->mmap_sem);
+ return rc;
}
diff --git a/arch/s390/kvm/gaccess.h b/arch/s390/kvm/gaccess.h
index df0a79dd8159..8756569ad938 100644
--- a/arch/s390/kvm/gaccess.h
+++ b/arch/s390/kvm/gaccess.h
@@ -361,4 +361,7 @@ void ipte_unlock(struct kvm_vcpu *vcpu);
int ipte_lock_held(struct kvm_vcpu *vcpu);
int kvm_s390_check_low_addr_prot_real(struct kvm_vcpu *vcpu, unsigned long gra);
+int kvm_s390_shadow_fault(struct kvm_vcpu *vcpu, struct gmap *shadow,
+ unsigned long saddr);
+
#endif /* __KVM_S390_GACCESS_H */
diff --git a/arch/s390/kvm/guestdbg.c b/arch/s390/kvm/guestdbg.c
index e8c6843b9600..31a05330d11c 100644
--- a/arch/s390/kvm/guestdbg.c
+++ b/arch/s390/kvm/guestdbg.c
@@ -439,6 +439,23 @@ exit_required:
#define guest_per_enabled(vcpu) \
(vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PER)
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu)
+{
+ const u8 ilen = kvm_s390_get_ilen(vcpu);
+ struct kvm_s390_pgm_info pgm_info = {
+ .code = PGM_PER,
+ .per_code = PER_EVENT_IFETCH >> 24,
+ .per_address = __rewind_psw(vcpu->arch.sie_block->gpsw, ilen),
+ };
+
+ /*
+ * The PSW points to the next instruction, therefore the intercepted
+ * instruction generated a PER i-fetch event. PER address therefore
+ * points at the previous PSW address (could be an EXECUTE function).
+ */
+ return kvm_s390_inject_prog_irq(vcpu, &pgm_info);
+}
+
static void filter_guest_per_event(struct kvm_vcpu *vcpu)
{
u32 perc = vcpu->arch.sie_block->perc << 24;
@@ -465,7 +482,7 @@ static void filter_guest_per_event(struct kvm_vcpu *vcpu)
guest_perc &= ~PER_EVENT_IFETCH;
/* All other PER events will be given to the guest */
- /* TODO: Check alterated address/address space */
+ /* TODO: Check altered address/address space */
vcpu->arch.sie_block->perc = guest_perc >> 24;
diff --git a/arch/s390/kvm/intercept.c b/arch/s390/kvm/intercept.c
index 252157181302..dfd0ca2638fa 100644
--- a/arch/s390/kvm/intercept.c
+++ b/arch/s390/kvm/intercept.c
@@ -351,8 +351,26 @@ static int handle_partial_execution(struct kvm_vcpu *vcpu)
return -EOPNOTSUPP;
}
+static int handle_operexc(struct kvm_vcpu *vcpu)
+{
+ vcpu->stat.exit_operation_exception++;
+ trace_kvm_s390_handle_operexc(vcpu, vcpu->arch.sie_block->ipa,
+ vcpu->arch.sie_block->ipb);
+
+ if (vcpu->arch.sie_block->ipa == 0xb256 &&
+ test_kvm_facility(vcpu->kvm, 74))
+ return handle_sthyi(vcpu);
+
+ if (vcpu->arch.sie_block->ipa == 0 && vcpu->kvm->arch.user_instr0)
+ return -EOPNOTSUPP;
+
+ return kvm_s390_inject_program_int(vcpu, PGM_OPERATION);
+}
+
int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
{
+ int rc, per_rc = 0;
+
if (kvm_is_ucontrol(vcpu->kvm))
return -EOPNOTSUPP;
@@ -361,7 +379,8 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
case 0x18:
return handle_noop(vcpu);
case 0x04:
- return handle_instruction(vcpu);
+ rc = handle_instruction(vcpu);
+ break;
case 0x08:
return handle_prog(vcpu);
case 0x14:
@@ -372,9 +391,19 @@ int kvm_handle_sie_intercept(struct kvm_vcpu *vcpu)
return handle_validity(vcpu);
case 0x28:
return handle_stop(vcpu);
+ case 0x2c:
+ rc = handle_operexc(vcpu);
+ break;
case 0x38:
- return handle_partial_execution(vcpu);
+ rc = handle_partial_execution(vcpu);
+ break;
default:
return -EOPNOTSUPP;
}
+
+ /* process PER, also if the instrution is processed in user space */
+ if (vcpu->arch.sie_block->icptstatus & 0x02 &&
+ (!rc || rc == -EOPNOTSUPP))
+ per_rc = kvm_s390_handle_per_ifetch_icpt(vcpu);
+ return per_rc ? per_rc : rc;
}
diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c
index 5a80af740d3e..24524c0f3ef8 100644
--- a/arch/s390/kvm/interrupt.c
+++ b/arch/s390/kvm/interrupt.c
@@ -28,9 +28,6 @@
#include "gaccess.h"
#include "trace-s390.h"
-#define IOINT_SCHID_MASK 0x0000ffff
-#define IOINT_SSID_MASK 0x00030000
-#define IOINT_CSSID_MASK 0x03fc0000
#define PFAULT_INIT 0x0600
#define PFAULT_DONE 0x0680
#define VIRTIO_PARAM 0x0d00
@@ -821,7 +818,14 @@ static int __must_check __deliver_io(struct kvm_vcpu *vcpu,
struct kvm_s390_interrupt_info,
list);
if (inti) {
- VCPU_EVENT(vcpu, 4, "deliver: I/O 0x%llx", inti->type);
+ if (inti->type & KVM_S390_INT_IO_AI_MASK)
+ VCPU_EVENT(vcpu, 4, "%s", "deliver: I/O (AI)");
+ else
+ VCPU_EVENT(vcpu, 4, "deliver: I/O %x ss %x schid %04x",
+ inti->io.subchannel_id >> 8,
+ inti->io.subchannel_id >> 1 & 0x3,
+ inti->io.subchannel_nr);
+
vcpu->stat.deliver_io_int++;
trace_kvm_s390_deliver_interrupt(vcpu->vcpu_id,
inti->type,
@@ -991,6 +995,11 @@ void kvm_s390_vcpu_wakeup(struct kvm_vcpu *vcpu)
swake_up(&vcpu->wq);
vcpu->stat.halt_wakeup++;
}
+ /*
+ * The VCPU might not be sleeping but is executing the VSIE. Let's
+ * kick it, so it leaves the SIE to process the request.
+ */
+ kvm_s390_vsie_kick(vcpu);
}
enum hrtimer_restart kvm_s390_idle_wakeup(struct hrtimer *timer)
@@ -1415,6 +1424,13 @@ static int __inject_io(struct kvm *kvm, struct kvm_s390_interrupt_info *inti)
}
fi->counters[FIRQ_CNTR_IO] += 1;
+ if (inti->type & KVM_S390_INT_IO_AI_MASK)
+ VM_EVENT(kvm, 4, "%s", "inject: I/O (AI)");
+ else
+ VM_EVENT(kvm, 4, "inject: I/O %x ss %x schid %04x",
+ inti->io.subchannel_id >> 8,
+ inti->io.subchannel_id >> 1 & 0x3,
+ inti->io.subchannel_nr);
isc = int_word_to_isc(inti->io.io_int_word);
list = &fi->lists[FIRQ_LIST_IO_ISC_0 + isc];
list_add_tail(&inti->list, list);
@@ -1531,13 +1547,6 @@ int kvm_s390_inject_vm(struct kvm *kvm,
inti->mchk.mcic = s390int->parm64;
break;
case KVM_S390_INT_IO_MIN...KVM_S390_INT_IO_MAX:
- if (inti->type & KVM_S390_INT_IO_AI_MASK)
- VM_EVENT(kvm, 5, "%s", "inject: I/O (AI)");
- else
- VM_EVENT(kvm, 5, "inject: I/O css %x ss %x schid %04x",
- s390int->type & IOINT_CSSID_MASK,
- s390int->type & IOINT_SSID_MASK,
- s390int->type & IOINT_SCHID_MASK);
inti->io.subchannel_id = s390int->parm >> 16;
inti->io.subchannel_nr = s390int->parm & 0x0000ffffu;
inti->io.io_int_parm = s390int->parm64 >> 32;
@@ -2237,7 +2246,8 @@ static int set_adapter_int(struct kvm_kernel_irq_routing_entry *e,
return ret;
}
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
int ret;
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 6f5c344cd785..3f3ae4865d57 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -21,11 +21,13 @@
#include <linux/init.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
+#include <linux/mman.h>
#include <linux/module.h>
#include <linux/random.h>
#include <linux/slab.h>
#include <linux/timer.h>
#include <linux/vmalloc.h>
+#include <linux/bitmap.h>
#include <asm/asm-offsets.h>
#include <asm/lowcore.h>
#include <asm/stp.h>
@@ -35,6 +37,8 @@
#include <asm/switch_to.h>
#include <asm/isc.h>
#include <asm/sclp.h>
+#include <asm/cpacf.h>
+#include <asm/timex.h>
#include "kvm-s390.h"
#include "gaccess.h"
@@ -64,6 +68,7 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "exit_pei", VCPU_STAT(exit_pei) },
{ "exit_program_interruption", VCPU_STAT(exit_program_interruption) },
{ "exit_instr_and_program_int", VCPU_STAT(exit_instr_and_program) },
+ { "exit_operation_exception", VCPU_STAT(exit_operation_exception) },
{ "halt_successful_poll", VCPU_STAT(halt_successful_poll) },
{ "halt_attempted_poll", VCPU_STAT(halt_attempted_poll) },
{ "halt_poll_invalid", VCPU_STAT(halt_poll_invalid) },
@@ -94,6 +99,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "instruction_stsi", VCPU_STAT(instruction_stsi) },
{ "instruction_stfl", VCPU_STAT(instruction_stfl) },
{ "instruction_tprot", VCPU_STAT(instruction_tprot) },
+ { "instruction_sthyi", VCPU_STAT(instruction_sthyi) },
+ { "instruction_sie", VCPU_STAT(instruction_sie) },
{ "instruction_sigp_sense", VCPU_STAT(instruction_sigp_sense) },
{ "instruction_sigp_sense_running", VCPU_STAT(instruction_sigp_sense_running) },
{ "instruction_sigp_external_call", VCPU_STAT(instruction_sigp_external_call) },
@@ -119,6 +126,11 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ NULL }
};
+/* allow nested virtualization in KVM (if enabled by user space) */
+static int nested;
+module_param(nested, int, S_IRUGO);
+MODULE_PARM_DESC(nested, "Nested virtualization support");
+
/* upper facilities limit for kvm */
unsigned long kvm_s390_fac_list_mask[16] = {
0xffe6000000000000UL,
@@ -131,7 +143,13 @@ unsigned long kvm_s390_fac_list_mask_size(void)
return ARRAY_SIZE(kvm_s390_fac_list_mask);
}
+/* available cpu features supported by kvm */
+static DECLARE_BITMAP(kvm_s390_available_cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS);
+/* available subfunctions indicated via query / "test bit" */
+static struct kvm_s390_vm_cpu_subfunc kvm_s390_available_subfunc;
+
static struct gmap_notifier gmap_notifier;
+static struct gmap_notifier vsie_gmap_notifier;
debug_info_t *kvm_s390_dbf;
/* Section: not file related */
@@ -141,7 +159,8 @@ int kvm_arch_hardware_enable(void)
return 0;
}
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address);
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end);
/*
* This callback is executed during stop_machine(). All CPUs are therefore
@@ -163,6 +182,8 @@ static int kvm_clock_sync(struct notifier_block *notifier, unsigned long val,
vcpu->arch.sie_block->epoch -= *delta;
if (vcpu->arch.cputm_enabled)
vcpu->arch.cputm_start += *delta;
+ if (vcpu->arch.vsie_block)
+ vcpu->arch.vsie_block->epoch -= *delta;
}
}
return NOTIFY_OK;
@@ -175,7 +196,9 @@ static struct notifier_block kvm_clock_notifier = {
int kvm_arch_hardware_setup(void)
{
gmap_notifier.notifier_call = kvm_gmap_notifier;
- gmap_register_ipte_notifier(&gmap_notifier);
+ gmap_register_pte_notifier(&gmap_notifier);
+ vsie_gmap_notifier.notifier_call = kvm_s390_vsie_gmap_notifier;
+ gmap_register_pte_notifier(&vsie_gmap_notifier);
atomic_notifier_chain_register(&s390_epoch_delta_notifier,
&kvm_clock_notifier);
return 0;
@@ -183,11 +206,109 @@ int kvm_arch_hardware_setup(void)
void kvm_arch_hardware_unsetup(void)
{
- gmap_unregister_ipte_notifier(&gmap_notifier);
+ gmap_unregister_pte_notifier(&gmap_notifier);
+ gmap_unregister_pte_notifier(&vsie_gmap_notifier);
atomic_notifier_chain_unregister(&s390_epoch_delta_notifier,
&kvm_clock_notifier);
}
+static void allow_cpu_feat(unsigned long nr)
+{
+ set_bit_inv(nr, kvm_s390_available_cpu_feat);
+}
+
+static inline int plo_test_bit(unsigned char nr)
+{
+ register unsigned long r0 asm("0") = (unsigned long) nr | 0x100;
+ int cc = 3; /* subfunction not available */
+
+ asm volatile(
+ /* Parameter registers are ignored for "test bit" */
+ " plo 0,0,0,0(0)\n"
+ " ipm %0\n"
+ " srl %0,28\n"
+ : "=d" (cc)
+ : "d" (r0)
+ : "cc");
+ return cc == 0;
+}
+
+static void kvm_s390_cpu_feat_init(void)
+{
+ int i;
+
+ for (i = 0; i < 256; ++i) {
+ if (plo_test_bit(i))
+ kvm_s390_available_subfunc.plo[i >> 3] |= 0x80 >> (i & 7);
+ }
+
+ if (test_facility(28)) /* TOD-clock steering */
+ ptff(kvm_s390_available_subfunc.ptff,
+ sizeof(kvm_s390_available_subfunc.ptff),
+ PTFF_QAF);
+
+ if (test_facility(17)) { /* MSA */
+ __cpacf_query(CPACF_KMAC, kvm_s390_available_subfunc.kmac);
+ __cpacf_query(CPACF_KMC, kvm_s390_available_subfunc.kmc);
+ __cpacf_query(CPACF_KM, kvm_s390_available_subfunc.km);
+ __cpacf_query(CPACF_KIMD, kvm_s390_available_subfunc.kimd);
+ __cpacf_query(CPACF_KLMD, kvm_s390_available_subfunc.klmd);
+ }
+ if (test_facility(76)) /* MSA3 */
+ __cpacf_query(CPACF_PCKMO, kvm_s390_available_subfunc.pckmo);
+ if (test_facility(77)) { /* MSA4 */
+ __cpacf_query(CPACF_KMCTR, kvm_s390_available_subfunc.kmctr);
+ __cpacf_query(CPACF_KMF, kvm_s390_available_subfunc.kmf);
+ __cpacf_query(CPACF_KMO, kvm_s390_available_subfunc.kmo);
+ __cpacf_query(CPACF_PCC, kvm_s390_available_subfunc.pcc);
+ }
+ if (test_facility(57)) /* MSA5 */
+ __cpacf_query(CPACF_PPNO, kvm_s390_available_subfunc.ppno);
+
+ if (MACHINE_HAS_ESOP)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_ESOP);
+ /*
+ * We need SIE support, ESOP (PROT_READ protection for gmap_shadow),
+ * 64bit SCAO (SCA passthrough) and IDTE (for gmap_shadow unshadowing).
+ */
+ if (!sclp.has_sief2 || !MACHINE_HAS_ESOP || !sclp.has_64bscao ||
+ !test_facility(3) || !nested)
+ return;
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIEF2);
+ if (sclp.has_64bscao)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_64BSCAO);
+ if (sclp.has_siif)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_SIIF);
+ if (sclp.has_gpere)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GPERE);
+ if (sclp.has_gsls)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_GSLS);
+ if (sclp.has_ib)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IB);
+ if (sclp.has_cei)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_CEI);
+ if (sclp.has_ibs)
+ allow_cpu_feat(KVM_S390_VM_CPU_FEAT_IBS);
+ /*
+ * KVM_S390_VM_CPU_FEAT_SKEY: Wrong shadow of PTE.I bits will make
+ * all skey handling functions read/set the skey from the PGSTE
+ * instead of the real storage key.
+ *
+ * KVM_S390_VM_CPU_FEAT_CMMA: Wrong shadow of PTE.I bits will make
+ * pages being detected as preserved although they are resident.
+ *
+ * KVM_S390_VM_CPU_FEAT_PFMFI: Wrong shadow of PTE.I bits will
+ * have the same effect as for KVM_S390_VM_CPU_FEAT_SKEY.
+ *
+ * For KVM_S390_VM_CPU_FEAT_SKEY, KVM_S390_VM_CPU_FEAT_CMMA and
+ * KVM_S390_VM_CPU_FEAT_PFMFI, all PTE.I and PGSTE bits have to be
+ * correctly shadowed. We can do that for the PGSTE but not for PTE.I.
+ *
+ * KVM_S390_VM_CPU_FEAT_SIGPIF: Wrong SCB addresses in the SCA. We
+ * cannot easily shadow the SCA because of the ipte lock.
+ */
+}
+
int kvm_arch_init(void *opaque)
{
kvm_s390_dbf = debug_register("kvm-trace", 32, 1, 7 * sizeof(long));
@@ -199,6 +320,8 @@ int kvm_arch_init(void *opaque)
return -ENOMEM;
}
+ kvm_s390_cpu_feat_init();
+
/* Register floating interrupt controller interface. */
return kvm_register_device_ops(&kvm_flic_ops, KVM_DEV_TYPE_FLIC);
}
@@ -244,6 +367,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_S390_USER_STSI:
case KVM_CAP_S390_SKEYS:
case KVM_CAP_S390_IRQ_STATE:
+ case KVM_CAP_S390_USER_INSTR0:
r = 1;
break;
case KVM_CAP_S390_MEM_OP:
@@ -251,8 +375,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
case KVM_CAP_NR_VCPUS:
case KVM_CAP_MAX_VCPUS:
- r = sclp.has_esca ? KVM_S390_ESCA_CPU_SLOTS
- : KVM_S390_BSCA_CPU_SLOTS;
+ r = KVM_S390_BSCA_CPU_SLOTS;
+ if (sclp.has_esca && sclp.has_64bscao)
+ r = KVM_S390_ESCA_CPU_SLOTS;
break;
case KVM_CAP_NR_MEMSLOTS:
r = KVM_USER_MEM_SLOTS;
@@ -335,6 +460,16 @@ out:
return r;
}
+static void icpt_operexc_on_all_vcpus(struct kvm *kvm)
+{
+ unsigned int i;
+ struct kvm_vcpu *vcpu;
+
+ kvm_for_each_vcpu(i, vcpu, kvm) {
+ kvm_s390_sync_request(KVM_REQ_ICPT_OPEREXC, vcpu);
+ }
+}
+
static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
{
int r;
@@ -355,7 +490,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
break;
case KVM_CAP_S390_VECTOR_REGISTERS:
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus)) {
+ if (kvm->created_vcpus) {
r = -EBUSY;
} else if (MACHINE_HAS_VX) {
set_kvm_facility(kvm->arch.model.fac_mask, 129);
@@ -370,7 +505,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
case KVM_CAP_S390_RI:
r = -EINVAL;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus)) {
+ if (kvm->created_vcpus) {
r = -EBUSY;
} else if (test_facility(64)) {
set_kvm_facility(kvm->arch.model.fac_mask, 64);
@@ -386,6 +521,12 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, struct kvm_enable_cap *cap)
kvm->arch.user_stsi = 1;
r = 0;
break;
+ case KVM_CAP_S390_USER_INSTR0:
+ VM_EVENT(kvm, 3, "%s", "ENABLE: CAP_S390_USER_INSTR0");
+ kvm->arch.user_instr0 = 1;
+ icpt_operexc_on_all_vcpus(kvm);
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -418,21 +559,23 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
unsigned int idx;
switch (attr->attr) {
case KVM_S390_VM_MEM_ENABLE_CMMA:
- /* enable CMMA only for z10 and later (EDAT_1) */
- ret = -EINVAL;
- if (!MACHINE_IS_LPAR || !MACHINE_HAS_EDAT1)
+ ret = -ENXIO;
+ if (!sclp.has_cmma)
break;
ret = -EBUSY;
VM_EVENT(kvm, 3, "%s", "ENABLE: CMMA support");
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus) == 0) {
+ if (!kvm->created_vcpus) {
kvm->arch.use_cmma = 1;
ret = 0;
}
mutex_unlock(&kvm->lock);
break;
case KVM_S390_VM_MEM_CLR_CMMA:
+ ret = -ENXIO;
+ if (!sclp.has_cmma)
+ break;
ret = -EINVAL;
if (!kvm->arch.use_cmma)
break;
@@ -461,20 +604,20 @@ static int kvm_s390_set_mem_control(struct kvm *kvm, struct kvm_device_attr *att
if (!new_limit)
return -EINVAL;
- /* gmap_alloc takes last usable address */
+ /* gmap_create takes last usable address */
if (new_limit != KVM_S390_NO_MEM_LIMIT)
new_limit -= 1;
ret = -EBUSY;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus) == 0) {
- /* gmap_alloc will round the limit up */
- struct gmap *new = gmap_alloc(current->mm, new_limit);
+ if (!kvm->created_vcpus) {
+ /* gmap_create will round the limit up */
+ struct gmap *new = gmap_create(current->mm, new_limit);
if (!new) {
ret = -ENOMEM;
} else {
- gmap_free(kvm->arch.gmap);
+ gmap_remove(kvm->arch.gmap);
new->private = kvm;
kvm->arch.gmap = new;
ret = 0;
@@ -644,7 +787,7 @@ static int kvm_s390_set_processor(struct kvm *kvm, struct kvm_device_attr *attr)
int ret = 0;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus)) {
+ if (kvm->created_vcpus) {
ret = -EBUSY;
goto out;
}
@@ -676,6 +819,39 @@ out:
return ret;
}
+static int kvm_s390_set_processor_feat(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_feat data;
+ int ret = -EBUSY;
+
+ if (copy_from_user(&data, (void __user *)attr->addr, sizeof(data)))
+ return -EFAULT;
+ if (!bitmap_subset((unsigned long *) data.feat,
+ kvm_s390_available_cpu_feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS))
+ return -EINVAL;
+
+ mutex_lock(&kvm->lock);
+ if (!atomic_read(&kvm->online_vcpus)) {
+ bitmap_copy(kvm->arch.cpu_feat, (unsigned long *) data.feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS);
+ ret = 0;
+ }
+ mutex_unlock(&kvm->lock);
+ return ret;
+}
+
+static int kvm_s390_set_processor_subfunc(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ /*
+ * Once supported by kernel + hw, we have to store the subfunctions
+ * in kvm->arch and remember that user space configured them.
+ */
+ return -ENXIO;
+}
+
static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret = -ENXIO;
@@ -684,6 +860,12 @@ static int kvm_s390_set_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_CPU_PROCESSOR:
ret = kvm_s390_set_processor(kvm, attr);
break;
+ case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+ ret = kvm_s390_set_processor_feat(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+ ret = kvm_s390_set_processor_subfunc(kvm, attr);
+ break;
}
return ret;
}
@@ -732,6 +914,50 @@ out:
return ret;
}
+static int kvm_s390_get_processor_feat(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_feat data;
+
+ bitmap_copy((unsigned long *) data.feat, kvm->arch.cpu_feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS);
+ if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+ return -EFAULT;
+ return 0;
+}
+
+static int kvm_s390_get_machine_feat(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ struct kvm_s390_vm_cpu_feat data;
+
+ bitmap_copy((unsigned long *) data.feat,
+ kvm_s390_available_cpu_feat,
+ KVM_S390_VM_CPU_FEAT_NR_BITS);
+ if (copy_to_user((void __user *)attr->addr, &data, sizeof(data)))
+ return -EFAULT;
+ return 0;
+}
+
+static int kvm_s390_get_processor_subfunc(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ /*
+ * Once we can actually configure subfunctions (kernel + hw support),
+ * we have to check if they were already set by user space, if so copy
+ * them from kvm->arch.
+ */
+ return -ENXIO;
+}
+
+static int kvm_s390_get_machine_subfunc(struct kvm *kvm,
+ struct kvm_device_attr *attr)
+{
+ if (copy_to_user((void __user *)attr->addr, &kvm_s390_available_subfunc,
+ sizeof(struct kvm_s390_vm_cpu_subfunc)))
+ return -EFAULT;
+ return 0;
+}
static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
{
int ret = -ENXIO;
@@ -743,6 +969,18 @@ static int kvm_s390_get_cpu_model(struct kvm *kvm, struct kvm_device_attr *attr)
case KVM_S390_VM_CPU_MACHINE:
ret = kvm_s390_get_machine(kvm, attr);
break;
+ case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+ ret = kvm_s390_get_processor_feat(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_MACHINE_FEAT:
+ ret = kvm_s390_get_machine_feat(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
+ ret = kvm_s390_get_processor_subfunc(kvm, attr);
+ break;
+ case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
+ ret = kvm_s390_get_machine_subfunc(kvm, attr);
+ break;
}
return ret;
}
@@ -803,6 +1041,8 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_S390_VM_MEM_ENABLE_CMMA:
case KVM_S390_VM_MEM_CLR_CMMA:
+ ret = sclp.has_cmma ? 0 : -ENXIO;
+ break;
case KVM_S390_VM_MEM_LIMIT_SIZE:
ret = 0;
break;
@@ -826,8 +1066,13 @@ static int kvm_s390_vm_has_attr(struct kvm *kvm, struct kvm_device_attr *attr)
switch (attr->attr) {
case KVM_S390_VM_CPU_PROCESSOR:
case KVM_S390_VM_CPU_MACHINE:
+ case KVM_S390_VM_CPU_PROCESSOR_FEAT:
+ case KVM_S390_VM_CPU_MACHINE_FEAT:
+ case KVM_S390_VM_CPU_MACHINE_SUBFUNC:
ret = 0;
break;
+ /* configuring subfunctions is not supported yet */
+ case KVM_S390_VM_CPU_PROCESSOR_SUBFUNC:
default:
ret = -ENXIO;
break;
@@ -858,7 +1103,6 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
{
uint8_t *keys;
uint64_t hva;
- unsigned long curkey;
int i, r = 0;
if (args->flags != 0)
@@ -879,26 +1123,27 @@ static long kvm_s390_get_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (!keys)
return -ENOMEM;
+ down_read(&current->mm->mmap_sem);
for (i = 0; i < args->count; i++) {
hva = gfn_to_hva(kvm, args->start_gfn + i);
if (kvm_is_error_hva(hva)) {
r = -EFAULT;
- goto out;
+ break;
}
- curkey = get_guest_storage_key(current->mm, hva);
- if (IS_ERR_VALUE(curkey)) {
- r = curkey;
- goto out;
- }
- keys[i] = curkey;
+ r = get_guest_storage_key(current->mm, hva, &keys[i]);
+ if (r)
+ break;
+ }
+ up_read(&current->mm->mmap_sem);
+
+ if (!r) {
+ r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
+ sizeof(uint8_t) * args->count);
+ if (r)
+ r = -EFAULT;
}
- r = copy_to_user((uint8_t __user *)args->skeydata_addr, keys,
- sizeof(uint8_t) * args->count);
- if (r)
- r = -EFAULT;
-out:
kvfree(keys);
return r;
}
@@ -935,24 +1180,25 @@ static long kvm_s390_set_skeys(struct kvm *kvm, struct kvm_s390_skeys *args)
if (r)
goto out;
+ down_read(&current->mm->mmap_sem);
for (i = 0; i < args->count; i++) {
hva = gfn_to_hva(kvm, args->start_gfn + i);
if (kvm_is_error_hva(hva)) {
r = -EFAULT;
- goto out;
+ break;
}
/* Lowest order bit is reserved */
if (keys[i] & 0x01) {
r = -EINVAL;
- goto out;
+ break;
}
- r = set_guest_storage_key(current->mm, hva,
- (unsigned long)keys[i], 0);
+ r = set_guest_storage_key(current->mm, hva, keys[i], 0);
if (r)
- goto out;
+ break;
}
+ up_read(&current->mm->mmap_sem);
out:
kvfree(keys);
return r;
@@ -1129,6 +1375,7 @@ static void sca_dispose(struct kvm *kvm)
int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
{
+ gfp_t alloc_flags = GFP_KERNEL;
int i, rc;
char debug_name[16];
static unsigned long sca_offset;
@@ -1150,9 +1397,13 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
rc = -ENOMEM;
+ ratelimit_state_init(&kvm->arch.sthyi_limit, 5 * HZ, 500);
+
kvm->arch.use_esca = 0; /* start with basic SCA */
+ if (!sclp.has_64bscao)
+ alloc_flags |= GFP_DMA;
rwlock_init(&kvm->arch.sca_lock);
- kvm->arch.sca = (struct bsca_block *) get_zeroed_page(GFP_KERNEL);
+ kvm->arch.sca = (struct bsca_block *) get_zeroed_page(alloc_flags);
if (!kvm->arch.sca)
goto out_err;
spin_lock(&kvm_lock);
@@ -1189,6 +1440,9 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
memcpy(kvm->arch.model.fac_list, kvm->arch.model.fac_mask,
S390_ARCH_FAC_LIST_SIZE_BYTE);
+ set_kvm_facility(kvm->arch.model.fac_mask, 74);
+ set_kvm_facility(kvm->arch.model.fac_list, 74);
+
kvm->arch.model.cpuid = kvm_s390_get_initial_cpuid();
kvm->arch.model.ibc = sclp.ibc & 0x0fff;
@@ -1212,7 +1466,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
else
kvm->arch.mem_limit = min_t(unsigned long, TASK_MAX_SIZE,
sclp.hamax + 1);
- kvm->arch.gmap = gmap_alloc(current->mm, kvm->arch.mem_limit - 1);
+ kvm->arch.gmap = gmap_create(current->mm, kvm->arch.mem_limit - 1);
if (!kvm->arch.gmap)
goto out_err;
kvm->arch.gmap->private = kvm;
@@ -1224,6 +1478,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
kvm->arch.epoch = 0;
spin_lock_init(&kvm->arch.start_stop_lock);
+ kvm_s390_vsie_init(kvm);
KVM_EVENT(3, "vm 0x%pK created by pid %u", kvm, current->pid);
return 0;
@@ -1245,7 +1500,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
sca_del_vcpu(vcpu);
if (kvm_is_ucontrol(vcpu->kvm))
- gmap_free(vcpu->arch.gmap);
+ gmap_remove(vcpu->arch.gmap);
if (vcpu->kvm->arch.use_cmma)
kvm_s390_vcpu_unsetup_cmma(vcpu);
@@ -1278,16 +1533,17 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
debug_unregister(kvm->arch.dbf);
free_page((unsigned long)kvm->arch.sie_page2);
if (!kvm_is_ucontrol(kvm))
- gmap_free(kvm->arch.gmap);
+ gmap_remove(kvm->arch.gmap);
kvm_s390_destroy_adapters(kvm);
kvm_s390_clear_float_irqs(kvm);
+ kvm_s390_vsie_destroy(kvm);
KVM_EVENT(3, "vm 0x%pK destroyed", kvm);
}
/* Section: vcpu related */
static int __kvm_ucontrol_vcpu_init(struct kvm_vcpu *vcpu)
{
- vcpu->arch.gmap = gmap_alloc(current->mm, -1UL);
+ vcpu->arch.gmap = gmap_create(current->mm, -1UL);
if (!vcpu->arch.gmap)
return -ENOMEM;
vcpu->arch.gmap->private = vcpu->kvm;
@@ -1396,7 +1652,7 @@ static int sca_can_add_vcpu(struct kvm *kvm, unsigned int id)
if (id < KVM_S390_BSCA_CPU_SLOTS)
return true;
- if (!sclp.has_esca)
+ if (!sclp.has_esca || !sclp.has_64bscao)
return false;
mutex_lock(&kvm->lock);
@@ -1537,7 +1793,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
save_access_regs(vcpu->arch.host_acrs);
restore_access_regs(vcpu->run->s.regs.acrs);
- gmap_enable(vcpu->arch.gmap);
+ gmap_enable(vcpu->arch.enabled_gmap);
atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
__start_cpu_timer_accounting(vcpu);
@@ -1550,7 +1806,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu))
__stop_cpu_timer_accounting(vcpu);
atomic_andnot(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags);
- gmap_disable(vcpu->arch.gmap);
+ vcpu->arch.enabled_gmap = gmap_get_enabled();
+ gmap_disable(vcpu->arch.enabled_gmap);
/* Save guest register state */
save_fpu_regs();
@@ -1599,7 +1856,10 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu)
vcpu->arch.gmap = vcpu->kvm->arch.gmap;
sca_add_vcpu(vcpu);
}
-
+ if (test_kvm_facility(vcpu->kvm, 74) || vcpu->kvm->arch.user_instr0)
+ vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+ /* make vcpu_load load the right gmap on the first trigger */
+ vcpu->arch.enabled_gmap = vcpu->arch.gmap;
}
static void kvm_s390_vcpu_crypto_setup(struct kvm_vcpu *vcpu)
@@ -1658,15 +1918,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu)
kvm_s390_vcpu_setup_model(vcpu);
- vcpu->arch.sie_block->ecb = 0x02;
+ /* pgste_set_pte has special handling for !MACHINE_HAS_ESOP */
+ if (MACHINE_HAS_ESOP)
+ vcpu->arch.sie_block->ecb |= 0x02;
if (test_kvm_facility(vcpu->kvm, 9))
vcpu->arch.sie_block->ecb |= 0x04;
- if (test_kvm_facility(vcpu->kvm, 50) && test_kvm_facility(vcpu->kvm, 73))
+ if (test_kvm_facility(vcpu->kvm, 73))
vcpu->arch.sie_block->ecb |= 0x10;
- if (test_kvm_facility(vcpu->kvm, 8))
+ if (test_kvm_facility(vcpu->kvm, 8) && sclp.has_pfmfi)
vcpu->arch.sie_block->ecb2 |= 0x08;
- vcpu->arch.sie_block->eca = 0xC1002000U;
+ vcpu->arch.sie_block->eca = 0x1002000U;
+ if (sclp.has_cei)
+ vcpu->arch.sie_block->eca |= 0x80000000U;
+ if (sclp.has_ib)
+ vcpu->arch.sie_block->eca |= 0x40000000U;
if (sclp.has_siif)
vcpu->arch.sie_block->eca |= 1;
if (sclp.has_sigpif)
@@ -1716,6 +1982,10 @@ struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
vcpu->arch.sie_block = &sie_page->sie_block;
vcpu->arch.sie_block->itdba = (unsigned long) &sie_page->itdb;
+ /* the real guest size will always be smaller than msl */
+ vcpu->arch.sie_block->mso = 0;
+ vcpu->arch.sie_block->msl = sclp.hamax;
+
vcpu->arch.sie_block->icpua = id;
spin_lock_init(&vcpu->arch.local_int.lock);
vcpu->arch.local_int.float_int = &kvm->arch.float_int;
@@ -1784,16 +2054,25 @@ void kvm_s390_sync_request(int req, struct kvm_vcpu *vcpu)
kvm_s390_vcpu_request(vcpu);
}
-static void kvm_gmap_notifier(struct gmap *gmap, unsigned long address)
+static void kvm_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end)
{
- int i;
struct kvm *kvm = gmap->private;
struct kvm_vcpu *vcpu;
+ unsigned long prefix;
+ int i;
+ if (gmap_is_shadow(gmap))
+ return;
+ if (start >= 1UL << 31)
+ /* We are only interested in prefix pages */
+ return;
kvm_for_each_vcpu(i, vcpu, kvm) {
/* match against both prefix pages */
- if (kvm_s390_get_prefix(vcpu) == (address & ~0x1000UL)) {
- VCPU_EVENT(vcpu, 2, "gmap notifier for %lx", address);
+ prefix = kvm_s390_get_prefix(vcpu);
+ if (prefix <= end && start <= prefix + 2*PAGE_SIZE - 1) {
+ VCPU_EVENT(vcpu, 2, "gmap notifier for %lx-%lx",
+ start, end);
kvm_s390_sync_request(KVM_REQ_MMU_RELOAD, vcpu);
}
}
@@ -2002,6 +2281,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
if (dbg->control & ~VALID_GUESTDBG_FLAGS)
return -EINVAL;
+ if (!sclp.has_gpere)
+ return -EINVAL;
if (dbg->control & KVM_GUESTDBG_ENABLE) {
vcpu->guest_debug = dbg->control;
@@ -2070,16 +2351,16 @@ retry:
return 0;
/*
* We use MMU_RELOAD just to re-arm the ipte notifier for the
- * guest prefix page. gmap_ipte_notify will wait on the ptl lock.
+ * guest prefix page. gmap_mprotect_notify will wait on the ptl lock.
* This ensures that the ipte instruction for this request has
* already finished. We might race against a second unmapper that
* wants to set the blocking bit. Lets just retry the request loop.
*/
if (kvm_check_request(KVM_REQ_MMU_RELOAD, vcpu)) {
int rc;
- rc = gmap_ipte_notify(vcpu->arch.gmap,
- kvm_s390_get_prefix(vcpu),
- PAGE_SIZE * 2);
+ rc = gmap_mprotect_notify(vcpu->arch.gmap,
+ kvm_s390_get_prefix(vcpu),
+ PAGE_SIZE * 2, PROT_WRITE);
if (rc)
return rc;
goto retry;
@@ -2108,6 +2389,11 @@ retry:
goto retry;
}
+ if (kvm_check_request(KVM_REQ_ICPT_OPEREXC, vcpu)) {
+ vcpu->arch.sie_block->ictl |= ICTL_OPEREXC;
+ goto retry;
+ }
+
/* nothing to do, just clear the request */
clear_bit(KVM_REQ_UNHALT, &vcpu->requests);
@@ -2362,14 +2648,14 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
* guest_enter and guest_exit should be no uaccess.
*/
local_irq_disable();
- __kvm_guest_enter();
+ guest_enter_irqoff();
__disable_cpu_timer_accounting(vcpu);
local_irq_enable();
exit_reason = sie64a(vcpu->arch.sie_block,
vcpu->run->s.regs.gprs);
local_irq_disable();
__enable_cpu_timer_accounting(vcpu);
- __kvm_guest_exit();
+ guest_exit_irqoff();
local_irq_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -2598,6 +2884,8 @@ static void __disable_ibs_on_all_vcpus(struct kvm *kvm)
static void __enable_ibs_on_vcpu(struct kvm_vcpu *vcpu)
{
+ if (!sclp.has_ibs)
+ return;
kvm_check_request(KVM_REQ_DISABLE_IBS, vcpu);
kvm_s390_sync_request(KVM_REQ_ENABLE_IBS, vcpu);
}
diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h
index 8621ab00ec8e..b8432862a817 100644
--- a/arch/s390/kvm/kvm-s390.h
+++ b/arch/s390/kvm/kvm-s390.h
@@ -56,7 +56,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu)
static inline int is_vcpu_idle(struct kvm_vcpu *vcpu)
{
- return atomic_read(&vcpu->arch.sie_block->cpuflags) & CPUSTAT_WAIT;
+ return test_bit(vcpu->vcpu_id, vcpu->arch.local_int.float_int->idle_mask);
}
static inline int kvm_is_ucontrol(struct kvm *kvm)
@@ -175,6 +175,12 @@ static inline int set_kvm_facility(u64 *fac_list, unsigned long nr)
return 0;
}
+static inline int test_kvm_cpu_feat(struct kvm *kvm, unsigned long nr)
+{
+ WARN_ON_ONCE(nr >= KVM_S390_VM_CPU_FEAT_NR_BITS);
+ return test_bit_inv(nr, kvm->arch.cpu_feat);
+}
+
/* are cpu states controlled by user space */
static inline int kvm_s390_user_cpu_state_ctrl(struct kvm *kvm)
{
@@ -232,6 +238,8 @@ static inline void kvm_s390_forward_psw(struct kvm_vcpu *vcpu, int ilen)
}
static inline void kvm_s390_retry_instr(struct kvm_vcpu *vcpu)
{
+ /* don't inject PER events if we re-execute the instruction */
+ vcpu->arch.sie_block->icptstatus &= ~0x02;
kvm_s390_rewind_psw(vcpu, kvm_s390_get_ilen(vcpu));
}
@@ -246,10 +254,21 @@ int kvm_s390_handle_stctl(struct kvm_vcpu *vcpu);
int kvm_s390_handle_lctl(struct kvm_vcpu *vcpu);
int kvm_s390_handle_eb(struct kvm_vcpu *vcpu);
+/* implemented in vsie.c */
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu);
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end);
+void kvm_s390_vsie_init(struct kvm *kvm);
+void kvm_s390_vsie_destroy(struct kvm *kvm);
+
/* implemented in sigp.c */
int kvm_s390_handle_sigp(struct kvm_vcpu *vcpu);
int kvm_s390_handle_sigp_pei(struct kvm_vcpu *vcpu);
+/* implemented in sthyi.c */
+int handle_sthyi(struct kvm_vcpu *vcpu);
+
/* implemented in kvm-s390.c */
void kvm_s390_set_tod_clock(struct kvm *kvm, u64 tod);
long kvm_arch_fault_in_page(struct kvm_vcpu *vcpu, gpa_t gpa, int writable);
@@ -360,6 +379,7 @@ int kvm_s390_import_bp_data(struct kvm_vcpu *vcpu,
struct kvm_guest_debug *dbg);
void kvm_s390_clear_bp_data(struct kvm_vcpu *vcpu);
void kvm_s390_prepare_debug_exit(struct kvm_vcpu *vcpu);
+int kvm_s390_handle_per_ifetch_icpt(struct kvm_vcpu *vcpu);
void kvm_s390_handle_per_event(struct kvm_vcpu *vcpu);
/* support for Basic/Extended SCA handling */
diff --git a/arch/s390/kvm/priv.c b/arch/s390/kvm/priv.c
index 95916fa7c670..46160388e996 100644
--- a/arch/s390/kvm/priv.c
+++ b/arch/s390/kvm/priv.c
@@ -27,6 +27,7 @@
#include <asm/io.h>
#include <asm/ptrace.h>
#include <asm/compat.h>
+#include <asm/sclp.h>
#include "gaccess.h"
#include "kvm-s390.h"
#include "trace.h"
@@ -152,30 +153,166 @@ static int handle_store_cpu_address(struct kvm_vcpu *vcpu)
static int __skey_check_enable(struct kvm_vcpu *vcpu)
{
int rc = 0;
+
+ trace_kvm_s390_skey_related_inst(vcpu);
if (!(vcpu->arch.sie_block->ictl & (ICTL_ISKE | ICTL_SSKE | ICTL_RRBE)))
return rc;
rc = s390_enable_skey();
- VCPU_EVENT(vcpu, 3, "%s", "enabling storage keys for guest");
- trace_kvm_s390_skey_related_inst(vcpu);
- vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
+ VCPU_EVENT(vcpu, 3, "enabling storage keys for guest: %d", rc);
+ if (!rc)
+ vcpu->arch.sie_block->ictl &= ~(ICTL_ISKE | ICTL_SSKE | ICTL_RRBE);
return rc;
}
-
-static int handle_skey(struct kvm_vcpu *vcpu)
+static int try_handle_skey(struct kvm_vcpu *vcpu)
{
- int rc = __skey_check_enable(vcpu);
+ int rc;
+ vcpu->stat.instruction_storage_key++;
+ rc = __skey_check_enable(vcpu);
if (rc)
return rc;
- vcpu->stat.instruction_storage_key++;
-
+ if (sclp.has_skey) {
+ /* with storage-key facility, SIE interprets it for us */
+ kvm_s390_retry_instr(vcpu);
+ VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+ return -EAGAIN;
+ }
if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+ return 0;
+}
- kvm_s390_retry_instr(vcpu);
- VCPU_EVENT(vcpu, 4, "%s", "retrying storage key operation");
+static int handle_iske(struct kvm_vcpu *vcpu)
+{
+ unsigned long addr;
+ unsigned char key;
+ int reg1, reg2;
+ int rc;
+
+ rc = try_handle_skey(vcpu);
+ if (rc)
+ return rc != -EAGAIN ? rc : 0;
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+ addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+ addr = kvm_s390_logical_to_effective(vcpu, addr);
+ addr = kvm_s390_real_to_abs(vcpu, addr);
+ addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+ if (kvm_is_error_hva(addr))
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ down_read(&current->mm->mmap_sem);
+ rc = get_guest_storage_key(current->mm, addr, &key);
+ up_read(&current->mm->mmap_sem);
+ if (rc)
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ vcpu->run->s.regs.gprs[reg1] &= ~0xff;
+ vcpu->run->s.regs.gprs[reg1] |= key;
+ return 0;
+}
+
+static int handle_rrbe(struct kvm_vcpu *vcpu)
+{
+ unsigned long addr;
+ int reg1, reg2;
+ int rc;
+
+ rc = try_handle_skey(vcpu);
+ if (rc)
+ return rc != -EAGAIN ? rc : 0;
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+ addr = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+ addr = kvm_s390_logical_to_effective(vcpu, addr);
+ addr = kvm_s390_real_to_abs(vcpu, addr);
+ addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(addr));
+ if (kvm_is_error_hva(addr))
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ down_read(&current->mm->mmap_sem);
+ rc = reset_guest_reference_bit(current->mm, addr);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ kvm_s390_set_psw_cc(vcpu, rc);
+ return 0;
+}
+
+#define SSKE_NQ 0x8
+#define SSKE_MR 0x4
+#define SSKE_MC 0x2
+#define SSKE_MB 0x1
+static int handle_sske(struct kvm_vcpu *vcpu)
+{
+ unsigned char m3 = vcpu->arch.sie_block->ipb >> 28;
+ unsigned long start, end;
+ unsigned char key, oldkey;
+ int reg1, reg2;
+ int rc;
+
+ rc = try_handle_skey(vcpu);
+ if (rc)
+ return rc != -EAGAIN ? rc : 0;
+
+ if (!test_kvm_facility(vcpu->kvm, 8))
+ m3 &= ~SSKE_MB;
+ if (!test_kvm_facility(vcpu->kvm, 10))
+ m3 &= ~(SSKE_MC | SSKE_MR);
+ if (!test_kvm_facility(vcpu->kvm, 14))
+ m3 &= ~SSKE_NQ;
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+
+ key = vcpu->run->s.regs.gprs[reg1] & 0xfe;
+ start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
+ start = kvm_s390_logical_to_effective(vcpu, start);
+ if (m3 & SSKE_MB) {
+ /* start already designates an absolute address */
+ end = (start + (1UL << 20)) & ~((1UL << 20) - 1);
+ } else {
+ start = kvm_s390_real_to_abs(vcpu, start);
+ end = start + PAGE_SIZE;
+ }
+
+ while (start != end) {
+ unsigned long addr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
+
+ if (kvm_is_error_hva(addr))
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+
+ down_read(&current->mm->mmap_sem);
+ rc = cond_set_guest_storage_key(current->mm, addr, key, &oldkey,
+ m3 & SSKE_NQ, m3 & SSKE_MR,
+ m3 & SSKE_MC);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
+ return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ start += PAGE_SIZE;
+ };
+
+ if (m3 & (SSKE_MC | SSKE_MR)) {
+ if (m3 & SSKE_MB) {
+ /* skey in reg1 is unpredictable */
+ kvm_s390_set_psw_cc(vcpu, 3);
+ } else {
+ kvm_s390_set_psw_cc(vcpu, rc);
+ vcpu->run->s.regs.gprs[reg1] &= ~0xff00UL;
+ vcpu->run->s.regs.gprs[reg1] |= (u64) oldkey << 8;
+ }
+ }
+ if (m3 & SSKE_MB) {
+ if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT)
+ vcpu->run->s.regs.gprs[reg2] &= ~PAGE_MASK;
+ else
+ vcpu->run->s.regs.gprs[reg2] &= ~0xfffff000UL;
+ end = kvm_s390_logical_to_effective(vcpu, end);
+ vcpu->run->s.regs.gprs[reg2] |= end;
+ }
return 0;
}
@@ -582,10 +719,11 @@ static const intercept_handler_t b2_handlers[256] = {
[0x10] = handle_set_prefix,
[0x11] = handle_store_prefix,
[0x12] = handle_store_cpu_address,
+ [0x14] = kvm_s390_handle_vsie,
[0x21] = handle_ipte_interlock,
- [0x29] = handle_skey,
- [0x2a] = handle_skey,
- [0x2b] = handle_skey,
+ [0x29] = handle_iske,
+ [0x2a] = handle_rrbe,
+ [0x2b] = handle_sske,
[0x2c] = handle_test_block,
[0x30] = handle_io_inst,
[0x31] = handle_io_inst,
@@ -654,8 +792,10 @@ static int handle_epsw(struct kvm_vcpu *vcpu)
static int handle_pfmf(struct kvm_vcpu *vcpu)
{
+ bool mr = false, mc = false, nq;
int reg1, reg2;
unsigned long start, end;
+ unsigned char key;
vcpu->stat.instruction_pfmf++;
@@ -675,15 +815,27 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
!test_kvm_facility(vcpu->kvm, 14))
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
- /* No support for conditional-SSKE */
- if (vcpu->run->s.regs.gprs[reg1] & (PFMF_MR | PFMF_MC))
- return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+ /* Only provide conditional-SSKE support if enabled for the guest */
+ if (vcpu->run->s.regs.gprs[reg1] & PFMF_SK &&
+ test_kvm_facility(vcpu->kvm, 10)) {
+ mr = vcpu->run->s.regs.gprs[reg1] & PFMF_MR;
+ mc = vcpu->run->s.regs.gprs[reg1] & PFMF_MC;
+ }
+ nq = vcpu->run->s.regs.gprs[reg1] & PFMF_NQ;
+ key = vcpu->run->s.regs.gprs[reg1] & PFMF_KEY;
start = vcpu->run->s.regs.gprs[reg2] & PAGE_MASK;
start = kvm_s390_logical_to_effective(vcpu, start);
+ if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
+ if (kvm_s390_check_low_addr_prot_real(vcpu, start))
+ return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
+ }
+
switch (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
case 0x00000000:
+ /* only 4k frames specify a real address */
+ start = kvm_s390_real_to_abs(vcpu, start);
end = (start + (1UL << 12)) & ~((1UL << 12) - 1);
break;
case 0x00001000:
@@ -701,20 +853,11 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
}
- if (vcpu->run->s.regs.gprs[reg1] & PFMF_CF) {
- if (kvm_s390_check_low_addr_prot_real(vcpu, start))
- return kvm_s390_inject_prog_irq(vcpu, &vcpu->arch.pgm);
- }
-
- while (start < end) {
- unsigned long useraddr, abs_addr;
+ while (start != end) {
+ unsigned long useraddr;
/* Translate guest address to host address */
- if ((vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) == 0)
- abs_addr = kvm_s390_real_to_abs(vcpu, start);
- else
- abs_addr = start;
- useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(abs_addr));
+ useraddr = gfn_to_hva(vcpu->kvm, gpa_to_gfn(start));
if (kvm_is_error_hva(useraddr))
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
@@ -728,16 +871,25 @@ static int handle_pfmf(struct kvm_vcpu *vcpu)
if (rc)
return rc;
- if (set_guest_storage_key(current->mm, useraddr,
- vcpu->run->s.regs.gprs[reg1] & PFMF_KEY,
- vcpu->run->s.regs.gprs[reg1] & PFMF_NQ))
+ down_read(&current->mm->mmap_sem);
+ rc = cond_set_guest_storage_key(current->mm, useraddr,
+ key, NULL, nq, mr, mc);
+ up_read(&current->mm->mmap_sem);
+ if (rc < 0)
return kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
}
start += PAGE_SIZE;
}
- if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC)
- vcpu->run->s.regs.gprs[reg2] = end;
+ if (vcpu->run->s.regs.gprs[reg1] & PFMF_FSC) {
+ if (psw_bits(vcpu->arch.sie_block->gpsw).eaba == PSW_AMODE_64BIT) {
+ vcpu->run->s.regs.gprs[reg2] = end;
+ } else {
+ vcpu->run->s.regs.gprs[reg2] &= ~0xffffffffUL;
+ end = kvm_s390_logical_to_effective(vcpu, end);
+ vcpu->run->s.regs.gprs[reg2] |= end;
+ }
+ }
return 0;
}
@@ -1033,7 +1185,15 @@ static int handle_sckpf(struct kvm_vcpu *vcpu)
return 0;
}
+static int handle_ptff(struct kvm_vcpu *vcpu)
+{
+ /* we don't emulate any control instructions yet */
+ kvm_s390_set_psw_cc(vcpu, 3);
+ return 0;
+}
+
static const intercept_handler_t x01_handlers[256] = {
+ [0x04] = handle_ptff,
[0x07] = handle_sckpf,
};
diff --git a/arch/s390/kvm/sigp.c b/arch/s390/kvm/sigp.c
index 28ea0cab1f1b..1a252f537081 100644
--- a/arch/s390/kvm/sigp.c
+++ b/arch/s390/kvm/sigp.c
@@ -77,18 +77,18 @@ static int __sigp_conditional_emergency(struct kvm_vcpu *vcpu,
const u64 psw_int_mask = PSW_MASK_IO | PSW_MASK_EXT;
u16 p_asn, s_asn;
psw_t *psw;
- u32 flags;
+ bool idle;
- flags = atomic_read(&dst_vcpu->arch.sie_block->cpuflags);
+ idle = is_vcpu_idle(vcpu);
psw = &dst_vcpu->arch.sie_block->gpsw;
p_asn = dst_vcpu->arch.sie_block->gcr[4] & 0xffff; /* Primary ASN */
s_asn = dst_vcpu->arch.sie_block->gcr[3] & 0xffff; /* Secondary ASN */
/* Inject the emergency signal? */
- if (!(flags & CPUSTAT_STOPPED)
+ if (!is_vcpu_stopped(vcpu)
|| (psw->mask & psw_int_mask) != psw_int_mask
- || ((flags & CPUSTAT_WAIT) && psw->addr != 0)
- || (!(flags & CPUSTAT_WAIT) && (asn == p_asn || asn == s_asn))) {
+ || (idle && psw->addr != 0)
+ || (!idle && (asn == p_asn || asn == s_asn))) {
return __inject_sigp_emergency(vcpu, dst_vcpu);
} else {
*reg &= 0xffffffff00000000UL;
diff --git a/arch/s390/kvm/sthyi.c b/arch/s390/kvm/sthyi.c
new file mode 100644
index 000000000000..bd98b7d25200
--- /dev/null
+++ b/arch/s390/kvm/sthyi.c
@@ -0,0 +1,471 @@
+/*
+ * store hypervisor information instruction emulation functions.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Copyright IBM Corp. 2016
+ * Author(s): Janosch Frank <frankja@linux.vnet.ibm.com>
+ */
+#include <linux/kvm_host.h>
+#include <linux/errno.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <linux/ratelimit.h>
+
+#include <asm/kvm_host.h>
+#include <asm/asm-offsets.h>
+#include <asm/sclp.h>
+#include <asm/diag.h>
+#include <asm/sysinfo.h>
+#include <asm/ebcdic.h>
+
+#include "kvm-s390.h"
+#include "gaccess.h"
+#include "trace.h"
+
+#define DED_WEIGHT 0xffff
+/*
+ * CP and IFL as EBCDIC strings, SP/0x40 determines the end of string
+ * as they are justified with spaces.
+ */
+#define CP 0xc3d7404040404040UL
+#define IFL 0xc9c6d34040404040UL
+
+enum hdr_flags {
+ HDR_NOT_LPAR = 0x10,
+ HDR_STACK_INCM = 0x20,
+ HDR_STSI_UNAV = 0x40,
+ HDR_PERF_UNAV = 0x80,
+};
+
+enum mac_validity {
+ MAC_NAME_VLD = 0x20,
+ MAC_ID_VLD = 0x40,
+ MAC_CNT_VLD = 0x80,
+};
+
+enum par_flag {
+ PAR_MT_EN = 0x80,
+};
+
+enum par_validity {
+ PAR_GRP_VLD = 0x08,
+ PAR_ID_VLD = 0x10,
+ PAR_ABS_VLD = 0x20,
+ PAR_WGHT_VLD = 0x40,
+ PAR_PCNT_VLD = 0x80,
+};
+
+struct hdr_sctn {
+ u8 infhflg1;
+ u8 infhflg2; /* reserved */
+ u8 infhval1; /* reserved */
+ u8 infhval2; /* reserved */
+ u8 reserved[3];
+ u8 infhygct;
+ u16 infhtotl;
+ u16 infhdln;
+ u16 infmoff;
+ u16 infmlen;
+ u16 infpoff;
+ u16 infplen;
+ u16 infhoff1;
+ u16 infhlen1;
+ u16 infgoff1;
+ u16 infglen1;
+ u16 infhoff2;
+ u16 infhlen2;
+ u16 infgoff2;
+ u16 infglen2;
+ u16 infhoff3;
+ u16 infhlen3;
+ u16 infgoff3;
+ u16 infglen3;
+ u8 reserved2[4];
+} __packed;
+
+struct mac_sctn {
+ u8 infmflg1; /* reserved */
+ u8 infmflg2; /* reserved */
+ u8 infmval1;
+ u8 infmval2; /* reserved */
+ u16 infmscps;
+ u16 infmdcps;
+ u16 infmsifl;
+ u16 infmdifl;
+ char infmname[8];
+ char infmtype[4];
+ char infmmanu[16];
+ char infmseq[16];
+ char infmpman[4];
+ u8 reserved[4];
+} __packed;
+
+struct par_sctn {
+ u8 infpflg1;
+ u8 infpflg2; /* reserved */
+ u8 infpval1;
+ u8 infpval2; /* reserved */
+ u16 infppnum;
+ u16 infpscps;
+ u16 infpdcps;
+ u16 infpsifl;
+ u16 infpdifl;
+ u16 reserved;
+ char infppnam[8];
+ u32 infpwbcp;
+ u32 infpabcp;
+ u32 infpwbif;
+ u32 infpabif;
+ char infplgnm[8];
+ u32 infplgcp;
+ u32 infplgif;
+} __packed;
+
+struct sthyi_sctns {
+ struct hdr_sctn hdr;
+ struct mac_sctn mac;
+ struct par_sctn par;
+} __packed;
+
+struct cpu_inf {
+ u64 lpar_cap;
+ u64 lpar_grp_cap;
+ u64 lpar_weight;
+ u64 all_weight;
+ int cpu_num_ded;
+ int cpu_num_shd;
+};
+
+struct lpar_cpu_inf {
+ struct cpu_inf cp;
+ struct cpu_inf ifl;
+};
+
+static inline u64 cpu_id(u8 ctidx, void *diag224_buf)
+{
+ return *((u64 *)(diag224_buf + (ctidx + 1) * DIAG204_CPU_NAME_LEN));
+}
+
+/*
+ * Scales the cpu capping from the lpar range to the one expected in
+ * sthyi data.
+ *
+ * diag204 reports a cap in hundredths of processor units.
+ * z/VM's range for one core is 0 - 0x10000.
+ */
+static u32 scale_cap(u32 in)
+{
+ return (0x10000 * in) / 100;
+}
+
+static void fill_hdr(struct sthyi_sctns *sctns)
+{
+ sctns->hdr.infhdln = sizeof(sctns->hdr);
+ sctns->hdr.infmoff = sizeof(sctns->hdr);
+ sctns->hdr.infmlen = sizeof(sctns->mac);
+ sctns->hdr.infplen = sizeof(sctns->par);
+ sctns->hdr.infpoff = sctns->hdr.infhdln + sctns->hdr.infmlen;
+ sctns->hdr.infhtotl = sctns->hdr.infpoff + sctns->hdr.infplen;
+}
+
+static void fill_stsi_mac(struct sthyi_sctns *sctns,
+ struct sysinfo_1_1_1 *sysinfo)
+{
+ if (stsi(sysinfo, 1, 1, 1))
+ return;
+
+ sclp_ocf_cpc_name_copy(sctns->mac.infmname);
+
+ memcpy(sctns->mac.infmtype, sysinfo->type, sizeof(sctns->mac.infmtype));
+ memcpy(sctns->mac.infmmanu, sysinfo->manufacturer, sizeof(sctns->mac.infmmanu));
+ memcpy(sctns->mac.infmpman, sysinfo->plant, sizeof(sctns->mac.infmpman));
+ memcpy(sctns->mac.infmseq, sysinfo->sequence, sizeof(sctns->mac.infmseq));
+
+ sctns->mac.infmval1 |= MAC_ID_VLD | MAC_NAME_VLD;
+}
+
+static void fill_stsi_par(struct sthyi_sctns *sctns,
+ struct sysinfo_2_2_2 *sysinfo)
+{
+ if (stsi(sysinfo, 2, 2, 2))
+ return;
+
+ sctns->par.infppnum = sysinfo->lpar_number;
+ memcpy(sctns->par.infppnam, sysinfo->name, sizeof(sctns->par.infppnam));
+
+ sctns->par.infpval1 |= PAR_ID_VLD;
+}
+
+static void fill_stsi(struct sthyi_sctns *sctns)
+{
+ void *sysinfo;
+
+ /* Errors are handled through the validity bits in the response. */
+ sysinfo = (void *)__get_free_page(GFP_KERNEL);
+ if (!sysinfo)
+ return;
+
+ fill_stsi_mac(sctns, sysinfo);
+ fill_stsi_par(sctns, sysinfo);
+
+ free_pages((unsigned long)sysinfo, 0);
+}
+
+static void fill_diag_mac(struct sthyi_sctns *sctns,
+ struct diag204_x_phys_block *block,
+ void *diag224_buf)
+{
+ int i;
+
+ for (i = 0; i < block->hdr.cpus; i++) {
+ switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) {
+ case CP:
+ if (block->cpus[i].weight == DED_WEIGHT)
+ sctns->mac.infmdcps++;
+ else
+ sctns->mac.infmscps++;
+ break;
+ case IFL:
+ if (block->cpus[i].weight == DED_WEIGHT)
+ sctns->mac.infmdifl++;
+ else
+ sctns->mac.infmsifl++;
+ break;
+ }
+ }
+ sctns->mac.infmval1 |= MAC_CNT_VLD;
+}
+
+/* Returns a pointer to the the next partition block. */
+static struct diag204_x_part_block *lpar_cpu_inf(struct lpar_cpu_inf *part_inf,
+ bool this_lpar,
+ void *diag224_buf,
+ struct diag204_x_part_block *block)
+{
+ int i, capped = 0, weight_cp = 0, weight_ifl = 0;
+ struct cpu_inf *cpu_inf;
+
+ for (i = 0; i < block->hdr.rcpus; i++) {
+ if (!(block->cpus[i].cflag & DIAG204_CPU_ONLINE))
+ continue;
+
+ switch (cpu_id(block->cpus[i].ctidx, diag224_buf)) {
+ case CP:
+ cpu_inf = &part_inf->cp;
+ if (block->cpus[i].cur_weight < DED_WEIGHT)
+ weight_cp |= block->cpus[i].cur_weight;
+ break;
+ case IFL:
+ cpu_inf = &part_inf->ifl;
+ if (block->cpus[i].cur_weight < DED_WEIGHT)
+ weight_ifl |= block->cpus[i].cur_weight;
+ break;
+ default:
+ continue;
+ }
+
+ if (!this_lpar)
+ continue;
+
+ capped |= block->cpus[i].cflag & DIAG204_CPU_CAPPED;
+ cpu_inf->lpar_cap |= block->cpus[i].cpu_type_cap;
+ cpu_inf->lpar_grp_cap |= block->cpus[i].group_cpu_type_cap;
+
+ if (block->cpus[i].weight == DED_WEIGHT)
+ cpu_inf->cpu_num_ded += 1;
+ else
+ cpu_inf->cpu_num_shd += 1;
+ }
+
+ if (this_lpar && capped) {
+ part_inf->cp.lpar_weight = weight_cp;
+ part_inf->ifl.lpar_weight = weight_ifl;
+ }
+ part_inf->cp.all_weight += weight_cp;
+ part_inf->ifl.all_weight += weight_ifl;
+ return (struct diag204_x_part_block *)&block->cpus[i];
+}
+
+static void fill_diag(struct sthyi_sctns *sctns)
+{
+ int i, r, pages;
+ bool this_lpar;
+ void *diag204_buf;
+ void *diag224_buf = NULL;
+ struct diag204_x_info_blk_hdr *ti_hdr;
+ struct diag204_x_part_block *part_block;
+ struct diag204_x_phys_block *phys_block;
+ struct lpar_cpu_inf lpar_inf = {};
+
+ /* Errors are handled through the validity bits in the response. */
+ pages = diag204((unsigned long)DIAG204_SUBC_RSI |
+ (unsigned long)DIAG204_INFO_EXT, 0, NULL);
+ if (pages <= 0)
+ return;
+
+ diag204_buf = vmalloc(PAGE_SIZE * pages);
+ if (!diag204_buf)
+ return;
+
+ r = diag204((unsigned long)DIAG204_SUBC_STIB7 |
+ (unsigned long)DIAG204_INFO_EXT, pages, diag204_buf);
+ if (r < 0)
+ goto out;
+
+ diag224_buf = kmalloc(PAGE_SIZE, GFP_KERNEL | GFP_DMA);
+ if (!diag224_buf || diag224(diag224_buf))
+ goto out;
+
+ ti_hdr = diag204_buf;
+ part_block = diag204_buf + sizeof(*ti_hdr);
+
+ for (i = 0; i < ti_hdr->npar; i++) {
+ /*
+ * For the calling lpar we also need to get the cpu
+ * caps and weights. The time information block header
+ * specifies the offset to the partition block of the
+ * caller lpar, so we know when we process its data.
+ */
+ this_lpar = (void *)part_block - diag204_buf == ti_hdr->this_part;
+ part_block = lpar_cpu_inf(&lpar_inf, this_lpar, diag224_buf,
+ part_block);
+ }
+
+ phys_block = (struct diag204_x_phys_block *)part_block;
+ part_block = diag204_buf + ti_hdr->this_part;
+ if (part_block->hdr.mtid)
+ sctns->par.infpflg1 = PAR_MT_EN;
+
+ sctns->par.infpval1 |= PAR_GRP_VLD;
+ sctns->par.infplgcp = scale_cap(lpar_inf.cp.lpar_grp_cap);
+ sctns->par.infplgif = scale_cap(lpar_inf.ifl.lpar_grp_cap);
+ memcpy(sctns->par.infplgnm, part_block->hdr.hardware_group_name,
+ sizeof(sctns->par.infplgnm));
+
+ sctns->par.infpscps = lpar_inf.cp.cpu_num_shd;
+ sctns->par.infpdcps = lpar_inf.cp.cpu_num_ded;
+ sctns->par.infpsifl = lpar_inf.ifl.cpu_num_shd;
+ sctns->par.infpdifl = lpar_inf.ifl.cpu_num_ded;
+ sctns->par.infpval1 |= PAR_PCNT_VLD;
+
+ sctns->par.infpabcp = scale_cap(lpar_inf.cp.lpar_cap);
+ sctns->par.infpabif = scale_cap(lpar_inf.ifl.lpar_cap);
+ sctns->par.infpval1 |= PAR_ABS_VLD;
+
+ /*
+ * Everything below needs global performance data to be
+ * meaningful.
+ */
+ if (!(ti_hdr->flags & DIAG204_LPAR_PHYS_FLG)) {
+ sctns->hdr.infhflg1 |= HDR_PERF_UNAV;
+ goto out;
+ }
+
+ fill_diag_mac(sctns, phys_block, diag224_buf);
+
+ if (lpar_inf.cp.lpar_weight) {
+ sctns->par.infpwbcp = sctns->mac.infmscps * 0x10000 *
+ lpar_inf.cp.lpar_weight / lpar_inf.cp.all_weight;
+ }
+
+ if (lpar_inf.ifl.lpar_weight) {
+ sctns->par.infpwbif = sctns->mac.infmsifl * 0x10000 *
+ lpar_inf.ifl.lpar_weight / lpar_inf.ifl.all_weight;
+ }
+ sctns->par.infpval1 |= PAR_WGHT_VLD;
+
+out:
+ kfree(diag224_buf);
+ vfree(diag204_buf);
+}
+
+static int sthyi(u64 vaddr)
+{
+ register u64 code asm("0") = 0;
+ register u64 addr asm("2") = vaddr;
+ int cc;
+
+ asm volatile(
+ ".insn rre,0xB2560000,%[code],%[addr]\n"
+ "ipm %[cc]\n"
+ "srl %[cc],28\n"
+ : [cc] "=d" (cc)
+ : [code] "d" (code), [addr] "a" (addr)
+ : "memory", "cc");
+ return cc;
+}
+
+int handle_sthyi(struct kvm_vcpu *vcpu)
+{
+ int reg1, reg2, r = 0;
+ u64 code, addr, cc = 0;
+ struct sthyi_sctns *sctns = NULL;
+
+ /*
+ * STHYI requires extensive locking in the higher hypervisors
+ * and is very computational/memory expensive. Therefore we
+ * ratelimit the executions per VM.
+ */
+ if (!__ratelimit(&vcpu->kvm->arch.sthyi_limit)) {
+ kvm_s390_retry_instr(vcpu);
+ return 0;
+ }
+
+ kvm_s390_get_regs_rre(vcpu, &reg1, &reg2);
+ code = vcpu->run->s.regs.gprs[reg1];
+ addr = vcpu->run->s.regs.gprs[reg2];
+
+ vcpu->stat.instruction_sthyi++;
+ VCPU_EVENT(vcpu, 3, "STHYI: fc: %llu addr: 0x%016llx", code, addr);
+ trace_kvm_s390_handle_sthyi(vcpu, code, addr);
+
+ if (reg1 == reg2 || reg1 & 1 || reg2 & 1 || addr & ~PAGE_MASK)
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+ if (code & 0xffff) {
+ cc = 3;
+ goto out;
+ }
+
+ /*
+ * If the page has not yet been faulted in, we want to do that
+ * now and not after all the expensive calculations.
+ */
+ r = write_guest(vcpu, addr, reg2, &cc, 1);
+ if (r)
+ return kvm_s390_inject_prog_cond(vcpu, r);
+
+ sctns = (void *)get_zeroed_page(GFP_KERNEL);
+ if (!sctns)
+ return -ENOMEM;
+
+ /*
+ * If we are a guest, we don't want to emulate an emulated
+ * instruction. We ask the hypervisor to provide the data.
+ */
+ if (test_facility(74)) {
+ cc = sthyi((u64)sctns);
+ goto out;
+ }
+
+ fill_hdr(sctns);
+ fill_stsi(sctns);
+ fill_diag(sctns);
+
+out:
+ if (!cc) {
+ r = write_guest(vcpu, addr, reg2, sctns, PAGE_SIZE);
+ if (r) {
+ free_page((unsigned long)sctns);
+ return kvm_s390_inject_prog_cond(vcpu, r);
+ }
+ }
+
+ free_page((unsigned long)sctns);
+ vcpu->run->s.regs.gprs[reg2 + 1] = cc ? 4 : 0;
+ kvm_s390_set_psw_cc(vcpu, cc);
+ return r;
+}
diff --git a/arch/s390/kvm/trace.h b/arch/s390/kvm/trace.h
index 916834d7a73a..4fc9d4e5be89 100644
--- a/arch/s390/kvm/trace.h
+++ b/arch/s390/kvm/trace.h
@@ -41,7 +41,7 @@ TRACE_EVENT(kvm_s390_skey_related_inst,
TP_fast_assign(
VCPU_ASSIGN_COMMON
),
- VCPU_TP_PRINTK("%s", "first instruction related to skeys on vcpu")
+ VCPU_TP_PRINTK("%s", "storage key related instruction")
);
TRACE_EVENT(kvm_s390_major_guest_pfault,
@@ -185,8 +185,10 @@ TRACE_EVENT(kvm_s390_intercept_prog,
__entry->code = code;
),
- VCPU_TP_PRINTK("intercepted program interruption %04x",
- __entry->code)
+ VCPU_TP_PRINTK("intercepted program interruption %04x (%s)",
+ __entry->code,
+ __print_symbolic(__entry->code,
+ icpt_prog_codes))
);
/*
@@ -412,6 +414,47 @@ TRACE_EVENT(kvm_s390_handle_stsi,
__entry->addr)
);
+TRACE_EVENT(kvm_s390_handle_operexc,
+ TP_PROTO(VCPU_PROTO_COMMON, __u16 ipa, __u32 ipb),
+ TP_ARGS(VCPU_ARGS_COMMON, ipa, ipb),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(__u64, instruction)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->instruction = ((__u64)ipa << 48) |
+ ((__u64)ipb << 16);
+ ),
+
+ VCPU_TP_PRINTK("operation exception on instruction %016llx (%s)",
+ __entry->instruction,
+ __print_symbolic(icpt_insn_decoder(__entry->instruction),
+ icpt_insn_codes))
+ );
+
+TRACE_EVENT(kvm_s390_handle_sthyi,
+ TP_PROTO(VCPU_PROTO_COMMON, u64 code, u64 addr),
+ TP_ARGS(VCPU_ARGS_COMMON, code, addr),
+
+ TP_STRUCT__entry(
+ VCPU_FIELD_COMMON
+ __field(u64, code)
+ __field(u64, addr)
+ ),
+
+ TP_fast_assign(
+ VCPU_ASSIGN_COMMON
+ __entry->code = code;
+ __entry->addr = addr;
+ ),
+
+ VCPU_TP_PRINTK("STHYI fc: %llu addr: %016llx",
+ __entry->code, __entry->addr)
+ );
+
#endif /* _TRACE_KVM_H */
/* This part must be outside protection */
diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c
new file mode 100644
index 000000000000..c106488b4137
--- /dev/null
+++ b/arch/s390/kvm/vsie.c
@@ -0,0 +1,1091 @@
+/*
+ * kvm nested virtualization support for s390x
+ *
+ * Copyright IBM Corp. 2016
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License (version 2 only)
+ * as published by the Free Software Foundation.
+ *
+ * Author(s): David Hildenbrand <dahi@linux.vnet.ibm.com>
+ */
+#include <linux/vmalloc.h>
+#include <linux/kvm_host.h>
+#include <linux/bug.h>
+#include <linux/list.h>
+#include <linux/bitmap.h>
+#include <asm/gmap.h>
+#include <asm/mmu_context.h>
+#include <asm/sclp.h>
+#include <asm/nmi.h>
+#include <asm/dis.h>
+#include "kvm-s390.h"
+#include "gaccess.h"
+
+struct vsie_page {
+ struct kvm_s390_sie_block scb_s; /* 0x0000 */
+ /* the pinned originial scb */
+ struct kvm_s390_sie_block *scb_o; /* 0x0200 */
+ /* the shadow gmap in use by the vsie_page */
+ struct gmap *gmap; /* 0x0208 */
+ /* address of the last reported fault to guest2 */
+ unsigned long fault_addr; /* 0x0210 */
+ __u8 reserved[0x0700 - 0x0218]; /* 0x0218 */
+ struct kvm_s390_crypto_cb crycb; /* 0x0700 */
+ __u8 fac[S390_ARCH_FAC_LIST_SIZE_BYTE]; /* 0x0800 */
+} __packed;
+
+/* trigger a validity icpt for the given scb */
+static int set_validity_icpt(struct kvm_s390_sie_block *scb,
+ __u16 reason_code)
+{
+ scb->ipa = 0x1000;
+ scb->ipb = ((__u32) reason_code) << 16;
+ scb->icptcode = ICPT_VALIDITY;
+ return 1;
+}
+
+/* mark the prefix as unmapped, this will block the VSIE */
+static void prefix_unmapped(struct vsie_page *vsie_page)
+{
+ atomic_or(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* mark the prefix as unmapped and wait until the VSIE has been left */
+static void prefix_unmapped_sync(struct vsie_page *vsie_page)
+{
+ prefix_unmapped(vsie_page);
+ if (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+ atomic_or(CPUSTAT_STOP_INT, &vsie_page->scb_s.cpuflags);
+ while (vsie_page->scb_s.prog0c & PROG_IN_SIE)
+ cpu_relax();
+}
+
+/* mark the prefix as mapped, this will allow the VSIE to run */
+static void prefix_mapped(struct vsie_page *vsie_page)
+{
+ atomic_andnot(PROG_REQUEST, &vsie_page->scb_s.prog20);
+}
+
+/* test if the prefix is mapped into the gmap shadow */
+static int prefix_is_mapped(struct vsie_page *vsie_page)
+{
+ return !(atomic_read(&vsie_page->scb_s.prog20) & PROG_REQUEST);
+}
+
+/* copy the updated intervention request bits into the shadow scb */
+static void update_intervention_requests(struct vsie_page *vsie_page)
+{
+ const int bits = CPUSTAT_STOP_INT | CPUSTAT_IO_INT | CPUSTAT_EXT_INT;
+ int cpuflags;
+
+ cpuflags = atomic_read(&vsie_page->scb_o->cpuflags);
+ atomic_andnot(bits, &vsie_page->scb_s.cpuflags);
+ atomic_or(cpuflags & bits, &vsie_page->scb_s.cpuflags);
+}
+
+/* shadow (filter and validate) the cpuflags */
+static int prepare_cpuflags(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ int newflags, cpuflags = atomic_read(&scb_o->cpuflags);
+
+ /* we don't allow ESA/390 guests */
+ if (!(cpuflags & CPUSTAT_ZARCH))
+ return set_validity_icpt(scb_s, 0x0001U);
+
+ if (cpuflags & (CPUSTAT_RRF | CPUSTAT_MCDS))
+ return set_validity_icpt(scb_s, 0x0001U);
+ else if (cpuflags & (CPUSTAT_SLSV | CPUSTAT_SLSR))
+ return set_validity_icpt(scb_s, 0x0007U);
+
+ /* intervention requests will be set later */
+ newflags = CPUSTAT_ZARCH;
+ if (cpuflags & CPUSTAT_GED && test_kvm_facility(vcpu->kvm, 8))
+ newflags |= CPUSTAT_GED;
+ if (cpuflags & CPUSTAT_GED2 && test_kvm_facility(vcpu->kvm, 78)) {
+ if (cpuflags & CPUSTAT_GED)
+ return set_validity_icpt(scb_s, 0x0001U);
+ newflags |= CPUSTAT_GED2;
+ }
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GPERE))
+ newflags |= cpuflags & CPUSTAT_P;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_GSLS))
+ newflags |= cpuflags & CPUSTAT_SM;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IBS))
+ newflags |= cpuflags & CPUSTAT_IBS;
+
+ atomic_set(&scb_s->cpuflags, newflags);
+ return 0;
+}
+
+/*
+ * Create a shadow copy of the crycb block and setup key wrapping, if
+ * requested for guest 3 and enabled for guest 2.
+ *
+ * We only accept format-1 (no AP in g2), but convert it into format-2
+ * There is nothing to do for format-0.
+ *
+ * Returns: - 0 if shadowed or nothing to do
+ * - > 0 if control has to be given to guest 2
+ */
+static int shadow_crycb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ u32 crycb_addr = scb_o->crycbd & 0x7ffffff8U;
+ unsigned long *b1, *b2;
+ u8 ecb3_flags;
+
+ scb_s->crycbd = 0;
+ if (!(scb_o->crycbd & vcpu->arch.sie_block->crycbd & CRYCB_FORMAT1))
+ return 0;
+ /* format-1 is supported with message-security-assist extension 3 */
+ if (!test_kvm_facility(vcpu->kvm, 76))
+ return 0;
+ /* we may only allow it if enabled for guest 2 */
+ ecb3_flags = scb_o->ecb3 & vcpu->arch.sie_block->ecb3 &
+ (ECB3_AES | ECB3_DEA);
+ if (!ecb3_flags)
+ return 0;
+
+ if ((crycb_addr & PAGE_MASK) != ((crycb_addr + 128) & PAGE_MASK))
+ return set_validity_icpt(scb_s, 0x003CU);
+ else if (!crycb_addr)
+ return set_validity_icpt(scb_s, 0x0039U);
+
+ /* copy only the wrapping keys */
+ if (read_guest_real(vcpu, crycb_addr + 72, &vsie_page->crycb, 56))
+ return set_validity_icpt(scb_s, 0x0035U);
+
+ scb_s->ecb3 |= ecb3_flags;
+ scb_s->crycbd = ((__u32)(__u64) &vsie_page->crycb) | CRYCB_FORMAT1 |
+ CRYCB_FORMAT2;
+
+ /* xor both blocks in one run */
+ b1 = (unsigned long *) vsie_page->crycb.dea_wrapping_key_mask;
+ b2 = (unsigned long *)
+ vcpu->kvm->arch.crypto.crycb->dea_wrapping_key_mask;
+ /* as 56%8 == 0, bitmap_xor won't overwrite any data */
+ bitmap_xor(b1, b1, b2, BITS_PER_BYTE * 56);
+ return 0;
+}
+
+/* shadow (round up/down) the ibc to avoid validity icpt */
+static void prepare_ibc(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ __u64 min_ibc = (sclp.ibc >> 16) & 0x0fffU;
+
+ scb_s->ibc = 0;
+ /* ibc installed in g2 and requested for g3 */
+ if (vcpu->kvm->arch.model.ibc && (scb_o->ibc & 0x0fffU)) {
+ scb_s->ibc = scb_o->ibc & 0x0fffU;
+ /* takte care of the minimum ibc level of the machine */
+ if (scb_s->ibc < min_ibc)
+ scb_s->ibc = min_ibc;
+ /* take care of the maximum ibc level set for the guest */
+ if (scb_s->ibc > vcpu->kvm->arch.model.ibc)
+ scb_s->ibc = vcpu->kvm->arch.model.ibc;
+ }
+}
+
+/* unshadow the scb, copying parameters back to the real scb */
+static void unshadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+
+ /* interception */
+ scb_o->icptcode = scb_s->icptcode;
+ scb_o->icptstatus = scb_s->icptstatus;
+ scb_o->ipa = scb_s->ipa;
+ scb_o->ipb = scb_s->ipb;
+ scb_o->gbea = scb_s->gbea;
+
+ /* timer */
+ scb_o->cputm = scb_s->cputm;
+ scb_o->ckc = scb_s->ckc;
+ scb_o->todpr = scb_s->todpr;
+
+ /* guest state */
+ scb_o->gpsw = scb_s->gpsw;
+ scb_o->gg14 = scb_s->gg14;
+ scb_o->gg15 = scb_s->gg15;
+ memcpy(scb_o->gcr, scb_s->gcr, 128);
+ scb_o->pp = scb_s->pp;
+
+ /* interrupt intercept */
+ switch (scb_s->icptcode) {
+ case ICPT_PROGI:
+ case ICPT_INSTPROGI:
+ case ICPT_EXTINT:
+ memcpy((void *)((u64)scb_o + 0xc0),
+ (void *)((u64)scb_s + 0xc0), 0xf0 - 0xc0);
+ break;
+ case ICPT_PARTEXEC:
+ /* MVPG only */
+ memcpy((void *)((u64)scb_o + 0xc0),
+ (void *)((u64)scb_s + 0xc0), 0xd0 - 0xc0);
+ break;
+ }
+
+ if (scb_s->ihcpu != 0xffffU)
+ scb_o->ihcpu = scb_s->ihcpu;
+}
+
+/*
+ * Setup the shadow scb by copying and checking the relevant parts of the g2
+ * provided scb.
+ *
+ * Returns: - 0 if the scb has been shadowed
+ * - > 0 if control has to be given to guest 2
+ */
+static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ bool had_tx = scb_s->ecb & 0x10U;
+ unsigned long new_mso = 0;
+ int rc;
+
+ /* make sure we don't have any leftovers when reusing the scb */
+ scb_s->icptcode = 0;
+ scb_s->eca = 0;
+ scb_s->ecb = 0;
+ scb_s->ecb2 = 0;
+ scb_s->ecb3 = 0;
+ scb_s->ecd = 0;
+ scb_s->fac = 0;
+
+ rc = prepare_cpuflags(vcpu, vsie_page);
+ if (rc)
+ goto out;
+
+ /* timer */
+ scb_s->cputm = scb_o->cputm;
+ scb_s->ckc = scb_o->ckc;
+ scb_s->todpr = scb_o->todpr;
+ scb_s->epoch = scb_o->epoch;
+
+ /* guest state */
+ scb_s->gpsw = scb_o->gpsw;
+ scb_s->gg14 = scb_o->gg14;
+ scb_s->gg15 = scb_o->gg15;
+ memcpy(scb_s->gcr, scb_o->gcr, 128);
+ scb_s->pp = scb_o->pp;
+
+ /* interception / execution handling */
+ scb_s->gbea = scb_o->gbea;
+ scb_s->lctl = scb_o->lctl;
+ scb_s->svcc = scb_o->svcc;
+ scb_s->ictl = scb_o->ictl;
+ /*
+ * SKEY handling functions can't deal with false setting of PTE invalid
+ * bits. Therefore we cannot provide interpretation and would later
+ * have to provide own emulation handlers.
+ */
+ scb_s->ictl |= ICTL_ISKE | ICTL_SSKE | ICTL_RRBE;
+ scb_s->icpua = scb_o->icpua;
+
+ if (!(atomic_read(&scb_s->cpuflags) & CPUSTAT_SM))
+ new_mso = scb_o->mso & 0xfffffffffff00000UL;
+ /* if the hva of the prefix changes, we have to remap the prefix */
+ if (scb_s->mso != new_mso || scb_s->prefix != scb_o->prefix)
+ prefix_unmapped(vsie_page);
+ /* SIE will do mso/msl validity and exception checks for us */
+ scb_s->msl = scb_o->msl & 0xfffffffffff00000UL;
+ scb_s->mso = new_mso;
+ scb_s->prefix = scb_o->prefix;
+
+ /* We have to definetly flush the tlb if this scb never ran */
+ if (scb_s->ihcpu != 0xffffU)
+ scb_s->ihcpu = scb_o->ihcpu;
+
+ /* MVPG and Protection Exception Interpretation are always available */
+ scb_s->eca |= scb_o->eca & 0x01002000U;
+ /* Host-protection-interruption introduced with ESOP */
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_ESOP))
+ scb_s->ecb |= scb_o->ecb & 0x02U;
+ /* transactional execution */
+ if (test_kvm_facility(vcpu->kvm, 73)) {
+ /* remap the prefix is tx is toggled on */
+ if ((scb_o->ecb & 0x10U) && !had_tx)
+ prefix_unmapped(vsie_page);
+ scb_s->ecb |= scb_o->ecb & 0x10U;
+ }
+ /* SIMD */
+ if (test_kvm_facility(vcpu->kvm, 129)) {
+ scb_s->eca |= scb_o->eca & 0x00020000U;
+ scb_s->ecd |= scb_o->ecd & 0x20000000U;
+ }
+ /* Run-time-Instrumentation */
+ if (test_kvm_facility(vcpu->kvm, 64))
+ scb_s->ecb3 |= scb_o->ecb3 & 0x01U;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIIF))
+ scb_s->eca |= scb_o->eca & 0x00000001U;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_IB))
+ scb_s->eca |= scb_o->eca & 0x40000000U;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_CEI))
+ scb_s->eca |= scb_o->eca & 0x80000000U;
+
+ prepare_ibc(vcpu, vsie_page);
+ rc = shadow_crycb(vcpu, vsie_page);
+out:
+ if (rc)
+ unshadow_scb(vcpu, vsie_page);
+ return rc;
+}
+
+void kvm_s390_vsie_gmap_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end)
+{
+ struct kvm *kvm = gmap->private;
+ struct vsie_page *cur;
+ unsigned long prefix;
+ struct page *page;
+ int i;
+
+ if (!gmap_is_shadow(gmap))
+ return;
+ if (start >= 1UL << 31)
+ /* We are only interested in prefix pages */
+ return;
+
+ /*
+ * Only new shadow blocks are added to the list during runtime,
+ * therefore we can safely reference them all the time.
+ */
+ for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+ page = READ_ONCE(kvm->arch.vsie.pages[i]);
+ if (!page)
+ continue;
+ cur = page_to_virt(page);
+ if (READ_ONCE(cur->gmap) != gmap)
+ continue;
+ prefix = cur->scb_s.prefix << GUEST_PREFIX_SHIFT;
+ /* with mso/msl, the prefix lies at an offset */
+ prefix += cur->scb_s.mso;
+ if (prefix <= end && start <= prefix + 2 * PAGE_SIZE - 1)
+ prefix_unmapped_sync(cur);
+ }
+}
+
+/*
+ * Map the first prefix page and if tx is enabled also the second prefix page.
+ *
+ * The prefix will be protected, a gmap notifier will inform about unmaps.
+ * The shadow scb must not be executed until the prefix is remapped, this is
+ * guaranteed by properly handling PROG_REQUEST.
+ *
+ * Returns: - 0 on if successfully mapped or already mapped
+ * - > 0 if control has to be given to guest 2
+ * - -EAGAIN if the caller can retry immediately
+ * - -ENOMEM if out of memory
+ */
+static int map_prefix(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ u64 prefix = scb_s->prefix << GUEST_PREFIX_SHIFT;
+ int rc;
+
+ if (prefix_is_mapped(vsie_page))
+ return 0;
+
+ /* mark it as mapped so we can catch any concurrent unmappers */
+ prefix_mapped(vsie_page);
+
+ /* with mso/msl, the prefix lies at offset *mso* */
+ prefix += scb_s->mso;
+
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap, prefix);
+ if (!rc && (scb_s->ecb & 0x10U))
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+ prefix + PAGE_SIZE);
+ /*
+ * We don't have to mprotect, we will be called for all unshadows.
+ * SIE will detect if protection applies and trigger a validity.
+ */
+ if (rc)
+ prefix_unmapped(vsie_page);
+ if (rc > 0 || rc == -EFAULT)
+ rc = set_validity_icpt(scb_s, 0x0037U);
+ return rc;
+}
+
+/*
+ * Pin the guest page given by gpa and set hpa to the pinned host address.
+ * Will always be pinned writable.
+ *
+ * Returns: - 0 on success
+ * - -EINVAL if the gpa is not valid guest storage
+ * - -ENOMEM if out of memory
+ */
+static int pin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t *hpa)
+{
+ struct page *page;
+ hva_t hva;
+ int rc;
+
+ hva = gfn_to_hva(kvm, gpa_to_gfn(gpa));
+ if (kvm_is_error_hva(hva))
+ return -EINVAL;
+ rc = get_user_pages_fast(hva, 1, 1, &page);
+ if (rc < 0)
+ return rc;
+ else if (rc != 1)
+ return -ENOMEM;
+ *hpa = (hpa_t) page_to_virt(page) + (gpa & ~PAGE_MASK);
+ return 0;
+}
+
+/* Unpins a page previously pinned via pin_guest_page, marking it as dirty. */
+static void unpin_guest_page(struct kvm *kvm, gpa_t gpa, hpa_t hpa)
+{
+ struct page *page;
+
+ page = virt_to_page(hpa);
+ set_page_dirty_lock(page);
+ put_page(page);
+ /* mark the page always as dirty for migration */
+ mark_page_dirty(kvm, gpa_to_gfn(gpa));
+}
+
+/* unpin all blocks previously pinned by pin_blocks(), marking them dirty */
+static void unpin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ hpa_t hpa;
+ gpa_t gpa;
+
+ hpa = (u64) scb_s->scaoh << 32 | scb_s->scaol;
+ if (hpa) {
+ gpa = scb_o->scaol & ~0xfUL;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+ gpa |= (u64) scb_o->scaoh << 32;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->scaol = 0;
+ scb_s->scaoh = 0;
+ }
+
+ hpa = scb_s->itdba;
+ if (hpa) {
+ gpa = scb_o->itdba & ~0xffUL;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->itdba = 0;
+ }
+
+ hpa = scb_s->gvrd;
+ if (hpa) {
+ gpa = scb_o->gvrd & ~0x1ffUL;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->gvrd = 0;
+ }
+
+ hpa = scb_s->riccbd;
+ if (hpa) {
+ gpa = scb_o->riccbd & ~0x3fUL;
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ scb_s->riccbd = 0;
+ }
+}
+
+/*
+ * Instead of shadowing some blocks, we can simply forward them because the
+ * addresses in the scb are 64 bit long.
+ *
+ * This works as long as the data lies in one page. If blocks ever exceed one
+ * page, we have to fall back to shadowing.
+ *
+ * As we reuse the sca, the vcpu pointers contained in it are invalid. We must
+ * therefore not enable any facilities that access these pointers (e.g. SIGPIF).
+ *
+ * Returns: - 0 if all blocks were pinned.
+ * - > 0 if control has to be given to guest 2
+ * - -ENOMEM if out of memory
+ */
+static int pin_blocks(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ hpa_t hpa;
+ gpa_t gpa;
+ int rc = 0;
+
+ gpa = scb_o->scaol & ~0xfUL;
+ if (test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_64BSCAO))
+ gpa |= (u64) scb_o->scaoh << 32;
+ if (gpa) {
+ if (!(gpa & ~0x1fffUL))
+ rc = set_validity_icpt(scb_s, 0x0038U);
+ else if ((gpa & ~0x1fffUL) == kvm_s390_get_prefix(vcpu))
+ rc = set_validity_icpt(scb_s, 0x0011U);
+ else if ((gpa & PAGE_MASK) !=
+ ((gpa + sizeof(struct bsca_block) - 1) & PAGE_MASK))
+ rc = set_validity_icpt(scb_s, 0x003bU);
+ if (!rc) {
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x0034U);
+ }
+ if (rc)
+ goto unpin;
+ scb_s->scaoh = (u32)((u64)hpa >> 32);
+ scb_s->scaol = (u32)(u64)hpa;
+ }
+
+ gpa = scb_o->itdba & ~0xffUL;
+ if (gpa && (scb_s->ecb & 0x10U)) {
+ if (!(gpa & ~0x1fffU)) {
+ rc = set_validity_icpt(scb_s, 0x0080U);
+ goto unpin;
+ }
+ /* 256 bytes cannot cross page boundaries */
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x0080U);
+ if (rc)
+ goto unpin;
+ scb_s->itdba = hpa;
+ }
+
+ gpa = scb_o->gvrd & ~0x1ffUL;
+ if (gpa && (scb_s->eca & 0x00020000U) &&
+ !(scb_s->ecd & 0x20000000U)) {
+ if (!(gpa & ~0x1fffUL)) {
+ rc = set_validity_icpt(scb_s, 0x1310U);
+ goto unpin;
+ }
+ /*
+ * 512 bytes vector registers cannot cross page boundaries
+ * if this block gets bigger, we have to shadow it.
+ */
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x1310U);
+ if (rc)
+ goto unpin;
+ scb_s->gvrd = hpa;
+ }
+
+ gpa = scb_o->riccbd & ~0x3fUL;
+ if (gpa && (scb_s->ecb3 & 0x01U)) {
+ if (!(gpa & ~0x1fffUL)) {
+ rc = set_validity_icpt(scb_s, 0x0043U);
+ goto unpin;
+ }
+ /* 64 bytes cannot cross page boundaries */
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL)
+ rc = set_validity_icpt(scb_s, 0x0043U);
+ /* Validity 0x0044 will be checked by SIE */
+ if (rc)
+ goto unpin;
+ scb_s->gvrd = hpa;
+ }
+ return 0;
+unpin:
+ unpin_blocks(vcpu, vsie_page);
+ return rc;
+}
+
+/* unpin the scb provided by guest 2, marking it as dirty */
+static void unpin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+ gpa_t gpa)
+{
+ hpa_t hpa = (hpa_t) vsie_page->scb_o;
+
+ if (hpa)
+ unpin_guest_page(vcpu->kvm, gpa, hpa);
+ vsie_page->scb_o = NULL;
+}
+
+/*
+ * Pin the scb at gpa provided by guest 2 at vsie_page->scb_o.
+ *
+ * Returns: - 0 if the scb was pinned.
+ * - > 0 if control has to be given to guest 2
+ * - -ENOMEM if out of memory
+ */
+static int pin_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page,
+ gpa_t gpa)
+{
+ hpa_t hpa;
+ int rc;
+
+ rc = pin_guest_page(vcpu->kvm, gpa, &hpa);
+ if (rc == -EINVAL) {
+ rc = kvm_s390_inject_program_int(vcpu, PGM_ADDRESSING);
+ if (!rc)
+ rc = 1;
+ }
+ if (!rc)
+ vsie_page->scb_o = (struct kvm_s390_sie_block *) hpa;
+ return rc;
+}
+
+/*
+ * Inject a fault into guest 2.
+ *
+ * Returns: - > 0 if control has to be given to guest 2
+ * < 0 if an error occurred during injection.
+ */
+static int inject_fault(struct kvm_vcpu *vcpu, __u16 code, __u64 vaddr,
+ bool write_flag)
+{
+ struct kvm_s390_pgm_info pgm = {
+ .code = code,
+ .trans_exc_code =
+ /* 0-51: virtual address */
+ (vaddr & 0xfffffffffffff000UL) |
+ /* 52-53: store / fetch */
+ (((unsigned int) !write_flag) + 1) << 10,
+ /* 62-63: asce id (alway primary == 0) */
+ .exc_access_id = 0, /* always primary */
+ .op_access_id = 0, /* not MVPG */
+ };
+ int rc;
+
+ if (code == PGM_PROTECTION)
+ pgm.trans_exc_code |= 0x4UL;
+
+ rc = kvm_s390_inject_prog_irq(vcpu, &pgm);
+ return rc ? rc : 1;
+}
+
+/*
+ * Handle a fault during vsie execution on a gmap shadow.
+ *
+ * Returns: - 0 if the fault was resolved
+ * - > 0 if control has to be given to guest 2
+ * - < 0 if an error occurred
+ */
+static int handle_fault(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ int rc;
+
+ if (current->thread.gmap_int_code == PGM_PROTECTION)
+ /* we can directly forward all protection exceptions */
+ return inject_fault(vcpu, PGM_PROTECTION,
+ current->thread.gmap_addr, 1);
+
+ rc = kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+ current->thread.gmap_addr);
+ if (rc > 0) {
+ rc = inject_fault(vcpu, rc,
+ current->thread.gmap_addr,
+ current->thread.gmap_write_flag);
+ if (rc >= 0)
+ vsie_page->fault_addr = current->thread.gmap_addr;
+ }
+ return rc;
+}
+
+/*
+ * Retry the previous fault that required guest 2 intervention. This avoids
+ * one superfluous SIE re-entry and direct exit.
+ *
+ * Will ignore any errors. The next SIE fault will do proper fault handling.
+ */
+static void handle_last_fault(struct kvm_vcpu *vcpu,
+ struct vsie_page *vsie_page)
+{
+ if (vsie_page->fault_addr)
+ kvm_s390_shadow_fault(vcpu, vsie_page->gmap,
+ vsie_page->fault_addr);
+ vsie_page->fault_addr = 0;
+}
+
+static inline void clear_vsie_icpt(struct vsie_page *vsie_page)
+{
+ vsie_page->scb_s.icptcode = 0;
+}
+
+/* rewind the psw and clear the vsie icpt, so we can retry execution */
+static void retry_vsie_icpt(struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ int ilen = insn_length(scb_s->ipa >> 8);
+
+ /* take care of EXECUTE instructions */
+ if (scb_s->icptstatus & 1) {
+ ilen = (scb_s->icptstatus >> 4) & 0x6;
+ if (!ilen)
+ ilen = 4;
+ }
+ scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, ilen);
+ clear_vsie_icpt(vsie_page);
+}
+
+/*
+ * Try to shadow + enable the guest 2 provided facility list.
+ * Retry instruction execution if enabled for and provided by guest 2.
+ *
+ * Returns: - 0 if handled (retry or guest 2 icpt)
+ * - > 0 if control has to be given to guest 2
+ */
+static int handle_stfle(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ __u32 fac = vsie_page->scb_o->fac & 0x7ffffff8U;
+
+ if (fac && test_kvm_facility(vcpu->kvm, 7)) {
+ retry_vsie_icpt(vsie_page);
+ if (read_guest_real(vcpu, fac, &vsie_page->fac,
+ sizeof(vsie_page->fac)))
+ return set_validity_icpt(scb_s, 0x1090U);
+ scb_s->fac = (__u32)(__u64) &vsie_page->fac;
+ }
+ return 0;
+}
+
+/*
+ * Run the vsie on a shadow scb and a shadow gmap, without any further
+ * sanity checks, handling SIE faults.
+ *
+ * Returns: - 0 everything went fine
+ * - > 0 if control has to be given to guest 2
+ * - < 0 if an error occurred
+ */
+static int do_vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ struct kvm_s390_sie_block *scb_o = vsie_page->scb_o;
+ int rc;
+
+ handle_last_fault(vcpu, vsie_page);
+
+ if (need_resched())
+ schedule();
+ if (test_cpu_flag(CIF_MCCK_PENDING))
+ s390_handle_mcck();
+
+ srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
+ local_irq_disable();
+ guest_enter_irqoff();
+ local_irq_enable();
+
+ rc = sie64a(scb_s, vcpu->run->s.regs.gprs);
+
+ local_irq_disable();
+ guest_exit_irqoff();
+ local_irq_enable();
+ vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
+
+ if (rc > 0)
+ rc = 0; /* we could still have an icpt */
+ else if (rc == -EFAULT)
+ return handle_fault(vcpu, vsie_page);
+
+ switch (scb_s->icptcode) {
+ case ICPT_INST:
+ if (scb_s->ipa == 0xb2b0)
+ rc = handle_stfle(vcpu, vsie_page);
+ break;
+ case ICPT_STOP:
+ /* stop not requested by g2 - must have been a kick */
+ if (!(atomic_read(&scb_o->cpuflags) & CPUSTAT_STOP_INT))
+ clear_vsie_icpt(vsie_page);
+ break;
+ case ICPT_VALIDITY:
+ if ((scb_s->ipa & 0xf000) != 0xf000)
+ scb_s->ipa += 0x1000;
+ break;
+ }
+ return rc;
+}
+
+static void release_gmap_shadow(struct vsie_page *vsie_page)
+{
+ if (vsie_page->gmap)
+ gmap_put(vsie_page->gmap);
+ WRITE_ONCE(vsie_page->gmap, NULL);
+ prefix_unmapped(vsie_page);
+}
+
+static int acquire_gmap_shadow(struct kvm_vcpu *vcpu,
+ struct vsie_page *vsie_page)
+{
+ unsigned long asce;
+ union ctlreg0 cr0;
+ struct gmap *gmap;
+ int edat;
+
+ asce = vcpu->arch.sie_block->gcr[1];
+ cr0.val = vcpu->arch.sie_block->gcr[0];
+ edat = cr0.edat && test_kvm_facility(vcpu->kvm, 8);
+ edat += edat && test_kvm_facility(vcpu->kvm, 78);
+
+ /*
+ * ASCE or EDAT could have changed since last icpt, or the gmap
+ * we're holding has been unshadowed. If the gmap is still valid,
+ * we can safely reuse it.
+ */
+ if (vsie_page->gmap && gmap_shadow_valid(vsie_page->gmap, asce, edat))
+ return 0;
+
+ /* release the old shadow - if any, and mark the prefix as unmapped */
+ release_gmap_shadow(vsie_page);
+ gmap = gmap_shadow(vcpu->arch.gmap, asce, edat);
+ if (IS_ERR(gmap))
+ return PTR_ERR(gmap);
+ gmap->private = vcpu->kvm;
+ WRITE_ONCE(vsie_page->gmap, gmap);
+ return 0;
+}
+
+/*
+ * Register the shadow scb at the VCPU, e.g. for kicking out of vsie.
+ */
+static void register_shadow_scb(struct kvm_vcpu *vcpu,
+ struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+
+ WRITE_ONCE(vcpu->arch.vsie_block, &vsie_page->scb_s);
+ /*
+ * External calls have to lead to a kick of the vcpu and
+ * therefore the vsie -> Simulate Wait state.
+ */
+ atomic_or(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+ /*
+ * We have to adjust the g3 epoch by the g2 epoch. The epoch will
+ * automatically be adjusted on tod clock changes via kvm_sync_clock.
+ */
+ preempt_disable();
+ scb_s->epoch += vcpu->kvm->arch.epoch;
+ preempt_enable();
+}
+
+/*
+ * Unregister a shadow scb from a VCPU.
+ */
+static void unregister_shadow_scb(struct kvm_vcpu *vcpu)
+{
+ atomic_andnot(CPUSTAT_WAIT, &vcpu->arch.sie_block->cpuflags);
+ WRITE_ONCE(vcpu->arch.vsie_block, NULL);
+}
+
+/*
+ * Run the vsie on a shadowed scb, managing the gmap shadow, handling
+ * prefix pages and faults.
+ *
+ * Returns: - 0 if no errors occurred
+ * - > 0 if control has to be given to guest 2
+ * - -ENOMEM if out of memory
+ */
+static int vsie_run(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page)
+{
+ struct kvm_s390_sie_block *scb_s = &vsie_page->scb_s;
+ int rc = 0;
+
+ while (1) {
+ rc = acquire_gmap_shadow(vcpu, vsie_page);
+ if (!rc)
+ rc = map_prefix(vcpu, vsie_page);
+ if (!rc) {
+ gmap_enable(vsie_page->gmap);
+ update_intervention_requests(vsie_page);
+ rc = do_vsie_run(vcpu, vsie_page);
+ gmap_enable(vcpu->arch.gmap);
+ }
+ atomic_andnot(PROG_BLOCK_SIE, &scb_s->prog20);
+
+ if (rc == -EAGAIN)
+ rc = 0;
+ if (rc || scb_s->icptcode || signal_pending(current) ||
+ kvm_s390_vcpu_has_irq(vcpu, 0))
+ break;
+ };
+
+ if (rc == -EFAULT) {
+ /*
+ * Addressing exceptions are always presentes as intercepts.
+ * As addressing exceptions are suppressing and our guest 3 PSW
+ * points at the responsible instruction, we have to
+ * forward the PSW and set the ilc. If we can't read guest 3
+ * instruction, we can use an arbitrary ilc. Let's always use
+ * ilen = 4 for now, so we can avoid reading in guest 3 virtual
+ * memory. (we could also fake the shadow so the hardware
+ * handles it).
+ */
+ scb_s->icptcode = ICPT_PROGI;
+ scb_s->iprcc = PGM_ADDRESSING;
+ scb_s->pgmilc = 4;
+ scb_s->gpsw.addr = __rewind_psw(scb_s->gpsw, 4);
+ }
+ return rc;
+}
+
+/*
+ * Get or create a vsie page for a scb address.
+ *
+ * Returns: - address of a vsie page (cached or new one)
+ * - NULL if the same scb address is already used by another VCPU
+ * - ERR_PTR(-ENOMEM) if out of memory
+ */
+static struct vsie_page *get_vsie_page(struct kvm *kvm, unsigned long addr)
+{
+ struct vsie_page *vsie_page;
+ struct page *page;
+ int nr_vcpus;
+
+ rcu_read_lock();
+ page = radix_tree_lookup(&kvm->arch.vsie.addr_to_page, addr >> 9);
+ rcu_read_unlock();
+ if (page) {
+ if (page_ref_inc_return(page) == 2)
+ return page_to_virt(page);
+ page_ref_dec(page);
+ }
+
+ /*
+ * We want at least #online_vcpus shadows, so every VCPU can execute
+ * the VSIE in parallel.
+ */
+ nr_vcpus = atomic_read(&kvm->online_vcpus);
+
+ mutex_lock(&kvm->arch.vsie.mutex);
+ if (kvm->arch.vsie.page_count < nr_vcpus) {
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO | GFP_DMA);
+ if (!page) {
+ mutex_unlock(&kvm->arch.vsie.mutex);
+ return ERR_PTR(-ENOMEM);
+ }
+ page_ref_inc(page);
+ kvm->arch.vsie.pages[kvm->arch.vsie.page_count] = page;
+ kvm->arch.vsie.page_count++;
+ } else {
+ /* reuse an existing entry that belongs to nobody */
+ while (true) {
+ page = kvm->arch.vsie.pages[kvm->arch.vsie.next];
+ if (page_ref_inc_return(page) == 2)
+ break;
+ page_ref_dec(page);
+ kvm->arch.vsie.next++;
+ kvm->arch.vsie.next %= nr_vcpus;
+ }
+ radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+ }
+ page->index = addr;
+ /* double use of the same address */
+ if (radix_tree_insert(&kvm->arch.vsie.addr_to_page, addr >> 9, page)) {
+ page_ref_dec(page);
+ mutex_unlock(&kvm->arch.vsie.mutex);
+ return NULL;
+ }
+ mutex_unlock(&kvm->arch.vsie.mutex);
+
+ vsie_page = page_to_virt(page);
+ memset(&vsie_page->scb_s, 0, sizeof(struct kvm_s390_sie_block));
+ release_gmap_shadow(vsie_page);
+ vsie_page->fault_addr = 0;
+ vsie_page->scb_s.ihcpu = 0xffffU;
+ return vsie_page;
+}
+
+/* put a vsie page acquired via get_vsie_page */
+static void put_vsie_page(struct kvm *kvm, struct vsie_page *vsie_page)
+{
+ struct page *page = pfn_to_page(__pa(vsie_page) >> PAGE_SHIFT);
+
+ page_ref_dec(page);
+}
+
+int kvm_s390_handle_vsie(struct kvm_vcpu *vcpu)
+{
+ struct vsie_page *vsie_page;
+ unsigned long scb_addr;
+ int rc;
+
+ vcpu->stat.instruction_sie++;
+ if (!test_kvm_cpu_feat(vcpu->kvm, KVM_S390_VM_CPU_FEAT_SIEF2))
+ return -EOPNOTSUPP;
+ if (vcpu->arch.sie_block->gpsw.mask & PSW_MASK_PSTATE)
+ return kvm_s390_inject_program_int(vcpu, PGM_PRIVILEGED_OP);
+
+ BUILD_BUG_ON(sizeof(struct vsie_page) != 4096);
+ scb_addr = kvm_s390_get_base_disp_s(vcpu, NULL);
+
+ /* 512 byte alignment */
+ if (unlikely(scb_addr & 0x1ffUL))
+ return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
+
+ if (signal_pending(current) || kvm_s390_vcpu_has_irq(vcpu, 0))
+ return 0;
+
+ vsie_page = get_vsie_page(vcpu->kvm, scb_addr);
+ if (IS_ERR(vsie_page))
+ return PTR_ERR(vsie_page);
+ else if (!vsie_page)
+ /* double use of sie control block - simply do nothing */
+ return 0;
+
+ rc = pin_scb(vcpu, vsie_page, scb_addr);
+ if (rc)
+ goto out_put;
+ rc = shadow_scb(vcpu, vsie_page);
+ if (rc)
+ goto out_unpin_scb;
+ rc = pin_blocks(vcpu, vsie_page);
+ if (rc)
+ goto out_unshadow;
+ register_shadow_scb(vcpu, vsie_page);
+ rc = vsie_run(vcpu, vsie_page);
+ unregister_shadow_scb(vcpu);
+ unpin_blocks(vcpu, vsie_page);
+out_unshadow:
+ unshadow_scb(vcpu, vsie_page);
+out_unpin_scb:
+ unpin_scb(vcpu, vsie_page, scb_addr);
+out_put:
+ put_vsie_page(vcpu->kvm, vsie_page);
+
+ return rc < 0 ? rc : 0;
+}
+
+/* Init the vsie data structures. To be called when a vm is initialized. */
+void kvm_s390_vsie_init(struct kvm *kvm)
+{
+ mutex_init(&kvm->arch.vsie.mutex);
+ INIT_RADIX_TREE(&kvm->arch.vsie.addr_to_page, GFP_KERNEL);
+}
+
+/* Destroy the vsie data structures. To be called when a vm is destroyed. */
+void kvm_s390_vsie_destroy(struct kvm *kvm)
+{
+ struct vsie_page *vsie_page;
+ struct page *page;
+ int i;
+
+ mutex_lock(&kvm->arch.vsie.mutex);
+ for (i = 0; i < kvm->arch.vsie.page_count; i++) {
+ page = kvm->arch.vsie.pages[i];
+ kvm->arch.vsie.pages[i] = NULL;
+ vsie_page = page_to_virt(page);
+ release_gmap_shadow(vsie_page);
+ /* free the radix tree entry */
+ radix_tree_delete(&kvm->arch.vsie.addr_to_page, page->index >> 9);
+ __free_page(page);
+ }
+ kvm->arch.vsie.page_count = 0;
+ mutex_unlock(&kvm->arch.vsie.mutex);
+}
+
+void kvm_s390_vsie_kick(struct kvm_vcpu *vcpu)
+{
+ struct kvm_s390_sie_block *scb = READ_ONCE(vcpu->arch.vsie_block);
+
+ /*
+ * Even if the VCPU lets go of the shadow sie block reference, it is
+ * still valid in the cache. So we can safely kick it.
+ */
+ if (scb) {
+ atomic_or(PROG_BLOCK_SIE, &scb->prog20);
+ if (scb->prog0c & PROG_IN_SIE)
+ atomic_or(CPUSTAT_STOP_INT, &scb->cpuflags);
+ }
+}
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 25783dc3c813..a58bca62a93b 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -418,6 +418,8 @@ static inline int do_exception(struct pt_regs *regs, int access)
(struct gmap *) S390_lowcore.gmap : NULL;
if (gmap) {
current->thread.gmap_addr = address;
+ current->thread.gmap_write_flag = !!(flags & FAULT_FLAG_WRITE);
+ current->thread.gmap_int_code = regs->int_code & 0xffff;
address = __gmap_translate(gmap, address);
if (address == -EFAULT) {
fault = VM_FAULT_BADMAP;
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 063c721ec0dc..2ce6bb3bab32 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -20,14 +20,16 @@
#include <asm/gmap.h>
#include <asm/tlb.h>
+#define GMAP_SHADOW_FAKE_TABLE 1ULL
+
/**
- * gmap_alloc - allocate a guest address space
+ * gmap_alloc - allocate and initialize a guest address space
* @mm: pointer to the parent mm_struct
* @limit: maximum address of the gmap address space
*
* Returns a guest address space structure.
*/
-struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
+static struct gmap *gmap_alloc(unsigned long limit)
{
struct gmap *gmap;
struct page *page;
@@ -55,10 +57,14 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
if (!gmap)
goto out;
INIT_LIST_HEAD(&gmap->crst_list);
+ INIT_LIST_HEAD(&gmap->children);
+ INIT_LIST_HEAD(&gmap->pt_list);
INIT_RADIX_TREE(&gmap->guest_to_host, GFP_KERNEL);
INIT_RADIX_TREE(&gmap->host_to_guest, GFP_ATOMIC);
+ INIT_RADIX_TREE(&gmap->host_to_rmap, GFP_ATOMIC);
spin_lock_init(&gmap->guest_table_lock);
- gmap->mm = mm;
+ spin_lock_init(&gmap->shadow_lock);
+ atomic_set(&gmap->ref_count, 1);
page = alloc_pages(GFP_KERNEL, 2);
if (!page)
goto out_free;
@@ -70,9 +76,6 @@ struct gmap *gmap_alloc(struct mm_struct *mm, unsigned long limit)
gmap->asce = atype | _ASCE_TABLE_LENGTH |
_ASCE_USER_BITS | __pa(table);
gmap->asce_end = limit;
- down_write(&mm->mmap_sem);
- list_add(&gmap->list, &mm->context.gmap_list);
- up_write(&mm->mmap_sem);
return gmap;
out_free:
@@ -80,7 +83,28 @@ out_free:
out:
return NULL;
}
-EXPORT_SYMBOL_GPL(gmap_alloc);
+
+/**
+ * gmap_create - create a guest address space
+ * @mm: pointer to the parent mm_struct
+ * @limit: maximum size of the gmap address space
+ *
+ * Returns a guest address space structure.
+ */
+struct gmap *gmap_create(struct mm_struct *mm, unsigned long limit)
+{
+ struct gmap *gmap;
+
+ gmap = gmap_alloc(limit);
+ if (!gmap)
+ return NULL;
+ gmap->mm = mm;
+ spin_lock(&mm->context.gmap_lock);
+ list_add_rcu(&gmap->list, &mm->context.gmap_list);
+ spin_unlock(&mm->context.gmap_lock);
+ return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_create);
static void gmap_flush_tlb(struct gmap *gmap)
{
@@ -114,31 +138,117 @@ static void gmap_radix_tree_free(struct radix_tree_root *root)
} while (nr > 0);
}
+static void gmap_rmap_radix_tree_free(struct radix_tree_root *root)
+{
+ struct gmap_rmap *rmap, *rnext, *head;
+ struct radix_tree_iter iter;
+ unsigned long indices[16];
+ unsigned long index;
+ void **slot;
+ int i, nr;
+
+ /* A radix tree is freed by deleting all of its entries */
+ index = 0;
+ do {
+ nr = 0;
+ radix_tree_for_each_slot(slot, root, &iter, index) {
+ indices[nr] = iter.index;
+ if (++nr == 16)
+ break;
+ }
+ for (i = 0; i < nr; i++) {
+ index = indices[i];
+ head = radix_tree_delete(root, index);
+ gmap_for_each_rmap_safe(rmap, rnext, head)
+ kfree(rmap);
+ }
+ } while (nr > 0);
+}
+
/**
* gmap_free - free a guest address space
* @gmap: pointer to the guest address space structure
+ *
+ * No locks required. There are no references to this gmap anymore.
*/
-void gmap_free(struct gmap *gmap)
+static void gmap_free(struct gmap *gmap)
{
struct page *page, *next;
- /* Flush tlb. */
- if (MACHINE_HAS_IDTE)
- __tlb_flush_idte(gmap->asce);
- else
- __tlb_flush_global();
-
+ /* Flush tlb of all gmaps (if not already done for shadows) */
+ if (!(gmap_is_shadow(gmap) && gmap->removed))
+ gmap_flush_tlb(gmap);
/* Free all segment & region tables. */
list_for_each_entry_safe(page, next, &gmap->crst_list, lru)
__free_pages(page, 2);
gmap_radix_tree_free(&gmap->guest_to_host);
gmap_radix_tree_free(&gmap->host_to_guest);
- down_write(&gmap->mm->mmap_sem);
- list_del(&gmap->list);
- up_write(&gmap->mm->mmap_sem);
+
+ /* Free additional data for a shadow gmap */
+ if (gmap_is_shadow(gmap)) {
+ /* Free all page tables. */
+ list_for_each_entry_safe(page, next, &gmap->pt_list, lru)
+ page_table_free_pgste(page);
+ gmap_rmap_radix_tree_free(&gmap->host_to_rmap);
+ /* Release reference to the parent */
+ gmap_put(gmap->parent);
+ }
+
kfree(gmap);
}
-EXPORT_SYMBOL_GPL(gmap_free);
+
+/**
+ * gmap_get - increase reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * Returns the gmap pointer
+ */
+struct gmap *gmap_get(struct gmap *gmap)
+{
+ atomic_inc(&gmap->ref_count);
+ return gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get);
+
+/**
+ * gmap_put - decrease reference counter for guest address space
+ * @gmap: pointer to the guest address space structure
+ *
+ * If the reference counter reaches zero the guest address space is freed.
+ */
+void gmap_put(struct gmap *gmap)
+{
+ if (atomic_dec_return(&gmap->ref_count) == 0)
+ gmap_free(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_put);
+
+/**
+ * gmap_remove - remove a guest address space but do not free it yet
+ * @gmap: pointer to the guest address space structure
+ */
+void gmap_remove(struct gmap *gmap)
+{
+ struct gmap *sg, *next;
+
+ /* Remove all shadow gmaps linked to this gmap */
+ if (!list_empty(&gmap->children)) {
+ spin_lock(&gmap->shadow_lock);
+ list_for_each_entry_safe(sg, next, &gmap->children, list) {
+ list_del(&sg->list);
+ gmap_put(sg);
+ }
+ spin_unlock(&gmap->shadow_lock);
+ }
+ /* Remove gmap from the pre-mm list */
+ spin_lock(&gmap->mm->context.gmap_lock);
+ list_del_rcu(&gmap->list);
+ spin_unlock(&gmap->mm->context.gmap_lock);
+ synchronize_rcu();
+ /* Put reference */
+ gmap_put(gmap);
+}
+EXPORT_SYMBOL_GPL(gmap_remove);
/**
* gmap_enable - switch primary space to the guest address space
@@ -160,6 +270,17 @@ void gmap_disable(struct gmap *gmap)
}
EXPORT_SYMBOL_GPL(gmap_disable);
+/**
+ * gmap_get_enabled - get a pointer to the currently enabled gmap
+ *
+ * Returns a pointer to the currently enabled gmap. 0 if none is enabled.
+ */
+struct gmap *gmap_get_enabled(void)
+{
+ return (struct gmap *) S390_lowcore.gmap;
+}
+EXPORT_SYMBOL_GPL(gmap_get_enabled);
+
/*
* gmap_alloc_table is assumed to be called with mmap_sem held
*/
@@ -175,7 +296,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
return -ENOMEM;
new = (unsigned long *) page_to_phys(page);
crst_table_init(new, init);
- spin_lock(&gmap->mm->page_table_lock);
+ spin_lock(&gmap->guest_table_lock);
if (*table & _REGION_ENTRY_INVALID) {
list_add(&page->lru, &gmap->crst_list);
*table = (unsigned long) new | _REGION_ENTRY_LENGTH |
@@ -183,7 +304,7 @@ static int gmap_alloc_table(struct gmap *gmap, unsigned long *table,
page->index = gaddr;
page = NULL;
}
- spin_unlock(&gmap->mm->page_table_lock);
+ spin_unlock(&gmap->guest_table_lock);
if (page)
__free_pages(page, 2);
return 0;
@@ -219,6 +340,7 @@ static int __gmap_unlink_by_vmaddr(struct gmap *gmap, unsigned long vmaddr)
unsigned long *entry;
int flush = 0;
+ BUG_ON(gmap_is_shadow(gmap));
spin_lock(&gmap->guest_table_lock);
entry = radix_tree_delete(&gmap->host_to_guest, vmaddr >> PMD_SHIFT);
if (entry) {
@@ -258,6 +380,7 @@ int gmap_unmap_segment(struct gmap *gmap, unsigned long to, unsigned long len)
unsigned long off;
int flush;
+ BUG_ON(gmap_is_shadow(gmap));
if ((to | len) & (PMD_SIZE - 1))
return -EINVAL;
if (len == 0 || to + len < to)
@@ -289,6 +412,7 @@ int gmap_map_segment(struct gmap *gmap, unsigned long from,
unsigned long off;
int flush;
+ BUG_ON(gmap_is_shadow(gmap));
if ((from | to | len) & (PMD_SIZE - 1))
return -EINVAL;
if (len == 0 || from + len < from || to + len < to ||
@@ -326,6 +450,8 @@ EXPORT_SYMBOL_GPL(gmap_map_segment);
* This function does not establish potentially missing page table entries.
* The mmap_sem of the mm that belongs to the address space must be held
* when this function gets called.
+ *
+ * Note: Can also be called for shadow gmaps.
*/
unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
{
@@ -333,6 +459,7 @@ unsigned long __gmap_translate(struct gmap *gmap, unsigned long gaddr)
vmaddr = (unsigned long)
radix_tree_lookup(&gmap->guest_to_host, gaddr >> PMD_SHIFT);
+ /* Note: guest_to_host is empty for a shadow gmap */
return vmaddr ? (vmaddr | (gaddr & ~PMD_MASK)) : -EFAULT;
}
EXPORT_SYMBOL_GPL(__gmap_translate);
@@ -369,11 +496,13 @@ void gmap_unlink(struct mm_struct *mm, unsigned long *table,
struct gmap *gmap;
int flush;
- list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
flush = __gmap_unlink_by_vmaddr(gmap, vmaddr);
if (flush)
gmap_flush_tlb(gmap);
}
+ rcu_read_unlock();
}
/**
@@ -397,6 +526,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr)
pmd_t *pmd;
int rc;
+ BUG_ON(gmap_is_shadow(gmap));
/* Create higher level tables in the gmap page table */
table = gmap->table;
if ((gmap->asce & _ASCE_TYPE_MASK) >= _ASCE_TYPE_REGION1) {
@@ -552,116 +682,1412 @@ static LIST_HEAD(gmap_notifier_list);
static DEFINE_SPINLOCK(gmap_notifier_lock);
/**
- * gmap_register_ipte_notifier - register a pte invalidation callback
+ * gmap_register_pte_notifier - register a pte invalidation callback
* @nb: pointer to the gmap notifier block
*/
-void gmap_register_ipte_notifier(struct gmap_notifier *nb)
+void gmap_register_pte_notifier(struct gmap_notifier *nb)
{
spin_lock(&gmap_notifier_lock);
- list_add(&nb->list, &gmap_notifier_list);
+ list_add_rcu(&nb->list, &gmap_notifier_list);
spin_unlock(&gmap_notifier_lock);
}
-EXPORT_SYMBOL_GPL(gmap_register_ipte_notifier);
+EXPORT_SYMBOL_GPL(gmap_register_pte_notifier);
/**
- * gmap_unregister_ipte_notifier - remove a pte invalidation callback
+ * gmap_unregister_pte_notifier - remove a pte invalidation callback
* @nb: pointer to the gmap notifier block
*/
-void gmap_unregister_ipte_notifier(struct gmap_notifier *nb)
+void gmap_unregister_pte_notifier(struct gmap_notifier *nb)
{
spin_lock(&gmap_notifier_lock);
- list_del_init(&nb->list);
+ list_del_rcu(&nb->list);
spin_unlock(&gmap_notifier_lock);
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(gmap_unregister_pte_notifier);
+
+/**
+ * gmap_call_notifier - call all registered invalidation callbacks
+ * @gmap: pointer to guest mapping meta data structure
+ * @start: start virtual address in the guest address space
+ * @end: end virtual address in the guest address space
+ */
+static void gmap_call_notifier(struct gmap *gmap, unsigned long start,
+ unsigned long end)
+{
+ struct gmap_notifier *nb;
+
+ list_for_each_entry(nb, &gmap_notifier_list, list)
+ nb->notifier_call(gmap, start, end);
+}
+
+/**
+ * gmap_table_walk - walk the gmap page tables
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @level: page table level to stop at
+ *
+ * Returns a table entry pointer for the given guest address and @level
+ * @level=0 : returns a pointer to a page table table entry (or NULL)
+ * @level=1 : returns a pointer to a segment table entry (or NULL)
+ * @level=2 : returns a pointer to a region-3 table entry (or NULL)
+ * @level=3 : returns a pointer to a region-2 table entry (or NULL)
+ * @level=4 : returns a pointer to a region-1 table entry (or NULL)
+ *
+ * Returns NULL if the gmap page tables could not be walked to the
+ * requested level.
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static inline unsigned long *gmap_table_walk(struct gmap *gmap,
+ unsigned long gaddr, int level)
+{
+ unsigned long *table;
+
+ if ((gmap->asce & _ASCE_TYPE_MASK) + 4 < (level * 4))
+ return NULL;
+ if (gmap_is_shadow(gmap) && gmap->removed)
+ return NULL;
+ if (gaddr & (-1UL << (31 + ((gmap->asce & _ASCE_TYPE_MASK) >> 2)*11)))
+ return NULL;
+ table = gmap->table;
+ switch (gmap->asce & _ASCE_TYPE_MASK) {
+ case _ASCE_TYPE_REGION1:
+ table += (gaddr >> 53) & 0x7ff;
+ if (level == 4)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_REGION2:
+ table += (gaddr >> 42) & 0x7ff;
+ if (level == 3)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_REGION3:
+ table += (gaddr >> 31) & 0x7ff;
+ if (level == 2)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _REGION_ENTRY_ORIGIN);
+ /* Fallthrough */
+ case _ASCE_TYPE_SEGMENT:
+ table += (gaddr >> 20) & 0x7ff;
+ if (level == 1)
+ break;
+ if (*table & _REGION_ENTRY_INVALID)
+ return NULL;
+ table = (unsigned long *)(*table & _SEGMENT_ENTRY_ORIGIN);
+ table += (gaddr >> 12) & 0xff;
+ }
+ return table;
+}
+
+/**
+ * gmap_pte_op_walk - walk the gmap page table, get the page table lock
+ * and return the pte pointer
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @ptl: pointer to the spinlock pointer
+ *
+ * Returns a pointer to the locked pte for a guest address, or NULL
+ *
+ * Note: Can also be called for shadow gmaps.
+ */
+static pte_t *gmap_pte_op_walk(struct gmap *gmap, unsigned long gaddr,
+ spinlock_t **ptl)
+{
+ unsigned long *table;
+
+ if (gmap_is_shadow(gmap))
+ spin_lock(&gmap->guest_table_lock);
+ /* Walk the gmap page table, lock and get pte pointer */
+ table = gmap_table_walk(gmap, gaddr, 1); /* get segment pointer */
+ if (!table || *table & _SEGMENT_ENTRY_INVALID) {
+ if (gmap_is_shadow(gmap))
+ spin_unlock(&gmap->guest_table_lock);
+ return NULL;
+ }
+ if (gmap_is_shadow(gmap)) {
+ *ptl = &gmap->guest_table_lock;
+ return pte_offset_map((pmd_t *) table, gaddr);
+ }
+ return pte_alloc_map_lock(gmap->mm, (pmd_t *) table, gaddr, ptl);
+}
+
+/**
+ * gmap_pte_op_fixup - force a page in and connect the gmap page table
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @vmaddr: address in the host process address space
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if the caller can retry __gmap_translate (might fail again),
+ * -ENOMEM if out of memory and -EFAULT if anything goes wrong while fixing
+ * up or connecting the gmap page table.
+ */
+static int gmap_pte_op_fixup(struct gmap *gmap, unsigned long gaddr,
+ unsigned long vmaddr, int prot)
+{
+ struct mm_struct *mm = gmap->mm;
+ unsigned int fault_flags;
+ bool unlocked = false;
+
+ BUG_ON(gmap_is_shadow(gmap));
+ fault_flags = (prot == PROT_WRITE) ? FAULT_FLAG_WRITE : 0;
+ if (fixup_user_fault(current, mm, vmaddr, fault_flags, &unlocked))
+ return -EFAULT;
+ if (unlocked)
+ /* lost mmap_sem, caller has to retry __gmap_translate */
+ return 0;
+ /* Connect the page tables */
+ return __gmap_link(gmap, gaddr, vmaddr);
}
-EXPORT_SYMBOL_GPL(gmap_unregister_ipte_notifier);
/**
- * gmap_ipte_notify - mark a range of ptes for invalidation notification
+ * gmap_pte_op_end - release the page table lock
+ * @ptl: pointer to the spinlock pointer
+ */
+static void gmap_pte_op_end(spinlock_t *ptl)
+{
+ spin_unlock(ptl);
+}
+
+/*
+ * gmap_protect_range - remove access rights to memory and set pgste bits
* @gmap: pointer to guest mapping meta data structure
* @gaddr: virtual address in the guest address space
* @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bits: pgste notification bits to set
*
- * Returns 0 if for each page in the given range a gmap mapping exists and
- * the invalidation notification could be set. If the gmap mapping is missing
- * for one or more pages -EFAULT is returned. If no memory could be allocated
- * -ENOMEM is returned. This function establishes missing page table entries.
+ * Returns 0 if successfully protected, -ENOMEM if out of memory and
+ * -EFAULT if gaddr is invalid (or mapping for shadows is missing).
+ *
+ * Called with sg->mm->mmap_sem in read.
+ *
+ * Note: Can also be called for shadow gmaps.
*/
-int gmap_ipte_notify(struct gmap *gmap, unsigned long gaddr, unsigned long len)
+static int gmap_protect_range(struct gmap *gmap, unsigned long gaddr,
+ unsigned long len, int prot, unsigned long bits)
{
- unsigned long addr;
+ unsigned long vmaddr;
spinlock_t *ptl;
pte_t *ptep;
- bool unlocked;
- int rc = 0;
+ int rc;
+
+ while (len) {
+ rc = -EAGAIN;
+ ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+ if (ptep) {
+ rc = ptep_force_prot(gmap->mm, gaddr, ptep, prot, bits);
+ gmap_pte_op_end(ptl);
+ }
+ if (rc) {
+ vmaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(vmaddr))
+ return vmaddr;
+ rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, prot);
+ if (rc)
+ return rc;
+ continue;
+ }
+ gaddr += PAGE_SIZE;
+ len -= PAGE_SIZE;
+ }
+ return 0;
+}
+
+/**
+ * gmap_mprotect_notify - change access rights for a range of ptes and
+ * call the notifier if any pte changes again
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @len: size of area
+ * @prot: indicates access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ *
+ * Returns 0 if for each page in the given range a gmap mapping exists,
+ * the new access rights could be set and the notifier could be armed.
+ * If the gmap mapping is missing for one or more pages -EFAULT is
+ * returned. If no memory could be allocated -ENOMEM is returned.
+ * This function establishes missing page table entries.
+ */
+int gmap_mprotect_notify(struct gmap *gmap, unsigned long gaddr,
+ unsigned long len, int prot)
+{
+ int rc;
- if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK))
+ if ((gaddr & ~PAGE_MASK) || (len & ~PAGE_MASK) || gmap_is_shadow(gmap))
+ return -EINVAL;
+ if (!MACHINE_HAS_ESOP && prot == PROT_READ)
return -EINVAL;
down_read(&gmap->mm->mmap_sem);
- while (len) {
- unlocked = false;
- /* Convert gmap address and connect the page tables */
- addr = __gmap_translate(gmap, gaddr);
- if (IS_ERR_VALUE(addr)) {
- rc = addr;
+ rc = gmap_protect_range(gmap, gaddr, len, prot, PGSTE_IN_BIT);
+ up_read(&gmap->mm->mmap_sem);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_mprotect_notify);
+
+/**
+ * gmap_read_table - get an unsigned long value from a guest page table using
+ * absolute addressing, without marking the page referenced.
+ * @gmap: pointer to guest mapping meta data structure
+ * @gaddr: virtual address in the guest address space
+ * @val: pointer to the unsigned long value to return
+ *
+ * Returns 0 if the value was read, -ENOMEM if out of memory and -EFAULT
+ * if reading using the virtual address failed.
+ *
+ * Called with gmap->mm->mmap_sem in read.
+ */
+int gmap_read_table(struct gmap *gmap, unsigned long gaddr, unsigned long *val)
+{
+ unsigned long address, vmaddr;
+ spinlock_t *ptl;
+ pte_t *ptep, pte;
+ int rc;
+
+ while (1) {
+ rc = -EAGAIN;
+ ptep = gmap_pte_op_walk(gmap, gaddr, &ptl);
+ if (ptep) {
+ pte = *ptep;
+ if (pte_present(pte) && (pte_val(pte) & _PAGE_READ)) {
+ address = pte_val(pte) & PAGE_MASK;
+ address += gaddr & ~PAGE_MASK;
+ *val = *(unsigned long *) address;
+ pte_val(*ptep) |= _PAGE_YOUNG;
+ /* Do *NOT* clear the _PAGE_INVALID bit! */
+ rc = 0;
+ }
+ gmap_pte_op_end(ptl);
+ }
+ if (!rc)
+ break;
+ vmaddr = __gmap_translate(gmap, gaddr);
+ if (IS_ERR_VALUE(vmaddr)) {
+ rc = vmaddr;
break;
}
- /* Get the page mapped */
- if (fixup_user_fault(current, gmap->mm, addr, FAULT_FLAG_WRITE,
- &unlocked)) {
- rc = -EFAULT;
+ rc = gmap_pte_op_fixup(gmap, gaddr, vmaddr, PROT_READ);
+ if (rc)
break;
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_read_table);
+
+/**
+ * gmap_insert_rmap - add a rmap to the host_to_rmap radix tree
+ * @sg: pointer to the shadow guest address space structure
+ * @vmaddr: vm address associated with the rmap
+ * @rmap: pointer to the rmap structure
+ *
+ * Called with the sg->guest_table_lock
+ */
+static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr,
+ struct gmap_rmap *rmap)
+{
+ void **slot;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ slot = radix_tree_lookup_slot(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT);
+ if (slot) {
+ rmap->next = radix_tree_deref_slot_protected(slot,
+ &sg->guest_table_lock);
+ radix_tree_replace_slot(slot, rmap);
+ } else {
+ rmap->next = NULL;
+ radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT,
+ rmap);
+ }
+}
+
+/**
+ * gmap_protect_rmap - modify access rights to memory and create an rmap
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow gmap
+ * @paddr: address in the parent guest address space
+ * @len: length of the memory area to protect
+ * @prot: indicates access rights: none, read-only or read-write
+ *
+ * Returns 0 if successfully protected and the rmap was created, -ENOMEM
+ * if out of memory and -EFAULT if paddr is invalid.
+ */
+static int gmap_protect_rmap(struct gmap *sg, unsigned long raddr,
+ unsigned long paddr, unsigned long len, int prot)
+{
+ struct gmap *parent;
+ struct gmap_rmap *rmap;
+ unsigned long vmaddr;
+ spinlock_t *ptl;
+ pte_t *ptep;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ parent = sg->parent;
+ while (len) {
+ vmaddr = __gmap_translate(parent, paddr);
+ if (IS_ERR_VALUE(vmaddr))
+ return vmaddr;
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ if (!rmap)
+ return -ENOMEM;
+ rmap->raddr = raddr;
+ rc = radix_tree_preload(GFP_KERNEL);
+ if (rc) {
+ kfree(rmap);
+ return rc;
+ }
+ rc = -EAGAIN;
+ ptep = gmap_pte_op_walk(parent, paddr, &ptl);
+ if (ptep) {
+ spin_lock(&sg->guest_table_lock);
+ rc = ptep_force_prot(parent->mm, paddr, ptep, prot,
+ PGSTE_VSIE_BIT);
+ if (!rc)
+ gmap_insert_rmap(sg, vmaddr, rmap);
+ spin_unlock(&sg->guest_table_lock);
+ gmap_pte_op_end(ptl);
}
- /* While trying to map mmap_sem got unlocked. Let us retry */
- if (unlocked)
+ radix_tree_preload_end();
+ if (rc) {
+ kfree(rmap);
+ rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+ if (rc)
+ return rc;
continue;
- rc = __gmap_link(gmap, gaddr, addr);
+ }
+ paddr += PAGE_SIZE;
+ len -= PAGE_SIZE;
+ }
+ return 0;
+}
+
+#define _SHADOW_RMAP_MASK 0x7
+#define _SHADOW_RMAP_REGION1 0x5
+#define _SHADOW_RMAP_REGION2 0x4
+#define _SHADOW_RMAP_REGION3 0x3
+#define _SHADOW_RMAP_SEGMENT 0x2
+#define _SHADOW_RMAP_PGTABLE 0x1
+
+/**
+ * gmap_idte_one - invalidate a single region or segment table entry
+ * @asce: region or segment table *origin* + table-type bits
+ * @vaddr: virtual address to identify the table entry to flush
+ *
+ * The invalid bit of a single region or segment table entry is set
+ * and the associated TLB entries depending on the entry are flushed.
+ * The table-type of the @asce identifies the portion of the @vaddr
+ * that is used as the invalidation index.
+ */
+static inline void gmap_idte_one(unsigned long asce, unsigned long vaddr)
+{
+ asm volatile(
+ " .insn rrf,0xb98e0000,%0,%1,0,0"
+ : : "a" (asce), "a" (vaddr) : "cc", "memory");
+}
+
+/**
+ * gmap_unshadow_page - remove a page from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_page(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long *table;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ table = gmap_table_walk(sg, raddr, 0); /* get page table pointer */
+ if (!table || *table & _PAGE_INVALID)
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 12) - 1);
+ ptep_unshadow_pte(sg->mm, raddr, (pte_t *) table);
+}
+
+/**
+ * __gmap_unshadow_pgt - remove all entries from a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @pgt: pointer to the start of a shadow page table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr,
+ unsigned long *pgt)
+{
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ for (i = 0; i < 256; i++, raddr += 1UL << 12)
+ pgt[i] = _PAGE_INVALID;
+}
+
+/**
+ * gmap_unshadow_pgt - remove a shadow page table from a segment entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_pgt(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long sto, *ste, *pgt;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ ste = gmap_table_walk(sg, raddr, 1); /* get segment pointer */
+ if (!ste || !(*ste & _SEGMENT_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 20) - 1);
+ sto = (unsigned long) (ste - ((raddr >> 20) & 0x7ff));
+ gmap_idte_one(sto | _ASCE_TYPE_SEGMENT, raddr);
+ pgt = (unsigned long *)(*ste & _SEGMENT_ENTRY_ORIGIN);
+ *ste = _SEGMENT_ENTRY_EMPTY;
+ __gmap_unshadow_pgt(sg, raddr, pgt);
+ /* Free page table */
+ page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ page_table_free_pgste(page);
+}
+
+/**
+ * __gmap_unshadow_sgt - remove all entries from a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @sgt: pointer to the start of a shadow segment table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr,
+ unsigned long *sgt)
+{
+ unsigned long asce, *pgt;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) sgt | _ASCE_TYPE_SEGMENT;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 20) {
+ if (!(sgt[i] & _SEGMENT_ENTRY_ORIGIN))
+ continue;
+ pgt = (unsigned long *)(sgt[i] & _REGION_ENTRY_ORIGIN);
+ sgt[i] = _SEGMENT_ENTRY_EMPTY;
+ __gmap_unshadow_pgt(sg, raddr, pgt);
+ /* Free page table */
+ page = pfn_to_page(__pa(pgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ page_table_free_pgste(page);
+ }
+}
+
+/**
+ * gmap_unshadow_sgt - remove a shadow segment table from a region-3 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void gmap_unshadow_sgt(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r3o, *r3e, *sgt;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r3e = gmap_table_walk(sg, raddr, 2); /* get region-3 pointer */
+ if (!r3e || !(*r3e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 31) - 1);
+ r3o = (unsigned long) (r3e - ((raddr >> 31) & 0x7ff));
+ gmap_idte_one(r3o | _ASCE_TYPE_REGION3, raddr);
+ sgt = (unsigned long *)(*r3e & _REGION_ENTRY_ORIGIN);
+ *r3e = _REGION3_ENTRY_EMPTY;
+ __gmap_unshadow_sgt(sg, raddr, sgt);
+ /* Free segment table */
+ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r3t - remove all entries from a shadow region-3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: address in the shadow guest address space
+ * @r3t: pointer to the start of a shadow region-3 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r3t)
+{
+ unsigned long asce, *sgt;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) r3t | _ASCE_TYPE_REGION3;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 31) {
+ if (!(r3t[i] & _REGION_ENTRY_ORIGIN))
+ continue;
+ sgt = (unsigned long *)(r3t[i] & _REGION_ENTRY_ORIGIN);
+ r3t[i] = _REGION3_ENTRY_EMPTY;
+ __gmap_unshadow_sgt(sg, raddr, sgt);
+ /* Free segment table */
+ page = pfn_to_page(__pa(sgt) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+ }
+}
+
+/**
+ * gmap_unshadow_r3t - remove a shadow region-3 table from a region-2 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r3t(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r2o, *r2e, *r3t;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r2e = gmap_table_walk(sg, raddr, 3); /* get region-2 pointer */
+ if (!r2e || !(*r2e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 42) - 1);
+ r2o = (unsigned long) (r2e - ((raddr >> 42) & 0x7ff));
+ gmap_idte_one(r2o | _ASCE_TYPE_REGION2, raddr);
+ r3t = (unsigned long *)(*r2e & _REGION_ENTRY_ORIGIN);
+ *r2e = _REGION2_ENTRY_EMPTY;
+ __gmap_unshadow_r3t(sg, raddr, r3t);
+ /* Free region 3 table */
+ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r2t - remove all entries from a shadow region-2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r2t: pointer to the start of a shadow region-2 table
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void __gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r2t)
+{
+ unsigned long asce, *r3t;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) r2t | _ASCE_TYPE_REGION2;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 42) {
+ if (!(r2t[i] & _REGION_ENTRY_ORIGIN))
+ continue;
+ r3t = (unsigned long *)(r2t[i] & _REGION_ENTRY_ORIGIN);
+ r2t[i] = _REGION2_ENTRY_EMPTY;
+ __gmap_unshadow_r3t(sg, raddr, r3t);
+ /* Free region 3 table */
+ page = pfn_to_page(__pa(r3t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+ }
+}
+
+/**
+ * gmap_unshadow_r2t - remove a shadow region-2 table from a region-1 entry
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ *
+ * Called with the sg->guest_table_lock
+ */
+static void gmap_unshadow_r2t(struct gmap *sg, unsigned long raddr)
+{
+ unsigned long r1o, *r1e, *r2t;
+ struct page *page;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ r1e = gmap_table_walk(sg, raddr, 4); /* get region-1 pointer */
+ if (!r1e || !(*r1e & _REGION_ENTRY_ORIGIN))
+ return;
+ gmap_call_notifier(sg, raddr, raddr + (1UL << 53) - 1);
+ r1o = (unsigned long) (r1e - ((raddr >> 53) & 0x7ff));
+ gmap_idte_one(r1o | _ASCE_TYPE_REGION1, raddr);
+ r2t = (unsigned long *)(*r1e & _REGION_ENTRY_ORIGIN);
+ *r1e = _REGION1_ENTRY_EMPTY;
+ __gmap_unshadow_r2t(sg, raddr, r2t);
+ /* Free region 2 table */
+ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+}
+
+/**
+ * __gmap_unshadow_r1t - remove all entries from a shadow region-1 table
+ * @sg: pointer to the shadow guest address space structure
+ * @raddr: rmap address in the shadow guest address space
+ * @r1t: pointer to the start of a shadow region-1 table
+ *
+ * Called with the shadow->guest_table_lock
+ */
+static void __gmap_unshadow_r1t(struct gmap *sg, unsigned long raddr,
+ unsigned long *r1t)
+{
+ unsigned long asce, *r2t;
+ struct page *page;
+ int i;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ asce = (unsigned long) r1t | _ASCE_TYPE_REGION1;
+ for (i = 0; i < 2048; i++, raddr += 1UL << 53) {
+ if (!(r1t[i] & _REGION_ENTRY_ORIGIN))
+ continue;
+ r2t = (unsigned long *)(r1t[i] & _REGION_ENTRY_ORIGIN);
+ __gmap_unshadow_r2t(sg, raddr, r2t);
+ /* Clear entry and flush translation r1t -> r2t */
+ gmap_idte_one(asce, raddr);
+ r1t[i] = _REGION1_ENTRY_EMPTY;
+ /* Free region 2 table */
+ page = pfn_to_page(__pa(r2t) >> PAGE_SHIFT);
+ list_del(&page->lru);
+ __free_pages(page, 2);
+ }
+}
+
+/**
+ * gmap_unshadow - remove a shadow page table completely
+ * @sg: pointer to the shadow guest address space structure
+ *
+ * Called with sg->guest_table_lock
+ */
+static void gmap_unshadow(struct gmap *sg)
+{
+ unsigned long *table;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ if (sg->removed)
+ return;
+ sg->removed = 1;
+ gmap_call_notifier(sg, 0, -1UL);
+ gmap_flush_tlb(sg);
+ table = (unsigned long *)(sg->asce & _ASCE_ORIGIN);
+ switch (sg->asce & _ASCE_TYPE_MASK) {
+ case _ASCE_TYPE_REGION1:
+ __gmap_unshadow_r1t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_REGION2:
+ __gmap_unshadow_r2t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_REGION3:
+ __gmap_unshadow_r3t(sg, 0, table);
+ break;
+ case _ASCE_TYPE_SEGMENT:
+ __gmap_unshadow_sgt(sg, 0, table);
+ break;
+ }
+}
+
+/**
+ * gmap_find_shadow - find a specific asce in the list of shadow tables
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns the pointer to a gmap if a shadow table with the given asce is
+ * already available, ERR_PTR(-EAGAIN) if another one is just being created,
+ * otherwise NULL
+ */
+static struct gmap *gmap_find_shadow(struct gmap *parent, unsigned long asce,
+ int edat_level)
+{
+ struct gmap *sg;
+
+ list_for_each_entry(sg, &parent->children, list) {
+ if (sg->orig_asce != asce || sg->edat_level != edat_level ||
+ sg->removed)
+ continue;
+ if (!sg->initialized)
+ return ERR_PTR(-EAGAIN);
+ atomic_inc(&sg->ref_count);
+ return sg;
+ }
+ return NULL;
+}
+
+/**
+ * gmap_shadow_valid - check if a shadow guest address space matches the
+ * given properties and is still valid
+ * @sg: pointer to the shadow guest address space structure
+ * @asce: ASCE for which the shadow table is requested
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * Returns 1 if the gmap shadow is still valid and matches the given
+ * properties, the caller can continue using it. Returns 0 otherwise, the
+ * caller has to request a new shadow gmap in this case.
+ *
+ */
+int gmap_shadow_valid(struct gmap *sg, unsigned long asce, int edat_level)
+{
+ if (sg->removed)
+ return 0;
+ return sg->orig_asce == asce && sg->edat_level == edat_level;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_valid);
+
+/**
+ * gmap_shadow - create/find a shadow guest address space
+ * @parent: pointer to the parent gmap
+ * @asce: ASCE for which the shadow table is created
+ * @edat_level: edat level to be used for the shadow translation
+ *
+ * The pages of the top level page table referred by the asce parameter
+ * will be set to read-only and marked in the PGSTEs of the kvm process.
+ * The shadow table will be removed automatically on any change to the
+ * PTE mapping for the source table.
+ *
+ * Returns a guest address space structure, ERR_PTR(-ENOMEM) if out of memory,
+ * ERR_PTR(-EAGAIN) if the caller has to retry and ERR_PTR(-EFAULT) if the
+ * parent gmap table could not be protected.
+ */
+struct gmap *gmap_shadow(struct gmap *parent, unsigned long asce,
+ int edat_level)
+{
+ struct gmap *sg, *new;
+ unsigned long limit;
+ int rc;
+
+ BUG_ON(gmap_is_shadow(parent));
+ spin_lock(&parent->shadow_lock);
+ sg = gmap_find_shadow(parent, asce, edat_level);
+ spin_unlock(&parent->shadow_lock);
+ if (sg)
+ return sg;
+ /* Create a new shadow gmap */
+ limit = -1UL >> (33 - (((asce & _ASCE_TYPE_MASK) >> 2) * 11));
+ if (asce & _ASCE_REAL_SPACE)
+ limit = -1UL;
+ new = gmap_alloc(limit);
+ if (!new)
+ return ERR_PTR(-ENOMEM);
+ new->mm = parent->mm;
+ new->parent = gmap_get(parent);
+ new->orig_asce = asce;
+ new->edat_level = edat_level;
+ new->initialized = false;
+ spin_lock(&parent->shadow_lock);
+ /* Recheck if another CPU created the same shadow */
+ sg = gmap_find_shadow(parent, asce, edat_level);
+ if (sg) {
+ spin_unlock(&parent->shadow_lock);
+ gmap_free(new);
+ return sg;
+ }
+ if (asce & _ASCE_REAL_SPACE) {
+ /* only allow one real-space gmap shadow */
+ list_for_each_entry(sg, &parent->children, list) {
+ if (sg->orig_asce & _ASCE_REAL_SPACE) {
+ spin_lock(&sg->guest_table_lock);
+ gmap_unshadow(sg);
+ spin_unlock(&sg->guest_table_lock);
+ list_del(&sg->list);
+ gmap_put(sg);
+ break;
+ }
+ }
+ }
+ atomic_set(&new->ref_count, 2);
+ list_add(&new->list, &parent->children);
+ if (asce & _ASCE_REAL_SPACE) {
+ /* nothing to protect, return right away */
+ new->initialized = true;
+ spin_unlock(&parent->shadow_lock);
+ return new;
+ }
+ spin_unlock(&parent->shadow_lock);
+ /* protect after insertion, so it will get properly invalidated */
+ down_read(&parent->mm->mmap_sem);
+ rc = gmap_protect_range(parent, asce & _ASCE_ORIGIN,
+ ((asce & _ASCE_TABLE_LENGTH) + 1) * 4096,
+ PROT_READ, PGSTE_VSIE_BIT);
+ up_read(&parent->mm->mmap_sem);
+ spin_lock(&parent->shadow_lock);
+ new->initialized = true;
+ if (rc) {
+ list_del(&new->list);
+ gmap_free(new);
+ new = ERR_PTR(rc);
+ }
+ spin_unlock(&parent->shadow_lock);
+ return new;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow);
+
+/**
+ * gmap_shadow_r2t - create an empty shadow region 2 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r2t: parent gmap address of the region 2 table to get shadowed
+ * @fake: r2t references contiguous guest memory block, not a r2t
+ *
+ * The r2t parameter specifies the address of the source table. The
+ * four pages of the source table are made read-only in the parent gmap
+ * address space. A write to the source table area @r2t will automatically
+ * remove the shadow r2 table and all of its decendents.
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r2t(struct gmap *sg, unsigned long saddr, unsigned long r2t,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_r2t, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ /* Allocate a shadow region second table */
+ page = alloc_pages(GFP_KERNEL, 2);
+ if (!page)
+ return -ENOMEM;
+ page->index = r2t & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_r2t = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 4); /* get region-1 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ crst_table_init(s_r2t, _REGION2_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_r2t | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R1 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= (r2t & _REGION_ENTRY_PROTECT);
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make r2t read-only in parent gmap page table */
+ raddr = (saddr & 0xffe0000000000000UL) | _SHADOW_RMAP_REGION1;
+ origin = r2t & _REGION_ENTRY_ORIGIN;
+ offset = ((r2t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+ len = ((r2t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 4);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_r2t)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_r2t(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, 2);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r2t);
+
+/**
+ * gmap_shadow_r3t - create a shadow region 3 table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @r3t: parent gmap address of the region 3 table to get shadowed
+ * @fake: r3t references contiguous guest memory block, not a r3t
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_r3t(struct gmap *sg, unsigned long saddr, unsigned long r3t,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_r3t, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ /* Allocate a shadow region second table */
+ page = alloc_pages(GFP_KERNEL, 2);
+ if (!page)
+ return -ENOMEM;
+ page->index = r3t & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_r3t = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 3); /* get region-2 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ }
+ crst_table_init(s_r3t, _REGION3_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_r3t | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R2 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= (r3t & _REGION_ENTRY_PROTECT);
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make r3t read-only in parent gmap page table */
+ raddr = (saddr & 0xfffffc0000000000UL) | _SHADOW_RMAP_REGION2;
+ origin = r3t & _REGION_ENTRY_ORIGIN;
+ offset = ((r3t & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+ len = ((r3t & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 3);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_r3t)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_r3t(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, 2);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_r3t);
+
+/**
+ * gmap_shadow_sgt - create a shadow segment table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @sgt: parent gmap address of the segment table to get shadowed
+ * @fake: sgt references contiguous guest memory block, not a sgt
+ *
+ * Returns: 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_sgt(struct gmap *sg, unsigned long saddr, unsigned long sgt,
+ int fake)
+{
+ unsigned long raddr, origin, offset, len;
+ unsigned long *s_sgt, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg) || (sgt & _REGION3_ENTRY_LARGE));
+ /* Allocate a shadow segment table */
+ page = alloc_pages(GFP_KERNEL, 2);
+ if (!page)
+ return -ENOMEM;
+ page->index = sgt & _REGION_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_sgt = (unsigned long *) page_to_phys(page);
+ /* Install shadow region second table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 2); /* get region-3 pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _REGION_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _REGION_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ crst_table_init(s_sgt, _SEGMENT_ENTRY_EMPTY);
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_sgt | _REGION_ENTRY_LENGTH |
+ _REGION_ENTRY_TYPE_R3 | _REGION_ENTRY_INVALID;
+ if (sg->edat_level >= 1)
+ *table |= sgt & _REGION_ENTRY_PROTECT;
+ list_add(&page->lru, &sg->crst_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_REGION_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make sgt read-only in parent gmap page table */
+ raddr = (saddr & 0xffffffff80000000UL) | _SHADOW_RMAP_REGION3;
+ origin = sgt & _REGION_ENTRY_ORIGIN;
+ offset = ((sgt & _REGION_ENTRY_OFFSET) >> 6) * 4096;
+ len = ((sgt & _REGION_ENTRY_LENGTH) + 1) * 4096 - offset;
+ rc = gmap_protect_rmap(sg, raddr, origin + offset, len, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 2);
+ if (!table || (*table & _REGION_ENTRY_ORIGIN) !=
+ (unsigned long) s_sgt)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_REGION_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_sgt(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ __free_pages(page, 2);
+ return rc;
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_sgt);
+
+/**
+ * gmap_shadow_lookup_pgtable - find a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: the address in the shadow aguest address space
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @dat_protection: if the pgtable is marked as protected by dat
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if the shadow page table was found and -EAGAIN if the page
+ * table was not found.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_pgt_lookup(struct gmap *sg, unsigned long saddr,
+ unsigned long *pgt, int *dat_protection,
+ int *fake)
+{
+ unsigned long *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+ if (table && !(*table & _SEGMENT_ENTRY_INVALID)) {
+ /* Shadow page tables are full pages (pte+pgste) */
+ page = pfn_to_page(*table >> PAGE_SHIFT);
+ *pgt = page->index & ~GMAP_SHADOW_FAKE_TABLE;
+ *dat_protection = !!(*table & _SEGMENT_ENTRY_PROTECT);
+ *fake = !!(page->index & GMAP_SHADOW_FAKE_TABLE);
+ rc = 0;
+ } else {
+ rc = -EAGAIN;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt_lookup);
+
+/**
+ * gmap_shadow_pgt - instantiate a shadow page table
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pgt: parent gmap address of the page table to get shadowed
+ * @fake: pgt references contiguous guest memory block, not a pgtable
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory,
+ * -EFAULT if an address in the parent gmap could not be resolved and
+ *
+ * Called with gmap->mm->mmap_sem in read
+ */
+int gmap_shadow_pgt(struct gmap *sg, unsigned long saddr, unsigned long pgt,
+ int fake)
+{
+ unsigned long raddr, origin;
+ unsigned long *s_pgt, *table;
+ struct page *page;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg) || (pgt & _SEGMENT_ENTRY_LARGE));
+ /* Allocate a shadow page table */
+ page = page_table_alloc_pgste(sg->mm);
+ if (!page)
+ return -ENOMEM;
+ page->index = pgt & _SEGMENT_ENTRY_ORIGIN;
+ if (fake)
+ page->index |= GMAP_SHADOW_FAKE_TABLE;
+ s_pgt = (unsigned long *) page_to_phys(page);
+ /* Install shadow page table */
+ spin_lock(&sg->guest_table_lock);
+ table = gmap_table_walk(sg, saddr, 1); /* get segment pointer */
+ if (!table) {
+ rc = -EAGAIN; /* Race with unshadow */
+ goto out_free;
+ }
+ if (!(*table & _SEGMENT_ENTRY_INVALID)) {
+ rc = 0; /* Already established */
+ goto out_free;
+ } else if (*table & _SEGMENT_ENTRY_ORIGIN) {
+ rc = -EAGAIN; /* Race with shadow */
+ goto out_free;
+ }
+ /* mark as invalid as long as the parent table is not protected */
+ *table = (unsigned long) s_pgt | _SEGMENT_ENTRY |
+ (pgt & _SEGMENT_ENTRY_PROTECT) | _SEGMENT_ENTRY_INVALID;
+ list_add(&page->lru, &sg->pt_list);
+ if (fake) {
+ /* nothing to protect for fake tables */
+ *table &= ~_SEGMENT_ENTRY_INVALID;
+ spin_unlock(&sg->guest_table_lock);
+ return 0;
+ }
+ spin_unlock(&sg->guest_table_lock);
+ /* Make pgt read-only in parent gmap page table (not the pgste) */
+ raddr = (saddr & 0xfffffffffff00000UL) | _SHADOW_RMAP_SEGMENT;
+ origin = pgt & _SEGMENT_ENTRY_ORIGIN & PAGE_MASK;
+ rc = gmap_protect_rmap(sg, raddr, origin, PAGE_SIZE, PROT_READ);
+ spin_lock(&sg->guest_table_lock);
+ if (!rc) {
+ table = gmap_table_walk(sg, saddr, 1);
+ if (!table || (*table & _SEGMENT_ENTRY_ORIGIN) !=
+ (unsigned long) s_pgt)
+ rc = -EAGAIN; /* Race with unshadow */
+ else
+ *table &= ~_SEGMENT_ENTRY_INVALID;
+ } else {
+ gmap_unshadow_pgt(sg, raddr);
+ }
+ spin_unlock(&sg->guest_table_lock);
+ return rc;
+out_free:
+ spin_unlock(&sg->guest_table_lock);
+ page_table_free_pgste(page);
+ return rc;
+
+}
+EXPORT_SYMBOL_GPL(gmap_shadow_pgt);
+
+/**
+ * gmap_shadow_page - create a shadow page mapping
+ * @sg: pointer to the shadow guest address space structure
+ * @saddr: faulting address in the shadow gmap
+ * @pte: pte in parent gmap address space to get shadowed
+ *
+ * Returns 0 if successfully shadowed or already shadowed, -EAGAIN if the
+ * shadow table structure is incomplete, -ENOMEM if out of memory and
+ * -EFAULT if an address in the parent gmap could not be resolved.
+ *
+ * Called with sg->mm->mmap_sem in read.
+ */
+int gmap_shadow_page(struct gmap *sg, unsigned long saddr, pte_t pte)
+{
+ struct gmap *parent;
+ struct gmap_rmap *rmap;
+ unsigned long vmaddr, paddr;
+ spinlock_t *ptl;
+ pte_t *sptep, *tptep;
+ int prot;
+ int rc;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ parent = sg->parent;
+ prot = (pte_val(pte) & _PAGE_PROTECT) ? PROT_READ : PROT_WRITE;
+
+ rmap = kzalloc(sizeof(*rmap), GFP_KERNEL);
+ if (!rmap)
+ return -ENOMEM;
+ rmap->raddr = (saddr & PAGE_MASK) | _SHADOW_RMAP_PGTABLE;
+
+ while (1) {
+ paddr = pte_val(pte) & PAGE_MASK;
+ vmaddr = __gmap_translate(parent, paddr);
+ if (IS_ERR_VALUE(vmaddr)) {
+ rc = vmaddr;
+ break;
+ }
+ rc = radix_tree_preload(GFP_KERNEL);
if (rc)
break;
- /* Walk the process page table, lock and get pte pointer */
- ptep = get_locked_pte(gmap->mm, addr, &ptl);
- VM_BUG_ON(!ptep);
- /* Set notification bit in the pgste of the pte */
- if ((pte_val(*ptep) & (_PAGE_INVALID | _PAGE_PROTECT)) == 0) {
- ptep_set_notify(gmap->mm, addr, ptep);
- gaddr += PAGE_SIZE;
- len -= PAGE_SIZE;
+ rc = -EAGAIN;
+ sptep = gmap_pte_op_walk(parent, paddr, &ptl);
+ if (sptep) {
+ spin_lock(&sg->guest_table_lock);
+ /* Get page table pointer */
+ tptep = (pte_t *) gmap_table_walk(sg, saddr, 0);
+ if (!tptep) {
+ spin_unlock(&sg->guest_table_lock);
+ gmap_pte_op_end(ptl);
+ radix_tree_preload_end();
+ break;
+ }
+ rc = ptep_shadow_pte(sg->mm, saddr, sptep, tptep, pte);
+ if (rc > 0) {
+ /* Success and a new mapping */
+ gmap_insert_rmap(sg, vmaddr, rmap);
+ rmap = NULL;
+ rc = 0;
+ }
+ gmap_pte_op_end(ptl);
+ spin_unlock(&sg->guest_table_lock);
}
- pte_unmap_unlock(ptep, ptl);
+ radix_tree_preload_end();
+ if (!rc)
+ break;
+ rc = gmap_pte_op_fixup(parent, paddr, vmaddr, prot);
+ if (rc)
+ break;
}
- up_read(&gmap->mm->mmap_sem);
+ kfree(rmap);
return rc;
}
-EXPORT_SYMBOL_GPL(gmap_ipte_notify);
+EXPORT_SYMBOL_GPL(gmap_shadow_page);
+
+/**
+ * gmap_shadow_notify - handle notifications for shadow gmap
+ *
+ * Called with sg->parent->shadow_lock.
+ */
+static void gmap_shadow_notify(struct gmap *sg, unsigned long vmaddr,
+ unsigned long offset, pte_t *pte)
+{
+ struct gmap_rmap *rmap, *rnext, *head;
+ unsigned long gaddr, start, end, bits, raddr;
+ unsigned long *table;
+
+ BUG_ON(!gmap_is_shadow(sg));
+ spin_lock(&sg->parent->guest_table_lock);
+ table = radix_tree_lookup(&sg->parent->host_to_guest,
+ vmaddr >> PMD_SHIFT);
+ gaddr = table ? __gmap_segment_gaddr(table) + offset : 0;
+ spin_unlock(&sg->parent->guest_table_lock);
+ if (!table)
+ return;
+
+ spin_lock(&sg->guest_table_lock);
+ if (sg->removed) {
+ spin_unlock(&sg->guest_table_lock);
+ return;
+ }
+ /* Check for top level table */
+ start = sg->orig_asce & _ASCE_ORIGIN;
+ end = start + ((sg->orig_asce & _ASCE_TABLE_LENGTH) + 1) * 4096;
+ if (!(sg->orig_asce & _ASCE_REAL_SPACE) && gaddr >= start &&
+ gaddr < end) {
+ /* The complete shadow table has to go */
+ gmap_unshadow(sg);
+ spin_unlock(&sg->guest_table_lock);
+ list_del(&sg->list);
+ gmap_put(sg);
+ return;
+ }
+ /* Remove the page table tree from on specific entry */
+ head = radix_tree_delete(&sg->host_to_rmap, vmaddr >> 12);
+ gmap_for_each_rmap_safe(rmap, rnext, head) {
+ bits = rmap->raddr & _SHADOW_RMAP_MASK;
+ raddr = rmap->raddr ^ bits;
+ switch (bits) {
+ case _SHADOW_RMAP_REGION1:
+ gmap_unshadow_r2t(sg, raddr);
+ break;
+ case _SHADOW_RMAP_REGION2:
+ gmap_unshadow_r3t(sg, raddr);
+ break;
+ case _SHADOW_RMAP_REGION3:
+ gmap_unshadow_sgt(sg, raddr);
+ break;
+ case _SHADOW_RMAP_SEGMENT:
+ gmap_unshadow_pgt(sg, raddr);
+ break;
+ case _SHADOW_RMAP_PGTABLE:
+ gmap_unshadow_page(sg, raddr);
+ break;
+ }
+ kfree(rmap);
+ }
+ spin_unlock(&sg->guest_table_lock);
+}
/**
* ptep_notify - call all invalidation callbacks for a specific pte.
* @mm: pointer to the process mm_struct
* @addr: virtual address in the process address space
* @pte: pointer to the page table entry
+ * @bits: bits from the pgste that caused the notify call
*
* This function is assumed to be called with the page table lock held
* for the pte to notify.
*/
-void ptep_notify(struct mm_struct *mm, unsigned long vmaddr, pte_t *pte)
+void ptep_notify(struct mm_struct *mm, unsigned long vmaddr,
+ pte_t *pte, unsigned long bits)
{
unsigned long offset, gaddr;
unsigned long *table;
- struct gmap_notifier *nb;
- struct gmap *gmap;
+ struct gmap *gmap, *sg, *next;
offset = ((unsigned long) pte) & (255 * sizeof(pte_t));
offset = offset * (4096 / sizeof(pte_t));
- spin_lock(&gmap_notifier_lock);
- list_for_each_entry(gmap, &mm->context.gmap_list, list) {
+ rcu_read_lock();
+ list_for_each_entry_rcu(gmap, &mm->context.gmap_list, list) {
+ if (!list_empty(&gmap->children) && (bits & PGSTE_VSIE_BIT)) {
+ spin_lock(&gmap->shadow_lock);
+ list_for_each_entry_safe(sg, next,
+ &gmap->children, list)
+ gmap_shadow_notify(sg, vmaddr, offset, pte);
+ spin_unlock(&gmap->shadow_lock);
+ }
+ if (!(bits & PGSTE_IN_BIT))
+ continue;
+ spin_lock(&gmap->guest_table_lock);
table = radix_tree_lookup(&gmap->host_to_guest,
vmaddr >> PMD_SHIFT);
- if (!table)
- continue;
- gaddr = __gmap_segment_gaddr(table) + offset;
- list_for_each_entry(nb, &gmap_notifier_list, list)
- nb->notifier_call(gmap, gaddr);
+ if (table)
+ gaddr = __gmap_segment_gaddr(table) + offset;
+ spin_unlock(&gmap->guest_table_lock);
+ if (table)
+ gmap_call_notifier(gmap, gaddr, gaddr + PAGE_SIZE - 1);
}
- spin_unlock(&gmap_notifier_lock);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(ptep_notify);
diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c
index e2565d2d0c32..995f78532cc2 100644
--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -137,6 +137,29 @@ static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
return new;
}
+#ifdef CONFIG_PGSTE
+
+struct page *page_table_alloc_pgste(struct mm_struct *mm)
+{
+ struct page *page;
+ unsigned long *table;
+
+ page = alloc_page(GFP_KERNEL|__GFP_REPEAT);
+ if (page) {
+ table = (unsigned long *) page_to_phys(page);
+ clear_table(table, _PAGE_INVALID, PAGE_SIZE/2);
+ clear_table(table + PTRS_PER_PTE, 0, PAGE_SIZE/2);
+ }
+ return page;
+}
+
+void page_table_free_pgste(struct page *page)
+{
+ __free_page(page);
+}
+
+#endif /* CONFIG_PGSTE */
+
/*
* page table entry allocation/free routines.
*/
@@ -149,7 +172,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
/* Try to get a fragment of a 4K page as a 2K page table */
if (!mm_alloc_pgste(mm)) {
table = NULL;
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
if (!list_empty(&mm->context.pgtable_list)) {
page = list_first_entry(&mm->context.pgtable_list,
struct page, lru);
@@ -164,7 +187,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
list_del(&page->lru);
}
}
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
if (table)
return table;
}
@@ -187,9 +210,9 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
/* Return the first 2K fragment of the page */
atomic_set(&page->_mapcount, 1);
clear_table(table, _PAGE_INVALID, PAGE_SIZE);
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
list_add(&page->lru, &mm->context.pgtable_list);
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
}
return table;
}
@@ -203,13 +226,13 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
if (!mm_alloc_pgste(mm)) {
/* Free 2K page table fragment of a 4K page */
bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
mask = atomic_xor_bits(&page->_mapcount, 1U << bit);
if (mask & 3)
list_add(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
if (mask != 0)
return;
}
@@ -235,13 +258,13 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
return;
}
bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
- spin_lock_bh(&mm->context.list_lock);
+ spin_lock_bh(&mm->context.pgtable_lock);
mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit);
if (mask & 3)
list_add_tail(&page->lru, &mm->context.pgtable_list);
else
list_del(&page->lru);
- spin_unlock_bh(&mm->context.list_lock);
+ spin_unlock_bh(&mm->context.pgtable_lock);
table = (unsigned long *) (__pa(table) | (1U << bit));
tlb_remove_table(tlb, table);
}
diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c
index b98d1a152d46..5f092015aaa7 100644
--- a/arch/s390/mm/pgtable.c
+++ b/arch/s390/mm/pgtable.c
@@ -174,14 +174,17 @@ static inline pgste_t pgste_set_pte(pte_t *ptep, pgste_t pgste, pte_t entry)
return pgste;
}
-static inline pgste_t pgste_ipte_notify(struct mm_struct *mm,
- unsigned long addr,
- pte_t *ptep, pgste_t pgste)
+static inline pgste_t pgste_pte_notify(struct mm_struct *mm,
+ unsigned long addr,
+ pte_t *ptep, pgste_t pgste)
{
#ifdef CONFIG_PGSTE
- if (pgste_val(pgste) & PGSTE_IN_BIT) {
- pgste_val(pgste) &= ~PGSTE_IN_BIT;
- ptep_notify(mm, addr, ptep);
+ unsigned long bits;
+
+ bits = pgste_val(pgste) & (PGSTE_IN_BIT | PGSTE_VSIE_BIT);
+ if (bits) {
+ pgste_val(pgste) ^= bits;
+ ptep_notify(mm, addr, ptep, bits);
}
#endif
return pgste;
@@ -194,7 +197,7 @@ static inline pgste_t ptep_xchg_start(struct mm_struct *mm,
if (mm_has_pgste(mm)) {
pgste = pgste_get_lock(ptep);
- pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+ pgste = pgste_pte_notify(mm, addr, ptep, pgste);
}
return pgste;
}
@@ -459,6 +462,90 @@ void ptep_set_notify(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
preempt_enable();
}
+/**
+ * ptep_force_prot - change access rights of a locked pte
+ * @mm: pointer to the process mm_struct
+ * @addr: virtual address in the guest address space
+ * @ptep: pointer to the page table entry
+ * @prot: indicates guest access rights: PROT_NONE, PROT_READ or PROT_WRITE
+ * @bit: pgste bit to set (e.g. for notification)
+ *
+ * Returns 0 if the access rights were changed and -EAGAIN if the current
+ * and requested access rights are incompatible.
+ */
+int ptep_force_prot(struct mm_struct *mm, unsigned long addr,
+ pte_t *ptep, int prot, unsigned long bit)
+{
+ pte_t entry;
+ pgste_t pgste;
+ int pte_i, pte_p;
+
+ pgste = pgste_get_lock(ptep);
+ entry = *ptep;
+ /* Check pte entry after all locks have been acquired */
+ pte_i = pte_val(entry) & _PAGE_INVALID;
+ pte_p = pte_val(entry) & _PAGE_PROTECT;
+ if ((pte_i && (prot != PROT_NONE)) ||
+ (pte_p && (prot & PROT_WRITE))) {
+ pgste_set_unlock(ptep, pgste);
+ return -EAGAIN;
+ }
+ /* Change access rights and set pgste bit */
+ if (prot == PROT_NONE && !pte_i) {
+ ptep_flush_direct(mm, addr, ptep);
+ pgste = pgste_update_all(entry, pgste, mm);
+ pte_val(entry) |= _PAGE_INVALID;
+ }
+ if (prot == PROT_READ && !pte_p) {
+ ptep_flush_direct(mm, addr, ptep);
+ pte_val(entry) &= ~_PAGE_INVALID;
+ pte_val(entry) |= _PAGE_PROTECT;
+ }
+ pgste_val(pgste) |= bit;
+ pgste = pgste_set_pte(ptep, pgste, entry);
+ pgste_set_unlock(ptep, pgste);
+ return 0;
+}
+
+int ptep_shadow_pte(struct mm_struct *mm, unsigned long saddr,
+ pte_t *sptep, pte_t *tptep, pte_t pte)
+{
+ pgste_t spgste, tpgste;
+ pte_t spte, tpte;
+ int rc = -EAGAIN;
+
+ if (!(pte_val(*tptep) & _PAGE_INVALID))
+ return 0; /* already shadowed */
+ spgste = pgste_get_lock(sptep);
+ spte = *sptep;
+ if (!(pte_val(spte) & _PAGE_INVALID) &&
+ !((pte_val(spte) & _PAGE_PROTECT) &&
+ !(pte_val(pte) & _PAGE_PROTECT))) {
+ pgste_val(spgste) |= PGSTE_VSIE_BIT;
+ tpgste = pgste_get_lock(tptep);
+ pte_val(tpte) = (pte_val(spte) & PAGE_MASK) |
+ (pte_val(pte) & _PAGE_PROTECT);
+ /* don't touch the storage key - it belongs to parent pgste */
+ tpgste = pgste_set_pte(tptep, tpgste, tpte);
+ pgste_set_unlock(tptep, tpgste);
+ rc = 1;
+ }
+ pgste_set_unlock(sptep, spgste);
+ return rc;
+}
+
+void ptep_unshadow_pte(struct mm_struct *mm, unsigned long saddr, pte_t *ptep)
+{
+ pgste_t pgste;
+
+ pgste = pgste_get_lock(ptep);
+ /* notifier is called by the caller */
+ ptep_flush_direct(mm, saddr, ptep);
+ /* don't touch the storage key - it belongs to parent pgste */
+ pgste = pgste_set_pte(ptep, pgste, __pte(_PAGE_INVALID));
+ pgste_set_unlock(ptep, pgste);
+}
+
static void ptep_zap_swap_entry(struct mm_struct *mm, swp_entry_t entry)
{
if (!non_swap_entry(entry))
@@ -532,7 +619,7 @@ bool test_and_clear_guest_dirty(struct mm_struct *mm, unsigned long addr)
pgste_val(pgste) &= ~PGSTE_UC_BIT;
pte = *ptep;
if (dirty && (pte_val(pte) & _PAGE_PRESENT)) {
- pgste = pgste_ipte_notify(mm, addr, ptep, pgste);
+ pgste = pgste_pte_notify(mm, addr, ptep, pgste);
__ptep_ipte(addr, ptep);
if (MACHINE_HAS_ESOP || !(pte_val(pte) & _PAGE_WRITE))
pte_val(pte) |= _PAGE_PROTECT;
@@ -555,12 +642,9 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pgste_t old, new;
pte_t *ptep;
- down_read(&mm->mmap_sem);
ptep = get_locked_pte(mm, addr, &ptl);
- if (unlikely(!ptep)) {
- up_read(&mm->mmap_sem);
+ if (unlikely(!ptep))
return -EFAULT;
- }
new = old = pgste_get_lock(ptep);
pgste_val(new) &= ~(PGSTE_GR_BIT | PGSTE_GC_BIT |
@@ -587,45 +671,100 @@ int set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
pgste_set_unlock(ptep, new);
pte_unmap_unlock(ptep, ptl);
- up_read(&mm->mmap_sem);
return 0;
}
EXPORT_SYMBOL(set_guest_storage_key);
-unsigned char get_guest_storage_key(struct mm_struct *mm, unsigned long addr)
+/**
+ * Conditionally set a guest storage key (handling csske).
+ * oldkey will be updated when either mr or mc is set and a pointer is given.
+ *
+ * Returns 0 if a guests storage key update wasn't necessary, 1 if the guest
+ * storage key was updated and -EFAULT on access errors.
+ */
+int cond_set_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char key, unsigned char *oldkey,
+ bool nq, bool mr, bool mc)
+{
+ unsigned char tmp, mask = _PAGE_ACC_BITS | _PAGE_FP_BIT;
+ int rc;
+
+ /* we can drop the pgste lock between getting and setting the key */
+ if (mr | mc) {
+ rc = get_guest_storage_key(current->mm, addr, &tmp);
+ if (rc)
+ return rc;
+ if (oldkey)
+ *oldkey = tmp;
+ if (!mr)
+ mask |= _PAGE_REFERENCED;
+ if (!mc)
+ mask |= _PAGE_CHANGED;
+ if (!((tmp ^ key) & mask))
+ return 0;
+ }
+ rc = set_guest_storage_key(current->mm, addr, key, nq);
+ return rc < 0 ? rc : 1;
+}
+EXPORT_SYMBOL(cond_set_guest_storage_key);
+
+/**
+ * Reset a guest reference bit (rrbe), returning the reference and changed bit.
+ *
+ * Returns < 0 in case of error, otherwise the cc to be reported to the guest.
+ */
+int reset_guest_reference_bit(struct mm_struct *mm, unsigned long addr)
{
- unsigned char key;
spinlock_t *ptl;
- pgste_t pgste;
+ pgste_t old, new;
pte_t *ptep;
+ int cc = 0;
- down_read(&mm->mmap_sem);
ptep = get_locked_pte(mm, addr, &ptl);
- if (unlikely(!ptep)) {
- up_read(&mm->mmap_sem);
+ if (unlikely(!ptep))
return -EFAULT;
- }
- pgste = pgste_get_lock(ptep);
- if (pte_val(*ptep) & _PAGE_INVALID) {
- key = (pgste_val(pgste) & PGSTE_ACC_BITS) >> 56;
- key |= (pgste_val(pgste) & PGSTE_FP_BIT) >> 56;
- key |= (pgste_val(pgste) & PGSTE_GR_BIT) >> 48;
- key |= (pgste_val(pgste) & PGSTE_GC_BIT) >> 48;
- } else {
- key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+ new = old = pgste_get_lock(ptep);
+ /* Reset guest reference bit only */
+ pgste_val(new) &= ~PGSTE_GR_BIT;
- /* Reflect guest's logical view, not physical */
- if (pgste_val(pgste) & PGSTE_GR_BIT)
- key |= _PAGE_REFERENCED;
- if (pgste_val(pgste) & PGSTE_GC_BIT)
- key |= _PAGE_CHANGED;
+ if (!(pte_val(*ptep) & _PAGE_INVALID)) {
+ cc = page_reset_referenced(pte_val(*ptep) & PAGE_MASK);
+ /* Merge real referenced bit into host-set */
+ pgste_val(new) |= ((unsigned long) cc << 53) & PGSTE_HR_BIT;
}
+ /* Reflect guest's logical view, not physical */
+ cc |= (pgste_val(old) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 49;
+ /* Changing the guest storage key is considered a change of the page */
+ if ((pgste_val(new) ^ pgste_val(old)) & PGSTE_GR_BIT)
+ pgste_val(new) |= PGSTE_UC_BIT;
+
+ pgste_set_unlock(ptep, new);
+ pte_unmap_unlock(ptep, ptl);
+ return 0;
+}
+EXPORT_SYMBOL(reset_guest_reference_bit);
+
+int get_guest_storage_key(struct mm_struct *mm, unsigned long addr,
+ unsigned char *key)
+{
+ spinlock_t *ptl;
+ pgste_t pgste;
+ pte_t *ptep;
+ ptep = get_locked_pte(mm, addr, &ptl);
+ if (unlikely(!ptep))
+ return -EFAULT;
+
+ pgste = pgste_get_lock(ptep);
+ *key = (pgste_val(pgste) & (PGSTE_ACC_BITS | PGSTE_FP_BIT)) >> 56;
+ if (!(pte_val(*ptep) & _PAGE_INVALID))
+ *key = page_get_storage_key(pte_val(*ptep) & PAGE_MASK);
+ /* Reflect guest's logical view, not physical */
+ *key |= (pgste_val(pgste) & (PGSTE_GR_BIT | PGSTE_GC_BIT)) >> 48;
pgste_set_unlock(ptep, pgste);
pte_unmap_unlock(ptep, ptl);
- up_read(&mm->mmap_sem);
- return key;
+ return 0;
}
EXPORT_SYMBOL(get_guest_storage_key);
#endif
diff --git a/arch/sparc/include/asm/pci_64.h b/arch/sparc/include/asm/pci_64.h
index 022d16008a00..2303635158f5 100644
--- a/arch/sparc/include/asm/pci_64.h
+++ b/arch/sparc/include/asm/pci_64.h
@@ -55,9 +55,6 @@ static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel)
}
#define HAVE_ARCH_PCI_RESOURCE_TO_USER
-void pci_resource_to_user(const struct pci_dev *dev, int bar,
- const struct resource *rsrc,
- resource_size_t *start, resource_size_t *end);
#endif /* __KERNEL__ */
#endif /* __SPARC64_PCI_H */
diff --git a/arch/sparc/kernel/pci.c b/arch/sparc/kernel/pci.c
index c2b202d763a1..9c1878f4fa9f 100644
--- a/arch/sparc/kernel/pci.c
+++ b/arch/sparc/kernel/pci.c
@@ -986,16 +986,18 @@ void pci_resource_to_user(const struct pci_dev *pdev, int bar,
const struct resource *rp, resource_size_t *start,
resource_size_t *end)
{
- struct pci_pbm_info *pbm = pdev->dev.archdata.host_controller;
- unsigned long offset;
-
- if (rp->flags & IORESOURCE_IO)
- offset = pbm->io_space.start;
- else
- offset = pbm->mem_space.start;
+ struct pci_bus_region region;
- *start = rp->start - offset;
- *end = rp->end - offset;
+ /*
+ * "User" addresses are shown in /sys/devices/pci.../.../resource
+ * and /proc/bus/pci/devices and used as mmap offsets for
+ * /proc/bus/pci/BB/DD.F files (see proc_bus_pci_mmap()).
+ *
+ * On sparc, these are PCI bus addresses, i.e., raw BAR values.
+ */
+ pcibios_resource_to_bus(pdev->bus, &region, (struct resource *) rp);
+ *start = region.start;
+ *end = region.end;
}
void pcibios_set_master(struct pci_dev *dev)
diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common
index cc0013475444..58650d098fb4 100644
--- a/arch/um/Kconfig.common
+++ b/arch/um/Kconfig.common
@@ -9,6 +9,7 @@ config UML
select GENERIC_CPU_DEVICES
select GENERIC_IO
select GENERIC_CLOCKEVENTS
+ select HAVE_GCC_PLUGINS
select TTY # Needed for line.c
config MMU
diff --git a/arch/um/Makefile b/arch/um/Makefile
index e3abe6f3156d..0ca46ededfc7 100644
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -78,8 +78,8 @@ include $(ARCH_DIR)/Makefile-os-$(OS)
KBUILD_CPPFLAGS += -I$(srctree)/$(HOST_DIR)/include \
-I$(srctree)/$(HOST_DIR)/include/uapi \
- -I$(HOST_DIR)/include/generated \
- -I$(HOST_DIR)/include/generated/uapi
+ -I$(objtree)/$(HOST_DIR)/include/generated \
+ -I$(objtree)/$(HOST_DIR)/include/generated/uapi
# -Derrno=kernel_errno - This turns all kernel references to errno into
# kernel_errno to separate them from the libc errno. This allows -fno-common
diff --git a/arch/unicore32/kernel/pci.c b/arch/unicore32/kernel/pci.c
index d45fa5f3e9c4..62137d13c6f9 100644
--- a/arch/unicore32/kernel/pci.c
+++ b/arch/unicore32/kernel/pci.c
@@ -265,10 +265,8 @@ static int __init pci_common_init(void)
pci_fixup_irqs(pci_common_swizzle, pci_puv3_map_irq);
- if (!pci_has_flag(PCI_PROBE_ONLY)) {
- pci_bus_size_bridges(puv3_bus);
- pci_bus_assign_resources(puv3_bus);
- }
+ pci_bus_size_bridges(puv3_bus);
+ pci_bus_assign_resources(puv3_bus);
pci_bus_add_devices(puv3_bus);
return 0;
}
@@ -279,9 +277,6 @@ char * __init pcibios_setup(char *str)
if (!strcmp(str, "debug")) {
debug_pci = 1;
return NULL;
- } else if (!strcmp(str, "firmware")) {
- pci_add_flags(PCI_PROBE_ONLY);
- return NULL;
}
return str;
}
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2fa55851d2a9..3a9add58d794 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -111,6 +111,7 @@ config X86
select HAVE_FUNCTION_GRAPH_FP_TEST
select HAVE_FUNCTION_GRAPH_TRACER
select HAVE_FUNCTION_TRACER
+ select HAVE_GCC_PLUGINS
select HAVE_GENERIC_DMA_COHERENT if X86_32
select HAVE_HW_BREAKPOINT
select HAVE_IDE
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index be8e688fa0d4..12ea8f8384f4 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -96,7 +96,7 @@ $(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
$(call if_changed,zoffset)
-AFLAGS_header.o += -I$(obj)
+AFLAGS_header.o += -I$(objtree)/$(obj)
$(obj)/header.o: $(obj)/zoffset.h
LDFLAGS_setup.elf := -T
diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 6ba89a1ab0e5..d5409660f5de 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -75,7 +75,7 @@ CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
-fno-omit-frame-pointer -foptimize-sibling-calls \
-DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
-$(vobjs): KBUILD_CFLAGS += $(CFL)
+$(vobjs): KBUILD_CFLAGS := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS)) $(CFL)
#
# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
@@ -145,6 +145,7 @@ KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS))
KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -fno-pic,$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 := $(filter-out -mfentry,$(KBUILD_CFLAGS_32))
+KBUILD_CFLAGS_32 := $(filter-out $(GCC_PLUGINS_CFLAGS),$(KBUILD_CFLAGS_32))
KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic
KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector)
KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls)
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 69e62862b622..33ae3a4d0159 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -35,8 +35,9 @@
#include <asm/asm.h>
#include <asm/kvm_page_track.h>
-#define KVM_MAX_VCPUS 255
-#define KVM_SOFT_MAX_VCPUS 160
+#define KVM_MAX_VCPUS 288
+#define KVM_SOFT_MAX_VCPUS 240
+#define KVM_MAX_VCPU_ID 1023
#define KVM_USER_MEM_SLOTS 509
/* memory slots that are not exposed to userspace */
#define KVM_PRIVATE_MEM_SLOTS 3
@@ -599,6 +600,7 @@ struct kvm_vcpu_arch {
u64 mcg_cap;
u64 mcg_status;
u64 mcg_ctl;
+ u64 mcg_ext_ctl;
u64 *mce_banks;
/* Cache MMIO info */
@@ -682,9 +684,12 @@ struct kvm_arch_memory_slot {
struct kvm_apic_map {
struct rcu_head rcu;
u8 mode;
- struct kvm_lapic *phys_map[256];
- /* first index is cluster id second is cpu id in a cluster */
- struct kvm_lapic *logical_map[16][16];
+ u32 max_apic_id;
+ union {
+ struct kvm_lapic *xapic_flat_map[8];
+ struct kvm_lapic *xapic_cluster_map[16][4];
+ };
+ struct kvm_lapic *phys_map[];
};
/* Hyper-V emulation context */
@@ -779,6 +784,9 @@ struct kvm_arch {
u32 ldr_mode;
struct page *avic_logical_id_table_page;
struct page *avic_physical_id_table_page;
+
+ bool x2apic_format;
+ bool x2apic_broadcast_quirk_disabled;
};
struct kvm_vm_stat {
@@ -1006,6 +1014,11 @@ struct kvm_x86_ops {
int (*update_pi_irte)(struct kvm *kvm, unsigned int host_irq,
uint32_t guest_irq, bool set);
void (*apicv_post_state_restore)(struct kvm_vcpu *vcpu);
+
+ int (*set_hv_timer)(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc);
+ void (*cancel_hv_timer)(struct kvm_vcpu *vcpu);
+
+ void (*setup_mce)(struct kvm_vcpu *vcpu);
};
struct kvm_arch_async_pf {
@@ -1026,7 +1039,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu);
void kvm_mmu_init_vm(struct kvm *kvm);
void kvm_mmu_uninit_vm(struct kvm *kvm);
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask);
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask);
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu);
void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
@@ -1077,6 +1090,10 @@ extern u32 kvm_max_guest_tsc_khz;
extern u8 kvm_tsc_scaling_ratio_frac_bits;
/* maximum allowed value of TSC scaling ratio */
extern u64 kvm_max_tsc_scaling_ratio;
+/* 1ull << kvm_tsc_scaling_ratio_frac_bits */
+extern u64 kvm_default_tsc_scaling_ratio;
+
+extern u64 kvm_mce_cap_supported;
enum emulation_result {
EMULATE_DONE, /* no further processing */
@@ -1352,7 +1369,7 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu);
bool kvm_intr_is_single_vcpu(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
-void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq);
static inline void kvm_arch_vcpu_blocking(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index d0fe23ec7e98..14824fc78f7e 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -193,7 +193,6 @@ struct __attribute__ ((__packed__)) vmcb {
struct vmcb_save_area save;
};
-#define SVM_CPUID_FEATURE_SHIFT 2
#define SVM_CPUID_FUNC 0x8000000a
#define SVM_VM_CR_SVM_DISABLE 4
diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h
index cce9ee68e335..0116b2ee9e64 100644
--- a/arch/x86/include/asm/virtext.h
+++ b/arch/x86/include/asm/virtext.h
@@ -83,23 +83,19 @@ static inline void cpu_emergency_vmxoff(void)
*/
static inline int cpu_has_svm(const char **msg)
{
- uint32_t eax, ebx, ecx, edx;
-
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
if (msg)
*msg = "not amd";
return 0;
}
- cpuid(0x80000000, &eax, &ebx, &ecx, &edx);
- if (eax < SVM_CPUID_FUNC) {
+ if (boot_cpu_data.extended_cpuid_level < SVM_CPUID_FUNC) {
if (msg)
*msg = "can't execute cpuid_8000000a";
return 0;
}
- cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
- if (!(ecx & (1 << SVM_CPUID_FEATURE_SHIFT))) {
+ if (!boot_cpu_has(X86_FEATURE_SVM)) {
if (msg)
*msg = "svm not available";
return 0;
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 639a6e34500c..ab8e32f7b9a8 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -32,7 +32,6 @@ config KVM
select HAVE_KVM_IRQ_BYPASS
select HAVE_KVM_IRQ_ROUTING
select HAVE_KVM_EVENTFD
- select KVM_APIC_ARCHITECTURE
select KVM_ASYNC_PF
select USER_RETURN_NOTIFIER
select KVM_MMIO
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index a4bf5b45d65a..5fb6c620180e 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -645,7 +645,6 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
.write = speaker_ioport_write,
};
-/* Caller must hold slots_lock */
struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
{
struct kvm_pit *pit;
@@ -690,6 +689,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
kvm_pit_set_reinject(pit, true);
+ mutex_lock(&kvm->slots_lock);
kvm_iodevice_init(&pit->dev, &pit_dev_ops);
ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS,
KVM_PIT_MEM_LENGTH, &pit->dev);
@@ -704,12 +704,14 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
if (ret < 0)
goto fail_register_speaker;
}
+ mutex_unlock(&kvm->slots_lock);
return pit;
fail_register_speaker:
kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
fail_register_pit:
+ mutex_unlock(&kvm->slots_lock);
kvm_pit_set_reinject(pit, false);
kthread_stop(pit->worker_task);
fail_kthread:
diff --git a/arch/x86/kvm/iommu.c b/arch/x86/kvm/iommu.c
index 95e0e6481f07..b181426f67b4 100644
--- a/arch/x86/kvm/iommu.c
+++ b/arch/x86/kvm/iommu.c
@@ -28,9 +28,7 @@
#include <linux/moduleparam.h>
#include <linux/pci.h>
#include <linux/stat.h>
-#include <linux/dmar.h>
#include <linux/iommu.h>
-#include <linux/intel-iommu.h>
#include "assigned-dev.h"
static bool allow_unsafe_assigned_interrupts;
diff --git a/arch/x86/kvm/irq_comm.c b/arch/x86/kvm/irq_comm.c
index dfb4c6476877..25810b144b58 100644
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -110,13 +110,17 @@ int kvm_irq_delivery_to_apic(struct kvm *kvm, struct kvm_lapic *src,
return r;
}
-void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
+void kvm_set_msi_irq(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e,
struct kvm_lapic_irq *irq)
{
- trace_kvm_msi_set_irq(e->msi.address_lo, e->msi.data);
+ trace_kvm_msi_set_irq(e->msi.address_lo | (kvm->arch.x2apic_format ?
+ (u64)e->msi.address_hi << 32 : 0),
+ e->msi.data);
irq->dest_id = (e->msi.address_lo &
MSI_ADDR_DEST_ID_MASK) >> MSI_ADDR_DEST_ID_SHIFT;
+ if (kvm->arch.x2apic_format)
+ irq->dest_id |= MSI_ADDR_EXT_DEST_ID(e->msi.address_hi);
irq->vector = (e->msi.data &
MSI_DATA_VECTOR_MASK) >> MSI_DATA_VECTOR_SHIFT;
irq->dest_mode = (1 << MSI_ADDR_DEST_MODE_SHIFT) & e->msi.address_lo;
@@ -129,15 +133,24 @@ void kvm_set_msi_irq(struct kvm_kernel_irq_routing_entry *e,
}
EXPORT_SYMBOL_GPL(kvm_set_msi_irq);
+static inline bool kvm_msi_route_invalid(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e)
+{
+ return kvm->arch.x2apic_format && (e->msi.address_hi & 0xff);
+}
+
int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
struct kvm *kvm, int irq_source_id, int level, bool line_status)
{
struct kvm_lapic_irq irq;
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
if (!level)
return -1;
- kvm_set_msi_irq(e, &irq);
+ kvm_set_msi_irq(kvm, e, &irq);
return kvm_irq_delivery_to_apic(kvm, NULL, &irq, NULL);
}
@@ -153,7 +166,10 @@ int kvm_arch_set_irq_inatomic(struct kvm_kernel_irq_routing_entry *e,
if (unlikely(e->type != KVM_IRQ_ROUTING_MSI))
return -EWOULDBLOCK;
- kvm_set_msi_irq(e, &irq);
+ if (kvm_msi_route_invalid(kvm, e))
+ return -EINVAL;
+
+ kvm_set_msi_irq(kvm, e, &irq);
if (kvm_irq_delivery_to_apic_fast(kvm, NULL, &irq, &r, NULL))
return r;
@@ -248,7 +264,8 @@ static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
}
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
int r = -EINVAL;
@@ -285,6 +302,9 @@ int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
e->msi.address_lo = ue->u.msi.address_lo;
e->msi.address_hi = ue->u.msi.address_hi;
e->msi.data = ue->u.msi.data;
+
+ if (kvm_msi_route_invalid(kvm, e))
+ goto out;
break;
case KVM_IRQ_ROUTING_HV_SINT:
e->set = kvm_hv_set_sint;
@@ -388,21 +408,16 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu,
kvm->arch.nr_reserved_ioapic_pins);
for (i = 0; i < nr_ioapic_pins; ++i) {
hlist_for_each_entry(entry, &table->map[i], link) {
- u32 dest_id, dest_mode;
- bool level;
+ struct kvm_lapic_irq irq;
if (entry->type != KVM_IRQ_ROUTING_MSI)
continue;
- dest_id = (entry->msi.address_lo >> 12) & 0xff;
- dest_mode = (entry->msi.address_lo >> 2) & 0x1;
- level = entry->msi.data & MSI_DATA_TRIGGER_LEVEL;
- if (level && kvm_apic_match_dest(vcpu, NULL, 0,
- dest_id, dest_mode)) {
- u32 vector = entry->msi.data & 0xff;
-
- __set_bit(vector,
- ioapic_handled_vectors);
- }
+
+ kvm_set_msi_irq(vcpu->kvm, entry, &irq);
+
+ if (irq.level && kvm_apic_match_dest(vcpu, NULL, 0,
+ irq.dest_id, irq.dest_mode))
+ __set_bit(irq.vector, ioapic_handled_vectors);
}
}
srcu_read_unlock(&kvm->irq_srcu, idx);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 57549ed47ca5..730cf174090a 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -115,26 +115,43 @@ static inline int apic_enabled(struct kvm_lapic *apic)
(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
-/* The logical map is definitely wrong if we have multiple
- * modes at the same time. (Physical map is always right.)
- */
-static inline bool kvm_apic_logical_map_valid(struct kvm_apic_map *map)
-{
- return !(map->mode & (map->mode - 1));
+static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
+ u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
+ switch (map->mode) {
+ case KVM_APIC_MODE_X2APIC: {
+ u32 offset = (dest_id >> 16) * 16;
+ u32 max_apic_id = map->max_apic_id;
+
+ if (offset <= max_apic_id) {
+ u8 cluster_size = min(max_apic_id - offset + 1, 16U);
+
+ *cluster = &map->phys_map[offset];
+ *mask = dest_id & (0xffff >> (16 - cluster_size));
+ } else {
+ *mask = 0;
+ }
+
+ return true;
+ }
+ case KVM_APIC_MODE_XAPIC_FLAT:
+ *cluster = map->xapic_flat_map;
+ *mask = dest_id & 0xff;
+ return true;
+ case KVM_APIC_MODE_XAPIC_CLUSTER:
+ *cluster = map->xapic_cluster_map[dest_id >> 4];
+ *mask = dest_id & 0xf;
+ return true;
+ default:
+ /* Not optimized. */
+ return false;
+ }
}
-static inline void
-apic_logical_id(struct kvm_apic_map *map, u32 dest_id, u16 *cid, u16 *lid)
+static void kvm_apic_map_free(struct rcu_head *rcu)
{
- unsigned lid_bits;
+ struct kvm_apic_map *map = container_of(rcu, struct kvm_apic_map, rcu);
- BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_CLUSTER != 4);
- BUILD_BUG_ON(KVM_APIC_MODE_XAPIC_FLAT != 8);
- BUILD_BUG_ON(KVM_APIC_MODE_X2APIC != 16);
- lid_bits = map->mode;
-
- *cid = dest_id >> lid_bits;
- *lid = dest_id & ((1 << lid_bits) - 1);
+ kvfree(map);
}
static void recalculate_apic_map(struct kvm *kvm)
@@ -142,17 +159,26 @@ static void recalculate_apic_map(struct kvm *kvm)
struct kvm_apic_map *new, *old = NULL;
struct kvm_vcpu *vcpu;
int i;
-
- new = kzalloc(sizeof(struct kvm_apic_map), GFP_KERNEL);
+ u32 max_id = 255;
mutex_lock(&kvm->arch.apic_map_lock);
+ kvm_for_each_vcpu(i, vcpu, kvm)
+ if (kvm_apic_present(vcpu))
+ max_id = max(max_id, kvm_apic_id(vcpu->arch.apic));
+
+ new = kvm_kvzalloc(sizeof(struct kvm_apic_map) +
+ sizeof(struct kvm_lapic *) * ((u64)max_id + 1));
+
if (!new)
goto out;
+ new->max_apic_id = max_id;
+
kvm_for_each_vcpu(i, vcpu, kvm) {
struct kvm_lapic *apic = vcpu->arch.apic;
- u16 cid, lid;
+ struct kvm_lapic **cluster;
+ u16 mask;
u32 ldr, aid;
if (!kvm_apic_present(vcpu))
@@ -161,7 +187,7 @@ static void recalculate_apic_map(struct kvm *kvm)
aid = kvm_apic_id(apic);
ldr = kvm_lapic_get_reg(apic, APIC_LDR);
- if (aid < ARRAY_SIZE(new->phys_map))
+ if (aid <= new->max_apic_id)
new->phys_map[aid] = apic;
if (apic_x2apic_mode(apic)) {
@@ -174,13 +200,11 @@ static void recalculate_apic_map(struct kvm *kvm)
new->mode |= KVM_APIC_MODE_XAPIC_CLUSTER;
}
- if (!kvm_apic_logical_map_valid(new))
+ if (!kvm_apic_map_get_logical_dest(new, ldr, &cluster, &mask))
continue;
- apic_logical_id(new, ldr, &cid, &lid);
-
- if (lid && cid < ARRAY_SIZE(new->logical_map))
- new->logical_map[cid][ffs(lid) - 1] = apic;
+ if (mask)
+ cluster[ffs(mask) - 1] = apic;
}
out:
old = rcu_dereference_protected(kvm->arch.apic_map,
@@ -189,7 +213,7 @@ out:
mutex_unlock(&kvm->arch.apic_map_lock);
if (old)
- kfree_rcu(old, rcu);
+ call_rcu(&old->rcu, kvm_apic_map_free);
kvm_make_scan_ioapic_request(kvm);
}
@@ -210,7 +234,7 @@ static inline void apic_set_spiv(struct kvm_lapic *apic, u32 val)
}
}
-static inline void kvm_apic_set_id(struct kvm_lapic *apic, u8 id)
+static inline void kvm_apic_set_xapic_id(struct kvm_lapic *apic, u8 id)
{
kvm_lapic_set_reg(apic, APIC_ID, id << 24);
recalculate_apic_map(apic->vcpu->kvm);
@@ -222,11 +246,11 @@ static inline void kvm_apic_set_ldr(struct kvm_lapic *apic, u32 id)
recalculate_apic_map(apic->vcpu->kvm);
}
-static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u8 id)
+static inline void kvm_apic_set_x2apic_id(struct kvm_lapic *apic, u32 id)
{
u32 ldr = ((id >> 4) << 16) | (1 << (id & 0xf));
- kvm_lapic_set_reg(apic, APIC_ID, id << 24);
+ kvm_lapic_set_reg(apic, APIC_ID, id);
kvm_lapic_set_reg(apic, APIC_LDR, ldr);
recalculate_apic_map(apic->vcpu->kvm);
}
@@ -599,17 +623,30 @@ static bool kvm_apic_match_logical_addr(struct kvm_lapic *apic, u32 mda)
}
}
-/* KVM APIC implementation has two quirks
- * - dest always begins at 0 while xAPIC MDA has offset 24,
- * - IOxAPIC messages have to be delivered (directly) to x2APIC.
+/* The KVM local APIC implementation has two quirks:
+ *
+ * - the xAPIC MDA stores the destination at bits 24-31, while this
+ * is not true of struct kvm_lapic_irq's dest_id field. This is
+ * just a quirk in the API and is not problematic.
+ *
+ * - in-kernel IOAPIC messages have to be delivered directly to
+ * x2APIC, because the kernel does not support interrupt remapping.
+ * In order to support broadcast without interrupt remapping, x2APIC
+ * rewrites the destination of non-IPI messages from APIC_BROADCAST
+ * to X2APIC_BROADCAST.
+ *
+ * The broadcast quirk can be disabled with KVM_CAP_X2APIC_API. This is
+ * important when userspace wants to use x2APIC-format MSIs, because
+ * APIC_BROADCAST (0xff) is a legal route for "cluster 0, CPUs 0-7".
*/
-static u32 kvm_apic_mda(unsigned int dest_id, struct kvm_lapic *source,
- struct kvm_lapic *target)
+static u32 kvm_apic_mda(struct kvm_vcpu *vcpu, unsigned int dest_id,
+ struct kvm_lapic *source, struct kvm_lapic *target)
{
bool ipi = source != NULL;
bool x2apic_mda = apic_x2apic_mode(ipi ? source : target);
- if (!ipi && dest_id == APIC_BROADCAST && x2apic_mda)
+ if (!vcpu->kvm->arch.x2apic_broadcast_quirk_disabled &&
+ !ipi && dest_id == APIC_BROADCAST && x2apic_mda)
return X2APIC_BROADCAST;
return x2apic_mda ? dest_id : SET_APIC_DEST_FIELD(dest_id);
@@ -619,7 +656,7 @@ bool kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source,
int short_hand, unsigned int dest, int dest_mode)
{
struct kvm_lapic *target = vcpu->arch.apic;
- u32 mda = kvm_apic_mda(dest, source, target);
+ u32 mda = kvm_apic_mda(vcpu, dest, source, target);
apic_debug("target %p, source %p, dest 0x%x, "
"dest_mode 0x%x, short_hand 0x%x\n",
@@ -671,102 +708,126 @@ static void kvm_apic_disabled_lapic_found(struct kvm *kvm)
}
}
-bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
- struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+static bool kvm_apic_is_broadcast_dest(struct kvm *kvm, struct kvm_lapic **src,
+ struct kvm_lapic_irq *irq, struct kvm_apic_map *map)
{
- struct kvm_apic_map *map;
- unsigned long bitmap = 1;
- struct kvm_lapic **dst;
- int i;
- bool ret, x2apic_ipi;
+ if (kvm->arch.x2apic_broadcast_quirk_disabled) {
+ if ((irq->dest_id == APIC_BROADCAST &&
+ map->mode != KVM_APIC_MODE_X2APIC))
+ return true;
+ if (irq->dest_id == X2APIC_BROADCAST)
+ return true;
+ } else {
+ bool x2apic_ipi = src && *src && apic_x2apic_mode(*src);
+ if (irq->dest_id == (x2apic_ipi ?
+ X2APIC_BROADCAST : APIC_BROADCAST))
+ return true;
+ }
- *r = -1;
+ return false;
+}
- if (irq->shorthand == APIC_DEST_SELF) {
- *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
- return true;
- }
+/* Return true if the interrupt can be handled by using *bitmap as index mask
+ * for valid destinations in *dst array.
+ * Return false if kvm_apic_map_get_dest_lapic did nothing useful.
+ * Note: we may have zero kvm_lapic destinations when we return true, which
+ * means that the interrupt should be dropped. In this case, *bitmap would be
+ * zero and *dst undefined.
+ */
+static inline bool kvm_apic_map_get_dest_lapic(struct kvm *kvm,
+ struct kvm_lapic **src, struct kvm_lapic_irq *irq,
+ struct kvm_apic_map *map, struct kvm_lapic ***dst,
+ unsigned long *bitmap)
+{
+ int i, lowest;
- if (irq->shorthand)
+ if (irq->shorthand == APIC_DEST_SELF && src) {
+ *dst = src;
+ *bitmap = 1;
+ return true;
+ } else if (irq->shorthand)
return false;
- x2apic_ipi = src && apic_x2apic_mode(src);
- if (irq->dest_id == (x2apic_ipi ? X2APIC_BROADCAST : APIC_BROADCAST))
+ if (!map || kvm_apic_is_broadcast_dest(kvm, src, irq, map))
return false;
- ret = true;
- rcu_read_lock();
- map = rcu_dereference(kvm->arch.apic_map);
-
- if (!map) {
- ret = false;
- goto out;
+ if (irq->dest_mode == APIC_DEST_PHYSICAL) {
+ if (irq->dest_id > map->max_apic_id) {
+ *bitmap = 0;
+ } else {
+ *dst = &map->phys_map[irq->dest_id];
+ *bitmap = 1;
+ }
+ return true;
}
- if (irq->dest_mode == APIC_DEST_PHYSICAL) {
- if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
- goto out;
+ *bitmap = 0;
+ if (!kvm_apic_map_get_logical_dest(map, irq->dest_id, dst,
+ (u16 *)bitmap))
+ return false;
- dst = &map->phys_map[irq->dest_id];
- } else {
- u16 cid;
+ if (!kvm_lowest_prio_delivery(irq))
+ return true;
- if (!kvm_apic_logical_map_valid(map)) {
- ret = false;
- goto out;
+ if (!kvm_vector_hashing_enabled()) {
+ lowest = -1;
+ for_each_set_bit(i, bitmap, 16) {
+ if (!(*dst)[i])
+ continue;
+ if (lowest < 0)
+ lowest = i;
+ else if (kvm_apic_compare_prio((*dst)[i]->vcpu,
+ (*dst)[lowest]->vcpu) < 0)
+ lowest = i;
}
+ } else {
+ if (!*bitmap)
+ return true;
- apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
+ lowest = kvm_vector_to_index(irq->vector, hweight16(*bitmap),
+ bitmap, 16);
- if (cid >= ARRAY_SIZE(map->logical_map))
- goto out;
+ if (!(*dst)[lowest]) {
+ kvm_apic_disabled_lapic_found(kvm);
+ *bitmap = 0;
+ return true;
+ }
+ }
- dst = map->logical_map[cid];
+ *bitmap = (lowest >= 0) ? 1 << lowest : 0;
- if (!kvm_lowest_prio_delivery(irq))
- goto set_irq;
+ return true;
+}
- if (!kvm_vector_hashing_enabled()) {
- int l = -1;
- for_each_set_bit(i, &bitmap, 16) {
- if (!dst[i])
- continue;
- if (l < 0)
- l = i;
- else if (kvm_apic_compare_prio(dst[i]->vcpu,
- dst[l]->vcpu) < 0)
- l = i;
- }
- bitmap = (l >= 0) ? 1 << l : 0;
- } else {
- int idx;
- unsigned int dest_vcpus;
+bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
+ struct kvm_lapic_irq *irq, int *r, struct dest_map *dest_map)
+{
+ struct kvm_apic_map *map;
+ unsigned long bitmap;
+ struct kvm_lapic **dst = NULL;
+ int i;
+ bool ret;
- dest_vcpus = hweight16(bitmap);
- if (dest_vcpus == 0)
- goto out;
+ *r = -1;
- idx = kvm_vector_to_index(irq->vector,
- dest_vcpus, &bitmap, 16);
+ if (irq->shorthand == APIC_DEST_SELF) {
+ *r = kvm_apic_set_irq(src->vcpu, irq, dest_map);
+ return true;
+ }
- if (!dst[idx]) {
- kvm_apic_disabled_lapic_found(kvm);
- goto out;
- }
+ rcu_read_lock();
+ map = rcu_dereference(kvm->arch.apic_map);
- bitmap = (idx >= 0) ? 1 << idx : 0;
+ ret = kvm_apic_map_get_dest_lapic(kvm, &src, irq, map, &dst, &bitmap);
+ if (ret)
+ for_each_set_bit(i, &bitmap, 16) {
+ if (!dst[i])
+ continue;
+ if (*r < 0)
+ *r = 0;
+ *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
}
- }
-set_irq:
- for_each_set_bit(i, &bitmap, 16) {
- if (!dst[i])
- continue;
- if (*r < 0)
- *r = 0;
- *r += kvm_apic_set_irq(dst[i]->vcpu, irq, dest_map);
- }
-out:
rcu_read_unlock();
return ret;
}
@@ -789,8 +850,9 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu)
{
struct kvm_apic_map *map;
+ unsigned long bitmap;
+ struct kvm_lapic **dst = NULL;
bool ret = false;
- struct kvm_lapic *dst = NULL;
if (irq->shorthand)
return false;
@@ -798,69 +860,16 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
rcu_read_lock();
map = rcu_dereference(kvm->arch.apic_map);
- if (!map)
- goto out;
-
- if (irq->dest_mode == APIC_DEST_PHYSICAL) {
- if (irq->dest_id == 0xFF)
- goto out;
-
- if (irq->dest_id >= ARRAY_SIZE(map->phys_map))
- goto out;
-
- dst = map->phys_map[irq->dest_id];
- if (dst && kvm_apic_present(dst->vcpu))
- *dest_vcpu = dst->vcpu;
- else
- goto out;
- } else {
- u16 cid;
- unsigned long bitmap = 1;
- int i, r = 0;
-
- if (!kvm_apic_logical_map_valid(map))
- goto out;
-
- apic_logical_id(map, irq->dest_id, &cid, (u16 *)&bitmap);
-
- if (cid >= ARRAY_SIZE(map->logical_map))
- goto out;
-
- if (kvm_vector_hashing_enabled() &&
- kvm_lowest_prio_delivery(irq)) {
- int idx;
- unsigned int dest_vcpus;
+ if (kvm_apic_map_get_dest_lapic(kvm, NULL, irq, map, &dst, &bitmap) &&
+ hweight16(bitmap) == 1) {
+ unsigned long i = find_first_bit(&bitmap, 16);
- dest_vcpus = hweight16(bitmap);
- if (dest_vcpus == 0)
- goto out;
-
- idx = kvm_vector_to_index(irq->vector, dest_vcpus,
- &bitmap, 16);
-
- dst = map->logical_map[cid][idx];
- if (!dst) {
- kvm_apic_disabled_lapic_found(kvm);
- goto out;
- }
-
- *dest_vcpu = dst->vcpu;
- } else {
- for_each_set_bit(i, &bitmap, 16) {
- dst = map->logical_map[cid][i];
- if (++r == 2)
- goto out;
- }
-
- if (dst && kvm_apic_present(dst->vcpu))
- *dest_vcpu = dst->vcpu;
- else
- goto out;
+ if (dst[i]) {
+ *dest_vcpu = dst[i]->vcpu;
+ ret = true;
}
}
- ret = true;
-out:
rcu_read_unlock();
return ret;
}
@@ -1127,12 +1136,6 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
return 0;
switch (offset) {
- case APIC_ID:
- if (apic_x2apic_mode(apic))
- val = kvm_apic_id(apic);
- else
- val = kvm_apic_id(apic) << 24;
- break;
case APIC_ARBPRI:
apic_debug("Access APIC ARBPRI register which is for P6\n");
break;
@@ -1314,6 +1317,108 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
nsec_to_cycles(vcpu, lapic_timer_advance_ns)));
}
+static void start_sw_tscdeadline(struct kvm_lapic *apic)
+{
+ u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
+ u64 ns = 0;
+ ktime_t expire;
+ struct kvm_vcpu *vcpu = apic->vcpu;
+ unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
+ unsigned long flags;
+ ktime_t now;
+
+ if (unlikely(!tscdeadline || !this_tsc_khz))
+ return;
+
+ local_irq_save(flags);
+
+ now = apic->lapic_timer.timer.base->get_time();
+ guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
+ if (likely(tscdeadline > guest_tsc)) {
+ ns = (tscdeadline - guest_tsc) * 1000000ULL;
+ do_div(ns, this_tsc_khz);
+ expire = ktime_add_ns(now, ns);
+ expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
+ hrtimer_start(&apic->lapic_timer.timer,
+ expire, HRTIMER_MODE_ABS_PINNED);
+ } else
+ apic_timer_expired(apic);
+
+ local_irq_restore(flags);
+}
+
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.apic->lapic_timer.hv_timer_in_use;
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use);
+
+static void cancel_hv_tscdeadline(struct kvm_lapic *apic)
+{
+ kvm_x86_ops->cancel_hv_timer(apic->vcpu);
+ apic->lapic_timer.hv_timer_in_use = false;
+}
+
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ WARN_ON(!apic->lapic_timer.hv_timer_in_use);
+ WARN_ON(swait_active(&vcpu->wq));
+ cancel_hv_tscdeadline(apic);
+ apic_timer_expired(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer);
+
+static bool start_hv_tscdeadline(struct kvm_lapic *apic)
+{
+ u64 tscdeadline = apic->lapic_timer.tscdeadline;
+
+ if (atomic_read(&apic->lapic_timer.pending) ||
+ kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) {
+ if (apic->lapic_timer.hv_timer_in_use)
+ cancel_hv_tscdeadline(apic);
+ } else {
+ apic->lapic_timer.hv_timer_in_use = true;
+ hrtimer_cancel(&apic->lapic_timer.timer);
+
+ /* In case the sw timer triggered in the window */
+ if (atomic_read(&apic->lapic_timer.pending))
+ cancel_hv_tscdeadline(apic);
+ }
+ trace_kvm_hv_timer_state(apic->vcpu->vcpu_id,
+ apic->lapic_timer.hv_timer_in_use);
+ return apic->lapic_timer.hv_timer_in_use;
+}
+
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ WARN_ON(apic->lapic_timer.hv_timer_in_use);
+
+ if (apic_lvtt_tscdeadline(apic))
+ start_hv_tscdeadline(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer);
+
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu)
+{
+ struct kvm_lapic *apic = vcpu->arch.apic;
+
+ /* Possibly the TSC deadline timer is not enabled yet */
+ if (!apic->lapic_timer.hv_timer_in_use)
+ return;
+
+ cancel_hv_tscdeadline(apic);
+
+ if (atomic_read(&apic->lapic_timer.pending))
+ return;
+
+ start_sw_tscdeadline(apic);
+}
+EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer);
+
static void start_apic_timer(struct kvm_lapic *apic)
{
ktime_t now;
@@ -1360,32 +1465,8 @@ static void start_apic_timer(struct kvm_lapic *apic)
ktime_to_ns(ktime_add_ns(now,
apic->lapic_timer.period)));
} else if (apic_lvtt_tscdeadline(apic)) {
- /* lapic timer in tsc deadline mode */
- u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline;
- u64 ns = 0;
- ktime_t expire;
- struct kvm_vcpu *vcpu = apic->vcpu;
- unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz;
- unsigned long flags;
-
- if (unlikely(!tscdeadline || !this_tsc_khz))
- return;
-
- local_irq_save(flags);
-
- now = apic->lapic_timer.timer.base->get_time();
- guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc());
- if (likely(tscdeadline > guest_tsc)) {
- ns = (tscdeadline - guest_tsc) * 1000000ULL;
- do_div(ns, this_tsc_khz);
- expire = ktime_add_ns(now, ns);
- expire = ktime_sub_ns(expire, lapic_timer_advance_ns);
- hrtimer_start(&apic->lapic_timer.timer,
- expire, HRTIMER_MODE_ABS_PINNED);
- } else
- apic_timer_expired(apic);
-
- local_irq_restore(flags);
+ if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic)))
+ start_sw_tscdeadline(apic);
}
}
@@ -1413,7 +1494,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
switch (reg) {
case APIC_ID: /* Local APIC ID */
if (!apic_x2apic_mode(apic))
- kvm_apic_set_id(apic, val >> 24);
+ kvm_apic_set_xapic_id(apic, val >> 24);
else
ret = 1;
break;
@@ -1674,9 +1755,10 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value)
/* update jump label if enable bit changes */
if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) {
- if (value & MSR_IA32_APICBASE_ENABLE)
+ if (value & MSR_IA32_APICBASE_ENABLE) {
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
static_key_slow_dec_deferred(&apic_hw_disabled);
- else
+ } else
static_key_slow_inc(&apic_hw_disabled.key);
recalculate_apic_map(vcpu->kvm);
}
@@ -1716,8 +1798,11 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
/* Stop the timer in case it's a reset to an active apic */
hrtimer_cancel(&apic->lapic_timer.timer);
- if (!init_event)
- kvm_apic_set_id(apic, vcpu->vcpu_id);
+ if (!init_event) {
+ kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE |
+ MSR_IA32_APICBASE_ENABLE);
+ kvm_apic_set_xapic_id(apic, vcpu->vcpu_id);
+ }
kvm_apic_set_version(apic->vcpu);
for (i = 0; i < KVM_APIC_LVT_NUM; i++)
@@ -1856,9 +1941,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
* thinking that APIC satet has changed.
*/
vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE;
- kvm_lapic_set_base(vcpu,
- APIC_DEFAULT_PHYS_BASE | MSR_IA32_APICBASE_ENABLE);
-
static_key_slow_inc(&apic_sw_disabled.key); /* sw disabled at reset */
kvm_lapic_reset(vcpu, false);
kvm_iodevice_init(&apic->dev, &apic_mmio_ops);
@@ -1938,17 +2020,48 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
return vector;
}
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
- struct kvm_lapic_state *s)
+static int kvm_apic_state_fixup(struct kvm_vcpu *vcpu,
+ struct kvm_lapic_state *s, bool set)
+{
+ if (apic_x2apic_mode(vcpu->arch.apic)) {
+ u32 *id = (u32 *)(s->regs + APIC_ID);
+
+ if (vcpu->kvm->arch.x2apic_format) {
+ if (*id != vcpu->vcpu_id)
+ return -EINVAL;
+ } else {
+ if (set)
+ *id >>= 24;
+ else
+ *id <<= 24;
+ }
+ }
+
+ return 0;
+}
+
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
+{
+ memcpy(s->regs, vcpu->arch.apic->regs, sizeof(*s));
+ return kvm_apic_state_fixup(vcpu, s, false);
+}
+
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
{
struct kvm_lapic *apic = vcpu->arch.apic;
+ int r;
+
kvm_lapic_set_base(vcpu, vcpu->arch.apic_base);
/* set SPIV separately to get count of SW disabled APICs right */
apic_set_spiv(apic, *((u32 *)(s->regs + APIC_SPIV)));
+
+ r = kvm_apic_state_fixup(vcpu, s, true);
+ if (r)
+ return r;
memcpy(vcpu->arch.apic->regs, s->regs, sizeof *s);
- /* call kvm_apic_set_id() to put apic into apic_map */
- kvm_apic_set_id(apic, kvm_apic_id(apic));
+
+ recalculate_apic_map(vcpu->kvm);
kvm_apic_set_version(vcpu);
apic_update_ppr(apic);
@@ -1974,6 +2087,8 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
kvm_rtc_eoi_tracking_restore_one(vcpu);
vcpu->arch.apic_arb_prio = 0;
+
+ return 0;
}
void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 891c6da7d4aa..f60d01c29d51 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -20,6 +20,7 @@ struct kvm_timer {
u64 tscdeadline;
u64 expired_tscdeadline;
atomic_t pending; /* accumulated triggered timers */
+ bool hv_timer_in_use;
};
struct kvm_lapic {
@@ -80,8 +81,8 @@ bool kvm_irq_delivery_to_apic_fast(struct kvm *kvm, struct kvm_lapic *src,
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu);
int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info);
-void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu,
- struct kvm_lapic_state *s);
+int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
+int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s);
int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu);
u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu);
@@ -199,9 +200,15 @@ static inline int kvm_lapic_latched_init(struct kvm_vcpu *vcpu)
return lapic_in_kernel(vcpu) && test_bit(KVM_APIC_INIT, &vcpu->arch.apic->pending_events);
}
-static inline int kvm_apic_id(struct kvm_lapic *apic)
+static inline u32 kvm_apic_id(struct kvm_lapic *apic)
{
- return (kvm_lapic_get_reg(apic, APIC_ID) >> 24) & 0xff;
+ /* To avoid a race between apic_base and following APIC_ID update when
+ * switching to x2apic_mode, the x2apic mode returns initial x2apic id.
+ */
+ if (apic_x2apic_mode(apic))
+ return apic->vcpu->vcpu_id;
+
+ return kvm_lapic_get_reg(apic, APIC_ID) >> 24;
}
bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
@@ -212,4 +219,8 @@ bool kvm_intr_is_single_vcpu_fast(struct kvm *kvm, struct kvm_lapic_irq *irq,
struct kvm_vcpu **dest_vcpu);
int kvm_vector_to_index(u32 vector, u32 dest_vcpus,
const unsigned long *bitmap, u32 bitmap_size);
+void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu);
+void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu);
+bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu);
#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 745a5f445ae2..3d4cc8cc56a3 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -176,6 +176,7 @@ static u64 __read_mostly shadow_user_mask;
static u64 __read_mostly shadow_accessed_mask;
static u64 __read_mostly shadow_dirty_mask;
static u64 __read_mostly shadow_mmio_mask;
+static u64 __read_mostly shadow_present_mask;
static void mmu_spte_set(u64 *sptep, u64 spte);
static void mmu_free_roots(struct kvm_vcpu *vcpu);
@@ -283,13 +284,14 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
}
void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
- u64 dirty_mask, u64 nx_mask, u64 x_mask)
+ u64 dirty_mask, u64 nx_mask, u64 x_mask, u64 p_mask)
{
shadow_user_mask = user_mask;
shadow_accessed_mask = accessed_mask;
shadow_dirty_mask = dirty_mask;
shadow_nx_mask = nx_mask;
shadow_x_mask = x_mask;
+ shadow_present_mask = p_mask;
}
EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
@@ -305,7 +307,7 @@ static int is_nx(struct kvm_vcpu *vcpu)
static int is_shadow_present_pte(u64 pte)
{
- return pte & PT_PRESENT_MASK && !is_mmio_spte(pte);
+ return (pte & 0xFFFFFFFFull) && !is_mmio_spte(pte);
}
static int is_large_pte(u64 pte)
@@ -524,7 +526,7 @@ static void mmu_spte_set(u64 *sptep, u64 new_spte)
}
/* Rules for using mmu_spte_update:
- * Update the state bits, it means the mapped pfn is not changged.
+ * Update the state bits, it means the mapped pfn is not changed.
*
* Whenever we overwrite a writable spte with a read-only one we
* should flush remote TLBs. Otherwise rmap_write_protect
@@ -2246,10 +2248,9 @@ static void link_shadow_page(struct kvm_vcpu *vcpu, u64 *sptep,
{
u64 spte;
- BUILD_BUG_ON(VMX_EPT_READABLE_MASK != PT_PRESENT_MASK ||
- VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
+ BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
- spte = __pa(sp->spt) | PT_PRESENT_MASK | PT_WRITABLE_MASK |
+ spte = __pa(sp->spt) | shadow_present_mask | PT_WRITABLE_MASK |
shadow_user_mask | shadow_x_mask | shadow_accessed_mask;
mmu_spte_set(sptep, spte);
@@ -2516,13 +2517,19 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
gfn_t gfn, kvm_pfn_t pfn, bool speculative,
bool can_unsync, bool host_writable)
{
- u64 spte;
+ u64 spte = 0;
int ret = 0;
if (set_mmio_spte(vcpu, sptep, gfn, pfn, pte_access))
return 0;
- spte = PT_PRESENT_MASK;
+ /*
+ * For the EPT case, shadow_present_mask is 0 if hardware
+ * supports exec-only page table entries. In that case,
+ * ACC_USER_MASK and shadow_user_mask are used to represent
+ * read access. See FNAME(gpte_access) in paging_tmpl.h.
+ */
+ spte |= shadow_present_mask;
if (!speculative)
spte |= shadow_accessed_mask;
@@ -3190,7 +3197,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
MMU_WARN_ON(VALID_PAGE(root));
if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i);
- if (!is_present_gpte(pdptr)) {
+ if (!(pdptr & PT_PRESENT_MASK)) {
vcpu->arch.mmu.pae_root[i] = 0;
continue;
}
@@ -3915,9 +3922,7 @@ static void update_permission_bitmask(struct kvm_vcpu *vcpu,
* clearer.
*/
smap = cr4_smap && u && !uf && !ff;
- } else
- /* Not really needed: no U/S accesses on ept */
- u = 1;
+ }
fault = (ff && !x) || (uf && !u) || (wf && !w) ||
(smapf && smap);
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 66b33b96a31b..ddc56e91f2e4 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -93,11 +93,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
return kvm_mmu_load(vcpu);
}
-static inline int is_present_gpte(unsigned long pte)
-{
- return pte & PT_PRESENT_MASK;
-}
-
/*
* Currently, we have two sorts of write-protection, a) the first one
* write-protects guest page to sync the guest modification, b) another one is
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index bc019f70e0b6..a01105485315 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -131,7 +131,7 @@ static inline void FNAME(protect_clean_gpte)(unsigned *access, unsigned gpte)
static inline int FNAME(is_present_gpte)(unsigned long pte)
{
#if PTTYPE != PTTYPE_EPT
- return is_present_gpte(pte);
+ return pte & PT_PRESENT_MASK;
#else
return pte & 7;
#endif
@@ -181,13 +181,19 @@ no_present:
return true;
}
+/*
+ * For PTTYPE_EPT, a page table can be executable but not readable
+ * on supported processors. Therefore, set_spte does not automatically
+ * set bit 0 if execute only is supported. Here, we repurpose ACC_USER_MASK
+ * to signify readability since it isn't used in the EPT case
+ */
static inline unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, u64 gpte)
{
unsigned access;
#if PTTYPE == PTTYPE_EPT
access = ((gpte & VMX_EPT_WRITABLE_MASK) ? ACC_WRITE_MASK : 0) |
((gpte & VMX_EPT_EXECUTABLE_MASK) ? ACC_EXEC_MASK : 0) |
- ACC_USER_MASK;
+ ((gpte & VMX_EPT_READABLE_MASK) ? ACC_USER_MASK : 0);
#else
BUILD_BUG_ON(ACC_EXEC_MASK != PT_PRESENT_MASK);
BUILD_BUG_ON(ACC_EXEC_MASK != 1);
diff --git a/arch/x86/kvm/pmu_intel.c b/arch/x86/kvm/pmu_intel.c
index ab38af4f4947..9d4a8504a95a 100644
--- a/arch/x86/kvm/pmu_intel.c
+++ b/arch/x86/kvm/pmu_intel.c
@@ -93,7 +93,7 @@ static unsigned intel_find_fixed_event(int idx)
return intel_arch_events[fixed_pmc_events[idx]].event_type;
}
-/* check if a PMC is enabled by comparising it with globl_ctrl bits. */
+/* check if a PMC is enabled by comparing it with globl_ctrl bits. */
static bool intel_pmc_is_enabled(struct kvm_pmc *pmc)
{
struct kvm_pmu *pmu = pmc_to_pmu(pmc);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 16ef31b87452..af523d84d102 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1577,7 +1577,7 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
{
/*
- * Any change of EFLAGS.VM is accompained by a reload of SS
+ * Any change of EFLAGS.VM is accompanied by a reload of SS
* (caused by either a task switch or an inter-privilege IRET),
* so we do not need to update the CPL here.
*/
@@ -4940,6 +4940,12 @@ out:
static void svm_handle_external_intr(struct kvm_vcpu *vcpu)
{
local_irq_enable();
+ /*
+ * We must have an instruction with interrupts enabled, so
+ * the timer interrupt isn't delayed by the interrupt shadow.
+ */
+ asm("nop");
+ local_irq_disable();
}
static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 8de925031b5c..0a6cc6754ec5 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1348,6 +1348,21 @@ TRACE_EVENT(kvm_avic_unaccelerated_access,
__entry->vec)
);
+TRACE_EVENT(kvm_hv_timer_state,
+ TP_PROTO(unsigned int vcpu_id, unsigned int hv_timer_in_use),
+ TP_ARGS(vcpu_id, hv_timer_in_use),
+ TP_STRUCT__entry(
+ __field(unsigned int, vcpu_id)
+ __field(unsigned int, hv_timer_in_use)
+ ),
+ TP_fast_assign(
+ __entry->vcpu_id = vcpu_id;
+ __entry->hv_timer_in_use = hv_timer_in_use;
+ ),
+ TP_printk("vcpu_id %x hv_timer %x\n",
+ __entry->vcpu_id,
+ __entry->hv_timer_in_use)
+);
#endif /* _TRACE_KVM_H */
#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index df07a0a4611f..bc354f003ce1 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -110,6 +110,13 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
+/* Guest_tsc -> host_tsc conversion requires 64-bit division. */
+static int __read_mostly cpu_preemption_timer_multi;
+static bool __read_mostly enable_preemption_timer = 1;
+#ifdef CONFIG_X86_64
+module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
+#endif
+
#define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD)
#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE)
#define KVM_VM_CR0_ALWAYS_ON \
@@ -398,6 +405,12 @@ struct nested_vmx {
/* The host-usable pointer to the above */
struct page *current_vmcs12_page;
struct vmcs12 *current_vmcs12;
+ /*
+ * Cache of the guest's VMCS, existing outside of guest memory.
+ * Loaded from guest memory during VMPTRLD. Flushed to guest
+ * memory during VMXOFF, VMCLEAR, VMPTRLD.
+ */
+ struct vmcs12 *cached_vmcs12;
struct vmcs *current_shadow_vmcs;
/*
* Indicates if the shadow vmcs must be updated with the
@@ -421,7 +434,6 @@ struct nested_vmx {
struct pi_desc *pi_desc;
bool pi_pending;
u16 posted_intr_nv;
- u64 msr_ia32_feature_control;
struct hrtimer preemption_timer;
bool preemption_timer_expired;
@@ -597,11 +609,22 @@ struct vcpu_vmx {
#define PML_ENTITY_NUM 512
struct page *pml_pg;
+ /* apic deadline value in host tsc */
+ u64 hv_deadline_tsc;
+
u64 current_tsc_ratio;
bool guest_pkru_valid;
u32 guest_pkru;
u32 host_pkru;
+
+ /*
+ * Only bits masked by msr_ia32_feature_control_valid_bits can be set in
+ * msr_ia32_feature_control. FEATURE_CONTROL_LOCKED is always included
+ * in msr_ia32_feature_control_valid_bits.
+ */
+ u64 msr_ia32_feature_control;
+ u64 msr_ia32_feature_control_valid_bits;
};
enum segment_cache_field {
@@ -841,7 +864,7 @@ static inline short vmcs_field_to_offset(unsigned long field)
static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
{
- return to_vmx(vcpu)->nested.current_vmcs12;
+ return to_vmx(vcpu)->nested.cached_vmcs12;
}
static struct page *nested_get_page(struct kvm_vcpu *vcpu, gpa_t addr)
@@ -1056,6 +1079,58 @@ static inline bool cpu_has_vmx_virtual_intr_delivery(void)
SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY;
}
+/*
+ * Comment's format: document - errata name - stepping - processor name.
+ * Refer from
+ * https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
+ */
+static u32 vmx_preemption_cpu_tfms[] = {
+/* 323344.pdf - BA86 - D0 - Xeon 7500 Series */
+0x000206E6,
+/* 323056.pdf - AAX65 - C2 - Xeon L3406 */
+/* 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile */
+/* 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020652,
+/* 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 */
+0x00020655,
+/* 322373.pdf - AAO95 - B1 - Xeon 3400 Series */
+/* 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop */
+/*
+ * 320767.pdf - AAP86 - B1 -
+ * i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
+ */
+0x000106E5,
+/* 321333.pdf - AAM126 - C0 - Xeon 3500 */
+0x000106A0,
+/* 321333.pdf - AAM126 - C1 - Xeon 3500 */
+0x000106A1,
+/* 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop */
+0x000106A4,
+ /* 321333.pdf - AAM126 - D0 - Xeon 3500 */
+ /* 321324.pdf - AAK139 - D0 - Xeon 5500 */
+ /* 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop */
+0x000106A5,
+};
+
+static inline bool cpu_has_broken_vmx_preemption_timer(void)
+{
+ u32 eax = cpuid_eax(0x00000001), i;
+
+ /* Clear the reserved bits */
+ eax &= ~(0x3U << 14 | 0xfU << 28);
+ for (i = 0; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
+ if (eax == vmx_preemption_cpu_tfms[i])
+ return true;
+
+ return false;
+}
+
+static inline bool cpu_has_vmx_preemption_timer(void)
+{
+ return vmcs_config.pin_based_exec_ctrl &
+ PIN_BASED_VMX_PREEMPTION_TIMER;
+}
+
static inline bool cpu_has_vmx_posted_intr(void)
{
return IS_ENABLED(CONFIG_X86_LOCAL_APIC) &&
@@ -1603,6 +1678,11 @@ static __always_inline void vmcs_set_bits(unsigned long field, u32 mask)
__vmcs_writel(field, __vmcs_readl(field) | mask);
}
+static inline void vm_entry_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+ vmx->vm_entry_controls_shadow = vmcs_read32(VM_ENTRY_CONTROLS);
+}
+
static inline void vm_entry_controls_init(struct vcpu_vmx *vmx, u32 val)
{
vmcs_write32(VM_ENTRY_CONTROLS, val);
@@ -1631,6 +1711,11 @@ static inline void vm_entry_controls_clearbit(struct vcpu_vmx *vmx, u32 val)
vm_entry_controls_set(vmx, vm_entry_controls_get(vmx) & ~val);
}
+static inline void vm_exit_controls_reset_shadow(struct vcpu_vmx *vmx)
+{
+ vmx->vm_exit_controls_shadow = vmcs_read32(VM_EXIT_CONTROLS);
+}
+
static inline void vm_exit_controls_init(struct vcpu_vmx *vmx, u32 val)
{
vmcs_write32(VM_EXIT_CONTROLS, val);
@@ -2121,22 +2206,14 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
+ bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
if (!vmm_exclusive)
kvm_cpu_vmxon(phys_addr);
- else if (vmx->loaded_vmcs->cpu != cpu)
+ else if (!already_loaded)
loaded_vmcs_clear(vmx->loaded_vmcs);
- if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
- per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
- vmcs_load(vmx->loaded_vmcs->vmcs);
- }
-
- if (vmx->loaded_vmcs->cpu != cpu) {
- struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
- unsigned long sysenter_esp;
-
- kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
+ if (!already_loaded) {
local_irq_disable();
crash_disable_local_vmclear(cpu);
@@ -2151,6 +2228,18 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
&per_cpu(loaded_vmcss_on_cpu, cpu));
crash_enable_local_vmclear(cpu);
local_irq_enable();
+ }
+
+ if (per_cpu(current_vmcs, cpu) != vmx->loaded_vmcs->vmcs) {
+ per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
+ vmcs_load(vmx->loaded_vmcs->vmcs);
+ }
+
+ if (!already_loaded) {
+ struct desc_ptr *gdt = this_cpu_ptr(&host_gdt);
+ unsigned long sysenter_esp;
+
+ kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
/*
* Linux uses per-cpu TSS and GDT, so set these when switching
@@ -2716,6 +2805,9 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
vmx->nested.nested_vmx_ept_caps = VMX_EPT_PAGE_WALK_4_BIT |
VMX_EPTP_WB_BIT | VMX_EPT_2MB_PAGE_BIT |
VMX_EPT_INVEPT_BIT;
+ if (cpu_has_vmx_ept_execute_only())
+ vmx->nested.nested_vmx_ept_caps |=
+ VMX_EPT_EXECUTE_ONLY_BIT;
vmx->nested.nested_vmx_ept_caps &= vmx_capability.ept;
/*
* For nested guests, we don't do anything specific
@@ -2864,6 +2956,14 @@ static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
return 0;
}
+static inline bool vmx_feature_control_msr_valid(struct kvm_vcpu *vcpu,
+ uint64_t val)
+{
+ uint64_t valid_bits = to_vmx(vcpu)->msr_ia32_feature_control_valid_bits;
+
+ return !(val & ~valid_bits);
+}
+
/*
* Reads an msr value (of 'msr_index') into 'pdata'.
* Returns 0 on success, non-0 otherwise.
@@ -2905,10 +3005,15 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
return 1;
msr_info->data = vmcs_read64(GUEST_BNDCFGS);
break;
- case MSR_IA32_FEATURE_CONTROL:
- if (!nested_vmx_allowed(vcpu))
+ case MSR_IA32_MCG_EXT_CTL:
+ if (!msr_info->host_initiated &&
+ !(to_vmx(vcpu)->msr_ia32_feature_control &
+ FEATURE_CONTROL_LMCE))
return 1;
- msr_info->data = to_vmx(vcpu)->nested.msr_ia32_feature_control;
+ msr_info->data = vcpu->arch.mcg_ext_ctl;
+ break;
+ case MSR_IA32_FEATURE_CONTROL:
+ msr_info->data = to_vmx(vcpu)->msr_ia32_feature_control;
break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
@@ -2998,12 +3103,20 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
case MSR_IA32_TSC_ADJUST:
ret = kvm_set_msr_common(vcpu, msr_info);
break;
+ case MSR_IA32_MCG_EXT_CTL:
+ if ((!msr_info->host_initiated &&
+ !(to_vmx(vcpu)->msr_ia32_feature_control &
+ FEATURE_CONTROL_LMCE)) ||
+ (data & ~MCG_EXT_CTL_LMCE_EN))
+ return 1;
+ vcpu->arch.mcg_ext_ctl = data;
+ break;
case MSR_IA32_FEATURE_CONTROL:
- if (!nested_vmx_allowed(vcpu) ||
- (to_vmx(vcpu)->nested.msr_ia32_feature_control &
+ if (!vmx_feature_control_msr_valid(vcpu, data) ||
+ (to_vmx(vcpu)->msr_ia32_feature_control &
FEATURE_CONTROL_LOCKED && !msr_info->host_initiated))
return 1;
- vmx->nested.msr_ia32_feature_control = data;
+ vmx->msr_ia32_feature_control = data;
if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
break;
@@ -3297,25 +3410,27 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
vmx_capability.ept, vmx_capability.vpid);
}
- min = VM_EXIT_SAVE_DEBUG_CONTROLS;
+ min = VM_EXIT_SAVE_DEBUG_CONTROLS | VM_EXIT_ACK_INTR_ON_EXIT;
#ifdef CONFIG_X86_64
min |= VM_EXIT_HOST_ADDR_SPACE_SIZE;
#endif
opt = VM_EXIT_SAVE_IA32_PAT | VM_EXIT_LOAD_IA32_PAT |
- VM_EXIT_ACK_INTR_ON_EXIT | VM_EXIT_CLEAR_BNDCFGS;
+ VM_EXIT_CLEAR_BNDCFGS;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_EXIT_CTLS,
&_vmexit_control) < 0)
return -EIO;
min = PIN_BASED_EXT_INTR_MASK | PIN_BASED_NMI_EXITING;
- opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR;
+ opt = PIN_BASED_VIRTUAL_NMIS | PIN_BASED_POSTED_INTR |
+ PIN_BASED_VMX_PREEMPTION_TIMER;
if (adjust_vmx_controls(min, opt, MSR_IA32_VMX_PINBASED_CTLS,
&_pin_based_exec_control) < 0)
return -EIO;
+ if (cpu_has_broken_vmx_preemption_timer())
+ _pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
if (!(_cpu_based_2nd_exec_control &
- SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) ||
- !(_vmexit_control & VM_EXIT_ACK_INTR_ON_EXIT))
+ SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
min = VM_ENTRY_LOAD_DEBUG_CONTROLS;
@@ -3364,7 +3479,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
/*
* Some cpus support VM_ENTRY_(LOAD|SAVE)_IA32_PERF_GLOBAL_CTRL
- * but due to arrata below it can't be used. Workaround is to use
+ * but due to errata below it can't be used. Workaround is to use
* msr load mechanism to switch IA32_PERF_GLOBAL_CTRL.
*
* VM Exit May Incorrectly Clear IA32_PERF_GLOBAL_CTRL [34:32]
@@ -4781,6 +4896,8 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
if (!kvm_vcpu_apicv_active(&vmx->vcpu))
pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
+ /* Enable the preemption timer dynamically */
+ pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
return pin_based_exec_ctrl;
}
@@ -4896,6 +5013,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
/* Control */
vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+ vmx->hv_deadline_tsc = -1;
vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, vmx_exec_control(vmx));
@@ -6016,12 +6134,14 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
trace_kvm_page_fault(gpa, exit_qualification);
- /* It is a write fault? */
- error_code = exit_qualification & PFERR_WRITE_MASK;
+ /* it is a read fault? */
+ error_code = (exit_qualification << 2) & PFERR_USER_MASK;
+ /* it is a write fault? */
+ error_code |= exit_qualification & PFERR_WRITE_MASK;
/* It is a fetch fault? */
error_code |= (exit_qualification << 2) & PFERR_FETCH_MASK;
/* ept page table is present? */
- error_code |= (exit_qualification >> 3) & PFERR_PRESENT_MASK;
+ error_code |= (exit_qualification & 0x38) != 0;
vcpu->arch.exit_qualification = exit_qualification;
@@ -6355,9 +6475,6 @@ static __init int hardware_setup(void)
for (msr = 0x800; msr <= 0x8ff; msr++)
vmx_disable_intercept_msr_read_x2apic(msr);
- /* According SDM, in x2apic mode, the whole id reg is used. But in
- * KVM, it only use the highest eight bits. Need to intercept it */
- vmx_enable_intercept_msr_read_x2apic(0x802);
/* TMCCT */
vmx_enable_intercept_msr_read_x2apic(0x839);
/* TPR */
@@ -6368,10 +6485,12 @@ static __init int hardware_setup(void)
vmx_disable_intercept_msr_write_x2apic(0x83f);
if (enable_ept) {
- kvm_mmu_set_mask_ptes(0ull,
+ kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK,
(enable_ept_ad_bits) ? VMX_EPT_ACCESS_BIT : 0ull,
(enable_ept_ad_bits) ? VMX_EPT_DIRTY_BIT : 0ull,
- 0ull, VMX_EPT_EXECUTABLE_MASK);
+ 0ull, VMX_EPT_EXECUTABLE_MASK,
+ cpu_has_vmx_ept_execute_only() ?
+ 0ull : VMX_EPT_READABLE_MASK);
ept_set_mmio_spte_mask();
kvm_enable_tdp();
} else
@@ -6393,8 +6512,21 @@ static __init int hardware_setup(void)
kvm_x86_ops->enable_log_dirty_pt_masked = NULL;
}
+ if (cpu_has_vmx_preemption_timer() && enable_preemption_timer) {
+ u64 vmx_msr;
+
+ rdmsrl(MSR_IA32_VMX_MISC, vmx_msr);
+ cpu_preemption_timer_multi =
+ vmx_msr & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
+ } else {
+ kvm_x86_ops->set_hv_timer = NULL;
+ kvm_x86_ops->cancel_hv_timer = NULL;
+ }
+
kvm_set_posted_intr_wakeup_handler(wakeup_handler);
+ kvm_mce_cap_supported |= MCG_LMCE_P;
+
return alloc_kvm_area();
out8:
@@ -6862,16 +6994,22 @@ static int handle_vmon(struct kvm_vcpu *vcpu)
return 1;
}
- if ((vmx->nested.msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
+ if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES)
!= VMXON_NEEDED_FEATURES) {
kvm_inject_gp(vcpu, 0);
return 1;
}
+ vmx->nested.cached_vmcs12 = kmalloc(VMCS12_SIZE, GFP_KERNEL);
+ if (!vmx->nested.cached_vmcs12)
+ return -ENOMEM;
+
if (enable_shadow_vmcs) {
shadow_vmcs = alloc_vmcs();
- if (!shadow_vmcs)
+ if (!shadow_vmcs) {
+ kfree(vmx->nested.cached_vmcs12);
return -ENOMEM;
+ }
/* mark vmcs as shadow */
shadow_vmcs->revision_id |= (1u << 31);
/* init shadow vmcs */
@@ -6942,6 +7080,11 @@ static inline void nested_release_vmcs12(struct vcpu_vmx *vmx)
vmcs_write64(VMCS_LINK_POINTER, -1ull);
}
vmx->nested.posted_intr_nv = -1;
+
+ /* Flush VMCS12 to guest memory */
+ memcpy(vmx->nested.current_vmcs12, vmx->nested.cached_vmcs12,
+ VMCS12_SIZE);
+
kunmap(vmx->nested.current_vmcs12_page);
nested_release_page(vmx->nested.current_vmcs12_page);
vmx->nested.current_vmptr = -1ull;
@@ -6962,6 +7105,7 @@ static void free_nested(struct vcpu_vmx *vmx)
nested_release_vmcs12(vmx);
if (enable_shadow_vmcs)
free_vmcs(vmx->nested.current_shadow_vmcs);
+ kfree(vmx->nested.cached_vmcs12);
/* Unpin physical memory we referred to in current vmcs02 */
if (vmx->nested.apic_access_page) {
nested_release_page(vmx->nested.apic_access_page);
@@ -7365,6 +7509,13 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
vmx->nested.current_vmptr = vmptr;
vmx->nested.current_vmcs12 = new_vmcs12;
vmx->nested.current_vmcs12_page = page;
+ /*
+ * Load VMCS12 from guest memory since it is not already
+ * cached.
+ */
+ memcpy(vmx->nested.cached_vmcs12,
+ vmx->nested.current_vmcs12, VMCS12_SIZE);
+
if (enable_shadow_vmcs) {
vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
SECONDARY_EXEC_SHADOW_VMCS);
@@ -7560,6 +7711,12 @@ static int handle_pml_full(struct kvm_vcpu *vcpu)
return 1;
}
+static int handle_preemption_timer(struct kvm_vcpu *vcpu)
+{
+ kvm_lapic_expired_hv_timer(vcpu);
+ return 1;
+}
+
/*
* The exit handlers return 1 if the exit was handled fully and guest execution
* may resume. Otherwise they set the kvm_run parameter to indicate what needs
@@ -7610,6 +7767,7 @@ static int (*const kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
[EXIT_REASON_XSAVES] = handle_xsaves,
[EXIT_REASON_XRSTORS] = handle_xrstors,
[EXIT_REASON_PML_FULL] = handle_pml_full,
+ [EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
};
static const int kvm_vmx_max_exit_handlers =
@@ -7918,6 +8076,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
* the XSS exit bitmap in vmcs12.
*/
return nested_cpu_has2(vmcs12, SECONDARY_EXEC_XSAVES);
+ case EXIT_REASON_PREEMPTION_TIMER:
+ return false;
default:
return true;
}
@@ -8303,7 +8463,7 @@ static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu, hpa_t hpa)
* the next L2->L1 exit.
*/
if (!is_guest_mode(vcpu) ||
- !nested_cpu_has2(vmx->nested.current_vmcs12,
+ !nested_cpu_has2(get_vmcs12(&vmx->vcpu),
SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
vmcs_write64(APIC_ACCESS_ADDR, hpa);
}
@@ -8436,7 +8596,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
"push %[sp]\n\t"
#endif
"pushf\n\t"
- "orl $0x200, (%%" _ASM_SP ")\n\t"
__ASM_SIZE(push) " $%c[cs]\n\t"
"call *%[entry]\n\t"
:
@@ -8449,8 +8608,7 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
[ss]"i"(__KERNEL_DS),
[cs]"i"(__KERNEL_CS)
);
- } else
- local_irq_enable();
+ }
}
static bool vmx_has_high_real_mode_segbase(void)
@@ -8601,6 +8759,26 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
msrs[i].host);
}
+void vmx_arm_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 tscl;
+ u32 delta_tsc;
+
+ if (vmx->hv_deadline_tsc == -1)
+ return;
+
+ tscl = rdtsc();
+ if (vmx->hv_deadline_tsc > tscl)
+ /* sure to be 32 bit only because checked on set_hv_timer */
+ delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
+ cpu_preemption_timer_multi);
+ else
+ delta_tsc = 0;
+
+ vmcs_write32(VMX_PREEMPTION_TIMER_VALUE, delta_tsc);
+}
+
static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -8650,6 +8828,8 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
atomic_switch_perf_msrs(vmx);
debugctlmsr = get_debugctlmsr();
+ vmx_arm_hv_timer(vcpu);
+
vmx->__launched = vmx->loaded_vmcs->launched;
asm(
/* Store host registers */
@@ -8940,6 +9120,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
vmx->nested.current_vmptr = -1ull;
vmx->nested.current_vmcs12 = NULL;
+ vmx->msr_ia32_feature_control_valid_bits = FEATURE_CONTROL_LOCKED;
+
return &vmx->vcpu;
free_vmcs:
@@ -9080,6 +9262,13 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
if (cpu_has_secondary_exec_ctrls())
vmcs_set_secondary_exec_control(secondary_exec_ctl);
+
+ if (nested_vmx_allowed(vcpu))
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
+ else
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;
}
static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
@@ -9636,9 +9825,14 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
vmcs_write64(VMCS_LINK_POINTER, -1ull);
exec_control = vmcs12->pin_based_vm_exec_control;
- exec_control |= vmcs_config.pin_based_exec_ctrl;
+
+ /* Preemption timer setting is only taken from vmcs01. */
exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ exec_control |= vmcs_config.pin_based_exec_ctrl;
+ if (vmx->hv_deadline_tsc == -1)
+ exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
+ /* Posted interrupts setting is only taken from vmcs12. */
if (nested_cpu_has_posted_intr(vmcs12)) {
/*
* Note that we use L0's vector here and in
@@ -10556,8 +10750,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
vmcs12->vm_exit_intr_error_code,
KVM_ISA_VMX);
- vm_entry_controls_init(vmx, vmcs_read32(VM_ENTRY_CONTROLS));
- vm_exit_controls_init(vmx, vmcs_read32(VM_EXIT_CONTROLS));
+ vm_entry_controls_reset_shadow(vmx);
+ vm_exit_controls_reset_shadow(vmx);
vmx_segment_cache_clear(vmx);
/* if no vmcs02 cache requested, remove the one we used */
@@ -10566,8 +10760,14 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
load_vmcs12_host_state(vcpu, vmcs12);
- /* Update TSC_OFFSET if TSC was changed while L2 ran */
+ /* Update any VMCS fields that might have changed while L2 ran */
vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+ if (vmx->hv_deadline_tsc == -1)
+ vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ else
+ vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
/* This is needed for same reason as it was needed in prepare_vmcs02 */
vmx->host_rsp = 0;
@@ -10647,6 +10847,64 @@ static int vmx_check_intercept(struct kvm_vcpu *vcpu,
return X86EMUL_CONTINUE;
}
+#ifdef CONFIG_X86_64
+/* (a << shift) / divisor, return 1 if overflow otherwise 0 */
+static inline int u64_shl_div_u64(u64 a, unsigned int shift,
+ u64 divisor, u64 *result)
+{
+ u64 low = a << shift, high = a >> (64 - shift);
+
+ /* To avoid the overflow on divq */
+ if (high >= divisor)
+ return 1;
+
+ /* Low hold the result, high hold rem which is discarded */
+ asm("divq %2\n\t" : "=a" (low), "=d" (high) :
+ "rm" (divisor), "0" (low), "1" (high));
+ *result = low;
+
+ return 0;
+}
+
+static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ u64 tscl = rdtsc();
+ u64 guest_tscl = kvm_read_l1_tsc(vcpu, tscl);
+ u64 delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
+
+ /* Convert to host delta tsc if tsc scaling is enabled */
+ if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio &&
+ u64_shl_div_u64(delta_tsc,
+ kvm_tsc_scaling_ratio_frac_bits,
+ vcpu->arch.tsc_scaling_ratio,
+ &delta_tsc))
+ return -ERANGE;
+
+ /*
+ * If the delta tsc can't fit in the 32 bit after the multi shift,
+ * we can't use the preemption timer.
+ * It's possible that it fits on later vmentries, but checking
+ * on every vmentry is costly so we just use an hrtimer.
+ */
+ if (delta_tsc >> (cpu_preemption_timer_multi + 32))
+ return -ERANGE;
+
+ vmx->hv_deadline_tsc = tscl + delta_tsc;
+ vmcs_set_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+ return 0;
+}
+
+static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
+{
+ struct vcpu_vmx *vmx = to_vmx(vcpu);
+ vmx->hv_deadline_tsc = -1;
+ vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL,
+ PIN_BASED_VMX_PREEMPTION_TIMER);
+}
+#endif
+
static void vmx_sched_in(struct kvm_vcpu *vcpu, int cpu)
{
if (ple_gap)
@@ -10691,7 +10949,7 @@ static void vmx_enable_log_dirty_pt_masked(struct kvm *kvm,
* this case, return 1, otherwise, return 0.
*
*/
-static int vmx_pre_block(struct kvm_vcpu *vcpu)
+static int pi_pre_block(struct kvm_vcpu *vcpu)
{
unsigned long flags;
unsigned int dest;
@@ -10758,7 +11016,18 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu)
return 0;
}
-static void vmx_post_block(struct kvm_vcpu *vcpu)
+static int vmx_pre_block(struct kvm_vcpu *vcpu)
+{
+ if (pi_pre_block(vcpu))
+ return 1;
+
+ if (kvm_lapic_hv_timer_in_use(vcpu))
+ kvm_lapic_switch_to_sw_timer(vcpu);
+
+ return 0;
+}
+
+static void pi_post_block(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
struct pi_desc old, new;
@@ -10800,6 +11069,14 @@ static void vmx_post_block(struct kvm_vcpu *vcpu)
}
}
+static void vmx_post_block(struct kvm_vcpu *vcpu)
+{
+ if (kvm_x86_ops->set_hv_timer)
+ kvm_lapic_switch_to_hv_timer(vcpu);
+
+ pi_post_block(vcpu);
+}
+
/*
* vmx_update_pi_irte - set IRTE for Posted-Interrupts
*
@@ -10844,7 +11121,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq,
* We will support full lowest-priority interrupt later.
*/
- kvm_set_msi_irq(e, &irq);
+ kvm_set_msi_irq(kvm, e, &irq);
if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu)) {
/*
* Make sure the IRTE is in remapped mode if
@@ -10889,6 +11166,16 @@ out:
return ret;
}
+static void vmx_setup_mce(struct kvm_vcpu *vcpu)
+{
+ if (vcpu->arch.mcg_cap & MCG_LMCE_P)
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |=
+ FEATURE_CONTROL_LMCE;
+ else
+ to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
+ ~FEATURE_CONTROL_LMCE;
+}
+
static struct kvm_x86_ops vmx_x86_ops = {
.cpu_has_kvm_support = cpu_has_kvm_support,
.disabled_by_bios = vmx_disabled_by_bios,
@@ -11013,6 +11300,13 @@ static struct kvm_x86_ops vmx_x86_ops = {
.pmu_ops = &intel_pmu_ops,
.update_pi_irte = vmx_update_pi_irte,
+
+#ifdef CONFIG_X86_64
+ .set_hv_timer = vmx_set_hv_timer,
+ .cancel_hv_timer = vmx_cancel_hv_timer,
+#endif
+
+ .setup_mce = vmx_setup_mce,
};
static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 9c496c7e8c00..19f9f9e05c2a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -71,7 +71,8 @@
#define MAX_IO_MSRS 256
#define KVM_MAX_MCE_BANKS 32
-#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
+u64 __read_mostly kvm_mce_cap_supported = MCG_CTL_P | MCG_SER_P;
+EXPORT_SYMBOL_GPL(kvm_mce_cap_supported);
#define emul_to_vcpu(ctxt) \
container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
@@ -90,8 +91,12 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
+#define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \
+ KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
static void process_nmi(struct kvm_vcpu *vcpu);
+static void enter_smm(struct kvm_vcpu *vcpu);
static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
struct kvm_x86_ops *kvm_x86_ops __read_mostly;
@@ -114,7 +119,8 @@ u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits;
EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits);
u64 __read_mostly kvm_max_tsc_scaling_ratio;
EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio);
-static u64 __read_mostly kvm_default_tsc_scaling_ratio;
+u64 __read_mostly kvm_default_tsc_scaling_ratio;
+EXPORT_SYMBOL_GPL(kvm_default_tsc_scaling_ratio);
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 __read_mostly tsc_tolerance_ppm = 250;
@@ -538,7 +544,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
goto out;
}
for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
- if (is_present_gpte(pdpte[i]) &&
+ if ((pdpte[i] & PT_PRESENT_MASK) &&
(pdpte[i] &
vcpu->arch.mmu.guest_rsvd_check.rsvd_bits_mask[0][2])) {
ret = 0;
@@ -983,6 +989,7 @@ static u32 emulated_msrs[] = {
MSR_IA32_MISC_ENABLE,
MSR_IA32_MCG_STATUS,
MSR_IA32_MCG_CTL,
+ MSR_IA32_MCG_EXT_CTL,
MSR_IA32_SMBASE,
};
@@ -1162,7 +1169,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
int version;
int r;
struct pvclock_wall_clock wc;
- struct timespec boot;
+ struct timespec64 boot;
if (!wall_clock)
return;
@@ -1185,13 +1192,13 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
* wall clock specified here. guest system time equals host
* system time for us, thus we must fill in host boot time here.
*/
- getboottime(&boot);
+ getboottime64(&boot);
if (kvm->arch.kvmclock_offset) {
- struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
- boot = timespec_sub(boot, ts);
+ struct timespec64 ts = ns_to_timespec64(kvm->arch.kvmclock_offset);
+ boot = timespec64_sub(boot, ts);
}
- wc.sec = boot.tv_sec;
+ wc.sec = (u32)boot.tv_sec; /* overflow in 2106 guest time */
wc.nsec = boot.tv_nsec;
wc.version = version;
@@ -2616,6 +2623,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_TSC_CONTROL:
r = kvm_has_tsc_control;
break;
+ case KVM_CAP_X2APIC_API:
+ r = KVM_X2APIC_API_VALID_FLAGS;
+ break;
default:
r = 0;
break;
@@ -2678,11 +2688,9 @@ long kvm_arch_dev_ioctl(struct file *filp,
break;
}
case KVM_X86_GET_MCE_CAP_SUPPORTED: {
- u64 mce_cap;
-
- mce_cap = KVM_MCE_CAP_SUPPORTED;
r = -EFAULT;
- if (copy_to_user(argp, &mce_cap, sizeof mce_cap))
+ if (copy_to_user(argp, &kvm_mce_cap_supported,
+ sizeof(kvm_mce_cap_supported)))
goto out;
r = 0;
break;
@@ -2734,6 +2742,11 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
rdtsc() - vcpu->arch.last_host_tsc;
if (tsc_delta < 0)
mark_tsc_unstable("KVM discovered backwards TSC");
+
+ if (kvm_lapic_hv_timer_in_use(vcpu) &&
+ kvm_x86_ops->set_hv_timer(vcpu,
+ kvm_get_lapic_tscdeadline_msr(vcpu)))
+ kvm_lapic_switch_to_sw_timer(vcpu);
if (check_tsc_unstable()) {
u64 offset = kvm_compute_tsc_offset(vcpu,
vcpu->arch.last_guest_tsc);
@@ -2767,15 +2780,17 @@ static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
if (vcpu->arch.apicv_active)
kvm_x86_ops->sync_pir_to_irr(vcpu);
- memcpy(s->regs, vcpu->arch.apic->regs, sizeof *s);
-
- return 0;
+ return kvm_apic_get_state(vcpu, s);
}
static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
- kvm_apic_post_state_restore(vcpu, s);
+ int r;
+
+ r = kvm_apic_set_state(vcpu, s);
+ if (r)
+ return r;
update_cr8_intercept(vcpu);
return 0;
@@ -2860,7 +2875,7 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
r = -EINVAL;
if (!bank_num || bank_num >= KVM_MAX_MCE_BANKS)
goto out;
- if (mcg_cap & ~(KVM_MCE_CAP_SUPPORTED | 0xff | 0xff0000))
+ if (mcg_cap & ~(kvm_mce_cap_supported | 0xff | 0xff0000))
goto out;
r = 0;
vcpu->arch.mcg_cap = mcg_cap;
@@ -2870,6 +2885,9 @@ static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu,
/* Init IA32_MCi_CTL to all 1s */
for (bank = 0; bank < bank_num; bank++)
vcpu->arch.mce_banks[bank*4] = ~(u64)0;
+
+ if (kvm_x86_ops->setup_mce)
+ kvm_x86_ops->setup_mce(vcpu);
out:
return r;
}
@@ -3768,7 +3786,7 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = -EEXIST;
if (irqchip_in_kernel(kvm))
goto split_irqchip_unlock;
- if (atomic_read(&kvm->online_vcpus))
+ if (kvm->created_vcpus)
goto split_irqchip_unlock;
r = kvm_setup_empty_irq_routing(kvm);
if (r)
@@ -3782,6 +3800,18 @@ split_irqchip_unlock:
mutex_unlock(&kvm->lock);
break;
}
+ case KVM_CAP_X2APIC_API:
+ r = -EINVAL;
+ if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS)
+ break;
+
+ if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS)
+ kvm->arch.x2apic_format = true;
+ if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK)
+ kvm->arch.x2apic_broadcast_quirk_disabled = true;
+
+ r = 0;
+ break;
default:
r = -EINVAL;
break;
@@ -3833,7 +3863,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (kvm->arch.vpic)
goto create_irqchip_unlock;
r = -EINVAL;
- if (atomic_read(&kvm->online_vcpus))
+ if (kvm->created_vcpus)
goto create_irqchip_unlock;
r = -ENOMEM;
vpic = kvm_create_pic(kvm);
@@ -3873,7 +3903,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
sizeof(struct kvm_pit_config)))
goto out;
create_pit:
- mutex_lock(&kvm->slots_lock);
+ mutex_lock(&kvm->lock);
r = -EEXIST;
if (kvm->arch.vpit)
goto create_pit_unlock;
@@ -3882,7 +3912,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
if (kvm->arch.vpit)
r = 0;
create_pit_unlock:
- mutex_unlock(&kvm->slots_lock);
+ mutex_unlock(&kvm->lock);
break;
case KVM_GET_IRQCHIP: {
/* 0: PIC master, 1: PIC slave, 2: IOAPIC */
@@ -3989,7 +4019,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
case KVM_SET_BOOT_CPU_ID:
r = 0;
mutex_lock(&kvm->lock);
- if (atomic_read(&kvm->online_vcpus) != 0)
+ if (kvm->created_vcpus)
r = -EBUSY;
else
kvm->arch.bsp_vcpu_id = arg;
@@ -5297,13 +5327,8 @@ static void kvm_smm_changed(struct kvm_vcpu *vcpu)
/* This is a good place to trace that we are exiting SMM. */
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, false);
- if (unlikely(vcpu->arch.smi_pending)) {
- kvm_make_request(KVM_REQ_SMI, vcpu);
- vcpu->arch.smi_pending = 0;
- } else {
- /* Process a latched INIT, if any. */
- kvm_make_request(KVM_REQ_EVENT, vcpu);
- }
+ /* Process a latched INIT or SMI, if any. */
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
}
kvm_mmu_reset_context(vcpu);
@@ -5849,8 +5874,8 @@ int kvm_arch_init(void *opaque)
kvm_x86_ops = ops;
kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
- PT_DIRTY_MASK, PT64_NX_MASK, 0);
-
+ PT_DIRTY_MASK, PT64_NX_MASK, 0,
+ PT_PRESENT_MASK);
kvm_timer_init();
perf_register_guest_info_callbacks(&kvm_guest_cbs);
@@ -6084,7 +6109,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
}
/* try to inject new event if pending */
- if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
+ if (vcpu->arch.smi_pending && !is_smm(vcpu)) {
+ vcpu->arch.smi_pending = false;
+ enter_smm(vcpu);
+ } else if (vcpu->arch.nmi_pending && kvm_x86_ops->nmi_allowed(vcpu)) {
--vcpu->arch.nmi_pending;
vcpu->arch.nmi_injected = true;
kvm_x86_ops->set_nmi(vcpu);
@@ -6107,6 +6135,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
kvm_x86_ops->set_irq(vcpu);
}
}
+
return 0;
}
@@ -6130,7 +6159,7 @@ static void process_nmi(struct kvm_vcpu *vcpu)
#define put_smstate(type, buf, offset, val) \
*(type *)((buf) + (offset) - 0x7e00) = val
-static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
+static u32 enter_smm_get_segment_flags(struct kvm_segment *seg)
{
u32 flags = 0;
flags |= seg->g << 23;
@@ -6144,7 +6173,7 @@ static u32 process_smi_get_segment_flags(struct kvm_segment *seg)
return flags;
}
-static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
+static void enter_smm_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
{
struct kvm_segment seg;
int offset;
@@ -6159,11 +6188,11 @@ static void process_smi_save_seg_32(struct kvm_vcpu *vcpu, char *buf, int n)
put_smstate(u32, buf, offset + 8, seg.base);
put_smstate(u32, buf, offset + 4, seg.limit);
- put_smstate(u32, buf, offset, process_smi_get_segment_flags(&seg));
+ put_smstate(u32, buf, offset, enter_smm_get_segment_flags(&seg));
}
#ifdef CONFIG_X86_64
-static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
+static void enter_smm_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
{
struct kvm_segment seg;
int offset;
@@ -6172,7 +6201,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
kvm_get_segment(vcpu, &seg, n);
offset = 0x7e00 + n * 16;
- flags = process_smi_get_segment_flags(&seg) >> 8;
+ flags = enter_smm_get_segment_flags(&seg) >> 8;
put_smstate(u16, buf, offset, seg.selector);
put_smstate(u16, buf, offset + 2, flags);
put_smstate(u32, buf, offset + 4, seg.limit);
@@ -6180,7 +6209,7 @@ static void process_smi_save_seg_64(struct kvm_vcpu *vcpu, char *buf, int n)
}
#endif
-static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
+static void enter_smm_save_state_32(struct kvm_vcpu *vcpu, char *buf)
{
struct desc_ptr dt;
struct kvm_segment seg;
@@ -6204,13 +6233,13 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7fc4, seg.selector);
put_smstate(u32, buf, 0x7f64, seg.base);
put_smstate(u32, buf, 0x7f60, seg.limit);
- put_smstate(u32, buf, 0x7f5c, process_smi_get_segment_flags(&seg));
+ put_smstate(u32, buf, 0x7f5c, enter_smm_get_segment_flags(&seg));
kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
put_smstate(u32, buf, 0x7fc0, seg.selector);
put_smstate(u32, buf, 0x7f80, seg.base);
put_smstate(u32, buf, 0x7f7c, seg.limit);
- put_smstate(u32, buf, 0x7f78, process_smi_get_segment_flags(&seg));
+ put_smstate(u32, buf, 0x7f78, enter_smm_get_segment_flags(&seg));
kvm_x86_ops->get_gdt(vcpu, &dt);
put_smstate(u32, buf, 0x7f74, dt.address);
@@ -6221,7 +6250,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7f54, dt.size);
for (i = 0; i < 6; i++)
- process_smi_save_seg_32(vcpu, buf, i);
+ enter_smm_save_seg_32(vcpu, buf, i);
put_smstate(u32, buf, 0x7f14, kvm_read_cr4(vcpu));
@@ -6230,7 +6259,7 @@ static void process_smi_save_state_32(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u32, buf, 0x7ef8, vcpu->arch.smbase);
}
-static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
+static void enter_smm_save_state_64(struct kvm_vcpu *vcpu, char *buf)
{
#ifdef CONFIG_X86_64
struct desc_ptr dt;
@@ -6262,7 +6291,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
kvm_get_segment(vcpu, &seg, VCPU_SREG_TR);
put_smstate(u16, buf, 0x7e90, seg.selector);
- put_smstate(u16, buf, 0x7e92, process_smi_get_segment_flags(&seg) >> 8);
+ put_smstate(u16, buf, 0x7e92, enter_smm_get_segment_flags(&seg) >> 8);
put_smstate(u32, buf, 0x7e94, seg.limit);
put_smstate(u64, buf, 0x7e98, seg.base);
@@ -6272,7 +6301,7 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
kvm_get_segment(vcpu, &seg, VCPU_SREG_LDTR);
put_smstate(u16, buf, 0x7e70, seg.selector);
- put_smstate(u16, buf, 0x7e72, process_smi_get_segment_flags(&seg) >> 8);
+ put_smstate(u16, buf, 0x7e72, enter_smm_get_segment_flags(&seg) >> 8);
put_smstate(u32, buf, 0x7e74, seg.limit);
put_smstate(u64, buf, 0x7e78, seg.base);
@@ -6281,31 +6310,26 @@ static void process_smi_save_state_64(struct kvm_vcpu *vcpu, char *buf)
put_smstate(u64, buf, 0x7e68, dt.address);
for (i = 0; i < 6; i++)
- process_smi_save_seg_64(vcpu, buf, i);
+ enter_smm_save_seg_64(vcpu, buf, i);
#else
WARN_ON_ONCE(1);
#endif
}
-static void process_smi(struct kvm_vcpu *vcpu)
+static void enter_smm(struct kvm_vcpu *vcpu)
{
struct kvm_segment cs, ds;
struct desc_ptr dt;
char buf[512];
u32 cr0;
- if (is_smm(vcpu)) {
- vcpu->arch.smi_pending = true;
- return;
- }
-
trace_kvm_enter_smm(vcpu->vcpu_id, vcpu->arch.smbase, true);
vcpu->arch.hflags |= HF_SMM_MASK;
memset(buf, 0, 512);
if (guest_cpuid_has_longmode(vcpu))
- process_smi_save_state_64(vcpu, buf);
+ enter_smm_save_state_64(vcpu, buf);
else
- process_smi_save_state_32(vcpu, buf);
+ enter_smm_save_state_32(vcpu, buf);
kvm_vcpu_write_guest(vcpu, vcpu->arch.smbase + 0xfe00, buf, sizeof(buf));
@@ -6361,6 +6385,12 @@ static void process_smi(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
}
+static void process_smi(struct kvm_vcpu *vcpu)
+{
+ vcpu->arch.smi_pending = true;
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
+}
+
void kvm_make_scan_ioapic_request(struct kvm *kvm)
{
kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
@@ -6555,8 +6585,18 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (inject_pending_event(vcpu, req_int_win) != 0)
req_immediate_exit = true;
- /* enable NMI/IRQ window open exits if needed */
else {
+ /* Enable NMI/IRQ window open exits if needed.
+ *
+ * SMIs have two cases: 1) they can be nested, and
+ * then there is nothing to do here because RSM will
+ * cause a vmexit anyway; 2) or the SMI can be pending
+ * because inject_pending_event has completed the
+ * injection of an IRQ or NMI from the previous vmexit,
+ * and then we request an immediate exit to inject the SMI.
+ */
+ if (vcpu->arch.smi_pending && !is_smm(vcpu))
+ req_immediate_exit = true;
if (vcpu->arch.nmi_pending)
kvm_x86_ops->enable_nmi_window(vcpu);
if (kvm_cpu_has_injectable_intr(vcpu) || req_int_win)
@@ -6607,12 +6647,14 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
kvm_load_guest_xcr0(vcpu);
- if (req_immediate_exit)
+ if (req_immediate_exit) {
+ kvm_make_request(KVM_REQ_EVENT, vcpu);
smp_send_reschedule(vcpu->cpu);
+ }
trace_kvm_entry(vcpu->vcpu_id);
wait_lapic_expire(vcpu);
- __kvm_guest_enter();
+ guest_enter_irqoff();
if (unlikely(vcpu->arch.switch_db_regs)) {
set_debugreg(0, 7);
@@ -6663,16 +6705,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
++vcpu->stat.exits;
- /*
- * We must have an instruction between local_irq_enable() and
- * kvm_guest_exit(), so the timer interrupt isn't delayed by
- * the interrupt shadow. The stat.exits increment will do nicely.
- * But we need to prevent reordering, hence this barrier():
- */
- barrier();
-
- kvm_guest_exit();
+ guest_exit_irqoff();
+ local_irq_enable();
preempt_enable();
vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
@@ -7409,6 +7444,7 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
{
vcpu->arch.hflags = 0;
+ vcpu->arch.smi_pending = 0;
atomic_set(&vcpu->arch.nmi_queued, 0);
vcpu->arch.nmi_pending = 0;
vcpu->arch.nmi_injected = false;
@@ -7601,11 +7637,6 @@ bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu)
return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0;
}
-bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu)
-{
- return irqchip_in_kernel(vcpu->kvm) == lapic_in_kernel(vcpu);
-}
-
struct static_key kvm_no_apic_vcpu __read_mostly;
EXPORT_SYMBOL_GPL(kvm_no_apic_vcpu);
@@ -7872,7 +7903,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
kfree(kvm->arch.vpic);
kfree(kvm->arch.vioapic);
kvm_free_vcpus(kvm);
- kfree(rcu_dereference_check(kvm->arch.apic_map, 1));
+ kvfree(rcu_dereference_check(kvm->arch.apic_map, 1));
kvm_mmu_uninit_vm(kvm);
}
@@ -8380,7 +8411,7 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
/*
* When producer of consumer is unregistered, we change back to
* remapped mode, so we can re-use the current implementation
- * when the irq is masked/disabed or the consumer side (KVM
+ * when the irq is masked/disabled or the consumer side (KVM
* int this case doesn't want to receive the interrupts.
*/
ret = kvm_x86_ops->update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 0);
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 8196054fedb0..7b6a9d14c8c0 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -133,7 +133,7 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev)
if (pci_probe & PCI_NOASSIGN_BARS) {
/*
* If the BIOS did not assign the BAR, zero out the
- * resource so the kernel doesn't attmept to assign
+ * resource so the kernel doesn't attempt to assign
* it later on in pci_assign_unassigned_resources
*/
for (bar = 0; bar <= PCI_STD_RESOURCE_END; bar++) {
diff --git a/arch/x86/pci/vmd.c b/arch/x86/pci/vmd.c
index 613cac7395c4..e88b4176260f 100644
--- a/arch/x86/pci/vmd.c
+++ b/arch/x86/pci/vmd.c
@@ -119,10 +119,11 @@ static void vmd_compose_msi_msg(struct irq_data *data, struct msi_msg *msg)
static void vmd_irq_enable(struct irq_data *data)
{
struct vmd_irq *vmdirq = data->chip_data;
+ unsigned long flags;
- raw_spin_lock(&list_lock);
+ raw_spin_lock_irqsave(&list_lock, flags);
list_add_tail_rcu(&vmdirq->node, &vmdirq->irq->irq_list);
- raw_spin_unlock(&list_lock);
+ raw_spin_unlock_irqrestore(&list_lock, flags);
data->chip->irq_unmask(data);
}
@@ -130,12 +131,14 @@ static void vmd_irq_enable(struct irq_data *data)
static void vmd_irq_disable(struct irq_data *data)
{
struct vmd_irq *vmdirq = data->chip_data;
+ unsigned long flags;
data->chip->irq_mask(data);
- raw_spin_lock(&list_lock);
+ raw_spin_lock_irqsave(&list_lock, flags);
list_del_rcu(&vmdirq->node);
- raw_spin_unlock(&list_lock);
+ INIT_LIST_HEAD_RCU(&vmdirq->node);
+ raw_spin_unlock_irqrestore(&list_lock, flags);
}
/*
@@ -166,16 +169,20 @@ static irq_hw_number_t vmd_get_hwirq(struct msi_domain_info *info,
* XXX: We can be even smarter selecting the best IRQ once we solve the
* affinity problem.
*/
-static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd)
+static struct vmd_irq_list *vmd_next_irq(struct vmd_dev *vmd, struct msi_desc *desc)
{
- int i, best = 0;
+ int i, best = 1;
+ unsigned long flags;
- raw_spin_lock(&list_lock);
+ if (!desc->msi_attrib.is_msix || vmd->msix_count == 1)
+ return &vmd->irqs[0];
+
+ raw_spin_lock_irqsave(&list_lock, flags);
for (i = 1; i < vmd->msix_count; i++)
if (vmd->irqs[i].count < vmd->irqs[best].count)
best = i;
vmd->irqs[best].count++;
- raw_spin_unlock(&list_lock);
+ raw_spin_unlock_irqrestore(&list_lock, flags);
return &vmd->irqs[best];
}
@@ -184,14 +191,15 @@ static int vmd_msi_init(struct irq_domain *domain, struct msi_domain_info *info,
unsigned int virq, irq_hw_number_t hwirq,
msi_alloc_info_t *arg)
{
- struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(arg->desc)->bus);
+ struct msi_desc *desc = arg->desc;
+ struct vmd_dev *vmd = vmd_from_bus(msi_desc_to_pci_dev(desc)->bus);
struct vmd_irq *vmdirq = kzalloc(sizeof(*vmdirq), GFP_KERNEL);
if (!vmdirq)
return -ENOMEM;
INIT_LIST_HEAD(&vmdirq->node);
- vmdirq->irq = vmd_next_irq(vmd);
+ vmdirq->irq = vmd_next_irq(vmd, desc);
vmdirq->virq = virq;
irq_domain_set_info(domain, virq, vmdirq->irq->vmd_vector, info->chip,
@@ -203,11 +211,12 @@ static void vmd_msi_free(struct irq_domain *domain,
struct msi_domain_info *info, unsigned int virq)
{
struct vmd_irq *vmdirq = irq_get_chip_data(virq);
+ unsigned long flags;
/* XXX: Potential optimization to rebalance */
- raw_spin_lock(&list_lock);
+ raw_spin_lock_irqsave(&list_lock, flags);
vmdirq->irq->count--;
- raw_spin_unlock(&list_lock);
+ raw_spin_unlock_irqrestore(&list_lock, flags);
kfree_rcu(vmdirq, rcu);
}
@@ -261,7 +270,7 @@ static struct device *to_vmd_dev(struct device *dev)
static struct dma_map_ops *vmd_dma_ops(struct device *dev)
{
- return to_vmd_dev(dev)->archdata.dma_ops;
+ return get_dma_ops(to_vmd_dev(dev));
}
static void *vmd_alloc(struct device *dev, size_t size, dma_addr_t *addr,
@@ -367,7 +376,7 @@ static void vmd_teardown_dma_ops(struct vmd_dev *vmd)
{
struct dma_domain *domain = &vmd->dma_domain;
- if (vmd->dev->dev.archdata.dma_ops)
+ if (get_dma_ops(&vmd->dev->dev))
del_dma_domain(domain);
}
@@ -379,7 +388,7 @@ static void vmd_teardown_dma_ops(struct vmd_dev *vmd)
static void vmd_setup_dma_ops(struct vmd_dev *vmd)
{
- const struct dma_map_ops *source = vmd->dev->dev.archdata.dma_ops;
+ const struct dma_map_ops *source = get_dma_ops(&vmd->dev->dev);
struct dma_map_ops *dest = &vmd->dma_ops;
struct dma_domain *domain = &vmd->dma_domain;
@@ -594,7 +603,7 @@ static int vmd_enable_domain(struct vmd_dev *vmd)
sd->node = pcibus_to_node(vmd->dev->bus);
vmd->irq_domain = pci_msi_create_irq_domain(NULL, &vmd_msi_domain_info,
- NULL);
+ x86_vector_domain);
if (!vmd->irq_domain)
return -ENODEV;
diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile
index 12734a96df47..ac58c1616408 100644
--- a/arch/x86/purgatory/Makefile
+++ b/arch/x86/purgatory/Makefile
@@ -8,6 +8,8 @@ PURGATORY_OBJS = $(addprefix $(obj)/,$(purgatory-y))
LDFLAGS_purgatory.ro := -e purgatory_start -r --no-undefined -nostdlib -z nodefaultlib
targets += purgatory.ro
+KCOV_INSTRUMENT := n
+
# Default KBUILD_CFLAGS can have -pg option set when FTRACE is enabled. That
# in turn leaves some undefined symbols like __fentry__ in purgatory and not
# sure how to relocate those. Like kexec-tools, use custom flags.
diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile
index c556c5ae8de5..25012abc3409 100644
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -48,7 +48,7 @@ targets += realmode.lds
$(obj)/realmode.lds: $(obj)/pasyms.h
LDFLAGS_realmode.elf := --emit-relocs -T
-CPPFLAGS_realmode.lds += -P -C -I$(obj)
+CPPFLAGS_realmode.lds += -P -C -I$(objtree)/$(obj)
targets += realmode.elf
$(obj)/realmode.elf: $(obj)/realmode.lds $(REALMODE_OBJS) FORCE
diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig
index aebd944bdaa1..445ce28475b3 100644
--- a/drivers/acpi/Kconfig
+++ b/drivers/acpi/Kconfig
@@ -221,6 +221,9 @@ config ACPI_PROCESSOR_IDLE
bool
select CPU_IDLE
+config ACPI_MCFG
+ bool
+
config ACPI_CPPC_LIB
bool
depends on ACPI_PROCESSOR
diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile
index 35a6ccbe3025..5ae9d85c5159 100644
--- a/drivers/acpi/Makefile
+++ b/drivers/acpi/Makefile
@@ -40,6 +40,7 @@ acpi-$(CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC) += processor_pdc.o
acpi-y += ec.o
acpi-$(CONFIG_ACPI_DOCK) += dock.o
acpi-y += pci_root.o pci_link.o pci_irq.o
+obj-$(CONFIG_ACPI_MCFG) += pci_mcfg.o
acpi-y += acpi_lpss.o acpi_apd.o
acpi-y += acpi_platform.o
acpi-y += acpi_pnp.o
diff --git a/drivers/acpi/pci_mcfg.c b/drivers/acpi/pci_mcfg.c
new file mode 100644
index 000000000000..b5b376e081f5
--- /dev/null
+++ b/drivers/acpi/pci_mcfg.c
@@ -0,0 +1,92 @@
+/*
+ * Copyright (C) 2016 Broadcom
+ * Author: Jayachandran C <jchandra@broadcom.com>
+ * Copyright (C) 2016 Semihalf
+ * Author: Tomasz Nowicki <tn@semihalf.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation (the "GPL").
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License version 2 (GPLv2) for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * version 2 (GPLv2) along with this source code.
+ */
+
+#define pr_fmt(fmt) "ACPI: " fmt
+
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/pci-acpi.h>
+
+/* Structure to hold entries from the MCFG table */
+struct mcfg_entry {
+ struct list_head list;
+ phys_addr_t addr;
+ u16 segment;
+ u8 bus_start;
+ u8 bus_end;
+};
+
+/* List to save MCFG entries */
+static LIST_HEAD(pci_mcfg_list);
+
+phys_addr_t pci_mcfg_lookup(u16 seg, struct resource *bus_res)
+{
+ struct mcfg_entry *e;
+
+ /*
+ * We expect exact match, unless MCFG entry end bus covers more than
+ * specified by caller.
+ */
+ list_for_each_entry(e, &pci_mcfg_list, list) {
+ if (e->segment == seg && e->bus_start == bus_res->start &&
+ e->bus_end >= bus_res->end)
+ return e->addr;
+ }
+
+ return 0;
+}
+
+static __init int pci_mcfg_parse(struct acpi_table_header *header)
+{
+ struct acpi_table_mcfg *mcfg;
+ struct acpi_mcfg_allocation *mptr;
+ struct mcfg_entry *e, *arr;
+ int i, n;
+
+ if (header->length < sizeof(struct acpi_table_mcfg))
+ return -EINVAL;
+
+ n = (header->length - sizeof(struct acpi_table_mcfg)) /
+ sizeof(struct acpi_mcfg_allocation);
+ mcfg = (struct acpi_table_mcfg *)header;
+ mptr = (struct acpi_mcfg_allocation *) &mcfg[1];
+
+ arr = kcalloc(n, sizeof(*arr), GFP_KERNEL);
+ if (!arr)
+ return -ENOMEM;
+
+ for (i = 0, e = arr; i < n; i++, mptr++, e++) {
+ e->segment = mptr->pci_segment;
+ e->addr = mptr->address;
+ e->bus_start = mptr->start_bus_number;
+ e->bus_end = mptr->end_bus_number;
+ list_add(&e->list, &pci_mcfg_list);
+ }
+
+ pr_info("MCFG table detected, %d entries\n", n);
+ return 0;
+}
+
+/* Interface called by ACPI - parse and save MCFG table */
+void __init pci_mmcfg_late_init(void)
+{
+ int err = acpi_table_parse(ACPI_SIG_MCFG, pci_mcfg_parse);
+ if (err)
+ pr_err("Failed to parse MCFG (%d)\n", err);
+}
diff --git a/drivers/acpi/pci_root.c b/drivers/acpi/pci_root.c
index ae3fe4e64203..d144168d4ef9 100644
--- a/drivers/acpi/pci_root.c
+++ b/drivers/acpi/pci_root.c
@@ -720,6 +720,36 @@ next:
}
}
+static void acpi_pci_root_remap_iospace(struct resource_entry *entry)
+{
+#ifdef PCI_IOBASE
+ struct resource *res = entry->res;
+ resource_size_t cpu_addr = res->start;
+ resource_size_t pci_addr = cpu_addr - entry->offset;
+ resource_size_t length = resource_size(res);
+ unsigned long port;
+
+ if (pci_register_io_range(cpu_addr, length))
+ goto err;
+
+ port = pci_address_to_pio(cpu_addr);
+ if (port == (unsigned long)-1)
+ goto err;
+
+ res->start = port;
+ res->end = port + length - 1;
+ entry->offset = port - pci_addr;
+
+ if (pci_remap_iospace(res, cpu_addr) < 0)
+ goto err;
+
+ pr_info("Remapped I/O %pa to %pR\n", &cpu_addr, res);
+ return;
+err:
+ res->flags |= IORESOURCE_DISABLED;
+#endif
+}
+
int acpi_pci_probe_root_resources(struct acpi_pci_root_info *info)
{
int ret;
@@ -740,6 +770,9 @@ int acpi_pci_probe_root_resources(struct acpi_pci_root_info *info)
"no IO and memory resources present in _CRS\n");
else {
resource_list_for_each_entry_safe(entry, tmp, list) {
+ if (entry->res->flags & IORESOURCE_IO)
+ acpi_pci_root_remap_iospace(entry);
+
if (entry->res->flags & IORESOURCE_DISABLED)
resource_list_destroy_entry(entry);
else
@@ -811,6 +844,8 @@ static void acpi_pci_root_release_info(struct pci_host_bridge *bridge)
resource_list_for_each_entry(entry, &bridge->windows) {
res = entry->res;
+ if (res->flags & IORESOURCE_IO)
+ pci_unmap_iospace(res);
if (res->parent &&
(res->flags & (IORESOURCE_MEM | IORESOURCE_IO)))
release_resource(res);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 450662055d97..1a04af6d2421 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -1937,7 +1937,7 @@ static struct ceph_osd_request *rbd_osd_req_create(
osd_req->r_callback = rbd_osd_req_callback;
osd_req->r_priv = obj_request;
- osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+ osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
obj_request->object_name))
goto fail;
@@ -1991,7 +1991,7 @@ rbd_osd_req_create_copyup(struct rbd_obj_request *obj_request)
osd_req->r_callback = rbd_osd_req_callback;
osd_req->r_priv = obj_request;
- osd_req->r_base_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+ osd_req->r_base_oloc.pool = rbd_dev->layout.pool_id;
if (ceph_oid_aprintf(&osd_req->r_base_oid, GFP_NOIO, "%s",
obj_request->object_name))
goto fail;
@@ -3995,10 +3995,11 @@ static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
/* Initialize the layout used for all rbd requests */
- rbd_dev->layout.fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
- rbd_dev->layout.fl_stripe_count = cpu_to_le32(1);
- rbd_dev->layout.fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
- rbd_dev->layout.fl_pg_pool = cpu_to_le32((u32) spec->pool_id);
+ rbd_dev->layout.stripe_unit = 1 << RBD_MAX_OBJ_ORDER;
+ rbd_dev->layout.stripe_count = 1;
+ rbd_dev->layout.object_size = 1 << RBD_MAX_OBJ_ORDER;
+ rbd_dev->layout.pool_id = spec->pool_id;
+ RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
/*
* If this is a mapping rbd_dev (as opposed to a parent one),
@@ -5187,7 +5188,7 @@ static int rbd_dev_header_name(struct rbd_device *rbd_dev)
rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
- rbd_dev->header_oloc.pool = ceph_file_layout_pg_pool(rbd_dev->layout);
+ rbd_dev->header_oloc.pool = rbd_dev->layout.pool_id;
if (rbd_dev->image_format == 1)
ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
spec->image_name, RBD_SUFFIX);
diff --git a/drivers/gpu/drm/drm_dp_helper.c b/drivers/gpu/drm/drm_dp_helper.c
index 8f11b8741e42..eae5ef963cb7 100644
--- a/drivers/gpu/drm/drm_dp_helper.c
+++ b/drivers/gpu/drm/drm_dp_helper.c
@@ -860,3 +860,35 @@ void drm_dp_aux_unregister(struct drm_dp_aux *aux)
i2c_del_adapter(&aux->ddc);
}
EXPORT_SYMBOL(drm_dp_aux_unregister);
+
+#define PSR_SETUP_TIME(x) [DP_PSR_SETUP_TIME_ ## x >> DP_PSR_SETUP_TIME_SHIFT] = (x)
+
+/**
+ * drm_dp_psr_setup_time() - PSR setup in time usec
+ * @psr_cap: PSR capabilities from DPCD
+ *
+ * Returns:
+ * PSR setup time for the panel in microseconds, negative
+ * error code on failure.
+ */
+int drm_dp_psr_setup_time(const u8 psr_cap[EDP_PSR_RECEIVER_CAP_SIZE])
+{
+ static const u16 psr_setup_time_us[] = {
+ PSR_SETUP_TIME(330),
+ PSR_SETUP_TIME(275),
+ PSR_SETUP_TIME(165),
+ PSR_SETUP_TIME(110),
+ PSR_SETUP_TIME(55),
+ PSR_SETUP_TIME(0),
+ };
+ int i;
+
+ i = (psr_cap[1] & DP_PSR_SETUP_TIME_MASK) >> DP_PSR_SETUP_TIME_SHIFT;
+ if (i >= ARRAY_SIZE(psr_setup_time_us))
+ return -EINVAL;
+
+ return psr_setup_time_us[i];
+}
+EXPORT_SYMBOL(drm_dp_psr_setup_time);
+
+#undef PSR_SETUP_TIME
diff --git a/drivers/gpu/drm/i915/intel_drv.h b/drivers/gpu/drm/i915/intel_drv.h
index 3329fc6a95f4..cc937a19b1ba 100644
--- a/drivers/gpu/drm/i915/intel_drv.h
+++ b/drivers/gpu/drm/i915/intel_drv.h
@@ -1730,6 +1730,8 @@ bool intel_sdvo_init(struct drm_device *dev,
/* intel_sprite.c */
+int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
+ int usecs);
int intel_plane_init(struct drm_device *dev, enum pipe pipe, int plane);
int intel_sprite_set_colorkey(struct drm_device *dev, void *data,
struct drm_file *file_priv);
diff --git a/drivers/gpu/drm/i915/intel_psr.c b/drivers/gpu/drm/i915/intel_psr.c
index 68bd0bb34817..2b0d1baf15b3 100644
--- a/drivers/gpu/drm/i915/intel_psr.c
+++ b/drivers/gpu/drm/i915/intel_psr.c
@@ -327,6 +327,9 @@ static bool intel_psr_match_conditions(struct intel_dp *intel_dp)
struct drm_i915_private *dev_priv = to_i915(dev);
struct drm_crtc *crtc = dig_port->base.base.crtc;
struct intel_crtc *intel_crtc = to_intel_crtc(crtc);
+ const struct drm_display_mode *adjusted_mode =
+ &intel_crtc->config->base.adjusted_mode;
+ int psr_setup_time;
lockdep_assert_held(&dev_priv->psr.lock);
WARN_ON(!drm_modeset_is_locked(&dev->mode_config.connection_mutex));
@@ -365,11 +368,25 @@ static bool intel_psr_match_conditions(struct intel_dp *intel_dp)
}
if (IS_HASWELL(dev) &&
- intel_crtc->config->base.adjusted_mode.flags & DRM_MODE_FLAG_INTERLACE) {
+ adjusted_mode->flags & DRM_MODE_FLAG_INTERLACE) {
DRM_DEBUG_KMS("PSR condition failed: Interlaced is Enabled\n");
return false;
}
+ psr_setup_time = drm_dp_psr_setup_time(intel_dp->psr_dpcd);
+ if (psr_setup_time < 0) {
+ DRM_DEBUG_KMS("PSR condition failed: Invalid PSR setup time (0x%02x)\n",
+ intel_dp->psr_dpcd[1]);
+ return false;
+ }
+
+ if (intel_usecs_to_scanlines(adjusted_mode, psr_setup_time) >
+ adjusted_mode->crtc_vtotal - adjusted_mode->crtc_vdisplay - 1) {
+ DRM_DEBUG_KMS("PSR condition failed: PSR setup time (%d us) too long\n",
+ psr_setup_time);
+ return false;
+ }
+
dev_priv->psr.source_ok = true;
return true;
}
diff --git a/drivers/gpu/drm/i915/intel_sprite.c b/drivers/gpu/drm/i915/intel_sprite.c
index 0de935ad01c2..7c08e4f29032 100644
--- a/drivers/gpu/drm/i915/intel_sprite.c
+++ b/drivers/gpu/drm/i915/intel_sprite.c
@@ -53,8 +53,8 @@ format_is_yuv(uint32_t format)
}
}
-static int usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
- int usecs)
+int intel_usecs_to_scanlines(const struct drm_display_mode *adjusted_mode,
+ int usecs)
{
/* paranoia */
if (!adjusted_mode->crtc_htotal)
@@ -91,7 +91,7 @@ void intel_pipe_update_start(struct intel_crtc *crtc)
vblank_start = DIV_ROUND_UP(vblank_start, 2);
/* FIXME needs to be calibrated sensibly */
- min = vblank_start - usecs_to_scanlines(adjusted_mode, 100);
+ min = vblank_start - intel_usecs_to_scanlines(adjusted_mode, 100);
max = vblank_start - 1;
local_irq_disable();
diff --git a/drivers/irqchip/Kconfig b/drivers/irqchip/Kconfig
index 5495a5ba8039..7f8728984f44 100644
--- a/drivers/irqchip/Kconfig
+++ b/drivers/irqchip/Kconfig
@@ -21,9 +21,9 @@ config ARM_GIC_MAX_NR
config ARM_GIC_V2M
bool
- depends on ARM_GIC
- depends on PCI && PCI_MSI
- select PCI_MSI_IRQ_DOMAIN
+ depends on PCI
+ select ARM_GIC
+ select PCI_MSI
config GIC_NON_BANKED
bool
@@ -37,7 +37,8 @@ config ARM_GIC_V3
config ARM_GIC_V3_ITS
bool
- select PCI_MSI_IRQ_DOMAIN
+ depends on PCI
+ depends on PCI_MSI
config ARM_NVIC
bool
@@ -62,13 +63,13 @@ config ARM_VIC_NR
config ARMADA_370_XP_IRQ
bool
select GENERIC_IRQ_CHIP
- select PCI_MSI_IRQ_DOMAIN if PCI_MSI
+ select PCI_MSI if PCI
config ALPINE_MSI
bool
- depends on PCI && PCI_MSI
+ depends on PCI
+ select PCI_MSI
select GENERIC_IRQ_CHIP
- select PCI_MSI_IRQ_DOMAIN
config ATMEL_AIC_IRQ
bool
@@ -117,7 +118,6 @@ config HISILICON_IRQ_MBIGEN
bool
select ARM_GIC_V3
select ARM_GIC_V3_ITS
- select GENERIC_MSI_IRQ_DOMAIN
config IMGPDC_IRQ
bool
@@ -250,12 +250,10 @@ config IRQ_MXS
config MVEBU_ODMI
bool
- select GENERIC_MSI_IRQ_DOMAIN
config LS_SCFG_MSI
def_bool y if SOC_LS1021A || ARCH_LAYERSCAPE
depends on PCI && PCI_MSI
- select PCI_MSI_IRQ_DOMAIN
config PARTITION_PERCPU
bool
diff --git a/drivers/memory/Kconfig b/drivers/memory/Kconfig
index 133712346911..4b4c0c3c3d2f 100644
--- a/drivers/memory/Kconfig
+++ b/drivers/memory/Kconfig
@@ -115,7 +115,7 @@ config FSL_CORENET_CF
config FSL_IFC
bool
- depends on FSL_SOC
+ depends on FSL_SOC || ARCH_LAYERSCAPE
config JZ4780_NEMC
bool "Ingenic JZ4780 SoC NEMC driver"
diff --git a/drivers/memory/fsl_ifc.c b/drivers/memory/fsl_ifc.c
index 904b4af5f142..1b182b117f9c 100644
--- a/drivers/memory/fsl_ifc.c
+++ b/drivers/memory/fsl_ifc.c
@@ -31,7 +31,9 @@
#include <linux/of_device.h>
#include <linux/platform_device.h>
#include <linux/fsl_ifc.h>
-#include <asm/prom.h>
+#include <linux/irqdomain.h>
+#include <linux/of_address.h>
+#include <linux/of_irq.h>
struct fsl_ifc_ctrl *fsl_ifc_ctrl_dev;
EXPORT_SYMBOL(fsl_ifc_ctrl_dev);
diff --git a/drivers/misc/genwqe/card_base.c b/drivers/misc/genwqe/card_base.c
index 4cf8f82cfca2..a70b853fa2c9 100644
--- a/drivers/misc/genwqe/card_base.c
+++ b/drivers/misc/genwqe/card_base.c
@@ -182,7 +182,7 @@ static void genwqe_dev_free(struct genwqe_dev *cd)
*/
static int genwqe_bus_reset(struct genwqe_dev *cd)
{
- int bars, rc = 0;
+ int rc = 0;
struct pci_dev *pci_dev = cd->pci_dev;
void __iomem *mmio;
@@ -193,8 +193,7 @@ static int genwqe_bus_reset(struct genwqe_dev *cd)
cd->mmio = NULL;
pci_iounmap(pci_dev, mmio);
- bars = pci_select_bars(pci_dev, IORESOURCE_MEM);
- pci_release_selected_regions(pci_dev, bars);
+ pci_release_mem_regions(pci_dev);
/*
* Firmware/BIOS might change memory mapping during bus reset.
@@ -218,7 +217,7 @@ static int genwqe_bus_reset(struct genwqe_dev *cd)
GENWQE_INJECT_GFIR_FATAL |
GENWQE_INJECT_GFIR_INFO);
- rc = pci_request_selected_regions(pci_dev, bars, genwqe_driver_name);
+ rc = pci_request_mem_regions(pci_dev, genwqe_driver_name);
if (rc) {
dev_err(&pci_dev->dev,
"[%s] err: request bars failed (%d)\n", __func__, rc);
@@ -1068,10 +1067,9 @@ static int genwqe_health_check_stop(struct genwqe_dev *cd)
*/
static int genwqe_pci_setup(struct genwqe_dev *cd)
{
- int err, bars;
+ int err;
struct pci_dev *pci_dev = cd->pci_dev;
- bars = pci_select_bars(pci_dev, IORESOURCE_MEM);
err = pci_enable_device_mem(pci_dev);
if (err) {
dev_err(&pci_dev->dev,
@@ -1080,7 +1078,7 @@ static int genwqe_pci_setup(struct genwqe_dev *cd)
}
/* Reserve PCI I/O and memory resources */
- err = pci_request_selected_regions(pci_dev, bars, genwqe_driver_name);
+ err = pci_request_mem_regions(pci_dev, genwqe_driver_name);
if (err) {
dev_err(&pci_dev->dev,
"[%s] err: request bars failed (%d)\n", __func__, err);
@@ -1142,7 +1140,7 @@ static int genwqe_pci_setup(struct genwqe_dev *cd)
out_iounmap:
pci_iounmap(pci_dev, cd->mmio);
out_release_resources:
- pci_release_selected_regions(pci_dev, bars);
+ pci_release_mem_regions(pci_dev);
err_disable_device:
pci_disable_device(pci_dev);
err_out:
@@ -1154,14 +1152,12 @@ static int genwqe_pci_setup(struct genwqe_dev *cd)
*/
static void genwqe_pci_remove(struct genwqe_dev *cd)
{
- int bars;
struct pci_dev *pci_dev = cd->pci_dev;
if (cd->mmio)
pci_iounmap(pci_dev, cd->mmio);
- bars = pci_select_bars(pci_dev, IORESOURCE_MEM);
- pci_release_selected_regions(pci_dev, bars);
+ pci_release_mem_regions(pci_dev);
pci_disable_device(pci_dev);
}
diff --git a/drivers/mtd/chips/cfi_cmdset_0020.c b/drivers/mtd/chips/cfi_cmdset_0020.c
index 9a1a6ffd16b8..94d3eb42c4d5 100644
--- a/drivers/mtd/chips/cfi_cmdset_0020.c
+++ b/drivers/mtd/chips/cfi_cmdset_0020.c
@@ -416,7 +416,7 @@ static int cfi_staa_read (struct mtd_info *mtd, loff_t from, size_t len, size_t
return ret;
}
-static inline int do_write_buffer(struct map_info *map, struct flchip *chip,
+static int do_write_buffer(struct map_info *map, struct flchip *chip,
unsigned long adr, const u_char *buf, int len)
{
struct cfi_private *cfi = map->fldrv_priv;
diff --git a/drivers/mtd/devices/Kconfig b/drivers/mtd/devices/Kconfig
index 64a248556d29..58329d2dacd1 100644
--- a/drivers/mtd/devices/Kconfig
+++ b/drivers/mtd/devices/Kconfig
@@ -113,12 +113,12 @@ config MTD_SST25L
if you want to specify device partitioning.
config MTD_BCM47XXSFLASH
- tristate "R/O support for serial flash on BCMA bus"
+ tristate "Support for serial flash on BCMA bus"
depends on BCMA_SFLASH && (MIPS || ARM)
help
BCMA bus can have various flash memories attached, they are
registered by bcma as platform devices. This enables driver for
- serial flash memories (only read-only mode is implemented).
+ serial flash memories.
config MTD_SLRAM
tristate "Uncached system RAM"
@@ -171,18 +171,6 @@ config MTDRAM_ERASE_SIZE
as a module, it is also possible to specify this as a parameter when
loading the module.
-#If not a module (I don't want to test it as a module)
-config MTDRAM_ABS_POS
- hex "SRAM Hexadecimal Absolute position or 0"
- depends on MTD_MTDRAM=y
- default "0"
- help
- If you have system RAM accessible by the CPU but not used by Linux
- in normal operation, you can give the physical address at which the
- available RAM starts, and the MTDRAM driver will use it instead of
- allocating space from Linux's available memory. Otherwise, leave
- this set to zero. Most people will want to leave this as zero.
-
config MTD_BLOCK2MTD
tristate "MTD using block device"
depends on BLOCK
diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index 9d6854467651..9cf7fcd28034 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -73,14 +73,15 @@ static int m25p80_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
return spi_write(spi, flash->command, len + 1);
}
-static void m25p80_write(struct spi_nor *nor, loff_t to, size_t len,
- size_t *retlen, const u_char *buf)
+static ssize_t m25p80_write(struct spi_nor *nor, loff_t to, size_t len,
+ const u_char *buf)
{
struct m25p *flash = nor->priv;
struct spi_device *spi = flash->spi;
struct spi_transfer t[2] = {};
struct spi_message m;
int cmd_sz = m25p_cmdsz(nor);
+ ssize_t ret;
spi_message_init(&m);
@@ -98,9 +99,14 @@ static void m25p80_write(struct spi_nor *nor, loff_t to, size_t len,
t[1].len = len;
spi_message_add_tail(&t[1], &m);
- spi_sync(spi, &m);
+ ret = spi_sync(spi, &m);
+ if (ret)
+ return ret;
- *retlen += m.actual_length - cmd_sz;
+ ret = m.actual_length - cmd_sz;
+ if (ret < 0)
+ return -EIO;
+ return ret;
}
static inline unsigned int m25p80_rx_nbits(struct spi_nor *nor)
@@ -119,21 +125,21 @@ static inline unsigned int m25p80_rx_nbits(struct spi_nor *nor)
* Read an address range from the nor chip. The address range
* may be any size provided it is within the physical boundaries.
*/
-static int m25p80_read(struct spi_nor *nor, loff_t from, size_t len,
- size_t *retlen, u_char *buf)
+static ssize_t m25p80_read(struct spi_nor *nor, loff_t from, size_t len,
+ u_char *buf)
{
struct m25p *flash = nor->priv;
struct spi_device *spi = flash->spi;
struct spi_transfer t[2];
struct spi_message m;
unsigned int dummy = nor->read_dummy;
+ ssize_t ret;
/* convert the dummy cycles to the number of bytes */
dummy /= 8;
if (spi_flash_read_supported(spi)) {
struct spi_flash_read_message msg;
- int ret;
memset(&msg, 0, sizeof(msg));
@@ -149,8 +155,9 @@ static int m25p80_read(struct spi_nor *nor, loff_t from, size_t len,
msg.data_nbits = m25p80_rx_nbits(nor);
ret = spi_flash_read(spi, &msg);
- *retlen = msg.retlen;
- return ret;
+ if (ret < 0)
+ return ret;
+ return msg.retlen;
}
spi_message_init(&m);
@@ -165,13 +172,17 @@ static int m25p80_read(struct spi_nor *nor, loff_t from, size_t len,
t[1].rx_buf = buf;
t[1].rx_nbits = m25p80_rx_nbits(nor);
- t[1].len = len;
+ t[1].len = min(len, spi_max_transfer_size(spi));
spi_message_add_tail(&t[1], &m);
- spi_sync(spi, &m);
+ ret = spi_sync(spi, &m);
+ if (ret)
+ return ret;
- *retlen = m.actual_length - m25p_cmdsz(nor) - dummy;
- return 0;
+ ret = m.actual_length - m25p_cmdsz(nor) - dummy;
+ if (ret < 0)
+ return -EIO;
+ return ret;
}
/*
diff --git a/drivers/mtd/maps/physmap_of.c b/drivers/mtd/maps/physmap_of.c
index 22f3858c0364..3fad35942895 100644
--- a/drivers/mtd/maps/physmap_of.c
+++ b/drivers/mtd/maps/physmap_of.c
@@ -186,7 +186,7 @@ static int of_flash_probe(struct platform_device *dev)
* consists internally of 2 non-identical NOR chips on one die.
*/
p = of_get_property(dp, "reg", &count);
- if (count % reg_tuple_size != 0) {
+ if (!p || count % reg_tuple_size != 0) {
dev_err(&dev->dev, "Malformed reg property on %s\n",
dev->dev.of_node->full_name);
err = -EINVAL;
diff --git a/drivers/mtd/maps/pmcmsp-flash.c b/drivers/mtd/maps/pmcmsp-flash.c
index 744ca5cacc9b..f9fa3fad728e 100644
--- a/drivers/mtd/maps/pmcmsp-flash.c
+++ b/drivers/mtd/maps/pmcmsp-flash.c
@@ -75,15 +75,15 @@ static int __init init_msp_flash(void)
printk(KERN_NOTICE "Found %d PMC flash devices\n", fcnt);
- msp_flash = kmalloc(fcnt * sizeof(struct map_info *), GFP_KERNEL);
+ msp_flash = kcalloc(fcnt, sizeof(*msp_flash), GFP_KERNEL);
if (!msp_flash)
return -ENOMEM;
- msp_parts = kmalloc(fcnt * sizeof(struct mtd_partition *), GFP_KERNEL);
+ msp_parts = kcalloc(fcnt, sizeof(*msp_parts), GFP_KERNEL);
if (!msp_parts)
goto free_msp_flash;
- msp_maps = kcalloc(fcnt, sizeof(struct mtd_info), GFP_KERNEL);
+ msp_maps = kcalloc(fcnt, sizeof(*msp_maps), GFP_KERNEL);
if (!msp_maps)
goto free_msp_parts;
diff --git a/drivers/mtd/maps/sa1100-flash.c b/drivers/mtd/maps/sa1100-flash.c
index 142fc3d79463..784c6e1a0391 100644
--- a/drivers/mtd/maps/sa1100-flash.c
+++ b/drivers/mtd/maps/sa1100-flash.c
@@ -230,8 +230,10 @@ static struct sa_info *sa1100_setup_mtd(struct platform_device *pdev,
info->mtd = mtd_concat_create(cdev, info->num_subdev,
plat->name);
- if (info->mtd == NULL)
+ if (info->mtd == NULL) {
ret = -ENXIO;
+ goto err;
+ }
}
info->mtd->dev.parent = &pdev->dev;
diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig
index f05e0e9eb2f7..21ff58099f3b 100644
--- a/drivers/mtd/nand/Kconfig
+++ b/drivers/mtd/nand/Kconfig
@@ -438,7 +438,7 @@ config MTD_NAND_FSL_ELBC
config MTD_NAND_FSL_IFC
tristate "NAND support for Freescale IFC controller"
- depends on MTD_NAND && FSL_SOC
+ depends on MTD_NAND && (FSL_SOC || ARCH_LAYERSCAPE)
select FSL_IFC
select MEMORY
help
@@ -539,7 +539,6 @@ config MTD_NAND_FSMC
config MTD_NAND_XWAY
tristate "Support for NAND on Lantiq XWAY SoC"
depends on LANTIQ && SOC_TYPE_XWAY
- select MTD_NAND_PLATFORM
help
Enables support for NAND Flash chips on Lantiq XWAY SoCs. NAND is attached
to the External Bus Unit (EBU).
@@ -563,4 +562,11 @@ config MTD_NAND_QCOM
Enables support for NAND flash chips on SoCs containing the EBI2 NAND
controller. This controller is found on IPQ806x SoC.
+config MTD_NAND_MTK
+ tristate "Support for NAND controller on MTK SoCs"
+ depends on HAS_DMA
+ help
+ Enables support for NAND controller on MTK SoCs.
+ This controller is found on mt27xx, mt81xx, mt65xx SoCs.
+
endif # MTD_NAND
diff --git a/drivers/mtd/nand/Makefile b/drivers/mtd/nand/Makefile
index f55335373f7c..cafde6f3d957 100644
--- a/drivers/mtd/nand/Makefile
+++ b/drivers/mtd/nand/Makefile
@@ -57,5 +57,6 @@ obj-$(CONFIG_MTD_NAND_SUNXI) += sunxi_nand.o
obj-$(CONFIG_MTD_NAND_HISI504) += hisi504_nand.o
obj-$(CONFIG_MTD_NAND_BRCMNAND) += brcmnand/
obj-$(CONFIG_MTD_NAND_QCOM) += qcom_nandc.o
+obj-$(CONFIG_MTD_NAND_MTK) += mtk_nand.o mtk_ecc.o
nand-objs := nand_base.o nand_bbt.o nand_timings.o
diff --git a/drivers/mtd/nand/brcmnand/brcmnand.c b/drivers/mtd/nand/brcmnand/brcmnand.c
index b76ad7c0144f..8eb2c64df38c 100644
--- a/drivers/mtd/nand/brcmnand/brcmnand.c
+++ b/drivers/mtd/nand/brcmnand/brcmnand.c
@@ -340,6 +340,36 @@ static const u16 brcmnand_regs_v71[] = {
[BRCMNAND_FC_BASE] = 0x400,
};
+/* BRCMNAND v7.2 */
+static const u16 brcmnand_regs_v72[] = {
+ [BRCMNAND_CMD_START] = 0x04,
+ [BRCMNAND_CMD_EXT_ADDRESS] = 0x08,
+ [BRCMNAND_CMD_ADDRESS] = 0x0c,
+ [BRCMNAND_INTFC_STATUS] = 0x14,
+ [BRCMNAND_CS_SELECT] = 0x18,
+ [BRCMNAND_CS_XOR] = 0x1c,
+ [BRCMNAND_LL_OP] = 0x20,
+ [BRCMNAND_CS0_BASE] = 0x50,
+ [BRCMNAND_CS1_BASE] = 0,
+ [BRCMNAND_CORR_THRESHOLD] = 0xdc,
+ [BRCMNAND_CORR_THRESHOLD_EXT] = 0xe0,
+ [BRCMNAND_UNCORR_COUNT] = 0xfc,
+ [BRCMNAND_CORR_COUNT] = 0x100,
+ [BRCMNAND_CORR_EXT_ADDR] = 0x10c,
+ [BRCMNAND_CORR_ADDR] = 0x110,
+ [BRCMNAND_UNCORR_EXT_ADDR] = 0x114,
+ [BRCMNAND_UNCORR_ADDR] = 0x118,
+ [BRCMNAND_SEMAPHORE] = 0x150,
+ [BRCMNAND_ID] = 0x194,
+ [BRCMNAND_ID_EXT] = 0x198,
+ [BRCMNAND_LL_RDATA] = 0x19c,
+ [BRCMNAND_OOB_READ_BASE] = 0x200,
+ [BRCMNAND_OOB_READ_10_BASE] = 0,
+ [BRCMNAND_OOB_WRITE_BASE] = 0x400,
+ [BRCMNAND_OOB_WRITE_10_BASE] = 0,
+ [BRCMNAND_FC_BASE] = 0x600,
+};
+
enum brcmnand_cs_reg {
BRCMNAND_CS_CFG_EXT = 0,
BRCMNAND_CS_CFG,
@@ -435,7 +465,9 @@ static int brcmnand_revision_init(struct brcmnand_controller *ctrl)
}
/* Register offsets */
- if (ctrl->nand_version >= 0x0701)
+ if (ctrl->nand_version >= 0x0702)
+ ctrl->reg_offsets = brcmnand_regs_v72;
+ else if (ctrl->nand_version >= 0x0701)
ctrl->reg_offsets = brcmnand_regs_v71;
else if (ctrl->nand_version >= 0x0600)
ctrl->reg_offsets = brcmnand_regs_v60;
@@ -480,7 +512,9 @@ static int brcmnand_revision_init(struct brcmnand_controller *ctrl)
}
/* Maximum spare area sector size (per 512B) */
- if (ctrl->nand_version >= 0x0600)
+ if (ctrl->nand_version >= 0x0702)
+ ctrl->max_oob = 128;
+ else if (ctrl->nand_version >= 0x0600)
ctrl->max_oob = 64;
else if (ctrl->nand_version >= 0x0500)
ctrl->max_oob = 32;
@@ -583,14 +617,20 @@ static void brcmnand_wr_corr_thresh(struct brcmnand_host *host, u8 val)
enum brcmnand_reg reg = BRCMNAND_CORR_THRESHOLD;
int cs = host->cs;
- if (ctrl->nand_version >= 0x0600)
+ if (ctrl->nand_version >= 0x0702)
+ bits = 7;
+ else if (ctrl->nand_version >= 0x0600)
bits = 6;
else if (ctrl->nand_version >= 0x0500)
bits = 5;
else
bits = 4;
- if (ctrl->nand_version >= 0x0600) {
+ if (ctrl->nand_version >= 0x0702) {
+ if (cs >= 4)
+ reg = BRCMNAND_CORR_THRESHOLD_EXT;
+ shift = (cs % 4) * bits;
+ } else if (ctrl->nand_version >= 0x0600) {
if (cs >= 5)
reg = BRCMNAND_CORR_THRESHOLD_EXT;
shift = (cs % 5) * bits;
@@ -631,19 +671,28 @@ enum {
static inline u32 brcmnand_spare_area_mask(struct brcmnand_controller *ctrl)
{
- if (ctrl->nand_version >= 0x0600)
+ if (ctrl->nand_version >= 0x0702)
+ return GENMASK(7, 0);
+ else if (ctrl->nand_version >= 0x0600)
return GENMASK(6, 0);
else
return GENMASK(5, 0);
}
#define NAND_ACC_CONTROL_ECC_SHIFT 16
+#define NAND_ACC_CONTROL_ECC_EXT_SHIFT 13
static inline u32 brcmnand_ecc_level_mask(struct brcmnand_controller *ctrl)
{
u32 mask = (ctrl->nand_version >= 0x0600) ? 0x1f : 0x0f;
- return mask << NAND_ACC_CONTROL_ECC_SHIFT;
+ mask <<= NAND_ACC_CONTROL_ECC_SHIFT;
+
+ /* v7.2 includes additional ECC levels */
+ if (ctrl->nand_version >= 0x0702)
+ mask |= 0x7 << NAND_ACC_CONTROL_ECC_EXT_SHIFT;
+
+ return mask;
}
static void brcmnand_set_ecc_enabled(struct brcmnand_host *host, int en)
@@ -667,7 +716,9 @@ static void brcmnand_set_ecc_enabled(struct brcmnand_host *host, int en)
static inline int brcmnand_sector_1k_shift(struct brcmnand_controller *ctrl)
{
- if (ctrl->nand_version >= 0x0600)
+ if (ctrl->nand_version >= 0x0702)
+ return 9;
+ else if (ctrl->nand_version >= 0x0600)
return 7;
else if (ctrl->nand_version >= 0x0500)
return 6;
@@ -773,10 +824,16 @@ enum brcmnand_llop_type {
* Internal support functions
***********************************************************************/
-static inline bool is_hamming_ecc(struct brcmnand_cfg *cfg)
+static inline bool is_hamming_ecc(struct brcmnand_controller *ctrl,
+ struct brcmnand_cfg *cfg)
{
- return cfg->sector_size_1k == 0 && cfg->spare_area_size == 16 &&
- cfg->ecc_level == 15;
+ if (ctrl->nand_version <= 0x0701)
+ return cfg->sector_size_1k == 0 && cfg->spare_area_size == 16 &&
+ cfg->ecc_level == 15;
+ else
+ return cfg->sector_size_1k == 0 && ((cfg->spare_area_size == 16 &&
+ cfg->ecc_level == 15) ||
+ (cfg->spare_area_size == 28 && cfg->ecc_level == 16));
}
/*
@@ -931,7 +988,7 @@ static int brcmstb_choose_ecc_layout(struct brcmnand_host *host)
if (p->sector_size_1k)
ecc_level <<= 1;
- if (is_hamming_ecc(p)) {
+ if (is_hamming_ecc(host->ctrl, p)) {
ecc->bytes = 3 * sectors;
mtd_set_ooblayout(mtd, &brcmnand_hamming_ooblayout_ops);
return 0;
@@ -1108,7 +1165,7 @@ static void brcmnand_send_cmd(struct brcmnand_host *host, int cmd)
ctrl->cmd_pending = cmd;
intfc = brcmnand_read_reg(ctrl, BRCMNAND_INTFC_STATUS);
- BUG_ON(!(intfc & INTFC_CTLR_READY));
+ WARN_ON(!(intfc & INTFC_CTLR_READY));
mb(); /* flush previous writes */
brcmnand_write_reg(ctrl, BRCMNAND_CMD_START,
@@ -1545,6 +1602,56 @@ static int brcmnand_read_by_pio(struct mtd_info *mtd, struct nand_chip *chip,
return ret;
}
+/*
+ * Check a page to see if it is erased (w/ bitflips) after an uncorrectable ECC
+ * error
+ *
+ * Because the HW ECC signals an ECC error if an erase paged has even a single
+ * bitflip, we must check each ECC error to see if it is actually an erased
+ * page with bitflips, not a truly corrupted page.
+ *
+ * On a real error, return a negative error code (-EBADMSG for ECC error), and
+ * buf will contain raw data.
+ * Otherwise, buf gets filled with 0xffs and return the maximum number of
+ * bitflips-per-ECC-sector to the caller.
+ *
+ */
+static int brcmstb_nand_verify_erased_page(struct mtd_info *mtd,
+ struct nand_chip *chip, void *buf, u64 addr)
+{
+ int i, sas;
+ void *oob = chip->oob_poi;
+ int bitflips = 0;
+ int page = addr >> chip->page_shift;
+ int ret;
+
+ if (!buf) {
+ buf = chip->buffers->databuf;
+ /* Invalidate page cache */
+ chip->pagebuf = -1;
+ }
+
+ sas = mtd->oobsize / chip->ecc.steps;
+
+ /* read without ecc for verification */
+ chip->cmdfunc(mtd, NAND_CMD_READ0, 0x00, page);
+ ret = chip->ecc.read_page_raw(mtd, chip, buf, true, page);
+ if (ret)
+ return ret;
+
+ for (i = 0; i < chip->ecc.steps; i++, oob += sas) {
+ ret = nand_check_erased_ecc_chunk(buf, chip->ecc.size,
+ oob, sas, NULL, 0,
+ chip->ecc.strength);
+ if (ret < 0)
+ return ret;
+
+ bitflips = max(bitflips, ret);
+ }
+
+ return bitflips;
+}
+
static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip,
u64 addr, unsigned int trans, u32 *buf, u8 *oob)
{
@@ -1552,9 +1659,11 @@ static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip,
struct brcmnand_controller *ctrl = host->ctrl;
u64 err_addr = 0;
int err;
+ bool retry = true;
dev_dbg(ctrl->dev, "read %llx -> %p\n", (unsigned long long)addr, buf);
+try_dmaread:
brcmnand_write_reg(ctrl, BRCMNAND_UNCORR_COUNT, 0);
if (has_flash_dma(ctrl) && !oob && flash_dma_buf_ok(buf)) {
@@ -1575,6 +1684,34 @@ static int brcmnand_read(struct mtd_info *mtd, struct nand_chip *chip,
}
if (mtd_is_eccerr(err)) {
+ /*
+ * On controller version and 7.0, 7.1 , DMA read after a
+ * prior PIO read that reported uncorrectable error,
+ * the DMA engine captures this error following DMA read
+ * cleared only on subsequent DMA read, so just retry once
+ * to clear a possible false error reported for current DMA
+ * read
+ */
+ if ((ctrl->nand_version == 0x0700) ||
+ (ctrl->nand_version == 0x0701)) {
+ if (retry) {
+ retry = false;
+ goto try_dmaread;
+ }
+ }
+
+ /*
+ * Controller version 7.2 has hw encoder to detect erased page
+ * bitflips, apply sw verification for older controllers only
+ */
+ if (ctrl->nand_version < 0x0702) {
+ err = brcmstb_nand_verify_erased_page(mtd, chip, buf,
+ addr);
+ /* erased page bitflips corrected */
+ if (err > 0)
+ return err;
+ }
+
dev_dbg(ctrl->dev, "uncorrectable error at 0x%llx\n",
(unsigned long long)err_addr);
mtd->ecc_stats.failed++;
@@ -1857,7 +1994,8 @@ static int brcmnand_set_cfg(struct brcmnand_host *host,
return 0;
}
-static void brcmnand_print_cfg(char *buf, struct brcmnand_cfg *cfg)
+static void brcmnand_print_cfg(struct brcmnand_host *host,
+ char *buf, struct brcmnand_cfg *cfg)
{
buf += sprintf(buf,
"%lluMiB total, %uKiB blocks, %u%s pages, %uB OOB, %u-bit",
@@ -1868,7 +2006,7 @@ static void brcmnand_print_cfg(char *buf, struct brcmnand_cfg *cfg)
cfg->spare_area_size, cfg->device_width);
/* Account for Hamming ECC and for BCH 512B vs 1KiB sectors */
- if (is_hamming_ecc(cfg))
+ if (is_hamming_ecc(host->ctrl, cfg))
sprintf(buf, ", Hamming ECC");
else if (cfg->sector_size_1k)
sprintf(buf, ", BCH-%u (1KiB sector)", cfg->ecc_level << 1);
@@ -1987,7 +2125,7 @@ static int brcmnand_setup_dev(struct brcmnand_host *host)
brcmnand_set_ecc_enabled(host, 1);
- brcmnand_print_cfg(msg, cfg);
+ brcmnand_print_cfg(host, msg, cfg);
dev_info(ctrl->dev, "detected %s\n", msg);
/* Configure ACC_CONTROL */
@@ -1995,6 +2133,10 @@ static int brcmnand_setup_dev(struct brcmnand_host *host)
tmp = nand_readreg(ctrl, offs);
tmp &= ~ACC_CONTROL_PARTIAL_PAGE;
tmp &= ~ACC_CONTROL_RD_ERASED;
+
+ /* We need to turn on Read from erased paged protected by ECC */
+ if (ctrl->nand_version >= 0x0702)
+ tmp |= ACC_CONTROL_RD_ERASED;
tmp &= ~ACC_CONTROL_FAST_PGM_RDIN;
if (ctrl->features & BRCMNAND_HAS_PREFETCH) {
/*
@@ -2195,6 +2337,7 @@ static const struct of_device_id brcmnand_of_match[] = {
{ .compatible = "brcm,brcmnand-v6.2" },
{ .compatible = "brcm,brcmnand-v7.0" },
{ .compatible = "brcm,brcmnand-v7.1" },
+ { .compatible = "brcm,brcmnand-v7.2" },
{},
};
MODULE_DEVICE_TABLE(of, brcmnand_of_match);
diff --git a/drivers/mtd/nand/jz4780_bch.c b/drivers/mtd/nand/jz4780_bch.c
index d74f4ba4a6f4..731c6051d91e 100644
--- a/drivers/mtd/nand/jz4780_bch.c
+++ b/drivers/mtd/nand/jz4780_bch.c
@@ -375,6 +375,6 @@ static struct platform_driver jz4780_bch_driver = {
module_platform_driver(jz4780_bch_driver);
MODULE_AUTHOR("Alex Smith <alex@alex-smith.me.uk>");
-MODULE_AUTHOR("Harvey Hunt <harvey.hunt@imgtec.com>");
+MODULE_AUTHOR("Harvey Hunt <harveyhuntnexus@gmail.com>");
MODULE_DESCRIPTION("Ingenic JZ4780 BCH error correction driver");
MODULE_LICENSE("GPL v2");
diff --git a/drivers/mtd/nand/jz4780_nand.c b/drivers/mtd/nand/jz4780_nand.c
index daf3c4217f4d..175f67da25af 100644
--- a/drivers/mtd/nand/jz4780_nand.c
+++ b/drivers/mtd/nand/jz4780_nand.c
@@ -412,6 +412,6 @@ static struct platform_driver jz4780_nand_driver = {
module_platform_driver(jz4780_nand_driver);
MODULE_AUTHOR("Alex Smith <alex@alex-smith.me.uk>");
-MODULE_AUTHOR("Harvey Hunt <harvey.hunt@imgtec.com>");
+MODULE_AUTHOR("Harvey Hunt <harveyhuntnexus@gmail.com>");
MODULE_DESCRIPTION("Ingenic JZ4780 NAND driver");
MODULE_LICENSE("GPL v2");
diff --git a/drivers/mtd/nand/mtk_ecc.c b/drivers/mtd/nand/mtk_ecc.c
new file mode 100644
index 000000000000..25a4fbd4d24a
--- /dev/null
+++ b/drivers/mtd/nand/mtk_ecc.c
@@ -0,0 +1,530 @@
+/*
+ * MTK ECC controller driver.
+ * Copyright (C) 2016 MediaTek Inc.
+ * Authors: Xiaolei Li <xiaolei.li@mediatek.com>
+ * Jorge Ramirez-Ortiz <jorge.ramirez-ortiz@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/clk.h>
+#include <linux/module.h>
+#include <linux/iopoll.h>
+#include <linux/of.h>
+#include <linux/of_platform.h>
+#include <linux/mutex.h>
+
+#include "mtk_ecc.h"
+
+#define ECC_IDLE_MASK BIT(0)
+#define ECC_IRQ_EN BIT(0)
+#define ECC_OP_ENABLE (1)
+#define ECC_OP_DISABLE (0)
+
+#define ECC_ENCCON (0x00)
+#define ECC_ENCCNFG (0x04)
+#define ECC_CNFG_4BIT (0)
+#define ECC_CNFG_6BIT (1)
+#define ECC_CNFG_8BIT (2)
+#define ECC_CNFG_10BIT (3)
+#define ECC_CNFG_12BIT (4)
+#define ECC_CNFG_14BIT (5)
+#define ECC_CNFG_16BIT (6)
+#define ECC_CNFG_18BIT (7)
+#define ECC_CNFG_20BIT (8)
+#define ECC_CNFG_22BIT (9)
+#define ECC_CNFG_24BIT (0xa)
+#define ECC_CNFG_28BIT (0xb)
+#define ECC_CNFG_32BIT (0xc)
+#define ECC_CNFG_36BIT (0xd)
+#define ECC_CNFG_40BIT (0xe)
+#define ECC_CNFG_44BIT (0xf)
+#define ECC_CNFG_48BIT (0x10)
+#define ECC_CNFG_52BIT (0x11)
+#define ECC_CNFG_56BIT (0x12)
+#define ECC_CNFG_60BIT (0x13)
+#define ECC_MODE_SHIFT (5)
+#define ECC_MS_SHIFT (16)
+#define ECC_ENCDIADDR (0x08)
+#define ECC_ENCIDLE (0x0C)
+#define ECC_ENCPAR(x) (0x10 + (x) * sizeof(u32))
+#define ECC_ENCIRQ_EN (0x80)
+#define ECC_ENCIRQ_STA (0x84)
+#define ECC_DECCON (0x100)
+#define ECC_DECCNFG (0x104)
+#define DEC_EMPTY_EN BIT(31)
+#define DEC_CNFG_CORRECT (0x3 << 12)
+#define ECC_DECIDLE (0x10C)
+#define ECC_DECENUM0 (0x114)
+#define ERR_MASK (0x3f)
+#define ECC_DECDONE (0x124)
+#define ECC_DECIRQ_EN (0x200)
+#define ECC_DECIRQ_STA (0x204)
+
+#define ECC_TIMEOUT (500000)
+
+#define ECC_IDLE_REG(op) ((op) == ECC_ENCODE ? ECC_ENCIDLE : ECC_DECIDLE)
+#define ECC_CTL_REG(op) ((op) == ECC_ENCODE ? ECC_ENCCON : ECC_DECCON)
+#define ECC_IRQ_REG(op) ((op) == ECC_ENCODE ? \
+ ECC_ENCIRQ_EN : ECC_DECIRQ_EN)
+
+struct mtk_ecc {
+ struct device *dev;
+ void __iomem *regs;
+ struct clk *clk;
+
+ struct completion done;
+ struct mutex lock;
+ u32 sectors;
+};
+
+static inline void mtk_ecc_wait_idle(struct mtk_ecc *ecc,
+ enum mtk_ecc_operation op)
+{
+ struct device *dev = ecc->dev;
+ u32 val;
+ int ret;
+
+ ret = readl_poll_timeout_atomic(ecc->regs + ECC_IDLE_REG(op), val,
+ val & ECC_IDLE_MASK,
+ 10, ECC_TIMEOUT);
+ if (ret)
+ dev_warn(dev, "%s NOT idle\n",
+ op == ECC_ENCODE ? "encoder" : "decoder");
+}
+
+static irqreturn_t mtk_ecc_irq(int irq, void *id)
+{
+ struct mtk_ecc *ecc = id;
+ enum mtk_ecc_operation op;
+ u32 dec, enc;
+
+ dec = readw(ecc->regs + ECC_DECIRQ_STA) & ECC_IRQ_EN;
+ if (dec) {
+ op = ECC_DECODE;
+ dec = readw(ecc->regs + ECC_DECDONE);
+ if (dec & ecc->sectors) {
+ ecc->sectors = 0;
+ complete(&ecc->done);
+ } else {
+ return IRQ_HANDLED;
+ }
+ } else {
+ enc = readl(ecc->regs + ECC_ENCIRQ_STA) & ECC_IRQ_EN;
+ if (enc) {
+ op = ECC_ENCODE;
+ complete(&ecc->done);
+ } else {
+ return IRQ_NONE;
+ }
+ }
+
+ writel(0, ecc->regs + ECC_IRQ_REG(op));
+
+ return IRQ_HANDLED;
+}
+
+static void mtk_ecc_config(struct mtk_ecc *ecc, struct mtk_ecc_config *config)
+{
+ u32 ecc_bit = ECC_CNFG_4BIT, dec_sz, enc_sz;
+ u32 reg;
+
+ switch (config->strength) {
+ case 4:
+ ecc_bit = ECC_CNFG_4BIT;
+ break;
+ case 6:
+ ecc_bit = ECC_CNFG_6BIT;
+ break;
+ case 8:
+ ecc_bit = ECC_CNFG_8BIT;
+ break;
+ case 10:
+ ecc_bit = ECC_CNFG_10BIT;
+ break;
+ case 12:
+ ecc_bit = ECC_CNFG_12BIT;
+ break;
+ case 14:
+ ecc_bit = ECC_CNFG_14BIT;
+ break;
+ case 16:
+ ecc_bit = ECC_CNFG_16BIT;
+ break;
+ case 18:
+ ecc_bit = ECC_CNFG_18BIT;
+ break;
+ case 20:
+ ecc_bit = ECC_CNFG_20BIT;
+ break;
+ case 22:
+ ecc_bit = ECC_CNFG_22BIT;
+ break;
+ case 24:
+ ecc_bit = ECC_CNFG_24BIT;
+ break;
+ case 28:
+ ecc_bit = ECC_CNFG_28BIT;
+ break;
+ case 32:
+ ecc_bit = ECC_CNFG_32BIT;
+ break;
+ case 36:
+ ecc_bit = ECC_CNFG_36BIT;
+ break;
+ case 40:
+ ecc_bit = ECC_CNFG_40BIT;
+ break;
+ case 44:
+ ecc_bit = ECC_CNFG_44BIT;
+ break;
+ case 48:
+ ecc_bit = ECC_CNFG_48BIT;
+ break;
+ case 52:
+ ecc_bit = ECC_CNFG_52BIT;
+ break;
+ case 56:
+ ecc_bit = ECC_CNFG_56BIT;
+ break;
+ case 60:
+ ecc_bit = ECC_CNFG_60BIT;
+ break;
+ default:
+ dev_err(ecc->dev, "invalid strength %d, default to 4 bits\n",
+ config->strength);
+ }
+
+ if (config->op == ECC_ENCODE) {
+ /* configure ECC encoder (in bits) */
+ enc_sz = config->len << 3;
+
+ reg = ecc_bit | (config->mode << ECC_MODE_SHIFT);
+ reg |= (enc_sz << ECC_MS_SHIFT);
+ writel(reg, ecc->regs + ECC_ENCCNFG);
+
+ if (config->mode != ECC_NFI_MODE)
+ writel(lower_32_bits(config->addr),
+ ecc->regs + ECC_ENCDIADDR);
+
+ } else {
+ /* configure ECC decoder (in bits) */
+ dec_sz = (config->len << 3) +
+ config->strength * ECC_PARITY_BITS;
+
+ reg = ecc_bit | (config->mode << ECC_MODE_SHIFT);
+ reg |= (dec_sz << ECC_MS_SHIFT) | DEC_CNFG_CORRECT;
+ reg |= DEC_EMPTY_EN;
+ writel(reg, ecc->regs + ECC_DECCNFG);
+
+ if (config->sectors)
+ ecc->sectors = 1 << (config->sectors - 1);
+ }
+}
+
+void mtk_ecc_get_stats(struct mtk_ecc *ecc, struct mtk_ecc_stats *stats,
+ int sectors)
+{
+ u32 offset, i, err;
+ u32 bitflips = 0;
+
+ stats->corrected = 0;
+ stats->failed = 0;
+
+ for (i = 0; i < sectors; i++) {
+ offset = (i >> 2) << 2;
+ err = readl(ecc->regs + ECC_DECENUM0 + offset);
+ err = err >> ((i % 4) * 8);
+ err &= ERR_MASK;
+ if (err == ERR_MASK) {
+ /* uncorrectable errors */
+ stats->failed++;
+ continue;
+ }
+
+ stats->corrected += err;
+ bitflips = max_t(u32, bitflips, err);
+ }
+
+ stats->bitflips = bitflips;
+}
+EXPORT_SYMBOL(mtk_ecc_get_stats);
+
+void mtk_ecc_release(struct mtk_ecc *ecc)
+{
+ clk_disable_unprepare(ecc->clk);
+ put_device(ecc->dev);
+}
+EXPORT_SYMBOL(mtk_ecc_release);
+
+static void mtk_ecc_hw_init(struct mtk_ecc *ecc)
+{
+ mtk_ecc_wait_idle(ecc, ECC_ENCODE);
+ writew(ECC_OP_DISABLE, ecc->regs + ECC_ENCCON);
+
+ mtk_ecc_wait_idle(ecc, ECC_DECODE);
+ writel(ECC_OP_DISABLE, ecc->regs + ECC_DECCON);
+}
+
+static struct mtk_ecc *mtk_ecc_get(struct device_node *np)
+{
+ struct platform_device *pdev;
+ struct mtk_ecc *ecc;
+
+ pdev = of_find_device_by_node(np);
+ if (!pdev || !platform_get_drvdata(pdev))
+ return ERR_PTR(-EPROBE_DEFER);
+
+ get_device(&pdev->dev);
+ ecc = platform_get_drvdata(pdev);
+ clk_prepare_enable(ecc->clk);
+ mtk_ecc_hw_init(ecc);
+
+ return ecc;
+}
+
+struct mtk_ecc *of_mtk_ecc_get(struct device_node *of_node)
+{
+ struct mtk_ecc *ecc = NULL;
+ struct device_node *np;
+
+ np = of_parse_phandle(of_node, "ecc-engine", 0);
+ if (np) {
+ ecc = mtk_ecc_get(np);
+ of_node_put(np);
+ }
+
+ return ecc;
+}
+EXPORT_SYMBOL(of_mtk_ecc_get);
+
+int mtk_ecc_enable(struct mtk_ecc *ecc, struct mtk_ecc_config *config)
+{
+ enum mtk_ecc_operation op = config->op;
+ int ret;
+
+ ret = mutex_lock_interruptible(&ecc->lock);
+ if (ret) {
+ dev_err(ecc->dev, "interrupted when attempting to lock\n");
+ return ret;
+ }
+
+ mtk_ecc_wait_idle(ecc, op);
+ mtk_ecc_config(ecc, config);
+ writew(ECC_OP_ENABLE, ecc->regs + ECC_CTL_REG(op));
+
+ init_completion(&ecc->done);
+ writew(ECC_IRQ_EN, ecc->regs + ECC_IRQ_REG(op));
+
+ return 0;
+}
+EXPORT_SYMBOL(mtk_ecc_enable);
+
+void mtk_ecc_disable(struct mtk_ecc *ecc)
+{
+ enum mtk_ecc_operation op = ECC_ENCODE;
+
+ /* find out the running operation */
+ if (readw(ecc->regs + ECC_CTL_REG(op)) != ECC_OP_ENABLE)
+ op = ECC_DECODE;
+
+ /* disable it */
+ mtk_ecc_wait_idle(ecc, op);
+ writew(0, ecc->regs + ECC_IRQ_REG(op));
+ writew(ECC_OP_DISABLE, ecc->regs + ECC_CTL_REG(op));
+
+ mutex_unlock(&ecc->lock);
+}
+EXPORT_SYMBOL(mtk_ecc_disable);
+
+int mtk_ecc_wait_done(struct mtk_ecc *ecc, enum mtk_ecc_operation op)
+{
+ int ret;
+
+ ret = wait_for_completion_timeout(&ecc->done, msecs_to_jiffies(500));
+ if (!ret) {
+ dev_err(ecc->dev, "%s timeout - interrupt did not arrive)\n",
+ (op == ECC_ENCODE) ? "encoder" : "decoder");
+ return -ETIMEDOUT;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL(mtk_ecc_wait_done);
+
+int mtk_ecc_encode(struct mtk_ecc *ecc, struct mtk_ecc_config *config,
+ u8 *data, u32 bytes)
+{
+ dma_addr_t addr;
+ u32 *p, len, i;
+ int ret = 0;
+
+ addr = dma_map_single(ecc->dev, data, bytes, DMA_TO_DEVICE);
+ ret = dma_mapping_error(ecc->dev, addr);
+ if (ret) {
+ dev_err(ecc->dev, "dma mapping error\n");
+ return -EINVAL;
+ }
+
+ config->op = ECC_ENCODE;
+ config->addr = addr;
+ ret = mtk_ecc_enable(ecc, config);
+ if (ret) {
+ dma_unmap_single(ecc->dev, addr, bytes, DMA_TO_DEVICE);
+ return ret;
+ }
+
+ ret = mtk_ecc_wait_done(ecc, ECC_ENCODE);
+ if (ret)
+ goto timeout;
+
+ mtk_ecc_wait_idle(ecc, ECC_ENCODE);
+
+ /* Program ECC bytes to OOB: per sector oob = FDM + ECC + SPARE */
+ len = (config->strength * ECC_PARITY_BITS + 7) >> 3;
+ p = (u32 *)(data + bytes);
+
+ /* write the parity bytes generated by the ECC back to the OOB region */
+ for (i = 0; i < len; i++)
+ p[i] = readl(ecc->regs + ECC_ENCPAR(i));
+timeout:
+
+ dma_unmap_single(ecc->dev, addr, bytes, DMA_TO_DEVICE);
+ mtk_ecc_disable(ecc);
+
+ return ret;
+}
+EXPORT_SYMBOL(mtk_ecc_encode);
+
+void mtk_ecc_adjust_strength(u32 *p)
+{
+ u32 ecc[] = {4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 28, 32, 36,
+ 40, 44, 48, 52, 56, 60};
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(ecc); i++) {
+ if (*p <= ecc[i]) {
+ if (!i)
+ *p = ecc[i];
+ else if (*p != ecc[i])
+ *p = ecc[i - 1];
+ return;
+ }
+ }
+
+ *p = ecc[ARRAY_SIZE(ecc) - 1];
+}
+EXPORT_SYMBOL(mtk_ecc_adjust_strength);
+
+static int mtk_ecc_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct mtk_ecc *ecc;
+ struct resource *res;
+ int irq, ret;
+
+ ecc = devm_kzalloc(dev, sizeof(*ecc), GFP_KERNEL);
+ if (!ecc)
+ return -ENOMEM;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ ecc->regs = devm_ioremap_resource(dev, res);
+ if (IS_ERR(ecc->regs)) {
+ dev_err(dev, "failed to map regs: %ld\n", PTR_ERR(ecc->regs));
+ return PTR_ERR(ecc->regs);
+ }
+
+ ecc->clk = devm_clk_get(dev, NULL);
+ if (IS_ERR(ecc->clk)) {
+ dev_err(dev, "failed to get clock: %ld\n", PTR_ERR(ecc->clk));
+ return PTR_ERR(ecc->clk);
+ }
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0) {
+ dev_err(dev, "failed to get irq\n");
+ return -EINVAL;
+ }
+
+ ret = dma_set_mask(dev, DMA_BIT_MASK(32));
+ if (ret) {
+ dev_err(dev, "failed to set DMA mask\n");
+ return ret;
+ }
+
+ ret = devm_request_irq(dev, irq, mtk_ecc_irq, 0x0, "mtk-ecc", ecc);
+ if (ret) {
+ dev_err(dev, "failed to request irq\n");
+ return -EINVAL;
+ }
+
+ ecc->dev = dev;
+ mutex_init(&ecc->lock);
+ platform_set_drvdata(pdev, ecc);
+ dev_info(dev, "probed\n");
+
+ return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int mtk_ecc_suspend(struct device *dev)
+{
+ struct mtk_ecc *ecc = dev_get_drvdata(dev);
+
+ clk_disable_unprepare(ecc->clk);
+
+ return 0;
+}
+
+static int mtk_ecc_resume(struct device *dev)
+{
+ struct mtk_ecc *ecc = dev_get_drvdata(dev);
+ int ret;
+
+ ret = clk_prepare_enable(ecc->clk);
+ if (ret) {
+ dev_err(dev, "failed to enable clk\n");
+ return ret;
+ }
+
+ mtk_ecc_hw_init(ecc);
+
+ return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(mtk_ecc_pm_ops, mtk_ecc_suspend, mtk_ecc_resume);
+#endif
+
+static const struct of_device_id mtk_ecc_dt_match[] = {
+ { .compatible = "mediatek,mt2701-ecc" },
+ {},
+};
+
+MODULE_DEVICE_TABLE(of, mtk_ecc_dt_match);
+
+static struct platform_driver mtk_ecc_driver = {
+ .probe = mtk_ecc_probe,
+ .driver = {
+ .name = "mtk-ecc",
+ .of_match_table = of_match_ptr(mtk_ecc_dt_match),
+#ifdef CONFIG_PM_SLEEP
+ .pm = &mtk_ecc_pm_ops,
+#endif
+ },
+};
+
+module_platform_driver(mtk_ecc_driver);
+
+MODULE_AUTHOR("Xiaolei Li <xiaolei.li@mediatek.com>");
+MODULE_DESCRIPTION("MTK Nand ECC Driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/nand/mtk_ecc.h b/drivers/mtd/nand/mtk_ecc.h
new file mode 100644
index 000000000000..cbeba5cd1c13
--- /dev/null
+++ b/drivers/mtd/nand/mtk_ecc.h
@@ -0,0 +1,50 @@
+/*
+ * MTK SDG1 ECC controller
+ *
+ * Copyright (c) 2016 Mediatek
+ * Authors: Xiaolei Li <xiaolei.li@mediatek.com>
+ * Jorge Ramirez-Ortiz <jorge.ramirez-ortiz@linaro.org>
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published
+ * by the Free Software Foundation.
+ */
+
+#ifndef __DRIVERS_MTD_NAND_MTK_ECC_H__
+#define __DRIVERS_MTD_NAND_MTK_ECC_H__
+
+#include <linux/types.h>
+
+#define ECC_PARITY_BITS (14)
+
+enum mtk_ecc_mode {ECC_DMA_MODE = 0, ECC_NFI_MODE = 1};
+enum mtk_ecc_operation {ECC_ENCODE, ECC_DECODE};
+
+struct device_node;
+struct mtk_ecc;
+
+struct mtk_ecc_stats {
+ u32 corrected;
+ u32 bitflips;
+ u32 failed;
+};
+
+struct mtk_ecc_config {
+ enum mtk_ecc_operation op;
+ enum mtk_ecc_mode mode;
+ dma_addr_t addr;
+ u32 strength;
+ u32 sectors;
+ u32 len;
+};
+
+int mtk_ecc_encode(struct mtk_ecc *, struct mtk_ecc_config *, u8 *, u32);
+void mtk_ecc_get_stats(struct mtk_ecc *, struct mtk_ecc_stats *, int);
+int mtk_ecc_wait_done(struct mtk_ecc *, enum mtk_ecc_operation);
+int mtk_ecc_enable(struct mtk_ecc *, struct mtk_ecc_config *);
+void mtk_ecc_disable(struct mtk_ecc *);
+void mtk_ecc_adjust_strength(u32 *);
+
+struct mtk_ecc *of_mtk_ecc_get(struct device_node *);
+void mtk_ecc_release(struct mtk_ecc *);
+
+#endif
diff --git a/drivers/mtd/nand/mtk_nand.c b/drivers/mtd/nand/mtk_nand.c
new file mode 100644
index 000000000000..ddaa2acb9dd7
--- /dev/null
+++ b/drivers/mtd/nand/mtk_nand.c
@@ -0,0 +1,1526 @@
+/*
+ * MTK NAND Flash controller driver.
+ * Copyright (C) 2016 MediaTek Inc.
+ * Authors: Xiaolei Li <xiaolei.li@mediatek.com>
+ * Jorge Ramirez-Ortiz <jorge.ramirez-ortiz@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/platform_device.h>
+#include <linux/dma-mapping.h>
+#include <linux/interrupt.h>
+#include <linux/delay.h>
+#include <linux/clk.h>
+#include <linux/mtd/nand.h>
+#include <linux/mtd/mtd.h>
+#include <linux/module.h>
+#include <linux/iopoll.h>
+#include <linux/of.h>
+#include "mtk_ecc.h"
+
+/* NAND controller register definition */
+#define NFI_CNFG (0x00)
+#define CNFG_AHB BIT(0)
+#define CNFG_READ_EN BIT(1)
+#define CNFG_DMA_BURST_EN BIT(2)
+#define CNFG_BYTE_RW BIT(6)
+#define CNFG_HW_ECC_EN BIT(8)
+#define CNFG_AUTO_FMT_EN BIT(9)
+#define CNFG_OP_CUST (6 << 12)
+#define NFI_PAGEFMT (0x04)
+#define PAGEFMT_FDM_ECC_SHIFT (12)
+#define PAGEFMT_FDM_SHIFT (8)
+#define PAGEFMT_SPARE_16 (0)
+#define PAGEFMT_SPARE_26 (1)
+#define PAGEFMT_SPARE_27 (2)
+#define PAGEFMT_SPARE_28 (3)
+#define PAGEFMT_SPARE_32 (4)
+#define PAGEFMT_SPARE_36 (5)
+#define PAGEFMT_SPARE_40 (6)
+#define PAGEFMT_SPARE_44 (7)
+#define PAGEFMT_SPARE_48 (8)
+#define PAGEFMT_SPARE_49 (9)
+#define PAGEFMT_SPARE_50 (0xa)
+#define PAGEFMT_SPARE_51 (0xb)
+#define PAGEFMT_SPARE_52 (0xc)
+#define PAGEFMT_SPARE_62 (0xd)
+#define PAGEFMT_SPARE_63 (0xe)
+#define PAGEFMT_SPARE_64 (0xf)
+#define PAGEFMT_SPARE_SHIFT (4)
+#define PAGEFMT_SEC_SEL_512 BIT(2)
+#define PAGEFMT_512_2K (0)
+#define PAGEFMT_2K_4K (1)
+#define PAGEFMT_4K_8K (2)
+#define PAGEFMT_8K_16K (3)
+/* NFI control */
+#define NFI_CON (0x08)
+#define CON_FIFO_FLUSH BIT(0)
+#define CON_NFI_RST BIT(1)
+#define CON_BRD BIT(8) /* burst read */
+#define CON_BWR BIT(9) /* burst write */
+#define CON_SEC_SHIFT (12)
+/* Timming control register */
+#define NFI_ACCCON (0x0C)
+#define NFI_INTR_EN (0x10)
+#define INTR_AHB_DONE_EN BIT(6)
+#define NFI_INTR_STA (0x14)
+#define NFI_CMD (0x20)
+#define NFI_ADDRNOB (0x30)
+#define NFI_COLADDR (0x34)
+#define NFI_ROWADDR (0x38)
+#define NFI_STRDATA (0x40)
+#define STAR_EN (1)
+#define STAR_DE (0)
+#define NFI_CNRNB (0x44)
+#define NFI_DATAW (0x50)
+#define NFI_DATAR (0x54)
+#define NFI_PIO_DIRDY (0x58)
+#define PIO_DI_RDY (0x01)
+#define NFI_STA (0x60)
+#define STA_CMD BIT(0)
+#define STA_ADDR BIT(1)
+#define STA_BUSY BIT(8)
+#define STA_EMP_PAGE BIT(12)
+#define NFI_FSM_CUSTDATA (0xe << 16)
+#define NFI_FSM_MASK (0xf << 16)
+#define NFI_ADDRCNTR (0x70)
+#define CNTR_MASK GENMASK(16, 12)
+#define NFI_STRADDR (0x80)
+#define NFI_BYTELEN (0x84)
+#define NFI_CSEL (0x90)
+#define NFI_FDML(x) (0xA0 + (x) * sizeof(u32) * 2)
+#define NFI_FDMM(x) (0xA4 + (x) * sizeof(u32) * 2)
+#define NFI_FDM_MAX_SIZE (8)
+#define NFI_FDM_MIN_SIZE (1)
+#define NFI_MASTER_STA (0x224)
+#define MASTER_STA_MASK (0x0FFF)
+#define NFI_EMPTY_THRESH (0x23C)
+
+#define MTK_NAME "mtk-nand"
+#define KB(x) ((x) * 1024UL)
+#define MB(x) (KB(x) * 1024UL)
+
+#define MTK_TIMEOUT (500000)
+#define MTK_RESET_TIMEOUT (1000000)
+#define MTK_MAX_SECTOR (16)
+#define MTK_NAND_MAX_NSELS (2)
+
+struct mtk_nfc_bad_mark_ctl {
+ void (*bm_swap)(struct mtd_info *, u8 *buf, int raw);
+ u32 sec;
+ u32 pos;
+};
+
+/*
+ * FDM: region used to store free OOB data
+ */
+struct mtk_nfc_fdm {
+ u32 reg_size;
+ u32 ecc_size;
+};
+
+struct mtk_nfc_nand_chip {
+ struct list_head node;
+ struct nand_chip nand;
+
+ struct mtk_nfc_bad_mark_ctl bad_mark;
+ struct mtk_nfc_fdm fdm;
+ u32 spare_per_sector;
+
+ int nsels;
+ u8 sels[0];
+ /* nothing after this field */
+};
+
+struct mtk_nfc_clk {
+ struct clk *nfi_clk;
+ struct clk *pad_clk;
+};
+
+struct mtk_nfc {
+ struct nand_hw_control controller;
+ struct mtk_ecc_config ecc_cfg;
+ struct mtk_nfc_clk clk;
+ struct mtk_ecc *ecc;
+
+ struct device *dev;
+ void __iomem *regs;
+
+ struct completion done;
+ struct list_head chips;
+
+ u8 *buffer;
+};
+
+static inline struct mtk_nfc_nand_chip *to_mtk_nand(struct nand_chip *nand)
+{
+ return container_of(nand, struct mtk_nfc_nand_chip, nand);
+}
+
+static inline u8 *data_ptr(struct nand_chip *chip, const u8 *p, int i)
+{
+ return (u8 *)p + i * chip->ecc.size;
+}
+
+static inline u8 *oob_ptr(struct nand_chip *chip, int i)
+{
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+ u8 *poi;
+
+ /* map the sector's FDM data to free oob:
+ * the beginning of the oob area stores the FDM data of bad mark sectors
+ */
+
+ if (i < mtk_nand->bad_mark.sec)
+ poi = chip->oob_poi + (i + 1) * mtk_nand->fdm.reg_size;
+ else if (i == mtk_nand->bad_mark.sec)
+ poi = chip->oob_poi;
+ else
+ poi = chip->oob_poi + i * mtk_nand->fdm.reg_size;
+
+ return poi;
+}
+
+static inline int mtk_data_len(struct nand_chip *chip)
+{
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+
+ return chip->ecc.size + mtk_nand->spare_per_sector;
+}
+
+static inline u8 *mtk_data_ptr(struct nand_chip *chip, int i)
+{
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+
+ return nfc->buffer + i * mtk_data_len(chip);
+}
+
+static inline u8 *mtk_oob_ptr(struct nand_chip *chip, int i)
+{
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+
+ return nfc->buffer + i * mtk_data_len(chip) + chip->ecc.size;
+}
+
+static inline void nfi_writel(struct mtk_nfc *nfc, u32 val, u32 reg)
+{
+ writel(val, nfc->regs + reg);
+}
+
+static inline void nfi_writew(struct mtk_nfc *nfc, u16 val, u32 reg)
+{
+ writew(val, nfc->regs + reg);
+}
+
+static inline void nfi_writeb(struct mtk_nfc *nfc, u8 val, u32 reg)
+{
+ writeb(val, nfc->regs + reg);
+}
+
+static inline u32 nfi_readl(struct mtk_nfc *nfc, u32 reg)
+{
+ return readl_relaxed(nfc->regs + reg);
+}
+
+static inline u16 nfi_readw(struct mtk_nfc *nfc, u32 reg)
+{
+ return readw_relaxed(nfc->regs + reg);
+}
+
+static inline u8 nfi_readb(struct mtk_nfc *nfc, u32 reg)
+{
+ return readb_relaxed(nfc->regs + reg);
+}
+
+static void mtk_nfc_hw_reset(struct mtk_nfc *nfc)
+{
+ struct device *dev = nfc->dev;
+ u32 val;
+ int ret;
+
+ /* reset all registers and force the NFI master to terminate */
+ nfi_writel(nfc, CON_FIFO_FLUSH | CON_NFI_RST, NFI_CON);
+
+ /* wait for the master to finish the last transaction */
+ ret = readl_poll_timeout(nfc->regs + NFI_MASTER_STA, val,
+ !(val & MASTER_STA_MASK), 50,
+ MTK_RESET_TIMEOUT);
+ if (ret)
+ dev_warn(dev, "master active in reset [0x%x] = 0x%x\n",
+ NFI_MASTER_STA, val);
+
+ /* ensure any status register affected by the NFI master is reset */
+ nfi_writel(nfc, CON_FIFO_FLUSH | CON_NFI_RST, NFI_CON);
+ nfi_writew(nfc, STAR_DE, NFI_STRDATA);
+}
+
+static int mtk_nfc_send_command(struct mtk_nfc *nfc, u8 command)
+{
+ struct device *dev = nfc->dev;
+ u32 val;
+ int ret;
+
+ nfi_writel(nfc, command, NFI_CMD);
+
+ ret = readl_poll_timeout_atomic(nfc->regs + NFI_STA, val,
+ !(val & STA_CMD), 10, MTK_TIMEOUT);
+ if (ret) {
+ dev_warn(dev, "nfi core timed out entering command mode\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int mtk_nfc_send_address(struct mtk_nfc *nfc, int addr)
+{
+ struct device *dev = nfc->dev;
+ u32 val;
+ int ret;
+
+ nfi_writel(nfc, addr, NFI_COLADDR);
+ nfi_writel(nfc, 0, NFI_ROWADDR);
+ nfi_writew(nfc, 1, NFI_ADDRNOB);
+
+ ret = readl_poll_timeout_atomic(nfc->regs + NFI_STA, val,
+ !(val & STA_ADDR), 10, MTK_TIMEOUT);
+ if (ret) {
+ dev_warn(dev, "nfi core timed out entering address mode\n");
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static int mtk_nfc_hw_runtime_config(struct mtd_info *mtd)
+{
+ struct nand_chip *chip = mtd_to_nand(mtd);
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+ u32 fmt, spare;
+
+ if (!mtd->writesize)
+ return 0;
+
+ spare = mtk_nand->spare_per_sector;
+
+ switch (mtd->writesize) {
+ case 512:
+ fmt = PAGEFMT_512_2K | PAGEFMT_SEC_SEL_512;
+ break;
+ case KB(2):
+ if (chip->ecc.size == 512)
+ fmt = PAGEFMT_2K_4K | PAGEFMT_SEC_SEL_512;
+ else
+ fmt = PAGEFMT_512_2K;
+ break;
+ case KB(4):
+ if (chip->ecc.size == 512)
+ fmt = PAGEFMT_4K_8K | PAGEFMT_SEC_SEL_512;
+ else
+ fmt = PAGEFMT_2K_4K;
+ break;
+ case KB(8):
+ if (chip->ecc.size == 512)
+ fmt = PAGEFMT_8K_16K | PAGEFMT_SEC_SEL_512;
+ else
+ fmt = PAGEFMT_4K_8K;
+ break;
+ case KB(16):
+ fmt = PAGEFMT_8K_16K;
+ break;
+ default:
+ dev_err(nfc->dev, "invalid page len: %d\n", mtd->writesize);
+ return -EINVAL;
+ }
+
+ /*
+ * the hardware will double the value for this eccsize, so we need to
+ * halve it
+ */
+ if (chip->ecc.size == 1024)
+ spare >>= 1;
+
+ switch (spare) {
+ case 16:
+ fmt |= (PAGEFMT_SPARE_16 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 26:
+ fmt |= (PAGEFMT_SPARE_26 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 27:
+ fmt |= (PAGEFMT_SPARE_27 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 28:
+ fmt |= (PAGEFMT_SPARE_28 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 32:
+ fmt |= (PAGEFMT_SPARE_32 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 36:
+ fmt |= (PAGEFMT_SPARE_36 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 40:
+ fmt |= (PAGEFMT_SPARE_40 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 44:
+ fmt |= (PAGEFMT_SPARE_44 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 48:
+ fmt |= (PAGEFMT_SPARE_48 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 49:
+ fmt |= (PAGEFMT_SPARE_49 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 50:
+ fmt |= (PAGEFMT_SPARE_50 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 51:
+ fmt |= (PAGEFMT_SPARE_51 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 52:
+ fmt |= (PAGEFMT_SPARE_52 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 62:
+ fmt |= (PAGEFMT_SPARE_62 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 63:
+ fmt |= (PAGEFMT_SPARE_63 << PAGEFMT_SPARE_SHIFT);
+ break;
+ case 64:
+ fmt |= (PAGEFMT_SPARE_64 << PAGEFMT_SPARE_SHIFT);
+ break;
+ default:
+ dev_err(nfc->dev, "invalid spare per sector %d\n", spare);
+ return -EINVAL;
+ }
+
+ fmt |= mtk_nand->fdm.reg_size << PAGEFMT_FDM_SHIFT;
+ fmt |= mtk_nand->fdm.ecc_size << PAGEFMT_FDM_ECC_SHIFT;
+ nfi_writew(nfc, fmt, NFI_PAGEFMT);
+
+ nfc->ecc_cfg.strength = chip->ecc.strength;
+ nfc->ecc_cfg.len = chip->ecc.size + mtk_nand->fdm.ecc_size;
+
+ return 0;
+}
+
+static void mtk_nfc_select_chip(struct mtd_info *mtd, int chip)
+{
+ struct nand_chip *nand = mtd_to_nand(mtd);
+ struct mtk_nfc *nfc = nand_get_controller_data(nand);
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(nand);
+
+ if (chip < 0)
+ return;
+
+ mtk_nfc_hw_runtime_config(mtd);
+
+ nfi_writel(nfc, mtk_nand->sels[chip], NFI_CSEL);
+}
+
+static int mtk_nfc_dev_ready(struct mtd_info *mtd)
+{
+ struct mtk_nfc *nfc = nand_get_controller_data(mtd_to_nand(mtd));
+
+ if (nfi_readl(nfc, NFI_STA) & STA_BUSY)
+ return 0;
+
+ return 1;
+}
+
+static void mtk_nfc_cmd_ctrl(struct mtd_info *mtd, int dat, unsigned int ctrl)
+{
+ struct mtk_nfc *nfc = nand_get_controller_data(mtd_to_nand(mtd));
+
+ if (ctrl & NAND_ALE) {
+ mtk_nfc_send_address(nfc, dat);
+ } else if (ctrl & NAND_CLE) {
+ mtk_nfc_hw_reset(nfc);
+
+ nfi_writew(nfc, CNFG_OP_CUST, NFI_CNFG);
+ mtk_nfc_send_command(nfc, dat);
+ }
+}
+
+static inline void mtk_nfc_wait_ioready(struct mtk_nfc *nfc)
+{
+ int rc;
+ u8 val;
+
+ rc = readb_poll_timeout_atomic(nfc->regs + NFI_PIO_DIRDY, val,
+ val & PIO_DI_RDY, 10, MTK_TIMEOUT);
+ if (rc < 0)
+ dev_err(nfc->dev, "data not ready\n");
+}
+
+static inline u8 mtk_nfc_read_byte(struct mtd_info *mtd)
+{
+ struct nand_chip *chip = mtd_to_nand(mtd);
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+ u32 reg;
+
+ /* after each byte read, the NFI_STA reg is reset by the hardware */
+ reg = nfi_readl(nfc, NFI_STA) & NFI_FSM_MASK;
+ if (reg != NFI_FSM_CUSTDATA) {
+ reg = nfi_readw(nfc, NFI_CNFG);
+ reg |= CNFG_BYTE_RW | CNFG_READ_EN;
+ nfi_writew(nfc, reg, NFI_CNFG);
+
+ /*
+ * set to max sector to allow the HW to continue reading over
+ * unaligned accesses
+ */
+ reg = (MTK_MAX_SECTOR << CON_SEC_SHIFT) | CON_BRD;
+ nfi_writel(nfc, reg, NFI_CON);
+
+ /* trigger to fetch data */
+ nfi_writew(nfc, STAR_EN, NFI_STRDATA);
+ }
+
+ mtk_nfc_wait_ioready(nfc);
+
+ return nfi_readb(nfc, NFI_DATAR);
+}
+
+static void mtk_nfc_read_buf(struct mtd_info *mtd, u8 *buf, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++)
+ buf[i] = mtk_nfc_read_byte(mtd);
+}
+
+static void mtk_nfc_write_byte(struct mtd_info *mtd, u8 byte)
+{
+ struct mtk_nfc *nfc = nand_get_controller_data(mtd_to_nand(mtd));
+ u32 reg;
+
+ reg = nfi_readl(nfc, NFI_STA) & NFI_FSM_MASK;
+
+ if (reg != NFI_FSM_CUSTDATA) {
+ reg = nfi_readw(nfc, NFI_CNFG) | CNFG_BYTE_RW;
+ nfi_writew(nfc, reg, NFI_CNFG);
+
+ reg = MTK_MAX_SECTOR << CON_SEC_SHIFT | CON_BWR;
+ nfi_writel(nfc, reg, NFI_CON);
+
+ nfi_writew(nfc, STAR_EN, NFI_STRDATA);
+ }
+
+ mtk_nfc_wait_ioready(nfc);
+ nfi_writeb(nfc, byte, NFI_DATAW);
+}
+
+static void mtk_nfc_write_buf(struct mtd_info *mtd, const u8 *buf, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++)
+ mtk_nfc_write_byte(mtd, buf[i]);
+}
+
+static int mtk_nfc_sector_encode(struct nand_chip *chip, u8 *data)
+{
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+ int size = chip->ecc.size + mtk_nand->fdm.reg_size;
+
+ nfc->ecc_cfg.mode = ECC_DMA_MODE;
+ nfc->ecc_cfg.op = ECC_ENCODE;
+
+ return mtk_ecc_encode(nfc->ecc, &nfc->ecc_cfg, data, size);
+}
+
+static void mtk_nfc_no_bad_mark_swap(struct mtd_info *a, u8 *b, int c)
+{
+ /* nop */
+}
+
+static void mtk_nfc_bad_mark_swap(struct mtd_info *mtd, u8 *buf, int raw)
+{
+ struct nand_chip *chip = mtd_to_nand(mtd);
+ struct mtk_nfc_nand_chip *nand = to_mtk_nand(chip);
+ u32 bad_pos = nand->bad_mark.pos;
+
+ if (raw)
+ bad_pos += nand->bad_mark.sec * mtk_data_len(chip);
+ else
+ bad_pos += nand->bad_mark.sec * chip->ecc.size;
+
+ swap(chip->oob_poi[0], buf[bad_pos]);
+}
+
+static int mtk_nfc_format_subpage(struct mtd_info *mtd, u32 offset,
+ u32 len, const u8 *buf)
+{
+ struct nand_chip *chip = mtd_to_nand(mtd);
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+ struct mtk_nfc_fdm *fdm = &mtk_nand->fdm;
+ u32 start, end;
+ int i, ret;
+
+ start = offset / chip->ecc.size;
+ end = DIV_ROUND_UP(offset + len, chip->ecc.size);
+
+ memset(nfc->buffer, 0xff, mtd->writesize + mtd->oobsize);
+ for (i = 0; i < chip->ecc.steps; i++) {
+ memcpy(mtk_data_ptr(chip, i), data_ptr(chip, buf, i),
+ chip->ecc.size);
+
+ if (start > i || i >= end)
+ continue;
+
+ if (i == mtk_nand->bad_mark.sec)
+ mtk_nand->bad_mark.bm_swap(mtd, nfc->buffer, 1);
+
+ memcpy(mtk_oob_ptr(chip, i), oob_ptr(chip, i), fdm->reg_size);
+
+ /* program the CRC back to the OOB */
+ ret = mtk_nfc_sector_encode(chip, mtk_data_ptr(chip, i));
+ if (ret < 0)
+ return ret;
+ }
+
+ return 0;
+}
+
+static void mtk_nfc_format_page(struct mtd_info *mtd, const u8 *buf)
+{
+ struct nand_chip *chip = mtd_to_nand(mtd);
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+ struct mtk_nfc_fdm *fdm = &mtk_nand->fdm;
+ u32 i;
+
+ memset(nfc->buffer, 0xff, mtd->writesize + mtd->oobsize);
+ for (i = 0; i < chip->ecc.steps; i++) {
+ if (buf)
+ memcpy(mtk_data_ptr(chip, i), data_ptr(chip, buf, i),
+ chip->ecc.size);
+
+ if (i == mtk_nand->bad_mark.sec)
+ mtk_nand->bad_mark.bm_swap(mtd, nfc->buffer, 1);
+
+ memcpy(mtk_oob_ptr(chip, i), oob_ptr(chip, i), fdm->reg_size);
+ }
+}
+
+static inline void mtk_nfc_read_fdm(struct nand_chip *chip, u32 start,
+ u32 sectors)
+{
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+ struct mtk_nfc_fdm *fdm = &mtk_nand->fdm;
+ u32 vall, valm;
+ u8 *oobptr;
+ int i, j;
+
+ for (i = 0; i < sectors; i++) {
+ oobptr = oob_ptr(chip, start + i);
+ vall = nfi_readl(nfc, NFI_FDML(i));
+ valm = nfi_readl(nfc, NFI_FDMM(i));
+
+ for (j = 0; j < fdm->reg_size; j++)
+ oobptr[j] = (j >= 4 ? valm : vall) >> ((j % 4) * 8);
+ }
+}
+
+static inline void mtk_nfc_write_fdm(struct nand_chip *chip)
+{
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+ struct mtk_nfc_fdm *fdm = &mtk_nand->fdm;
+ u32 vall, valm;
+ u8 *oobptr;
+ int i, j;
+
+ for (i = 0; i < chip->ecc.steps; i++) {
+ oobptr = oob_ptr(chip, i);
+ vall = 0;
+ valm = 0;
+ for (j = 0; j < 8; j++) {
+ if (j < 4)
+ vall |= (j < fdm->reg_size ? oobptr[j] : 0xff)
+ << (j * 8);
+ else
+ valm |= (j < fdm->reg_size ? oobptr[j] : 0xff)
+ << ((j - 4) * 8);
+ }
+ nfi_writel(nfc, vall, NFI_FDML(i));
+ nfi_writel(nfc, valm, NFI_FDMM(i));
+ }
+}
+
+static int mtk_nfc_do_write_page(struct mtd_info *mtd, struct nand_chip *chip,
+ const u8 *buf, int page, int len)
+{
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+ struct device *dev = nfc->dev;
+ dma_addr_t addr;
+ u32 reg;
+ int ret;
+
+ addr = dma_map_single(dev, (void *)buf, len, DMA_TO_DEVICE);
+ ret = dma_mapping_error(nfc->dev, addr);
+ if (ret) {
+ dev_err(nfc->dev, "dma mapping error\n");
+ return -EINVAL;
+ }
+
+ reg = nfi_readw(nfc, NFI_CNFG) | CNFG_AHB | CNFG_DMA_BURST_EN;
+ nfi_writew(nfc, reg, NFI_CNFG);
+
+ nfi_writel(nfc, chip->ecc.steps << CON_SEC_SHIFT, NFI_CON);
+ nfi_writel(nfc, lower_32_bits(addr), NFI_STRADDR);
+ nfi_writew(nfc, INTR_AHB_DONE_EN, NFI_INTR_EN);
+
+ init_completion(&nfc->done);
+
+ reg = nfi_readl(nfc, NFI_CON) | CON_BWR;
+ nfi_writel(nfc, reg, NFI_CON);
+ nfi_writew(nfc, STAR_EN, NFI_STRDATA);
+
+ ret = wait_for_completion_timeout(&nfc->done, msecs_to_jiffies(500));
+ if (!ret) {
+ dev_err(dev, "program ahb done timeout\n");
+ nfi_writew(nfc, 0, NFI_INTR_EN);
+ ret = -ETIMEDOUT;
+ goto timeout;
+ }
+
+ ret = readl_poll_timeout_atomic(nfc->regs + NFI_ADDRCNTR, reg,
+ (reg & CNTR_MASK) >= chip->ecc.steps,
+ 10, MTK_TIMEOUT);
+ if (ret)
+ dev_err(dev, "hwecc write timeout\n");
+
+timeout:
+
+ dma_unmap_single(nfc->dev, addr, len, DMA_TO_DEVICE);
+ nfi_writel(nfc, 0, NFI_CON);
+
+ return ret;
+}
+
+static int mtk_nfc_write_page(struct mtd_info *mtd, struct nand_chip *chip,
+ const u8 *buf, int page, int raw)
+{
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+ size_t len;
+ const u8 *bufpoi;
+ u32 reg;
+ int ret;
+
+ if (!raw) {
+ /* OOB => FDM: from register, ECC: from HW */
+ reg = nfi_readw(nfc, NFI_CNFG) | CNFG_AUTO_FMT_EN;
+ nfi_writew(nfc, reg | CNFG_HW_ECC_EN, NFI_CNFG);
+
+ nfc->ecc_cfg.op = ECC_ENCODE;
+ nfc->ecc_cfg.mode = ECC_NFI_MODE;
+ ret = mtk_ecc_enable(nfc->ecc, &nfc->ecc_cfg);
+ if (ret) {
+ /* clear NFI config */
+ reg = nfi_readw(nfc, NFI_CNFG);
+ reg &= ~(CNFG_AUTO_FMT_EN | CNFG_HW_ECC_EN);
+ nfi_writew(nfc, reg, NFI_CNFG);
+
+ return ret;
+ }
+
+ memcpy(nfc->buffer, buf, mtd->writesize);
+ mtk_nand->bad_mark.bm_swap(mtd, nfc->buffer, raw);
+ bufpoi = nfc->buffer;
+
+ /* write OOB into the FDM registers (OOB area in MTK NAND) */
+ mtk_nfc_write_fdm(chip);
+ } else {
+ bufpoi = buf;
+ }
+
+ len = mtd->writesize + (raw ? mtd->oobsize : 0);
+ ret = mtk_nfc_do_write_page(mtd, chip, bufpoi, page, len);
+
+ if (!raw)
+ mtk_ecc_disable(nfc->ecc);
+
+ return ret;
+}
+
+static int mtk_nfc_write_page_hwecc(struct mtd_info *mtd,
+ struct nand_chip *chip, const u8 *buf,
+ int oob_on, int page)
+{
+ return mtk_nfc_write_page(mtd, chip, buf, page, 0);
+}
+
+static int mtk_nfc_write_page_raw(struct mtd_info *mtd, struct nand_chip *chip,
+ const u8 *buf, int oob_on, int pg)
+{
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+
+ mtk_nfc_format_page(mtd, buf);
+ return mtk_nfc_write_page(mtd, chip, nfc->buffer, pg, 1);
+}
+
+static int mtk_nfc_write_subpage_hwecc(struct mtd_info *mtd,
+ struct nand_chip *chip, u32 offset,
+ u32 data_len, const u8 *buf,
+ int oob_on, int page)
+{
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+ int ret;
+
+ ret = mtk_nfc_format_subpage(mtd, offset, data_len, buf);
+ if (ret < 0)
+ return ret;
+
+ /* use the data in the private buffer (now with FDM and CRC) */
+ return mtk_nfc_write_page(mtd, chip, nfc->buffer, page, 1);
+}
+
+static int mtk_nfc_write_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
+ int page)
+{
+ int ret;
+
+ chip->cmdfunc(mtd, NAND_CMD_SEQIN, 0x00, page);
+
+ ret = mtk_nfc_write_page_raw(mtd, chip, NULL, 1, page);
+ if (ret < 0)
+ return -EIO;
+
+ chip->cmdfunc(mtd, NAND_CMD_PAGEPROG, -1, -1);
+ ret = chip->waitfunc(mtd, chip);
+
+ return ret & NAND_STATUS_FAIL ? -EIO : 0;
+}
+
+static int mtk_nfc_update_ecc_stats(struct mtd_info *mtd, u8 *buf, u32 sectors)
+{
+ struct nand_chip *chip = mtd_to_nand(mtd);
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+ struct mtk_ecc_stats stats;
+ int rc, i;
+
+ rc = nfi_readl(nfc, NFI_STA) & STA_EMP_PAGE;
+ if (rc) {
+ memset(buf, 0xff, sectors * chip->ecc.size);
+ for (i = 0; i < sectors; i++)
+ memset(oob_ptr(chip, i), 0xff, mtk_nand->fdm.reg_size);
+ return 0;
+ }
+
+ mtk_ecc_get_stats(nfc->ecc, &stats, sectors);
+ mtd->ecc_stats.corrected += stats.corrected;
+ mtd->ecc_stats.failed += stats.failed;
+
+ return stats.bitflips;
+}
+
+static int mtk_nfc_read_subpage(struct mtd_info *mtd, struct nand_chip *chip,
+ u32 data_offs, u32 readlen,
+ u8 *bufpoi, int page, int raw)
+{
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+ u32 spare = mtk_nand->spare_per_sector;
+ u32 column, sectors, start, end, reg;
+ dma_addr_t addr;
+ int bitflips;
+ size_t len;
+ u8 *buf;
+ int rc;
+
+ start = data_offs / chip->ecc.size;
+ end = DIV_ROUND_UP(data_offs + readlen, chip->ecc.size);
+
+ sectors = end - start;
+ column = start * (chip->ecc.size + spare);
+
+ len = sectors * chip->ecc.size + (raw ? sectors * spare : 0);
+ buf = bufpoi + start * chip->ecc.size;
+
+ if (column != 0)
+ chip->cmdfunc(mtd, NAND_CMD_RNDOUT, column, -1);
+
+ addr = dma_map_single(nfc->dev, buf, len, DMA_FROM_DEVICE);
+ rc = dma_mapping_error(nfc->dev, addr);
+ if (rc) {
+ dev_err(nfc->dev, "dma mapping error\n");
+
+ return -EINVAL;
+ }
+
+ reg = nfi_readw(nfc, NFI_CNFG);
+ reg |= CNFG_READ_EN | CNFG_DMA_BURST_EN | CNFG_AHB;
+ if (!raw) {
+ reg |= CNFG_AUTO_FMT_EN | CNFG_HW_ECC_EN;
+ nfi_writew(nfc, reg, NFI_CNFG);
+
+ nfc->ecc_cfg.mode = ECC_NFI_MODE;
+ nfc->ecc_cfg.sectors = sectors;
+ nfc->ecc_cfg.op = ECC_DECODE;
+ rc = mtk_ecc_enable(nfc->ecc, &nfc->ecc_cfg);
+ if (rc) {
+ dev_err(nfc->dev, "ecc enable\n");
+ /* clear NFI_CNFG */
+ reg &= ~(CNFG_DMA_BURST_EN | CNFG_AHB | CNFG_READ_EN |
+ CNFG_AUTO_FMT_EN | CNFG_HW_ECC_EN);
+ nfi_writew(nfc, reg, NFI_CNFG);
+ dma_unmap_single(nfc->dev, addr, len, DMA_FROM_DEVICE);
+
+ return rc;
+ }
+ } else {
+ nfi_writew(nfc, reg, NFI_CNFG);
+ }
+
+ nfi_writel(nfc, sectors << CON_SEC_SHIFT, NFI_CON);
+ nfi_writew(nfc, INTR_AHB_DONE_EN, NFI_INTR_EN);
+ nfi_writel(nfc, lower_32_bits(addr), NFI_STRADDR);
+
+ init_completion(&nfc->done);
+ reg = nfi_readl(nfc, NFI_CON) | CON_BRD;
+ nfi_writel(nfc, reg, NFI_CON);
+ nfi_writew(nfc, STAR_EN, NFI_STRDATA);
+
+ rc = wait_for_completion_timeout(&nfc->done, msecs_to_jiffies(500));
+ if (!rc)
+ dev_warn(nfc->dev, "read ahb/dma done timeout\n");
+
+ rc = readl_poll_timeout_atomic(nfc->regs + NFI_BYTELEN, reg,
+ (reg & CNTR_MASK) >= sectors, 10,
+ MTK_TIMEOUT);
+ if (rc < 0) {
+ dev_err(nfc->dev, "subpage done timeout\n");
+ bitflips = -EIO;
+ } else {
+ bitflips = 0;
+ if (!raw) {
+ rc = mtk_ecc_wait_done(nfc->ecc, ECC_DECODE);
+ bitflips = rc < 0 ? -ETIMEDOUT :
+ mtk_nfc_update_ecc_stats(mtd, buf, sectors);
+ mtk_nfc_read_fdm(chip, start, sectors);
+ }
+ }
+
+ dma_unmap_single(nfc->dev, addr, len, DMA_FROM_DEVICE);
+
+ if (raw)
+ goto done;
+
+ mtk_ecc_disable(nfc->ecc);
+
+ if (clamp(mtk_nand->bad_mark.sec, start, end) == mtk_nand->bad_mark.sec)
+ mtk_nand->bad_mark.bm_swap(mtd, bufpoi, raw);
+done:
+ nfi_writel(nfc, 0, NFI_CON);
+
+ return bitflips;
+}
+
+static int mtk_nfc_read_subpage_hwecc(struct mtd_info *mtd,
+ struct nand_chip *chip, u32 off,
+ u32 len, u8 *p, int pg)
+{
+ return mtk_nfc_read_subpage(mtd, chip, off, len, p, pg, 0);
+}
+
+static int mtk_nfc_read_page_hwecc(struct mtd_info *mtd,
+ struct nand_chip *chip, u8 *p,
+ int oob_on, int pg)
+{
+ return mtk_nfc_read_subpage(mtd, chip, 0, mtd->writesize, p, pg, 0);
+}
+
+static int mtk_nfc_read_page_raw(struct mtd_info *mtd, struct nand_chip *chip,
+ u8 *buf, int oob_on, int page)
+{
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+ struct mtk_nfc *nfc = nand_get_controller_data(chip);
+ struct mtk_nfc_fdm *fdm = &mtk_nand->fdm;
+ int i, ret;
+
+ memset(nfc->buffer, 0xff, mtd->writesize + mtd->oobsize);
+ ret = mtk_nfc_read_subpage(mtd, chip, 0, mtd->writesize, nfc->buffer,
+ page, 1);
+ if (ret < 0)
+ return ret;
+
+ for (i = 0; i < chip->ecc.steps; i++) {
+ memcpy(oob_ptr(chip, i), mtk_oob_ptr(chip, i), fdm->reg_size);
+
+ if (i == mtk_nand->bad_mark.sec)
+ mtk_nand->bad_mark.bm_swap(mtd, nfc->buffer, 1);
+
+ if (buf)
+ memcpy(data_ptr(chip, buf, i), mtk_data_ptr(chip, i),
+ chip->ecc.size);
+ }
+
+ return ret;
+}
+
+static int mtk_nfc_read_oob_std(struct mtd_info *mtd, struct nand_chip *chip,
+ int page)
+{
+ chip->cmdfunc(mtd, NAND_CMD_READ0, 0, page);
+
+ return mtk_nfc_read_page_raw(mtd, chip, NULL, 1, page);
+}
+
+static inline void mtk_nfc_hw_init(struct mtk_nfc *nfc)
+{
+ /*
+ * ACCON: access timing control register
+ * -------------------------------------
+ * 31:28: minimum required time for CS post pulling down after accessing
+ * the device
+ * 27:22: minimum required time for CS pre pulling down before accessing
+ * the device
+ * 21:16: minimum required time from NCEB low to NREB low
+ * 15:12: minimum required time from NWEB high to NREB low.
+ * 11:08: write enable hold time
+ * 07:04: write wait states
+ * 03:00: read wait states
+ */
+ nfi_writel(nfc, 0x10804211, NFI_ACCCON);
+
+ /*
+ * CNRNB: nand ready/busy register
+ * -------------------------------
+ * 7:4: timeout register for polling the NAND busy/ready signal
+ * 0 : poll the status of the busy/ready signal after [7:4]*16 cycles.
+ */
+ nfi_writew(nfc, 0xf1, NFI_CNRNB);
+ nfi_writew(nfc, PAGEFMT_8K_16K, NFI_PAGEFMT);
+
+ mtk_nfc_hw_reset(nfc);
+
+ nfi_readl(nfc, NFI_INTR_STA);
+ nfi_writel(nfc, 0, NFI_INTR_EN);
+}
+
+static irqreturn_t mtk_nfc_irq(int irq, void *id)
+{
+ struct mtk_nfc *nfc = id;
+ u16 sta, ien;
+
+ sta = nfi_readw(nfc, NFI_INTR_STA);
+ ien = nfi_readw(nfc, NFI_INTR_EN);
+
+ if (!(sta & ien))
+ return IRQ_NONE;
+
+ nfi_writew(nfc, ~sta & ien, NFI_INTR_EN);
+ complete(&nfc->done);
+
+ return IRQ_HANDLED;
+}
+
+static int mtk_nfc_enable_clk(struct device *dev, struct mtk_nfc_clk *clk)
+{
+ int ret;
+
+ ret = clk_prepare_enable(clk->nfi_clk);
+ if (ret) {
+ dev_err(dev, "failed to enable nfi clk\n");
+ return ret;
+ }
+
+ ret = clk_prepare_enable(clk->pad_clk);
+ if (ret) {
+ dev_err(dev, "failed to enable pad clk\n");
+ clk_disable_unprepare(clk->nfi_clk);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void mtk_nfc_disable_clk(struct mtk_nfc_clk *clk)
+{
+ clk_disable_unprepare(clk->nfi_clk);
+ clk_disable_unprepare(clk->pad_clk);
+}
+
+static int mtk_nfc_ooblayout_free(struct mtd_info *mtd, int section,
+ struct mtd_oob_region *oob_region)
+{
+ struct nand_chip *chip = mtd_to_nand(mtd);
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+ struct mtk_nfc_fdm *fdm = &mtk_nand->fdm;
+ u32 eccsteps;
+
+ eccsteps = mtd->writesize / chip->ecc.size;
+
+ if (section >= eccsteps)
+ return -ERANGE;
+
+ oob_region->length = fdm->reg_size - fdm->ecc_size;
+ oob_region->offset = section * fdm->reg_size + fdm->ecc_size;
+
+ return 0;
+}
+
+static int mtk_nfc_ooblayout_ecc(struct mtd_info *mtd, int section,
+ struct mtd_oob_region *oob_region)
+{
+ struct nand_chip *chip = mtd_to_nand(mtd);
+ struct mtk_nfc_nand_chip *mtk_nand = to_mtk_nand(chip);
+ u32 eccsteps;
+
+ if (section)
+ return -ERANGE;
+
+ eccsteps = mtd->writesize / chip->ecc.size;
+ oob_region->offset = mtk_nand->fdm.reg_size * eccsteps;
+ oob_region->length = mtd->oobsize - oob_region->offset;
+
+ return 0;
+}
+
+static const struct mtd_ooblayout_ops mtk_nfc_ooblayout_ops = {
+ .free = mtk_nfc_ooblayout_free,
+ .ecc = mtk_nfc_ooblayout_ecc,
+};
+
+static void mtk_nfc_set_fdm(struct mtk_nfc_fdm *fdm, struct mtd_info *mtd)
+{
+ struct nand_chip *nand = mtd_to_nand(mtd);
+ struct mtk_nfc_nand_chip *chip = to_mtk_nand(nand);
+ u32 ecc_bytes;
+
+ ecc_bytes = DIV_ROUND_UP(nand->ecc.strength * ECC_PARITY_BITS, 8);
+
+ fdm->reg_size = chip->spare_per_sector - ecc_bytes;
+ if (fdm->reg_size > NFI_FDM_MAX_SIZE)
+ fdm->reg_size = NFI_FDM_MAX_SIZE;
+
+ /* bad block mark storage */
+ fdm->ecc_size = 1;
+}
+
+static void mtk_nfc_set_bad_mark_ctl(struct mtk_nfc_bad_mark_ctl *bm_ctl,
+ struct mtd_info *mtd)
+{
+ struct nand_chip *nand = mtd_to_nand(mtd);
+
+ if (mtd->writesize == 512) {
+ bm_ctl->bm_swap = mtk_nfc_no_bad_mark_swap;
+ } else {
+ bm_ctl->bm_swap = mtk_nfc_bad_mark_swap;
+ bm_ctl->sec = mtd->writesize / mtk_data_len(nand);
+ bm_ctl->pos = mtd->writesize % mtk_data_len(nand);
+ }
+}
+
+static void mtk_nfc_set_spare_per_sector(u32 *sps, struct mtd_info *mtd)
+{
+ struct nand_chip *nand = mtd_to_nand(mtd);
+ u32 spare[] = {16, 26, 27, 28, 32, 36, 40, 44,
+ 48, 49, 50, 51, 52, 62, 63, 64};
+ u32 eccsteps, i;
+
+ eccsteps = mtd->writesize / nand->ecc.size;
+ *sps = mtd->oobsize / eccsteps;
+
+ if (nand->ecc.size == 1024)
+ *sps >>= 1;
+
+ for (i = 0; i < ARRAY_SIZE(spare); i++) {
+ if (*sps <= spare[i]) {
+ if (!i)
+ *sps = spare[i];
+ else if (*sps != spare[i])
+ *sps = spare[i - 1];
+ break;
+ }
+ }
+
+ if (i >= ARRAY_SIZE(spare))
+ *sps = spare[ARRAY_SIZE(spare) - 1];
+
+ if (nand->ecc.size == 1024)
+ *sps <<= 1;
+}
+
+static int mtk_nfc_ecc_init(struct device *dev, struct mtd_info *mtd)
+{
+ struct nand_chip *nand = mtd_to_nand(mtd);
+ u32 spare;
+ int free;
+
+ /* support only ecc hw mode */
+ if (nand->ecc.mode != NAND_ECC_HW) {
+ dev_err(dev, "ecc.mode not supported\n");
+ return -EINVAL;
+ }
+
+ /* if optional dt settings not present */
+ if (!nand->ecc.size || !nand->ecc.strength) {
+ /* use datasheet requirements */
+ nand->ecc.strength = nand->ecc_strength_ds;
+ nand->ecc.size = nand->ecc_step_ds;
+
+ /*
+ * align eccstrength and eccsize
+ * this controller only supports 512 and 1024 sizes
+ */
+ if (nand->ecc.size < 1024) {
+ if (mtd->writesize > 512) {
+ nand->ecc.size = 1024;
+ nand->ecc.strength <<= 1;
+ } else {
+ nand->ecc.size = 512;
+ }
+ } else {
+ nand->ecc.size = 1024;
+ }
+
+ mtk_nfc_set_spare_per_sector(&spare, mtd);
+
+ /* calculate oob bytes except ecc parity data */
+ free = ((nand->ecc.strength * ECC_PARITY_BITS) + 7) >> 3;
+ free = spare - free;
+
+ /*
+ * enhance ecc strength if oob left is bigger than max FDM size
+ * or reduce ecc strength if oob size is not enough for ecc
+ * parity data.
+ */
+ if (free > NFI_FDM_MAX_SIZE) {
+ spare -= NFI_FDM_MAX_SIZE;
+ nand->ecc.strength = (spare << 3) / ECC_PARITY_BITS;
+ } else if (free < 0) {
+ spare -= NFI_FDM_MIN_SIZE;
+ nand->ecc.strength = (spare << 3) / ECC_PARITY_BITS;
+ }
+ }
+
+ mtk_ecc_adjust_strength(&nand->ecc.strength);
+
+ dev_info(dev, "eccsize %d eccstrength %d\n",
+ nand->ecc.size, nand->ecc.strength);
+
+ return 0;
+}
+
+static int mtk_nfc_nand_chip_init(struct device *dev, struct mtk_nfc *nfc,
+ struct device_node *np)
+{
+ struct mtk_nfc_nand_chip *chip;
+ struct nand_chip *nand;
+ struct mtd_info *mtd;
+ int nsels, len;
+ u32 tmp;
+ int ret;
+ int i;
+
+ if (!of_get_property(np, "reg", &nsels))
+ return -ENODEV;
+
+ nsels /= sizeof(u32);
+ if (!nsels || nsels > MTK_NAND_MAX_NSELS) {
+ dev_err(dev, "invalid reg property size %d\n", nsels);
+ return -EINVAL;
+ }
+
+ chip = devm_kzalloc(dev, sizeof(*chip) + nsels * sizeof(u8),
+ GFP_KERNEL);
+ if (!chip)
+ return -ENOMEM;
+
+ chip->nsels = nsels;
+ for (i = 0; i < nsels; i++) {
+ ret = of_property_read_u32_index(np, "reg", i, &tmp);
+ if (ret) {
+ dev_err(dev, "reg property failure : %d\n", ret);
+ return ret;
+ }
+ chip->sels[i] = tmp;
+ }
+
+ nand = &chip->nand;
+ nand->controller = &nfc->controller;
+
+ nand_set_flash_node(nand, np);
+ nand_set_controller_data(nand, nfc);
+
+ nand->options |= NAND_USE_BOUNCE_BUFFER | NAND_SUBPAGE_READ;
+ nand->dev_ready = mtk_nfc_dev_ready;
+ nand->select_chip = mtk_nfc_select_chip;
+ nand->write_byte = mtk_nfc_write_byte;
+ nand->write_buf = mtk_nfc_write_buf;
+ nand->read_byte = mtk_nfc_read_byte;
+ nand->read_buf = mtk_nfc_read_buf;
+ nand->cmd_ctrl = mtk_nfc_cmd_ctrl;
+
+ /* set default mode in case dt entry is missing */
+ nand->ecc.mode = NAND_ECC_HW;
+
+ nand->ecc.write_subpage = mtk_nfc_write_subpage_hwecc;
+ nand->ecc.write_page_raw = mtk_nfc_write_page_raw;
+ nand->ecc.write_page = mtk_nfc_write_page_hwecc;
+ nand->ecc.write_oob_raw = mtk_nfc_write_oob_std;
+ nand->ecc.write_oob = mtk_nfc_write_oob_std;
+
+ nand->ecc.read_subpage = mtk_nfc_read_subpage_hwecc;
+ nand->ecc.read_page_raw = mtk_nfc_read_page_raw;
+ nand->ecc.read_page = mtk_nfc_read_page_hwecc;
+ nand->ecc.read_oob_raw = mtk_nfc_read_oob_std;
+ nand->ecc.read_oob = mtk_nfc_read_oob_std;
+
+ mtd = nand_to_mtd(nand);
+ mtd->owner = THIS_MODULE;
+ mtd->dev.parent = dev;
+ mtd->name = MTK_NAME;
+ mtd_set_ooblayout(mtd, &mtk_nfc_ooblayout_ops);
+
+ mtk_nfc_hw_init(nfc);
+
+ ret = nand_scan_ident(mtd, nsels, NULL);
+ if (ret)
+ return -ENODEV;
+
+ /* store bbt magic in page, cause OOB is not protected */
+ if (nand->bbt_options & NAND_BBT_USE_FLASH)
+ nand->bbt_options |= NAND_BBT_NO_OOB;
+
+ ret = mtk_nfc_ecc_init(dev, mtd);
+ if (ret)
+ return -EINVAL;
+
+ if (nand->options & NAND_BUSWIDTH_16) {
+ dev_err(dev, "16bits buswidth not supported");
+ return -EINVAL;
+ }
+
+ mtk_nfc_set_spare_per_sector(&chip->spare_per_sector, mtd);
+ mtk_nfc_set_fdm(&chip->fdm, mtd);
+ mtk_nfc_set_bad_mark_ctl(&chip->bad_mark, mtd);
+
+ len = mtd->writesize + mtd->oobsize;
+ nfc->buffer = devm_kzalloc(dev, len, GFP_KERNEL);
+ if (!nfc->buffer)
+ return -ENOMEM;
+
+ ret = nand_scan_tail(mtd);
+ if (ret)
+ return -ENODEV;
+
+ ret = mtd_device_parse_register(mtd, NULL, NULL, NULL, 0);
+ if (ret) {
+ dev_err(dev, "mtd parse partition error\n");
+ nand_release(mtd);
+ return ret;
+ }
+
+ list_add_tail(&chip->node, &nfc->chips);
+
+ return 0;
+}
+
+static int mtk_nfc_nand_chips_init(struct device *dev, struct mtk_nfc *nfc)
+{
+ struct device_node *np = dev->of_node;
+ struct device_node *nand_np;
+ int ret;
+
+ for_each_child_of_node(np, nand_np) {
+ ret = mtk_nfc_nand_chip_init(dev, nfc, nand_np);
+ if (ret) {
+ of_node_put(nand_np);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+static int mtk_nfc_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct device_node *np = dev->of_node;
+ struct mtk_nfc *nfc;
+ struct resource *res;
+ int ret, irq;
+
+ nfc = devm_kzalloc(dev, sizeof(*nfc), GFP_KERNEL);
+ if (!nfc)
+ return -ENOMEM;
+
+ spin_lock_init(&nfc->controller.lock);
+ init_waitqueue_head(&nfc->controller.wq);
+ INIT_LIST_HEAD(&nfc->chips);
+
+ /* probe defer if not ready */
+ nfc->ecc = of_mtk_ecc_get(np);
+ if (IS_ERR(nfc->ecc))
+ return PTR_ERR(nfc->ecc);
+ else if (!nfc->ecc)
+ return -ENODEV;
+
+ nfc->dev = dev;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ nfc->regs = devm_ioremap_resource(dev, res);
+ if (IS_ERR(nfc->regs)) {
+ ret = PTR_ERR(nfc->regs);
+ dev_err(dev, "no nfi base\n");
+ goto release_ecc;
+ }
+
+ nfc->clk.nfi_clk = devm_clk_get(dev, "nfi_clk");
+ if (IS_ERR(nfc->clk.nfi_clk)) {
+ dev_err(dev, "no clk\n");
+ ret = PTR_ERR(nfc->clk.nfi_clk);
+ goto release_ecc;
+ }
+
+ nfc->clk.pad_clk = devm_clk_get(dev, "pad_clk");
+ if (IS_ERR(nfc->clk.pad_clk)) {
+ dev_err(dev, "no pad clk\n");
+ ret = PTR_ERR(nfc->clk.pad_clk);
+ goto release_ecc;
+ }
+
+ ret = mtk_nfc_enable_clk(dev, &nfc->clk);
+ if (ret)
+ goto release_ecc;
+
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0) {
+ dev_err(dev, "no nfi irq resource\n");
+ ret = -EINVAL;
+ goto clk_disable;
+ }
+
+ ret = devm_request_irq(dev, irq, mtk_nfc_irq, 0x0, "mtk-nand", nfc);
+ if (ret) {
+ dev_err(dev, "failed to request nfi irq\n");
+ goto clk_disable;
+ }
+
+ ret = dma_set_mask(dev, DMA_BIT_MASK(32));
+ if (ret) {
+ dev_err(dev, "failed to set dma mask\n");
+ goto clk_disable;
+ }
+
+ platform_set_drvdata(pdev, nfc);
+
+ ret = mtk_nfc_nand_chips_init(dev, nfc);
+ if (ret) {
+ dev_err(dev, "failed to init nand chips\n");
+ goto clk_disable;
+ }
+
+ return 0;
+
+clk_disable:
+ mtk_nfc_disable_clk(&nfc->clk);
+
+release_ecc:
+ mtk_ecc_release(nfc->ecc);
+
+ return ret;
+}
+
+static int mtk_nfc_remove(struct platform_device *pdev)
+{
+ struct mtk_nfc *nfc = platform_get_drvdata(pdev);
+ struct mtk_nfc_nand_chip *chip;
+
+ while (!list_empty(&nfc->chips)) {
+ chip = list_first_entry(&nfc->chips, struct mtk_nfc_nand_chip,
+ node);
+ nand_release(nand_to_mtd(&chip->nand));
+ list_del(&chip->node);
+ }
+
+ mtk_ecc_release(nfc->ecc);
+ mtk_nfc_disable_clk(&nfc->clk);
+
+ return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int mtk_nfc_suspend(struct device *dev)
+{
+ struct mtk_nfc *nfc = dev_get_drvdata(dev);
+
+ mtk_nfc_disable_clk(&nfc->clk);
+
+ return 0;
+}
+
+static int mtk_nfc_resume(struct device *dev)
+{
+ struct mtk_nfc *nfc = dev_get_drvdata(dev);
+ struct mtk_nfc_nand_chip *chip;
+ struct nand_chip *nand;
+ struct mtd_info *mtd;
+ int ret;
+ u32 i;
+
+ udelay(200);
+
+ ret = mtk_nfc_enable_clk(dev, &nfc->clk);
+ if (ret)
+ return ret;
+
+ mtk_nfc_hw_init(nfc);
+
+ /* reset NAND chip if VCC was powered off */
+ list_for_each_entry(chip, &nfc->chips, node) {
+ nand = &chip->nand;
+ mtd = nand_to_mtd(nand);
+ for (i = 0; i < chip->nsels; i++) {
+ nand->select_chip(mtd, i);
+ nand->cmdfunc(mtd, NAND_CMD_RESET, -1, -1);
+ }
+ }
+
+ return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(mtk_nfc_pm_ops, mtk_nfc_suspend, mtk_nfc_resume);
+#endif
+
+static const struct of_device_id mtk_nfc_id_table[] = {
+ { .compatible = "mediatek,mt2701-nfc" },
+ {}
+};
+MODULE_DEVICE_TABLE(of, mtk_nfc_id_table);
+
+static struct platform_driver mtk_nfc_driver = {
+ .probe = mtk_nfc_probe,
+ .remove = mtk_nfc_remove,
+ .driver = {
+ .name = MTK_NAME,
+ .of_match_table = mtk_nfc_id_table,
+#ifdef CONFIG_PM_SLEEP
+ .pm = &mtk_nfc_pm_ops,
+#endif
+ },
+};
+
+module_platform_driver(mtk_nfc_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Xiaolei Li <xiaolei.li@mediatek.com>");
+MODULE_DESCRIPTION("MTK Nand Flash Controller Driver");
diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0b0dc29d2af7..77533f7f2429 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2610,7 +2610,7 @@ static int nand_do_write_ops(struct mtd_info *mtd, loff_t to,
int cached = writelen > bytes && page != blockmask;
uint8_t *wbuf = buf;
int use_bufpoi;
- int part_pagewr = (column || writelen < (mtd->writesize - 1));
+ int part_pagewr = (column || writelen < mtd->writesize);
if (part_pagewr)
use_bufpoi = 1;
diff --git a/drivers/mtd/nand/nand_ids.c b/drivers/mtd/nand/nand_ids.c
index ccc05f5b2695..2af9869a115e 100644
--- a/drivers/mtd/nand/nand_ids.c
+++ b/drivers/mtd/nand/nand_ids.c
@@ -168,6 +168,7 @@ struct nand_flash_dev nand_flash_ids[] = {
/* Manufacturer IDs */
struct nand_manufacturers nand_manuf_ids[] = {
{NAND_MFR_TOSHIBA, "Toshiba"},
+ {NAND_MFR_ESMT, "ESMT"},
{NAND_MFR_SAMSUNG, "Samsung"},
{NAND_MFR_FUJITSU, "Fujitsu"},
{NAND_MFR_NATIONAL, "National"},
diff --git a/drivers/mtd/nand/omap2.c b/drivers/mtd/nand/omap2.c
index a136da8df6fe..a59361c36f40 100644
--- a/drivers/mtd/nand/omap2.c
+++ b/drivers/mtd/nand/omap2.c
@@ -118,8 +118,6 @@
#define PREFETCH_STATUS_FIFO_CNT(val) ((val >> 24) & 0x7F)
#define STATUS_BUFF_EMPTY 0x00000001
-#define OMAP24XX_DMA_GPMC 4
-
#define SECTOR_BYTES 512
/* 4 bit padding to make byte aligned, 56 = 52 + 4 */
#define BCH4_BIT_PAD 4
@@ -1811,7 +1809,6 @@ static int omap_nand_probe(struct platform_device *pdev)
struct nand_chip *nand_chip;
int err;
dma_cap_mask_t mask;
- unsigned sig;
struct resource *res;
struct device *dev = &pdev->dev;
int min_oobbytes = BADBLOCK_MARKER_LENGTH;
@@ -1924,11 +1921,11 @@ static int omap_nand_probe(struct platform_device *pdev)
case NAND_OMAP_PREFETCH_DMA:
dma_cap_zero(mask);
dma_cap_set(DMA_SLAVE, mask);
- sig = OMAP24XX_DMA_GPMC;
- info->dma = dma_request_channel(mask, omap_dma_filter_fn, &sig);
- if (!info->dma) {
+ info->dma = dma_request_chan(pdev->dev.parent, "rxtx");
+
+ if (IS_ERR(info->dma)) {
dev_err(&pdev->dev, "DMA engine request failed\n");
- err = -ENXIO;
+ err = PTR_ERR(info->dma);
goto return_error;
} else {
struct dma_slave_config cfg;
diff --git a/drivers/mtd/nand/sunxi_nand.c b/drivers/mtd/nand/sunxi_nand.c
index a83a690688b4..e414b31b71c1 100644
--- a/drivers/mtd/nand/sunxi_nand.c
+++ b/drivers/mtd/nand/sunxi_nand.c
@@ -39,6 +39,7 @@
#include <linux/gpio.h>
#include <linux/interrupt.h>
#include <linux/iopoll.h>
+#include <linux/reset.h>
#define NFC_REG_CTL 0x0000
#define NFC_REG_ST 0x0004
@@ -153,6 +154,7 @@
/* define bit use in NFC_ECC_ST */
#define NFC_ECC_ERR(x) BIT(x)
+#define NFC_ECC_ERR_MSK GENMASK(15, 0)
#define NFC_ECC_PAT_FOUND(x) BIT(x + 16)
#define NFC_ECC_ERR_CNT(b, x) (((x) >> (((b) % 4) * 8)) & 0xff)
@@ -269,10 +271,12 @@ struct sunxi_nfc {
void __iomem *regs;
struct clk *ahb_clk;
struct clk *mod_clk;
+ struct reset_control *reset;
unsigned long assigned_cs;
unsigned long clk_rate;
struct list_head chips;
struct completion complete;
+ struct dma_chan *dmac;
};
static inline struct sunxi_nfc *to_sunxi_nfc(struct nand_hw_control *ctrl)
@@ -365,6 +369,67 @@ static int sunxi_nfc_rst(struct sunxi_nfc *nfc)
return ret;
}
+static int sunxi_nfc_dma_op_prepare(struct mtd_info *mtd, const void *buf,
+ int chunksize, int nchunks,
+ enum dma_data_direction ddir,
+ struct scatterlist *sg)
+{
+ struct nand_chip *nand = mtd_to_nand(mtd);
+ struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
+ struct dma_async_tx_descriptor *dmad;
+ enum dma_transfer_direction tdir;
+ dma_cookie_t dmat;
+ int ret;
+
+ if (ddir == DMA_FROM_DEVICE)
+ tdir = DMA_DEV_TO_MEM;
+ else
+ tdir = DMA_MEM_TO_DEV;
+
+ sg_init_one(sg, buf, nchunks * chunksize);
+ ret = dma_map_sg(nfc->dev, sg, 1, ddir);
+ if (!ret)
+ return -ENOMEM;
+
+ dmad = dmaengine_prep_slave_sg(nfc->dmac, sg, 1, tdir, DMA_CTRL_ACK);
+ if (!dmad) {
+ ret = -EINVAL;
+ goto err_unmap_buf;
+ }
+
+ writel(readl(nfc->regs + NFC_REG_CTL) | NFC_RAM_METHOD,
+ nfc->regs + NFC_REG_CTL);
+ writel(nchunks, nfc->regs + NFC_REG_SECTOR_NUM);
+ writel(chunksize, nfc->regs + NFC_REG_CNT);
+ dmat = dmaengine_submit(dmad);
+
+ ret = dma_submit_error(dmat);
+ if (ret)
+ goto err_clr_dma_flag;
+
+ return 0;
+
+err_clr_dma_flag:
+ writel(readl(nfc->regs + NFC_REG_CTL) & ~NFC_RAM_METHOD,
+ nfc->regs + NFC_REG_CTL);
+
+err_unmap_buf:
+ dma_unmap_sg(nfc->dev, sg, 1, ddir);
+ return ret;
+}
+
+static void sunxi_nfc_dma_op_cleanup(struct mtd_info *mtd,
+ enum dma_data_direction ddir,
+ struct scatterlist *sg)
+{
+ struct nand_chip *nand = mtd_to_nand(mtd);
+ struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
+
+ dma_unmap_sg(nfc->dev, sg, 1, ddir);
+ writel(readl(nfc->regs + NFC_REG_CTL) & ~NFC_RAM_METHOD,
+ nfc->regs + NFC_REG_CTL);
+}
+
static int sunxi_nfc_dev_ready(struct mtd_info *mtd)
{
struct nand_chip *nand = mtd_to_nand(mtd);
@@ -822,17 +887,15 @@ static void sunxi_nfc_hw_ecc_update_stats(struct mtd_info *mtd,
}
static int sunxi_nfc_hw_ecc_correct(struct mtd_info *mtd, u8 *data, u8 *oob,
- int step, bool *erased)
+ int step, u32 status, bool *erased)
{
struct nand_chip *nand = mtd_to_nand(mtd);
struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
struct nand_ecc_ctrl *ecc = &nand->ecc;
- u32 status, tmp;
+ u32 tmp;
*erased = false;
- status = readl(nfc->regs + NFC_REG_ECC_ST);
-
if (status & NFC_ECC_ERR(step))
return -EBADMSG;
@@ -898,6 +961,7 @@ static int sunxi_nfc_hw_ecc_read_chunk(struct mtd_info *mtd,
*cur_off = oob_off + ecc->bytes + 4;
ret = sunxi_nfc_hw_ecc_correct(mtd, data, oob_required ? oob : NULL, 0,
+ readl(nfc->regs + NFC_REG_ECC_ST),
&erased);
if (erased)
return 1;
@@ -967,6 +1031,130 @@ static void sunxi_nfc_hw_ecc_read_extra_oob(struct mtd_info *mtd,
*cur_off = mtd->oobsize + mtd->writesize;
}
+static int sunxi_nfc_hw_ecc_read_chunks_dma(struct mtd_info *mtd, uint8_t *buf,
+ int oob_required, int page,
+ int nchunks)
+{
+ struct nand_chip *nand = mtd_to_nand(mtd);
+ bool randomized = nand->options & NAND_NEED_SCRAMBLING;
+ struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
+ struct nand_ecc_ctrl *ecc = &nand->ecc;
+ unsigned int max_bitflips = 0;
+ int ret, i, raw_mode = 0;
+ struct scatterlist sg;
+ u32 status;
+
+ ret = sunxi_nfc_wait_cmd_fifo_empty(nfc);
+ if (ret)
+ return ret;
+
+ ret = sunxi_nfc_dma_op_prepare(mtd, buf, ecc->size, nchunks,
+ DMA_FROM_DEVICE, &sg);
+ if (ret)
+ return ret;
+
+ sunxi_nfc_hw_ecc_enable(mtd);
+ sunxi_nfc_randomizer_config(mtd, page, false);
+ sunxi_nfc_randomizer_enable(mtd);
+
+ writel((NAND_CMD_RNDOUTSTART << 16) | (NAND_CMD_RNDOUT << 8) |
+ NAND_CMD_READSTART, nfc->regs + NFC_REG_RCMD_SET);
+
+ dma_async_issue_pending(nfc->dmac);
+
+ writel(NFC_PAGE_OP | NFC_DATA_SWAP_METHOD | NFC_DATA_TRANS,
+ nfc->regs + NFC_REG_CMD);
+
+ ret = sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0);
+ if (ret)
+ dmaengine_terminate_all(nfc->dmac);
+
+ sunxi_nfc_randomizer_disable(mtd);
+ sunxi_nfc_hw_ecc_disable(mtd);
+
+ sunxi_nfc_dma_op_cleanup(mtd, DMA_FROM_DEVICE, &sg);
+
+ if (ret)
+ return ret;
+
+ status = readl(nfc->regs + NFC_REG_ECC_ST);
+
+ for (i = 0; i < nchunks; i++) {
+ int data_off = i * ecc->size;
+ int oob_off = i * (ecc->bytes + 4);
+ u8 *data = buf + data_off;
+ u8 *oob = nand->oob_poi + oob_off;
+ bool erased;
+
+ ret = sunxi_nfc_hw_ecc_correct(mtd, randomized ? data : NULL,
+ oob_required ? oob : NULL,
+ i, status, &erased);
+
+ /* ECC errors are handled in the second loop. */
+ if (ret < 0)
+ continue;
+
+ if (oob_required && !erased) {
+ /* TODO: use DMA to retrieve OOB */
+ nand->cmdfunc(mtd, NAND_CMD_RNDOUT,
+ mtd->writesize + oob_off, -1);
+ nand->read_buf(mtd, oob, ecc->bytes + 4);
+
+ sunxi_nfc_hw_ecc_get_prot_oob_bytes(mtd, oob, i,
+ !i, page);
+ }
+
+ if (erased)
+ raw_mode = 1;
+
+ sunxi_nfc_hw_ecc_update_stats(mtd, &max_bitflips, ret);
+ }
+
+ if (status & NFC_ECC_ERR_MSK) {
+ for (i = 0; i < nchunks; i++) {
+ int data_off = i * ecc->size;
+ int oob_off = i * (ecc->bytes + 4);
+ u8 *data = buf + data_off;
+ u8 *oob = nand->oob_poi + oob_off;
+
+ if (!(status & NFC_ECC_ERR(i)))
+ continue;
+
+ /*
+ * Re-read the data with the randomizer disabled to
+ * identify bitflips in erased pages.
+ */
+ if (randomized) {
+ /* TODO: use DMA to read page in raw mode */
+ nand->cmdfunc(mtd, NAND_CMD_RNDOUT,
+ data_off, -1);
+ nand->read_buf(mtd, data, ecc->size);
+ }
+
+ /* TODO: use DMA to retrieve OOB */
+ nand->cmdfunc(mtd, NAND_CMD_RNDOUT,
+ mtd->writesize + oob_off, -1);
+ nand->read_buf(mtd, oob, ecc->bytes + 4);
+
+ ret = nand_check_erased_ecc_chunk(data, ecc->size,
+ oob, ecc->bytes + 4,
+ NULL, 0,
+ ecc->strength);
+ if (ret >= 0)
+ raw_mode = 1;
+
+ sunxi_nfc_hw_ecc_update_stats(mtd, &max_bitflips, ret);
+ }
+ }
+
+ if (oob_required)
+ sunxi_nfc_hw_ecc_read_extra_oob(mtd, nand->oob_poi,
+ NULL, !raw_mode,
+ page);
+
+ return max_bitflips;
+}
+
static int sunxi_nfc_hw_ecc_write_chunk(struct mtd_info *mtd,
const u8 *data, int data_off,
const u8 *oob, int oob_off,
@@ -1065,6 +1253,23 @@ static int sunxi_nfc_hw_ecc_read_page(struct mtd_info *mtd,
return max_bitflips;
}
+static int sunxi_nfc_hw_ecc_read_page_dma(struct mtd_info *mtd,
+ struct nand_chip *chip, u8 *buf,
+ int oob_required, int page)
+{
+ int ret;
+
+ ret = sunxi_nfc_hw_ecc_read_chunks_dma(mtd, buf, oob_required, page,
+ chip->ecc.steps);
+ if (ret >= 0)
+ return ret;
+
+ /* Fallback to PIO mode */
+ chip->cmdfunc(mtd, NAND_CMD_RNDOUT, 0, -1);
+
+ return sunxi_nfc_hw_ecc_read_page(mtd, chip, buf, oob_required, page);
+}
+
static int sunxi_nfc_hw_ecc_read_subpage(struct mtd_info *mtd,
struct nand_chip *chip,
u32 data_offs, u32 readlen,
@@ -1098,6 +1303,25 @@ static int sunxi_nfc_hw_ecc_read_subpage(struct mtd_info *mtd,
return max_bitflips;
}
+static int sunxi_nfc_hw_ecc_read_subpage_dma(struct mtd_info *mtd,
+ struct nand_chip *chip,
+ u32 data_offs, u32 readlen,
+ u8 *buf, int page)
+{
+ int nchunks = DIV_ROUND_UP(data_offs + readlen, chip->ecc.size);
+ int ret;
+
+ ret = sunxi_nfc_hw_ecc_read_chunks_dma(mtd, buf, false, page, nchunks);
+ if (ret >= 0)
+ return ret;
+
+ /* Fallback to PIO mode */
+ chip->cmdfunc(mtd, NAND_CMD_RNDOUT, 0, -1);
+
+ return sunxi_nfc_hw_ecc_read_subpage(mtd, chip, data_offs, readlen,
+ buf, page);
+}
+
static int sunxi_nfc_hw_ecc_write_page(struct mtd_info *mtd,
struct nand_chip *chip,
const uint8_t *buf, int oob_required,
@@ -1130,6 +1354,99 @@ static int sunxi_nfc_hw_ecc_write_page(struct mtd_info *mtd,
return 0;
}
+static int sunxi_nfc_hw_ecc_write_subpage(struct mtd_info *mtd,
+ struct nand_chip *chip,
+ u32 data_offs, u32 data_len,
+ const u8 *buf, int oob_required,
+ int page)
+{
+ struct nand_ecc_ctrl *ecc = &chip->ecc;
+ int ret, i, cur_off = 0;
+
+ sunxi_nfc_hw_ecc_enable(mtd);
+
+ for (i = data_offs / ecc->size;
+ i < DIV_ROUND_UP(data_offs + data_len, ecc->size); i++) {
+ int data_off = i * ecc->size;
+ int oob_off = i * (ecc->bytes + 4);
+ const u8 *data = buf + data_off;
+ const u8 *oob = chip->oob_poi + oob_off;
+
+ ret = sunxi_nfc_hw_ecc_write_chunk(mtd, data, data_off, oob,
+ oob_off + mtd->writesize,
+ &cur_off, !i, page);
+ if (ret)
+ return ret;
+ }
+
+ sunxi_nfc_hw_ecc_disable(mtd);
+
+ return 0;
+}
+
+static int sunxi_nfc_hw_ecc_write_page_dma(struct mtd_info *mtd,
+ struct nand_chip *chip,
+ const u8 *buf,
+ int oob_required,
+ int page)
+{
+ struct nand_chip *nand = mtd_to_nand(mtd);
+ struct sunxi_nfc *nfc = to_sunxi_nfc(nand->controller);
+ struct nand_ecc_ctrl *ecc = &nand->ecc;
+ struct scatterlist sg;
+ int ret, i;
+
+ ret = sunxi_nfc_wait_cmd_fifo_empty(nfc);
+ if (ret)
+ return ret;
+
+ ret = sunxi_nfc_dma_op_prepare(mtd, buf, ecc->size, ecc->steps,
+ DMA_TO_DEVICE, &sg);
+ if (ret)
+ goto pio_fallback;
+
+ for (i = 0; i < ecc->steps; i++) {
+ const u8 *oob = nand->oob_poi + (i * (ecc->bytes + 4));
+
+ sunxi_nfc_hw_ecc_set_prot_oob_bytes(mtd, oob, i, !i, page);
+ }
+
+ sunxi_nfc_hw_ecc_enable(mtd);
+ sunxi_nfc_randomizer_config(mtd, page, false);
+ sunxi_nfc_randomizer_enable(mtd);
+
+ writel((NAND_CMD_RNDIN << 8) | NAND_CMD_PAGEPROG,
+ nfc->regs + NFC_REG_RCMD_SET);
+
+ dma_async_issue_pending(nfc->dmac);
+
+ writel(NFC_PAGE_OP | NFC_DATA_SWAP_METHOD |
+ NFC_DATA_TRANS | NFC_ACCESS_DIR,
+ nfc->regs + NFC_REG_CMD);
+
+ ret = sunxi_nfc_wait_events(nfc, NFC_CMD_INT_FLAG, true, 0);
+ if (ret)
+ dmaengine_terminate_all(nfc->dmac);
+
+ sunxi_nfc_randomizer_disable(mtd);
+ sunxi_nfc_hw_ecc_disable(mtd);
+
+ sunxi_nfc_dma_op_cleanup(mtd, DMA_TO_DEVICE, &sg);
+
+ if (ret)
+ return ret;
+
+ if (oob_required || (chip->options & NAND_NEED_SCRAMBLING))
+ /* TODO: use DMA to transfer extra OOB bytes ? */
+ sunxi_nfc_hw_ecc_write_extra_oob(mtd, chip->oob_poi,
+ NULL, page);
+
+ return 0;
+
+pio_fallback:
+ return sunxi_nfc_hw_ecc_write_page(mtd, chip, buf, oob_required, page);
+}
+
static int sunxi_nfc_hw_syndrome_ecc_read_page(struct mtd_info *mtd,
struct nand_chip *chip,
uint8_t *buf, int oob_required,
@@ -1497,10 +1814,19 @@ static int sunxi_nand_hw_common_ecc_ctrl_init(struct mtd_info *mtd,
int ret;
int i;
+ if (ecc->size != 512 && ecc->size != 1024)
+ return -EINVAL;
+
data = kzalloc(sizeof(*data), GFP_KERNEL);
if (!data)
return -ENOMEM;
+ /* Prefer 1k ECC chunk over 512 ones */
+ if (ecc->size == 512 && mtd->writesize > 512) {
+ ecc->size = 1024;
+ ecc->strength *= 2;
+ }
+
/* Add ECC info retrieval from DT */
for (i = 0; i < ARRAY_SIZE(strengths); i++) {
if (ecc->strength <= strengths[i])
@@ -1550,14 +1876,28 @@ static int sunxi_nand_hw_ecc_ctrl_init(struct mtd_info *mtd,
struct nand_ecc_ctrl *ecc,
struct device_node *np)
{
+ struct nand_chip *nand = mtd_to_nand(mtd);
+ struct sunxi_nand_chip *sunxi_nand = to_sunxi_nand(nand);
+ struct sunxi_nfc *nfc = to_sunxi_nfc(sunxi_nand->nand.controller);
int ret;
ret = sunxi_nand_hw_common_ecc_ctrl_init(mtd, ecc, np);
if (ret)
return ret;
- ecc->read_page = sunxi_nfc_hw_ecc_read_page;
- ecc->write_page = sunxi_nfc_hw_ecc_write_page;
+ if (nfc->dmac) {
+ ecc->read_page = sunxi_nfc_hw_ecc_read_page_dma;
+ ecc->read_subpage = sunxi_nfc_hw_ecc_read_subpage_dma;
+ ecc->write_page = sunxi_nfc_hw_ecc_write_page_dma;
+ nand->options |= NAND_USE_BOUNCE_BUFFER;
+ } else {
+ ecc->read_page = sunxi_nfc_hw_ecc_read_page;
+ ecc->read_subpage = sunxi_nfc_hw_ecc_read_subpage;
+ ecc->write_page = sunxi_nfc_hw_ecc_write_page;
+ }
+
+ /* TODO: support DMA for raw accesses and subpage write */
+ ecc->write_subpage = sunxi_nfc_hw_ecc_write_subpage;
ecc->read_oob_raw = nand_read_oob_std;
ecc->write_oob_raw = nand_write_oob_std;
ecc->read_subpage = sunxi_nfc_hw_ecc_read_subpage;
@@ -1871,26 +2211,59 @@ static int sunxi_nfc_probe(struct platform_device *pdev)
if (ret)
goto out_ahb_clk_unprepare;
+ nfc->reset = devm_reset_control_get_optional(dev, "ahb");
+ if (!IS_ERR(nfc->reset)) {
+ ret = reset_control_deassert(nfc->reset);
+ if (ret) {
+ dev_err(dev, "reset err %d\n", ret);
+ goto out_mod_clk_unprepare;
+ }
+ } else if (PTR_ERR(nfc->reset) != -ENOENT) {
+ ret = PTR_ERR(nfc->reset);
+ goto out_mod_clk_unprepare;
+ }
+
ret = sunxi_nfc_rst(nfc);
if (ret)
- goto out_mod_clk_unprepare;
+ goto out_ahb_reset_reassert;
writel(0, nfc->regs + NFC_REG_INT);
ret = devm_request_irq(dev, irq, sunxi_nfc_interrupt,
0, "sunxi-nand", nfc);
if (ret)
- goto out_mod_clk_unprepare;
+ goto out_ahb_reset_reassert;
+
+ nfc->dmac = dma_request_slave_channel(dev, "rxtx");
+ if (nfc->dmac) {
+ struct dma_slave_config dmac_cfg = { };
+
+ dmac_cfg.src_addr = r->start + NFC_REG_IO_DATA;
+ dmac_cfg.dst_addr = dmac_cfg.src_addr;
+ dmac_cfg.src_addr_width = DMA_SLAVE_BUSWIDTH_4_BYTES;
+ dmac_cfg.dst_addr_width = dmac_cfg.src_addr_width;
+ dmac_cfg.src_maxburst = 4;
+ dmac_cfg.dst_maxburst = 4;
+ dmaengine_slave_config(nfc->dmac, &dmac_cfg);
+ } else {
+ dev_warn(dev, "failed to request rxtx DMA channel\n");
+ }
platform_set_drvdata(pdev, nfc);
ret = sunxi_nand_chips_init(dev, nfc);
if (ret) {
dev_err(dev, "failed to init nand chips\n");
- goto out_mod_clk_unprepare;
+ goto out_release_dmac;
}
return 0;
+out_release_dmac:
+ if (nfc->dmac)
+ dma_release_channel(nfc->dmac);
+out_ahb_reset_reassert:
+ if (!IS_ERR(nfc->reset))
+ reset_control_assert(nfc->reset);
out_mod_clk_unprepare:
clk_disable_unprepare(nfc->mod_clk);
out_ahb_clk_unprepare:
@@ -1904,6 +2277,12 @@ static int sunxi_nfc_remove(struct platform_device *pdev)
struct sunxi_nfc *nfc = platform_get_drvdata(pdev);
sunxi_nand_chips_cleanup(nfc);
+
+ if (!IS_ERR(nfc->reset))
+ reset_control_assert(nfc->reset);
+
+ if (nfc->dmac)
+ dma_release_channel(nfc->dmac);
clk_disable_unprepare(nfc->mod_clk);
clk_disable_unprepare(nfc->ahb_clk);
diff --git a/drivers/mtd/nand/xway_nand.c b/drivers/mtd/nand/xway_nand.c
index 0cf0ac07a8c2..1f2948c0c458 100644
--- a/drivers/mtd/nand/xway_nand.c
+++ b/drivers/mtd/nand/xway_nand.c
@@ -4,6 +4,7 @@
* by the Free Software Foundation.
*
* Copyright © 2012 John Crispin <blogic@openwrt.org>
+ * Copyright © 2016 Hauke Mehrtens <hauke@hauke-m.de>
*/
#include <linux/mtd/nand.h>
@@ -16,20 +17,28 @@
#define EBU_ADDSEL1 0x24
#define EBU_NAND_CON 0xB0
#define EBU_NAND_WAIT 0xB4
+#define NAND_WAIT_RD BIT(0) /* NAND flash status output */
+#define NAND_WAIT_WR_C BIT(3) /* NAND Write/Read complete */
#define EBU_NAND_ECC0 0xB8
#define EBU_NAND_ECC_AC 0xBC
-/* nand commands */
-#define NAND_CMD_ALE (1 << 2)
-#define NAND_CMD_CLE (1 << 3)
-#define NAND_CMD_CS (1 << 4)
-#define NAND_WRITE_CMD_RESET 0xff
+/*
+ * nand commands
+ * The pins of the NAND chip are selected based on the address bits of the
+ * "register" read and write. There are no special registers, but an
+ * address range and the lower address bits are used to activate the
+ * correct line. For example when the bit (1 << 2) is set in the address
+ * the ALE pin will be activated.
+ */
+#define NAND_CMD_ALE BIT(2) /* address latch enable */
+#define NAND_CMD_CLE BIT(3) /* command latch enable */
+#define NAND_CMD_CS BIT(4) /* chip select */
+#define NAND_CMD_SE BIT(5) /* spare area access latch */
+#define NAND_CMD_WP BIT(6) /* write protect */
#define NAND_WRITE_CMD (NAND_CMD_CS | NAND_CMD_CLE)
#define NAND_WRITE_ADDR (NAND_CMD_CS | NAND_CMD_ALE)
#define NAND_WRITE_DATA (NAND_CMD_CS)
#define NAND_READ_DATA (NAND_CMD_CS)
-#define NAND_WAIT_WR_C (1 << 3)
-#define NAND_WAIT_RD (0x1)
/* we need to tel the ebu which addr we mapped the nand to */
#define ADDSEL1_MASK(x) (x << 4)
@@ -54,31 +63,41 @@
#define NAND_CON_CSMUX (1 << 1)
#define NAND_CON_NANDM 1
-static void xway_reset_chip(struct nand_chip *chip)
+struct xway_nand_data {
+ struct nand_chip chip;
+ unsigned long csflags;
+ void __iomem *nandaddr;
+};
+
+static u8 xway_readb(struct mtd_info *mtd, int op)
{
- unsigned long nandaddr = (unsigned long) chip->IO_ADDR_W;
- unsigned long flags;
+ struct nand_chip *chip = mtd_to_nand(mtd);
+ struct xway_nand_data *data = nand_get_controller_data(chip);
- nandaddr &= ~NAND_WRITE_ADDR;
- nandaddr |= NAND_WRITE_CMD;
+ return readb(data->nandaddr + op);
+}
- /* finish with a reset */
- spin_lock_irqsave(&ebu_lock, flags);
- writeb(NAND_WRITE_CMD_RESET, (void __iomem *) nandaddr);
- while ((ltq_ebu_r32(EBU_NAND_WAIT) & NAND_WAIT_WR_C) == 0)
- ;
- spin_unlock_irqrestore(&ebu_lock, flags);
+static void xway_writeb(struct mtd_info *mtd, int op, u8 value)
+{
+ struct nand_chip *chip = mtd_to_nand(mtd);
+ struct xway_nand_data *data = nand_get_controller_data(chip);
+
+ writeb(value, data->nandaddr + op);
}
-static void xway_select_chip(struct mtd_info *mtd, int chip)
+static void xway_select_chip(struct mtd_info *mtd, int select)
{
+ struct nand_chip *chip = mtd_to_nand(mtd);
+ struct xway_nand_data *data = nand_get_controller_data(chip);
- switch (chip) {
+ switch (select) {
case -1:
ltq_ebu_w32_mask(NAND_CON_CE, 0, EBU_NAND_CON);
ltq_ebu_w32_mask(NAND_CON_NANDM, 0, EBU_NAND_CON);
+ spin_unlock_irqrestore(&ebu_lock, data->csflags);
break;
case 0:
+ spin_lock_irqsave(&ebu_lock, data->csflags);
ltq_ebu_w32_mask(0, NAND_CON_NANDM, EBU_NAND_CON);
ltq_ebu_w32_mask(0, NAND_CON_CE, EBU_NAND_CON);
break;
@@ -89,26 +108,16 @@ static void xway_select_chip(struct mtd_info *mtd, int chip)
static void xway_cmd_ctrl(struct mtd_info *mtd, int cmd, unsigned int ctrl)
{
- struct nand_chip *this = mtd_to_nand(mtd);
- unsigned long nandaddr = (unsigned long) this->IO_ADDR_W;
- unsigned long flags;
-
- if (ctrl & NAND_CTRL_CHANGE) {
- nandaddr &= ~(NAND_WRITE_CMD | NAND_WRITE_ADDR);
- if (ctrl & NAND_CLE)
- nandaddr |= NAND_WRITE_CMD;
- else
- nandaddr |= NAND_WRITE_ADDR;
- this->IO_ADDR_W = (void __iomem *) nandaddr;
- }
+ if (cmd == NAND_CMD_NONE)
+ return;
- if (cmd != NAND_CMD_NONE) {
- spin_lock_irqsave(&ebu_lock, flags);
- writeb(cmd, this->IO_ADDR_W);
- while ((ltq_ebu_r32(EBU_NAND_WAIT) & NAND_WAIT_WR_C) == 0)
- ;
- spin_unlock_irqrestore(&ebu_lock, flags);
- }
+ if (ctrl & NAND_CLE)
+ xway_writeb(mtd, NAND_WRITE_CMD, cmd);
+ else if (ctrl & NAND_ALE)
+ xway_writeb(mtd, NAND_WRITE_ADDR, cmd);
+
+ while ((ltq_ebu_r32(EBU_NAND_WAIT) & NAND_WAIT_WR_C) == 0)
+ ;
}
static int xway_dev_ready(struct mtd_info *mtd)
@@ -118,80 +127,122 @@ static int xway_dev_ready(struct mtd_info *mtd)
static unsigned char xway_read_byte(struct mtd_info *mtd)
{
- struct nand_chip *this = mtd_to_nand(mtd);
- unsigned long nandaddr = (unsigned long) this->IO_ADDR_R;
- unsigned long flags;
- int ret;
+ return xway_readb(mtd, NAND_READ_DATA);
+}
+
+static void xway_read_buf(struct mtd_info *mtd, u_char *buf, int len)
+{
+ int i;
- spin_lock_irqsave(&ebu_lock, flags);
- ret = ltq_r8((void __iomem *)(nandaddr + NAND_READ_DATA));
- spin_unlock_irqrestore(&ebu_lock, flags);
+ for (i = 0; i < len; i++)
+ buf[i] = xway_readb(mtd, NAND_WRITE_DATA);
+}
- return ret;
+static void xway_write_buf(struct mtd_info *mtd, const u_char *buf, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++)
+ xway_writeb(mtd, NAND_WRITE_DATA, buf[i]);
}
+/*
+ * Probe for the NAND device.
+ */
static int xway_nand_probe(struct platform_device *pdev)
{
- struct nand_chip *this = platform_get_drvdata(pdev);
- unsigned long nandaddr = (unsigned long) this->IO_ADDR_W;
- const __be32 *cs = of_get_property(pdev->dev.of_node,
- "lantiq,cs", NULL);
+ struct xway_nand_data *data;
+ struct mtd_info *mtd;
+ struct resource *res;
+ int err;
+ u32 cs;
u32 cs_flag = 0;
+ /* Allocate memory for the device structure (and zero it) */
+ data = devm_kzalloc(&pdev->dev, sizeof(struct xway_nand_data),
+ GFP_KERNEL);
+ if (!data)
+ return -ENOMEM;
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ data->nandaddr = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(data->nandaddr))
+ return PTR_ERR(data->nandaddr);
+
+ nand_set_flash_node(&data->chip, pdev->dev.of_node);
+ mtd = nand_to_mtd(&data->chip);
+ mtd->dev.parent = &pdev->dev;
+
+ data->chip.cmd_ctrl = xway_cmd_ctrl;
+ data->chip.dev_ready = xway_dev_ready;
+ data->chip.select_chip = xway_select_chip;
+ data->chip.write_buf = xway_write_buf;
+ data->chip.read_buf = xway_read_buf;
+ data->chip.read_byte = xway_read_byte;
+ data->chip.chip_delay = 30;
+
+ data->chip.ecc.mode = NAND_ECC_SOFT;
+ data->chip.ecc.algo = NAND_ECC_HAMMING;
+
+ platform_set_drvdata(pdev, data);
+ nand_set_controller_data(&data->chip, data);
+
/* load our CS from the DT. Either we find a valid 1 or default to 0 */
- if (cs && (*cs == 1))
+ err = of_property_read_u32(pdev->dev.of_node, "lantiq,cs", &cs);
+ if (!err && cs == 1)
cs_flag = NAND_CON_IN_CS1 | NAND_CON_OUT_CS1;
/* setup the EBU to run in NAND mode on our base addr */
- ltq_ebu_w32(CPHYSADDR(nandaddr)
- | ADDSEL1_MASK(3) | ADDSEL1_REGEN, EBU_ADDSEL1);
+ ltq_ebu_w32(CPHYSADDR(data->nandaddr)
+ | ADDSEL1_MASK(3) | ADDSEL1_REGEN, EBU_ADDSEL1);
ltq_ebu_w32(BUSCON1_SETUP | BUSCON1_BCGEN_RES | BUSCON1_WAITWRC2
- | BUSCON1_WAITRDC2 | BUSCON1_HOLDC1 | BUSCON1_RECOVC1
- | BUSCON1_CMULT4, LTQ_EBU_BUSCON1);
+ | BUSCON1_WAITRDC2 | BUSCON1_HOLDC1 | BUSCON1_RECOVC1
+ | BUSCON1_CMULT4, LTQ_EBU_BUSCON1);
ltq_ebu_w32(NAND_CON_NANDM | NAND_CON_CSMUX | NAND_CON_CS_P
- | NAND_CON_SE_P | NAND_CON_WP_P | NAND_CON_PRE_P
- | cs_flag, EBU_NAND_CON);
+ | NAND_CON_SE_P | NAND_CON_WP_P | NAND_CON_PRE_P
+ | cs_flag, EBU_NAND_CON);
- /* finish with a reset */
- xway_reset_chip(this);
+ /* Scan to find existence of the device */
+ err = nand_scan(mtd, 1);
+ if (err)
+ return err;
- return 0;
-}
+ err = mtd_device_register(mtd, NULL, 0);
+ if (err)
+ nand_release(mtd);
-static struct platform_nand_data xway_nand_data = {
- .chip = {
- .nr_chips = 1,
- .chip_delay = 30,
- },
- .ctrl = {
- .probe = xway_nand_probe,
- .cmd_ctrl = xway_cmd_ctrl,
- .dev_ready = xway_dev_ready,
- .select_chip = xway_select_chip,
- .read_byte = xway_read_byte,
- }
-};
+ return err;
+}
/*
- * Try to find the node inside the DT. If it is available attach out
- * platform_nand_data
+ * Remove a NAND device.
*/
-static int __init xway_register_nand(void)
+static int xway_nand_remove(struct platform_device *pdev)
{
- struct device_node *node;
- struct platform_device *pdev;
-
- node = of_find_compatible_node(NULL, NULL, "lantiq,nand-xway");
- if (!node)
- return -ENOENT;
- pdev = of_find_device_by_node(node);
- if (!pdev)
- return -EINVAL;
- pdev->dev.platform_data = &xway_nand_data;
- of_node_put(node);
+ struct xway_nand_data *data = platform_get_drvdata(pdev);
+
+ nand_release(nand_to_mtd(&data->chip));
+
return 0;
}
-subsys_initcall(xway_register_nand);
+static const struct of_device_id xway_nand_match[] = {
+ { .compatible = "lantiq,nand-xway" },
+ {},
+};
+MODULE_DEVICE_TABLE(of, xway_nand_match);
+
+static struct platform_driver xway_nand_driver = {
+ .probe = xway_nand_probe,
+ .remove = xway_nand_remove,
+ .driver = {
+ .name = "lantiq,nand-xway",
+ .of_match_table = xway_nand_match,
+ },
+};
+
+module_platform_driver(xway_nand_driver);
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c
index a4b029a417f0..1a6d0e367b89 100644
--- a/drivers/mtd/onenand/onenand_base.c
+++ b/drivers/mtd/onenand/onenand_base.c
@@ -3188,13 +3188,13 @@ static int onenand_otp_walk(struct mtd_info *mtd, loff_t from, size_t len,
size_t tmp_retlen;
ret = action(mtd, from, len, &tmp_retlen, buf);
+ if (ret)
+ break;
buf += tmp_retlen;
len -= tmp_retlen;
*retlen += tmp_retlen;
- if (ret)
- break;
}
otp_pages--;
}
diff --git a/drivers/mtd/spi-nor/Kconfig b/drivers/mtd/spi-nor/Kconfig
index d42c98e1f581..4a682ee0f632 100644
--- a/drivers/mtd/spi-nor/Kconfig
+++ b/drivers/mtd/spi-nor/Kconfig
@@ -29,6 +29,26 @@ config MTD_SPI_NOR_USE_4K_SECTORS
Please note that some tools/drivers/filesystems may not work with
4096 B erase size (e.g. UBIFS requires 15 KiB as a minimum).
+config SPI_ATMEL_QUADSPI
+ tristate "Atmel Quad SPI Controller"
+ depends on ARCH_AT91 || (ARM && COMPILE_TEST)
+ depends on OF && HAS_IOMEM
+ help
+ This enables support for the Quad SPI controller in master mode.
+ This driver does not support generic SPI. The implementation only
+ supports SPI NOR.
+
+config SPI_CADENCE_QUADSPI
+ tristate "Cadence Quad SPI controller"
+ depends on OF && ARM
+ help
+ Enable support for the Cadence Quad SPI Flash controller.
+
+ Cadence QSPI is a specialized controller for connecting an SPI
+ Flash over 1/2/4-bit wide bus. Enable this option if you have a
+ device with a Cadence QSPI controller and want to access the
+ Flash as an MTD device.
+
config SPI_FSL_QUADSPI
tristate "Freescale Quad SPI controller"
depends on ARCH_MXC || SOC_LS1021A || ARCH_LAYERSCAPE || COMPILE_TEST
@@ -38,6 +58,13 @@ config SPI_FSL_QUADSPI
This controller does not support generic SPI. It only supports
SPI NOR.
+config SPI_HISI_SFC
+ tristate "Hisilicon SPI-NOR Flash Controller(SFC)"
+ depends on ARCH_HISI || COMPILE_TEST
+ depends on HAS_IOMEM && HAS_DMA
+ help
+ This enables support for hisilicon SPI-NOR flash controller.
+
config SPI_NXP_SPIFI
tristate "NXP SPI Flash Interface (SPIFI)"
depends on OF && (ARCH_LPC18XX || COMPILE_TEST)
diff --git a/drivers/mtd/spi-nor/Makefile b/drivers/mtd/spi-nor/Makefile
index 0bf3a7f81675..121695e83542 100644
--- a/drivers/mtd/spi-nor/Makefile
+++ b/drivers/mtd/spi-nor/Makefile
@@ -1,4 +1,7 @@
obj-$(CONFIG_MTD_SPI_NOR) += spi-nor.o
+obj-$(CONFIG_SPI_ATMEL_QUADSPI) += atmel-quadspi.o
+obj-$(CONFIG_SPI_CADENCE_QUADSPI) += cadence-quadspi.o
obj-$(CONFIG_SPI_FSL_QUADSPI) += fsl-quadspi.o
+obj-$(CONFIG_SPI_HISI_SFC) += hisi-sfc.o
obj-$(CONFIG_MTD_MT81xx_NOR) += mtk-quadspi.o
obj-$(CONFIG_SPI_NXP_SPIFI) += nxp-spifi.o
diff --git a/drivers/mtd/spi-nor/atmel-quadspi.c b/drivers/mtd/spi-nor/atmel-quadspi.c
new file mode 100644
index 000000000000..47937d9beec6
--- /dev/null
+++ b/drivers/mtd/spi-nor/atmel-quadspi.c
@@ -0,0 +1,732 @@
+/*
+ * Driver for Atmel QSPI Controller
+ *
+ * Copyright (C) 2015 Atmel Corporation
+ *
+ * Author: Cyrille Pitchen <cyrille.pitchen@atmel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ *
+ * This driver is based on drivers/mtd/spi-nor/fsl-quadspi.c from Freescale.
+ */
+
+#include <linux/kernel.h>
+#include <linux/clk.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/spi-nor.h>
+#include <linux/platform_data/atmel.h>
+#include <linux/of.h>
+
+#include <linux/io.h>
+#include <linux/gpio.h>
+#include <linux/pinctrl/consumer.h>
+
+/* QSPI register offsets */
+#define QSPI_CR 0x0000 /* Control Register */
+#define QSPI_MR 0x0004 /* Mode Register */
+#define QSPI_RD 0x0008 /* Receive Data Register */
+#define QSPI_TD 0x000c /* Transmit Data Register */
+#define QSPI_SR 0x0010 /* Status Register */
+#define QSPI_IER 0x0014 /* Interrupt Enable Register */
+#define QSPI_IDR 0x0018 /* Interrupt Disable Register */
+#define QSPI_IMR 0x001c /* Interrupt Mask Register */
+#define QSPI_SCR 0x0020 /* Serial Clock Register */
+
+#define QSPI_IAR 0x0030 /* Instruction Address Register */
+#define QSPI_ICR 0x0034 /* Instruction Code Register */
+#define QSPI_IFR 0x0038 /* Instruction Frame Register */
+
+#define QSPI_SMR 0x0040 /* Scrambling Mode Register */
+#define QSPI_SKR 0x0044 /* Scrambling Key Register */
+
+#define QSPI_WPMR 0x00E4 /* Write Protection Mode Register */
+#define QSPI_WPSR 0x00E8 /* Write Protection Status Register */
+
+#define QSPI_VERSION 0x00FC /* Version Register */
+
+
+/* Bitfields in QSPI_CR (Control Register) */
+#define QSPI_CR_QSPIEN BIT(0)
+#define QSPI_CR_QSPIDIS BIT(1)
+#define QSPI_CR_SWRST BIT(7)
+#define QSPI_CR_LASTXFER BIT(24)
+
+/* Bitfields in QSPI_MR (Mode Register) */
+#define QSPI_MR_SSM BIT(0)
+#define QSPI_MR_LLB BIT(1)
+#define QSPI_MR_WDRBT BIT(2)
+#define QSPI_MR_SMRM BIT(3)
+#define QSPI_MR_CSMODE_MASK GENMASK(5, 4)
+#define QSPI_MR_CSMODE_NOT_RELOADED (0 << 4)
+#define QSPI_MR_CSMODE_LASTXFER (1 << 4)
+#define QSPI_MR_CSMODE_SYSTEMATICALLY (2 << 4)
+#define QSPI_MR_NBBITS_MASK GENMASK(11, 8)
+#define QSPI_MR_NBBITS(n) ((((n) - 8) << 8) & QSPI_MR_NBBITS_MASK)
+#define QSPI_MR_DLYBCT_MASK GENMASK(23, 16)
+#define QSPI_MR_DLYBCT(n) (((n) << 16) & QSPI_MR_DLYBCT_MASK)
+#define QSPI_MR_DLYCS_MASK GENMASK(31, 24)
+#define QSPI_MR_DLYCS(n) (((n) << 24) & QSPI_MR_DLYCS_MASK)
+
+/* Bitfields in QSPI_SR/QSPI_IER/QSPI_IDR/QSPI_IMR */
+#define QSPI_SR_RDRF BIT(0)
+#define QSPI_SR_TDRE BIT(1)
+#define QSPI_SR_TXEMPTY BIT(2)
+#define QSPI_SR_OVRES BIT(3)
+#define QSPI_SR_CSR BIT(8)
+#define QSPI_SR_CSS BIT(9)
+#define QSPI_SR_INSTRE BIT(10)
+#define QSPI_SR_QSPIENS BIT(24)
+
+#define QSPI_SR_CMD_COMPLETED (QSPI_SR_INSTRE | QSPI_SR_CSR)
+
+/* Bitfields in QSPI_SCR (Serial Clock Register) */
+#define QSPI_SCR_CPOL BIT(0)
+#define QSPI_SCR_CPHA BIT(1)
+#define QSPI_SCR_SCBR_MASK GENMASK(15, 8)
+#define QSPI_SCR_SCBR(n) (((n) << 8) & QSPI_SCR_SCBR_MASK)
+#define QSPI_SCR_DLYBS_MASK GENMASK(23, 16)
+#define QSPI_SCR_DLYBS(n) (((n) << 16) & QSPI_SCR_DLYBS_MASK)
+
+/* Bitfields in QSPI_ICR (Instruction Code Register) */
+#define QSPI_ICR_INST_MASK GENMASK(7, 0)
+#define QSPI_ICR_INST(inst) (((inst) << 0) & QSPI_ICR_INST_MASK)
+#define QSPI_ICR_OPT_MASK GENMASK(23, 16)
+#define QSPI_ICR_OPT(opt) (((opt) << 16) & QSPI_ICR_OPT_MASK)
+
+/* Bitfields in QSPI_IFR (Instruction Frame Register) */
+#define QSPI_IFR_WIDTH_MASK GENMASK(2, 0)
+#define QSPI_IFR_WIDTH_SINGLE_BIT_SPI (0 << 0)
+#define QSPI_IFR_WIDTH_DUAL_OUTPUT (1 << 0)
+#define QSPI_IFR_WIDTH_QUAD_OUTPUT (2 << 0)
+#define QSPI_IFR_WIDTH_DUAL_IO (3 << 0)
+#define QSPI_IFR_WIDTH_QUAD_IO (4 << 0)
+#define QSPI_IFR_WIDTH_DUAL_CMD (5 << 0)
+#define QSPI_IFR_WIDTH_QUAD_CMD (6 << 0)
+#define QSPI_IFR_INSTEN BIT(4)
+#define QSPI_IFR_ADDREN BIT(5)
+#define QSPI_IFR_OPTEN BIT(6)
+#define QSPI_IFR_DATAEN BIT(7)
+#define QSPI_IFR_OPTL_MASK GENMASK(9, 8)
+#define QSPI_IFR_OPTL_1BIT (0 << 8)
+#define QSPI_IFR_OPTL_2BIT (1 << 8)
+#define QSPI_IFR_OPTL_4BIT (2 << 8)
+#define QSPI_IFR_OPTL_8BIT (3 << 8)
+#define QSPI_IFR_ADDRL BIT(10)
+#define QSPI_IFR_TFRTYP_MASK GENMASK(13, 12)
+#define QSPI_IFR_TFRTYP_TRSFR_READ (0 << 12)
+#define QSPI_IFR_TFRTYP_TRSFR_READ_MEM (1 << 12)
+#define QSPI_IFR_TFRTYP_TRSFR_WRITE (2 << 12)
+#define QSPI_IFR_TFRTYP_TRSFR_WRITE_MEM (3 << 13)
+#define QSPI_IFR_CRM BIT(14)
+#define QSPI_IFR_NBDUM_MASK GENMASK(20, 16)
+#define QSPI_IFR_NBDUM(n) (((n) << 16) & QSPI_IFR_NBDUM_MASK)
+
+/* Bitfields in QSPI_SMR (Scrambling Mode Register) */
+#define QSPI_SMR_SCREN BIT(0)
+#define QSPI_SMR_RVDIS BIT(1)
+
+/* Bitfields in QSPI_WPMR (Write Protection Mode Register) */
+#define QSPI_WPMR_WPEN BIT(0)
+#define QSPI_WPMR_WPKEY_MASK GENMASK(31, 8)
+#define QSPI_WPMR_WPKEY(wpkey) (((wpkey) << 8) & QSPI_WPMR_WPKEY_MASK)
+
+/* Bitfields in QSPI_WPSR (Write Protection Status Register) */
+#define QSPI_WPSR_WPVS BIT(0)
+#define QSPI_WPSR_WPVSRC_MASK GENMASK(15, 8)
+#define QSPI_WPSR_WPVSRC(src) (((src) << 8) & QSPI_WPSR_WPVSRC)
+
+
+struct atmel_qspi {
+ void __iomem *regs;
+ void __iomem *mem;
+ struct clk *clk;
+ struct platform_device *pdev;
+ u32 pending;
+
+ struct spi_nor nor;
+ u32 clk_rate;
+ struct completion cmd_completion;
+};
+
+struct atmel_qspi_command {
+ union {
+ struct {
+ u32 instruction:1;
+ u32 address:3;
+ u32 mode:1;
+ u32 dummy:1;
+ u32 data:1;
+ u32 reserved:25;
+ } bits;
+ u32 word;
+ } enable;
+ u8 instruction;
+ u8 mode;
+ u8 num_mode_cycles;
+ u8 num_dummy_cycles;
+ u32 address;
+
+ size_t buf_len;
+ const void *tx_buf;
+ void *rx_buf;
+};
+
+/* Register access functions */
+static inline u32 qspi_readl(struct atmel_qspi *aq, u32 reg)
+{
+ return readl_relaxed(aq->regs + reg);
+}
+
+static inline void qspi_writel(struct atmel_qspi *aq, u32 reg, u32 value)
+{
+ writel_relaxed(value, aq->regs + reg);
+}
+
+static int atmel_qspi_run_transfer(struct atmel_qspi *aq,
+ const struct atmel_qspi_command *cmd)
+{
+ void __iomem *ahb_mem;
+
+ /* Then fallback to a PIO transfer (memcpy() DOES NOT work!) */
+ ahb_mem = aq->mem;
+ if (cmd->enable.bits.address)
+ ahb_mem += cmd->address;
+ if (cmd->tx_buf)
+ _memcpy_toio(ahb_mem, cmd->tx_buf, cmd->buf_len);
+ else
+ _memcpy_fromio(cmd->rx_buf, ahb_mem, cmd->buf_len);
+
+ return 0;
+}
+
+#ifdef DEBUG
+static void atmel_qspi_debug_command(struct atmel_qspi *aq,
+ const struct atmel_qspi_command *cmd,
+ u32 ifr)
+{
+ u8 cmd_buf[SPI_NOR_MAX_CMD_SIZE];
+ size_t len = 0;
+ int i;
+
+ if (cmd->enable.bits.instruction)
+ cmd_buf[len++] = cmd->instruction;
+
+ for (i = cmd->enable.bits.address-1; i >= 0; --i)
+ cmd_buf[len++] = (cmd->address >> (i << 3)) & 0xff;
+
+ if (cmd->enable.bits.mode)
+ cmd_buf[len++] = cmd->mode;
+
+ if (cmd->enable.bits.dummy) {
+ int num = cmd->num_dummy_cycles;
+
+ switch (ifr & QSPI_IFR_WIDTH_MASK) {
+ case QSPI_IFR_WIDTH_SINGLE_BIT_SPI:
+ case QSPI_IFR_WIDTH_DUAL_OUTPUT:
+ case QSPI_IFR_WIDTH_QUAD_OUTPUT:
+ num >>= 3;
+ break;
+ case QSPI_IFR_WIDTH_DUAL_IO:
+ case QSPI_IFR_WIDTH_DUAL_CMD:
+ num >>= 2;
+ break;
+ case QSPI_IFR_WIDTH_QUAD_IO:
+ case QSPI_IFR_WIDTH_QUAD_CMD:
+ num >>= 1;
+ break;
+ default:
+ return;
+ }
+
+ for (i = 0; i < num; ++i)
+ cmd_buf[len++] = 0;
+ }
+
+ /* Dump the SPI command */
+ print_hex_dump(KERN_DEBUG, "qspi cmd: ", DUMP_PREFIX_NONE,
+ 32, 1, cmd_buf, len, false);
+
+#ifdef VERBOSE_DEBUG
+ /* If verbose debug is enabled, also dump the TX data */
+ if (cmd->enable.bits.data && cmd->tx_buf)
+ print_hex_dump(KERN_DEBUG, "qspi tx : ", DUMP_PREFIX_NONE,
+ 32, 1, cmd->tx_buf, cmd->buf_len, false);
+#endif
+}
+#else
+#define atmel_qspi_debug_command(aq, cmd, ifr)
+#endif
+
+static int atmel_qspi_run_command(struct atmel_qspi *aq,
+ const struct atmel_qspi_command *cmd,
+ u32 ifr_tfrtyp, u32 ifr_width)
+{
+ u32 iar, icr, ifr, sr;
+ int err = 0;
+
+ iar = 0;
+ icr = 0;
+ ifr = ifr_tfrtyp | ifr_width;
+
+ /* Compute instruction parameters */
+ if (cmd->enable.bits.instruction) {
+ icr |= QSPI_ICR_INST(cmd->instruction);
+ ifr |= QSPI_IFR_INSTEN;
+ }
+
+ /* Compute address parameters */
+ switch (cmd->enable.bits.address) {
+ case 4:
+ ifr |= QSPI_IFR_ADDRL;
+ /* fall through to the 24bit (3 byte) address case. */
+ case 3:
+ iar = (cmd->enable.bits.data) ? 0 : cmd->address;
+ ifr |= QSPI_IFR_ADDREN;
+ break;
+ case 0:
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ /* Compute option parameters */
+ if (cmd->enable.bits.mode && cmd->num_mode_cycles) {
+ u32 mode_cycle_bits, mode_bits;
+
+ icr |= QSPI_ICR_OPT(cmd->mode);
+ ifr |= QSPI_IFR_OPTEN;
+
+ switch (ifr & QSPI_IFR_WIDTH_MASK) {
+ case QSPI_IFR_WIDTH_SINGLE_BIT_SPI:
+ case QSPI_IFR_WIDTH_DUAL_OUTPUT:
+ case QSPI_IFR_WIDTH_QUAD_OUTPUT:
+ mode_cycle_bits = 1;
+ break;
+ case QSPI_IFR_WIDTH_DUAL_IO:
+ case QSPI_IFR_WIDTH_DUAL_CMD:
+ mode_cycle_bits = 2;
+ break;
+ case QSPI_IFR_WIDTH_QUAD_IO:
+ case QSPI_IFR_WIDTH_QUAD_CMD:
+ mode_cycle_bits = 4;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ mode_bits = cmd->num_mode_cycles * mode_cycle_bits;
+ switch (mode_bits) {
+ case 1:
+ ifr |= QSPI_IFR_OPTL_1BIT;
+ break;
+
+ case 2:
+ ifr |= QSPI_IFR_OPTL_2BIT;
+ break;
+
+ case 4:
+ ifr |= QSPI_IFR_OPTL_4BIT;
+ break;
+
+ case 8:
+ ifr |= QSPI_IFR_OPTL_8BIT;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+ }
+
+ /* Set number of dummy cycles */
+ if (cmd->enable.bits.dummy)
+ ifr |= QSPI_IFR_NBDUM(cmd->num_dummy_cycles);
+
+ /* Set data enable */
+ if (cmd->enable.bits.data) {
+ ifr |= QSPI_IFR_DATAEN;
+
+ /* Special case for Continuous Read Mode */
+ if (!cmd->tx_buf && !cmd->rx_buf)
+ ifr |= QSPI_IFR_CRM;
+ }
+
+ /* Clear pending interrupts */
+ (void)qspi_readl(aq, QSPI_SR);
+
+ /* Set QSPI Instruction Frame registers */
+ atmel_qspi_debug_command(aq, cmd, ifr);
+ qspi_writel(aq, QSPI_IAR, iar);
+ qspi_writel(aq, QSPI_ICR, icr);
+ qspi_writel(aq, QSPI_IFR, ifr);
+
+ /* Skip to the final steps if there is no data */
+ if (!cmd->enable.bits.data)
+ goto no_data;
+
+ /* Dummy read of QSPI_IFR to synchronize APB and AHB accesses */
+ (void)qspi_readl(aq, QSPI_IFR);
+
+ /* Stop here for continuous read */
+ if (!cmd->tx_buf && !cmd->rx_buf)
+ return 0;
+ /* Send/Receive data */
+ err = atmel_qspi_run_transfer(aq, cmd);
+
+ /* Release the chip-select */
+ qspi_writel(aq, QSPI_CR, QSPI_CR_LASTXFER);
+
+ if (err)
+ return err;
+
+#if defined(DEBUG) && defined(VERBOSE_DEBUG)
+ /*
+ * If verbose debug is enabled, also dump the RX data in addition to
+ * the SPI command previously dumped by atmel_qspi_debug_command()
+ */
+ if (cmd->rx_buf)
+ print_hex_dump(KERN_DEBUG, "qspi rx : ", DUMP_PREFIX_NONE,
+ 32, 1, cmd->rx_buf, cmd->buf_len, false);
+#endif
+no_data:
+ /* Poll INSTRuction End status */
+ sr = qspi_readl(aq, QSPI_SR);
+ if ((sr & QSPI_SR_CMD_COMPLETED) == QSPI_SR_CMD_COMPLETED)
+ return err;
+
+ /* Wait for INSTRuction End interrupt */
+ reinit_completion(&aq->cmd_completion);
+ aq->pending = sr & QSPI_SR_CMD_COMPLETED;
+ qspi_writel(aq, QSPI_IER, QSPI_SR_CMD_COMPLETED);
+ if (!wait_for_completion_timeout(&aq->cmd_completion,
+ msecs_to_jiffies(1000)))
+ err = -ETIMEDOUT;
+ qspi_writel(aq, QSPI_IDR, QSPI_SR_CMD_COMPLETED);
+
+ return err;
+}
+
+static int atmel_qspi_read_reg(struct spi_nor *nor, u8 opcode,
+ u8 *buf, int len)
+{
+ struct atmel_qspi *aq = nor->priv;
+ struct atmel_qspi_command cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.enable.bits.instruction = 1;
+ cmd.enable.bits.data = 1;
+ cmd.instruction = opcode;
+ cmd.rx_buf = buf;
+ cmd.buf_len = len;
+ return atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_READ,
+ QSPI_IFR_WIDTH_SINGLE_BIT_SPI);
+}
+
+static int atmel_qspi_write_reg(struct spi_nor *nor, u8 opcode,
+ u8 *buf, int len)
+{
+ struct atmel_qspi *aq = nor->priv;
+ struct atmel_qspi_command cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.enable.bits.instruction = 1;
+ cmd.enable.bits.data = (buf != NULL && len > 0);
+ cmd.instruction = opcode;
+ cmd.tx_buf = buf;
+ cmd.buf_len = len;
+ return atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_WRITE,
+ QSPI_IFR_WIDTH_SINGLE_BIT_SPI);
+}
+
+static ssize_t atmel_qspi_write(struct spi_nor *nor, loff_t to, size_t len,
+ const u_char *write_buf)
+{
+ struct atmel_qspi *aq = nor->priv;
+ struct atmel_qspi_command cmd;
+ ssize_t ret;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.enable.bits.instruction = 1;
+ cmd.enable.bits.address = nor->addr_width;
+ cmd.enable.bits.data = 1;
+ cmd.instruction = nor->program_opcode;
+ cmd.address = (u32)to;
+ cmd.tx_buf = write_buf;
+ cmd.buf_len = len;
+ ret = atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_WRITE_MEM,
+ QSPI_IFR_WIDTH_SINGLE_BIT_SPI);
+ return (ret < 0) ? ret : len;
+}
+
+static int atmel_qspi_erase(struct spi_nor *nor, loff_t offs)
+{
+ struct atmel_qspi *aq = nor->priv;
+ struct atmel_qspi_command cmd;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.enable.bits.instruction = 1;
+ cmd.enable.bits.address = nor->addr_width;
+ cmd.instruction = nor->erase_opcode;
+ cmd.address = (u32)offs;
+ return atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_WRITE,
+ QSPI_IFR_WIDTH_SINGLE_BIT_SPI);
+}
+
+static ssize_t atmel_qspi_read(struct spi_nor *nor, loff_t from, size_t len,
+ u_char *read_buf)
+{
+ struct atmel_qspi *aq = nor->priv;
+ struct atmel_qspi_command cmd;
+ u8 num_mode_cycles, num_dummy_cycles;
+ u32 ifr_width;
+ ssize_t ret;
+
+ switch (nor->flash_read) {
+ case SPI_NOR_NORMAL:
+ case SPI_NOR_FAST:
+ ifr_width = QSPI_IFR_WIDTH_SINGLE_BIT_SPI;
+ break;
+
+ case SPI_NOR_DUAL:
+ ifr_width = QSPI_IFR_WIDTH_DUAL_OUTPUT;
+ break;
+
+ case SPI_NOR_QUAD:
+ ifr_width = QSPI_IFR_WIDTH_QUAD_OUTPUT;
+ break;
+
+ default:
+ return -EINVAL;
+ }
+
+ if (nor->read_dummy >= 2) {
+ num_mode_cycles = 2;
+ num_dummy_cycles = nor->read_dummy - 2;
+ } else {
+ num_mode_cycles = nor->read_dummy;
+ num_dummy_cycles = 0;
+ }
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.enable.bits.instruction = 1;
+ cmd.enable.bits.address = nor->addr_width;
+ cmd.enable.bits.mode = (num_mode_cycles > 0);
+ cmd.enable.bits.dummy = (num_dummy_cycles > 0);
+ cmd.enable.bits.data = 1;
+ cmd.instruction = nor->read_opcode;
+ cmd.address = (u32)from;
+ cmd.mode = 0xff; /* This value prevents from entering the 0-4-4 mode */
+ cmd.num_mode_cycles = num_mode_cycles;
+ cmd.num_dummy_cycles = num_dummy_cycles;
+ cmd.rx_buf = read_buf;
+ cmd.buf_len = len;
+ ret = atmel_qspi_run_command(aq, &cmd, QSPI_IFR_TFRTYP_TRSFR_READ_MEM,
+ ifr_width);
+ return (ret < 0) ? ret : len;
+}
+
+static int atmel_qspi_init(struct atmel_qspi *aq)
+{
+ unsigned long src_rate;
+ u32 mr, scr, scbr;
+
+ /* Reset the QSPI controller */
+ qspi_writel(aq, QSPI_CR, QSPI_CR_SWRST);
+
+ /* Set the QSPI controller in Serial Memory Mode */
+ mr = QSPI_MR_NBBITS(8) | QSPI_MR_SSM;
+ qspi_writel(aq, QSPI_MR, mr);
+
+ src_rate = clk_get_rate(aq->clk);
+ if (!src_rate)
+ return -EINVAL;
+
+ /* Compute the QSPI baudrate */
+ scbr = DIV_ROUND_UP(src_rate, aq->clk_rate);
+ if (scbr > 0)
+ scbr--;
+ scr = QSPI_SCR_SCBR(scbr);
+ qspi_writel(aq, QSPI_SCR, scr);
+
+ /* Enable the QSPI controller */
+ qspi_writel(aq, QSPI_CR, QSPI_CR_QSPIEN);
+
+ return 0;
+}
+
+static irqreturn_t atmel_qspi_interrupt(int irq, void *dev_id)
+{
+ struct atmel_qspi *aq = (struct atmel_qspi *)dev_id;
+ u32 status, mask, pending;
+
+ status = qspi_readl(aq, QSPI_SR);
+ mask = qspi_readl(aq, QSPI_IMR);
+ pending = status & mask;
+
+ if (!pending)
+ return IRQ_NONE;
+
+ aq->pending |= pending;
+ if ((aq->pending & QSPI_SR_CMD_COMPLETED) == QSPI_SR_CMD_COMPLETED)
+ complete(&aq->cmd_completion);
+
+ return IRQ_HANDLED;
+}
+
+static int atmel_qspi_probe(struct platform_device *pdev)
+{
+ struct device_node *child, *np = pdev->dev.of_node;
+ struct atmel_qspi *aq;
+ struct resource *res;
+ struct spi_nor *nor;
+ struct mtd_info *mtd;
+ int irq, err = 0;
+
+ if (of_get_child_count(np) != 1)
+ return -ENODEV;
+ child = of_get_next_child(np, NULL);
+
+ aq = devm_kzalloc(&pdev->dev, sizeof(*aq), GFP_KERNEL);
+ if (!aq) {
+ err = -ENOMEM;
+ goto exit;
+ }
+
+ platform_set_drvdata(pdev, aq);
+ init_completion(&aq->cmd_completion);
+ aq->pdev = pdev;
+
+ /* Map the registers */
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "qspi_base");
+ aq->regs = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(aq->regs)) {
+ dev_err(&pdev->dev, "missing registers\n");
+ err = PTR_ERR(aq->regs);
+ goto exit;
+ }
+
+ /* Map the AHB memory */
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "qspi_mmap");
+ aq->mem = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(aq->mem)) {
+ dev_err(&pdev->dev, "missing AHB memory\n");
+ err = PTR_ERR(aq->mem);
+ goto exit;
+ }
+
+ /* Get the peripheral clock */
+ aq->clk = devm_clk_get(&pdev->dev, NULL);
+ if (IS_ERR(aq->clk)) {
+ dev_err(&pdev->dev, "missing peripheral clock\n");
+ err = PTR_ERR(aq->clk);
+ goto exit;
+ }
+
+ /* Enable the peripheral clock */
+ err = clk_prepare_enable(aq->clk);
+ if (err) {
+ dev_err(&pdev->dev, "failed to enable the peripheral clock\n");
+ goto exit;
+ }
+
+ /* Request the IRQ */
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0) {
+ dev_err(&pdev->dev, "missing IRQ\n");
+ err = irq;
+ goto disable_clk;
+ }
+ err = devm_request_irq(&pdev->dev, irq, atmel_qspi_interrupt,
+ 0, dev_name(&pdev->dev), aq);
+ if (err)
+ goto disable_clk;
+
+ /* Setup the spi-nor */
+ nor = &aq->nor;
+ mtd = &nor->mtd;
+
+ nor->dev = &pdev->dev;
+ spi_nor_set_flash_node(nor, child);
+ nor->priv = aq;
+ mtd->priv = nor;
+
+ nor->read_reg = atmel_qspi_read_reg;
+ nor->write_reg = atmel_qspi_write_reg;
+ nor->read = atmel_qspi_read;
+ nor->write = atmel_qspi_write;
+ nor->erase = atmel_qspi_erase;
+
+ err = of_property_read_u32(child, "spi-max-frequency", &aq->clk_rate);
+ if (err < 0)
+ goto disable_clk;
+
+ err = atmel_qspi_init(aq);
+ if (err)
+ goto disable_clk;
+
+ err = spi_nor_scan(nor, NULL, SPI_NOR_QUAD);
+ if (err)
+ goto disable_clk;
+
+ err = mtd_device_register(mtd, NULL, 0);
+ if (err)
+ goto disable_clk;
+
+ of_node_put(child);
+
+ return 0;
+
+disable_clk:
+ clk_disable_unprepare(aq->clk);
+exit:
+ of_node_put(child);
+
+ return err;
+}
+
+static int atmel_qspi_remove(struct platform_device *pdev)
+{
+ struct atmel_qspi *aq = platform_get_drvdata(pdev);
+
+ mtd_device_unregister(&aq->nor.mtd);
+ qspi_writel(aq, QSPI_CR, QSPI_CR_QSPIDIS);
+ clk_disable_unprepare(aq->clk);
+ return 0;
+}
+
+
+static const struct of_device_id atmel_qspi_dt_ids[] = {
+ { .compatible = "atmel,sama5d2-qspi" },
+ { /* sentinel */ }
+};
+
+MODULE_DEVICE_TABLE(of, atmel_qspi_dt_ids);
+
+static struct platform_driver atmel_qspi_driver = {
+ .driver = {
+ .name = "atmel_qspi",
+ .of_match_table = atmel_qspi_dt_ids,
+ },
+ .probe = atmel_qspi_probe,
+ .remove = atmel_qspi_remove,
+};
+module_platform_driver(atmel_qspi_driver);
+
+MODULE_AUTHOR("Cyrille Pitchen <cyrille.pitchen@atmel.com>");
+MODULE_DESCRIPTION("Atmel QSPI Controller driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/mtd/spi-nor/cadence-quadspi.c b/drivers/mtd/spi-nor/cadence-quadspi.c
new file mode 100644
index 000000000000..d403ba7b8f43
--- /dev/null
+++ b/drivers/mtd/spi-nor/cadence-quadspi.c
@@ -0,0 +1,1299 @@
+/*
+ * Driver for Cadence QSPI Controller
+ *
+ * Copyright Altera Corporation (C) 2012-2014. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/clk.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/partitions.h>
+#include <linux/mtd/spi-nor.h>
+#include <linux/of_device.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
+#include <linux/spi/spi.h>
+#include <linux/timer.h>
+
+#define CQSPI_NAME "cadence-qspi"
+#define CQSPI_MAX_CHIPSELECT 16
+
+struct cqspi_st;
+
+struct cqspi_flash_pdata {
+ struct spi_nor nor;
+ struct cqspi_st *cqspi;
+ u32 clk_rate;
+ u32 read_delay;
+ u32 tshsl_ns;
+ u32 tsd2d_ns;
+ u32 tchsh_ns;
+ u32 tslch_ns;
+ u8 inst_width;
+ u8 addr_width;
+ u8 data_width;
+ u8 cs;
+ bool registered;
+};
+
+struct cqspi_st {
+ struct platform_device *pdev;
+
+ struct clk *clk;
+ unsigned int sclk;
+
+ void __iomem *iobase;
+ void __iomem *ahb_base;
+ struct completion transfer_complete;
+ struct mutex bus_mutex;
+
+ int current_cs;
+ int current_page_size;
+ int current_erase_size;
+ int current_addr_width;
+ unsigned long master_ref_clk_hz;
+ bool is_decoded_cs;
+ u32 fifo_depth;
+ u32 fifo_width;
+ u32 trigger_address;
+ struct cqspi_flash_pdata f_pdata[CQSPI_MAX_CHIPSELECT];
+};
+
+/* Operation timeout value */
+#define CQSPI_TIMEOUT_MS 500
+#define CQSPI_READ_TIMEOUT_MS 10
+
+/* Instruction type */
+#define CQSPI_INST_TYPE_SINGLE 0
+#define CQSPI_INST_TYPE_DUAL 1
+#define CQSPI_INST_TYPE_QUAD 2
+
+#define CQSPI_DUMMY_CLKS_PER_BYTE 8
+#define CQSPI_DUMMY_BYTES_MAX 4
+#define CQSPI_DUMMY_CLKS_MAX 31
+
+#define CQSPI_STIG_DATA_LEN_MAX 8
+
+/* Register map */
+#define CQSPI_REG_CONFIG 0x00
+#define CQSPI_REG_CONFIG_ENABLE_MASK BIT(0)
+#define CQSPI_REG_CONFIG_DECODE_MASK BIT(9)
+#define CQSPI_REG_CONFIG_CHIPSELECT_LSB 10
+#define CQSPI_REG_CONFIG_DMA_MASK BIT(15)
+#define CQSPI_REG_CONFIG_BAUD_LSB 19
+#define CQSPI_REG_CONFIG_IDLE_LSB 31
+#define CQSPI_REG_CONFIG_CHIPSELECT_MASK 0xF
+#define CQSPI_REG_CONFIG_BAUD_MASK 0xF
+
+#define CQSPI_REG_RD_INSTR 0x04
+#define CQSPI_REG_RD_INSTR_OPCODE_LSB 0
+#define CQSPI_REG_RD_INSTR_TYPE_INSTR_LSB 8
+#define CQSPI_REG_RD_INSTR_TYPE_ADDR_LSB 12
+#define CQSPI_REG_RD_INSTR_TYPE_DATA_LSB 16
+#define CQSPI_REG_RD_INSTR_MODE_EN_LSB 20
+#define CQSPI_REG_RD_INSTR_DUMMY_LSB 24
+#define CQSPI_REG_RD_INSTR_TYPE_INSTR_MASK 0x3
+#define CQSPI_REG_RD_INSTR_TYPE_ADDR_MASK 0x3
+#define CQSPI_REG_RD_INSTR_TYPE_DATA_MASK 0x3
+#define CQSPI_REG_RD_INSTR_DUMMY_MASK 0x1F
+
+#define CQSPI_REG_WR_INSTR 0x08
+#define CQSPI_REG_WR_INSTR_OPCODE_LSB 0
+#define CQSPI_REG_WR_INSTR_TYPE_ADDR_LSB 12
+#define CQSPI_REG_WR_INSTR_TYPE_DATA_LSB 16
+
+#define CQSPI_REG_DELAY 0x0C
+#define CQSPI_REG_DELAY_TSLCH_LSB 0
+#define CQSPI_REG_DELAY_TCHSH_LSB 8
+#define CQSPI_REG_DELAY_TSD2D_LSB 16
+#define CQSPI_REG_DELAY_TSHSL_LSB 24
+#define CQSPI_REG_DELAY_TSLCH_MASK 0xFF
+#define CQSPI_REG_DELAY_TCHSH_MASK 0xFF
+#define CQSPI_REG_DELAY_TSD2D_MASK 0xFF
+#define CQSPI_REG_DELAY_TSHSL_MASK 0xFF
+
+#define CQSPI_REG_READCAPTURE 0x10
+#define CQSPI_REG_READCAPTURE_BYPASS_LSB 0
+#define CQSPI_REG_READCAPTURE_DELAY_LSB 1
+#define CQSPI_REG_READCAPTURE_DELAY_MASK 0xF
+
+#define CQSPI_REG_SIZE 0x14
+#define CQSPI_REG_SIZE_ADDRESS_LSB 0
+#define CQSPI_REG_SIZE_PAGE_LSB 4
+#define CQSPI_REG_SIZE_BLOCK_LSB 16
+#define CQSPI_REG_SIZE_ADDRESS_MASK 0xF
+#define CQSPI_REG_SIZE_PAGE_MASK 0xFFF
+#define CQSPI_REG_SIZE_BLOCK_MASK 0x3F
+
+#define CQSPI_REG_SRAMPARTITION 0x18
+#define CQSPI_REG_INDIRECTTRIGGER 0x1C
+
+#define CQSPI_REG_DMA 0x20
+#define CQSPI_REG_DMA_SINGLE_LSB 0
+#define CQSPI_REG_DMA_BURST_LSB 8
+#define CQSPI_REG_DMA_SINGLE_MASK 0xFF
+#define CQSPI_REG_DMA_BURST_MASK 0xFF
+
+#define CQSPI_REG_REMAP 0x24
+#define CQSPI_REG_MODE_BIT 0x28
+
+#define CQSPI_REG_SDRAMLEVEL 0x2C
+#define CQSPI_REG_SDRAMLEVEL_RD_LSB 0
+#define CQSPI_REG_SDRAMLEVEL_WR_LSB 16
+#define CQSPI_REG_SDRAMLEVEL_RD_MASK 0xFFFF
+#define CQSPI_REG_SDRAMLEVEL_WR_MASK 0xFFFF
+
+#define CQSPI_REG_IRQSTATUS 0x40
+#define CQSPI_REG_IRQMASK 0x44
+
+#define CQSPI_REG_INDIRECTRD 0x60
+#define CQSPI_REG_INDIRECTRD_START_MASK BIT(0)
+#define CQSPI_REG_INDIRECTRD_CANCEL_MASK BIT(1)
+#define CQSPI_REG_INDIRECTRD_DONE_MASK BIT(5)
+
+#define CQSPI_REG_INDIRECTRDWATERMARK 0x64
+#define CQSPI_REG_INDIRECTRDSTARTADDR 0x68
+#define CQSPI_REG_INDIRECTRDBYTES 0x6C
+
+#define CQSPI_REG_CMDCTRL 0x90
+#define CQSPI_REG_CMDCTRL_EXECUTE_MASK BIT(0)
+#define CQSPI_REG_CMDCTRL_INPROGRESS_MASK BIT(1)
+#define CQSPI_REG_CMDCTRL_WR_BYTES_LSB 12
+#define CQSPI_REG_CMDCTRL_WR_EN_LSB 15
+#define CQSPI_REG_CMDCTRL_ADD_BYTES_LSB 16
+#define CQSPI_REG_CMDCTRL_ADDR_EN_LSB 19
+#define CQSPI_REG_CMDCTRL_RD_BYTES_LSB 20
+#define CQSPI_REG_CMDCTRL_RD_EN_LSB 23
+#define CQSPI_REG_CMDCTRL_OPCODE_LSB 24
+#define CQSPI_REG_CMDCTRL_WR_BYTES_MASK 0x7
+#define CQSPI_REG_CMDCTRL_ADD_BYTES_MASK 0x3
+#define CQSPI_REG_CMDCTRL_RD_BYTES_MASK 0x7
+
+#define CQSPI_REG_INDIRECTWR 0x70
+#define CQSPI_REG_INDIRECTWR_START_MASK BIT(0)
+#define CQSPI_REG_INDIRECTWR_CANCEL_MASK BIT(1)
+#define CQSPI_REG_INDIRECTWR_DONE_MASK BIT(5)
+
+#define CQSPI_REG_INDIRECTWRWATERMARK 0x74
+#define CQSPI_REG_INDIRECTWRSTARTADDR 0x78
+#define CQSPI_REG_INDIRECTWRBYTES 0x7C
+
+#define CQSPI_REG_CMDADDRESS 0x94
+#define CQSPI_REG_CMDREADDATALOWER 0xA0
+#define CQSPI_REG_CMDREADDATAUPPER 0xA4
+#define CQSPI_REG_CMDWRITEDATALOWER 0xA8
+#define CQSPI_REG_CMDWRITEDATAUPPER 0xAC
+
+/* Interrupt status bits */
+#define CQSPI_REG_IRQ_MODE_ERR BIT(0)
+#define CQSPI_REG_IRQ_UNDERFLOW BIT(1)
+#define CQSPI_REG_IRQ_IND_COMP BIT(2)
+#define CQSPI_REG_IRQ_IND_RD_REJECT BIT(3)
+#define CQSPI_REG_IRQ_WR_PROTECTED_ERR BIT(4)
+#define CQSPI_REG_IRQ_ILLEGAL_AHB_ERR BIT(5)
+#define CQSPI_REG_IRQ_WATERMARK BIT(6)
+#define CQSPI_REG_IRQ_IND_SRAM_FULL BIT(12)
+
+#define CQSPI_IRQ_MASK_RD (CQSPI_REG_IRQ_WATERMARK | \
+ CQSPI_REG_IRQ_IND_SRAM_FULL | \
+ CQSPI_REG_IRQ_IND_COMP)
+
+#define CQSPI_IRQ_MASK_WR (CQSPI_REG_IRQ_IND_COMP | \
+ CQSPI_REG_IRQ_WATERMARK | \
+ CQSPI_REG_IRQ_UNDERFLOW)
+
+#define CQSPI_IRQ_STATUS_MASK 0x1FFFF
+
+static int cqspi_wait_for_bit(void __iomem *reg, const u32 mask, bool clear)
+{
+ unsigned long end = jiffies + msecs_to_jiffies(CQSPI_TIMEOUT_MS);
+ u32 val;
+
+ while (1) {
+ val = readl(reg);
+ if (clear)
+ val = ~val;
+ val &= mask;
+
+ if (val == mask)
+ return 0;
+
+ if (time_after(jiffies, end))
+ return -ETIMEDOUT;
+ }
+}
+
+static bool cqspi_is_idle(struct cqspi_st *cqspi)
+{
+ u32 reg = readl(cqspi->iobase + CQSPI_REG_CONFIG);
+
+ return reg & (1 << CQSPI_REG_CONFIG_IDLE_LSB);
+}
+
+static u32 cqspi_get_rd_sram_level(struct cqspi_st *cqspi)
+{
+ u32 reg = readl(cqspi->iobase + CQSPI_REG_SDRAMLEVEL);
+
+ reg >>= CQSPI_REG_SDRAMLEVEL_RD_LSB;
+ return reg & CQSPI_REG_SDRAMLEVEL_RD_MASK;
+}
+
+static irqreturn_t cqspi_irq_handler(int this_irq, void *dev)
+{
+ struct cqspi_st *cqspi = dev;
+ unsigned int irq_status;
+
+ /* Read interrupt status */
+ irq_status = readl(cqspi->iobase + CQSPI_REG_IRQSTATUS);
+
+ /* Clear interrupt */
+ writel(irq_status, cqspi->iobase + CQSPI_REG_IRQSTATUS);
+
+ irq_status &= CQSPI_IRQ_MASK_RD | CQSPI_IRQ_MASK_WR;
+
+ if (irq_status)
+ complete(&cqspi->transfer_complete);
+
+ return IRQ_HANDLED;
+}
+
+static unsigned int cqspi_calc_rdreg(struct spi_nor *nor, const u8 opcode)
+{
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+ u32 rdreg = 0;
+
+ rdreg |= f_pdata->inst_width << CQSPI_REG_RD_INSTR_TYPE_INSTR_LSB;
+ rdreg |= f_pdata->addr_width << CQSPI_REG_RD_INSTR_TYPE_ADDR_LSB;
+ rdreg |= f_pdata->data_width << CQSPI_REG_RD_INSTR_TYPE_DATA_LSB;
+
+ return rdreg;
+}
+
+static int cqspi_wait_idle(struct cqspi_st *cqspi)
+{
+ const unsigned int poll_idle_retry = 3;
+ unsigned int count = 0;
+ unsigned long timeout;
+
+ timeout = jiffies + msecs_to_jiffies(CQSPI_TIMEOUT_MS);
+ while (1) {
+ /*
+ * Read few times in succession to ensure the controller
+ * is indeed idle, that is, the bit does not transition
+ * low again.
+ */
+ if (cqspi_is_idle(cqspi))
+ count++;
+ else
+ count = 0;
+
+ if (count >= poll_idle_retry)
+ return 0;
+
+ if (time_after(jiffies, timeout)) {
+ /* Timeout, in busy mode. */
+ dev_err(&cqspi->pdev->dev,
+ "QSPI is still busy after %dms timeout.\n",
+ CQSPI_TIMEOUT_MS);
+ return -ETIMEDOUT;
+ }
+
+ cpu_relax();
+ }
+}
+
+static int cqspi_exec_flash_cmd(struct cqspi_st *cqspi, unsigned int reg)
+{
+ void __iomem *reg_base = cqspi->iobase;
+ int ret;
+
+ /* Write the CMDCTRL without start execution. */
+ writel(reg, reg_base + CQSPI_REG_CMDCTRL);
+ /* Start execute */
+ reg |= CQSPI_REG_CMDCTRL_EXECUTE_MASK;
+ writel(reg, reg_base + CQSPI_REG_CMDCTRL);
+
+ /* Polling for completion. */
+ ret = cqspi_wait_for_bit(reg_base + CQSPI_REG_CMDCTRL,
+ CQSPI_REG_CMDCTRL_INPROGRESS_MASK, 1);
+ if (ret) {
+ dev_err(&cqspi->pdev->dev,
+ "Flash command execution timed out.\n");
+ return ret;
+ }
+
+ /* Polling QSPI idle status. */
+ return cqspi_wait_idle(cqspi);
+}
+
+static int cqspi_command_read(struct spi_nor *nor,
+ const u8 *txbuf, const unsigned n_tx,
+ u8 *rxbuf, const unsigned n_rx)
+{
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+ void __iomem *reg_base = cqspi->iobase;
+ unsigned int rdreg;
+ unsigned int reg;
+ unsigned int read_len;
+ int status;
+
+ if (!n_rx || n_rx > CQSPI_STIG_DATA_LEN_MAX || !rxbuf) {
+ dev_err(nor->dev, "Invalid input argument, len %d rxbuf 0x%p\n",
+ n_rx, rxbuf);
+ return -EINVAL;
+ }
+
+ reg = txbuf[0] << CQSPI_REG_CMDCTRL_OPCODE_LSB;
+
+ rdreg = cqspi_calc_rdreg(nor, txbuf[0]);
+ writel(rdreg, reg_base + CQSPI_REG_RD_INSTR);
+
+ reg |= (0x1 << CQSPI_REG_CMDCTRL_RD_EN_LSB);
+
+ /* 0 means 1 byte. */
+ reg |= (((n_rx - 1) & CQSPI_REG_CMDCTRL_RD_BYTES_MASK)
+ << CQSPI_REG_CMDCTRL_RD_BYTES_LSB);
+ status = cqspi_exec_flash_cmd(cqspi, reg);
+ if (status)
+ return status;
+
+ reg = readl(reg_base + CQSPI_REG_CMDREADDATALOWER);
+
+ /* Put the read value into rx_buf */
+ read_len = (n_rx > 4) ? 4 : n_rx;
+ memcpy(rxbuf, &reg, read_len);
+ rxbuf += read_len;
+
+ if (n_rx > 4) {
+ reg = readl(reg_base + CQSPI_REG_CMDREADDATAUPPER);
+
+ read_len = n_rx - read_len;
+ memcpy(rxbuf, &reg, read_len);
+ }
+
+ return 0;
+}
+
+static int cqspi_command_write(struct spi_nor *nor, const u8 opcode,
+ const u8 *txbuf, const unsigned n_tx)
+{
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+ void __iomem *reg_base = cqspi->iobase;
+ unsigned int reg;
+ unsigned int data;
+ int ret;
+
+ if (n_tx > 4 || (n_tx && !txbuf)) {
+ dev_err(nor->dev,
+ "Invalid input argument, cmdlen %d txbuf 0x%p\n",
+ n_tx, txbuf);
+ return -EINVAL;
+ }
+
+ reg = opcode << CQSPI_REG_CMDCTRL_OPCODE_LSB;
+ if (n_tx) {
+ reg |= (0x1 << CQSPI_REG_CMDCTRL_WR_EN_LSB);
+ reg |= ((n_tx - 1) & CQSPI_REG_CMDCTRL_WR_BYTES_MASK)
+ << CQSPI_REG_CMDCTRL_WR_BYTES_LSB;
+ data = 0;
+ memcpy(&data, txbuf, n_tx);
+ writel(data, reg_base + CQSPI_REG_CMDWRITEDATALOWER);
+ }
+
+ ret = cqspi_exec_flash_cmd(cqspi, reg);
+ return ret;
+}
+
+static int cqspi_command_write_addr(struct spi_nor *nor,
+ const u8 opcode, const unsigned int addr)
+{
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+ void __iomem *reg_base = cqspi->iobase;
+ unsigned int reg;
+
+ reg = opcode << CQSPI_REG_CMDCTRL_OPCODE_LSB;
+ reg |= (0x1 << CQSPI_REG_CMDCTRL_ADDR_EN_LSB);
+ reg |= ((nor->addr_width - 1) & CQSPI_REG_CMDCTRL_ADD_BYTES_MASK)
+ << CQSPI_REG_CMDCTRL_ADD_BYTES_LSB;
+
+ writel(addr, reg_base + CQSPI_REG_CMDADDRESS);
+
+ return cqspi_exec_flash_cmd(cqspi, reg);
+}
+
+static int cqspi_indirect_read_setup(struct spi_nor *nor,
+ const unsigned int from_addr)
+{
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+ void __iomem *reg_base = cqspi->iobase;
+ unsigned int dummy_clk = 0;
+ unsigned int reg;
+
+ writel(from_addr, reg_base + CQSPI_REG_INDIRECTRDSTARTADDR);
+
+ reg = nor->read_opcode << CQSPI_REG_RD_INSTR_OPCODE_LSB;
+ reg |= cqspi_calc_rdreg(nor, nor->read_opcode);
+
+ /* Setup dummy clock cycles */
+ dummy_clk = nor->read_dummy;
+ if (dummy_clk > CQSPI_DUMMY_CLKS_MAX)
+ dummy_clk = CQSPI_DUMMY_CLKS_MAX;
+
+ if (dummy_clk / 8) {
+ reg |= (1 << CQSPI_REG_RD_INSTR_MODE_EN_LSB);
+ /* Set mode bits high to ensure chip doesn't enter XIP */
+ writel(0xFF, reg_base + CQSPI_REG_MODE_BIT);
+
+ /* Need to subtract the mode byte (8 clocks). */
+ if (f_pdata->inst_width != CQSPI_INST_TYPE_QUAD)
+ dummy_clk -= 8;
+
+ if (dummy_clk)
+ reg |= (dummy_clk & CQSPI_REG_RD_INSTR_DUMMY_MASK)
+ << CQSPI_REG_RD_INSTR_DUMMY_LSB;
+ }
+
+ writel(reg, reg_base + CQSPI_REG_RD_INSTR);
+
+ /* Set address width */
+ reg = readl(reg_base + CQSPI_REG_SIZE);
+ reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK;
+ reg |= (nor->addr_width - 1);
+ writel(reg, reg_base + CQSPI_REG_SIZE);
+ return 0;
+}
+
+static int cqspi_indirect_read_execute(struct spi_nor *nor,
+ u8 *rxbuf, const unsigned n_rx)
+{
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+ void __iomem *reg_base = cqspi->iobase;
+ void __iomem *ahb_base = cqspi->ahb_base;
+ unsigned int remaining = n_rx;
+ unsigned int bytes_to_read = 0;
+ int ret = 0;
+
+ writel(remaining, reg_base + CQSPI_REG_INDIRECTRDBYTES);
+
+ /* Clear all interrupts. */
+ writel(CQSPI_IRQ_STATUS_MASK, reg_base + CQSPI_REG_IRQSTATUS);
+
+ writel(CQSPI_IRQ_MASK_RD, reg_base + CQSPI_REG_IRQMASK);
+
+ reinit_completion(&cqspi->transfer_complete);
+ writel(CQSPI_REG_INDIRECTRD_START_MASK,
+ reg_base + CQSPI_REG_INDIRECTRD);
+
+ while (remaining > 0) {
+ ret = wait_for_completion_timeout(&cqspi->transfer_complete,
+ msecs_to_jiffies
+ (CQSPI_READ_TIMEOUT_MS));
+
+ bytes_to_read = cqspi_get_rd_sram_level(cqspi);
+
+ if (!ret && bytes_to_read == 0) {
+ dev_err(nor->dev, "Indirect read timeout, no bytes\n");
+ ret = -ETIMEDOUT;
+ goto failrd;
+ }
+
+ while (bytes_to_read != 0) {
+ bytes_to_read *= cqspi->fifo_width;
+ bytes_to_read = bytes_to_read > remaining ?
+ remaining : bytes_to_read;
+ readsl(ahb_base, rxbuf, DIV_ROUND_UP(bytes_to_read, 4));
+ rxbuf += bytes_to_read;
+ remaining -= bytes_to_read;
+ bytes_to_read = cqspi_get_rd_sram_level(cqspi);
+ }
+
+ if (remaining > 0)
+ reinit_completion(&cqspi->transfer_complete);
+ }
+
+ /* Check indirect done status */
+ ret = cqspi_wait_for_bit(reg_base + CQSPI_REG_INDIRECTRD,
+ CQSPI_REG_INDIRECTRD_DONE_MASK, 0);
+ if (ret) {
+ dev_err(nor->dev,
+ "Indirect read completion error (%i)\n", ret);
+ goto failrd;
+ }
+
+ /* Disable interrupt */
+ writel(0, reg_base + CQSPI_REG_IRQMASK);
+
+ /* Clear indirect completion status */
+ writel(CQSPI_REG_INDIRECTRD_DONE_MASK, reg_base + CQSPI_REG_INDIRECTRD);
+
+ return 0;
+
+failrd:
+ /* Disable interrupt */
+ writel(0, reg_base + CQSPI_REG_IRQMASK);
+
+ /* Cancel the indirect read */
+ writel(CQSPI_REG_INDIRECTWR_CANCEL_MASK,
+ reg_base + CQSPI_REG_INDIRECTRD);
+ return ret;
+}
+
+static int cqspi_indirect_write_setup(struct spi_nor *nor,
+ const unsigned int to_addr)
+{
+ unsigned int reg;
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+ void __iomem *reg_base = cqspi->iobase;
+
+ /* Set opcode. */
+ reg = nor->program_opcode << CQSPI_REG_WR_INSTR_OPCODE_LSB;
+ writel(reg, reg_base + CQSPI_REG_WR_INSTR);
+ reg = cqspi_calc_rdreg(nor, nor->program_opcode);
+ writel(reg, reg_base + CQSPI_REG_RD_INSTR);
+
+ writel(to_addr, reg_base + CQSPI_REG_INDIRECTWRSTARTADDR);
+
+ reg = readl(reg_base + CQSPI_REG_SIZE);
+ reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK;
+ reg |= (nor->addr_width - 1);
+ writel(reg, reg_base + CQSPI_REG_SIZE);
+ return 0;
+}
+
+static int cqspi_indirect_write_execute(struct spi_nor *nor,
+ const u8 *txbuf, const unsigned n_tx)
+{
+ const unsigned int page_size = nor->page_size;
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+ void __iomem *reg_base = cqspi->iobase;
+ unsigned int remaining = n_tx;
+ unsigned int write_bytes;
+ int ret;
+
+ writel(remaining, reg_base + CQSPI_REG_INDIRECTWRBYTES);
+
+ /* Clear all interrupts. */
+ writel(CQSPI_IRQ_STATUS_MASK, reg_base + CQSPI_REG_IRQSTATUS);
+
+ writel(CQSPI_IRQ_MASK_WR, reg_base + CQSPI_REG_IRQMASK);
+
+ reinit_completion(&cqspi->transfer_complete);
+ writel(CQSPI_REG_INDIRECTWR_START_MASK,
+ reg_base + CQSPI_REG_INDIRECTWR);
+
+ while (remaining > 0) {
+ write_bytes = remaining > page_size ? page_size : remaining;
+ writesl(cqspi->ahb_base, txbuf, DIV_ROUND_UP(write_bytes, 4));
+
+ ret = wait_for_completion_timeout(&cqspi->transfer_complete,
+ msecs_to_jiffies
+ (CQSPI_TIMEOUT_MS));
+ if (!ret) {
+ dev_err(nor->dev, "Indirect write timeout\n");
+ ret = -ETIMEDOUT;
+ goto failwr;
+ }
+
+ txbuf += write_bytes;
+ remaining -= write_bytes;
+
+ if (remaining > 0)
+ reinit_completion(&cqspi->transfer_complete);
+ }
+
+ /* Check indirect done status */
+ ret = cqspi_wait_for_bit(reg_base + CQSPI_REG_INDIRECTWR,
+ CQSPI_REG_INDIRECTWR_DONE_MASK, 0);
+ if (ret) {
+ dev_err(nor->dev,
+ "Indirect write completion error (%i)\n", ret);
+ goto failwr;
+ }
+
+ /* Disable interrupt. */
+ writel(0, reg_base + CQSPI_REG_IRQMASK);
+
+ /* Clear indirect completion status */
+ writel(CQSPI_REG_INDIRECTWR_DONE_MASK, reg_base + CQSPI_REG_INDIRECTWR);
+
+ cqspi_wait_idle(cqspi);
+
+ return 0;
+
+failwr:
+ /* Disable interrupt. */
+ writel(0, reg_base + CQSPI_REG_IRQMASK);
+
+ /* Cancel the indirect write */
+ writel(CQSPI_REG_INDIRECTWR_CANCEL_MASK,
+ reg_base + CQSPI_REG_INDIRECTWR);
+ return ret;
+}
+
+static void cqspi_chipselect(struct spi_nor *nor)
+{
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+ void __iomem *reg_base = cqspi->iobase;
+ unsigned int chip_select = f_pdata->cs;
+ unsigned int reg;
+
+ reg = readl(reg_base + CQSPI_REG_CONFIG);
+ if (cqspi->is_decoded_cs) {
+ reg |= CQSPI_REG_CONFIG_DECODE_MASK;
+ } else {
+ reg &= ~CQSPI_REG_CONFIG_DECODE_MASK;
+
+ /* Convert CS if without decoder.
+ * CS0 to 4b'1110
+ * CS1 to 4b'1101
+ * CS2 to 4b'1011
+ * CS3 to 4b'0111
+ */
+ chip_select = 0xF & ~(1 << chip_select);
+ }
+
+ reg &= ~(CQSPI_REG_CONFIG_CHIPSELECT_MASK
+ << CQSPI_REG_CONFIG_CHIPSELECT_LSB);
+ reg |= (chip_select & CQSPI_REG_CONFIG_CHIPSELECT_MASK)
+ << CQSPI_REG_CONFIG_CHIPSELECT_LSB;
+ writel(reg, reg_base + CQSPI_REG_CONFIG);
+}
+
+static void cqspi_configure_cs_and_sizes(struct spi_nor *nor)
+{
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+ void __iomem *iobase = cqspi->iobase;
+ unsigned int reg;
+
+ /* configure page size and block size. */
+ reg = readl(iobase + CQSPI_REG_SIZE);
+ reg &= ~(CQSPI_REG_SIZE_PAGE_MASK << CQSPI_REG_SIZE_PAGE_LSB);
+ reg &= ~(CQSPI_REG_SIZE_BLOCK_MASK << CQSPI_REG_SIZE_BLOCK_LSB);
+ reg &= ~CQSPI_REG_SIZE_ADDRESS_MASK;
+ reg |= (nor->page_size << CQSPI_REG_SIZE_PAGE_LSB);
+ reg |= (ilog2(nor->mtd.erasesize) << CQSPI_REG_SIZE_BLOCK_LSB);
+ reg |= (nor->addr_width - 1);
+ writel(reg, iobase + CQSPI_REG_SIZE);
+
+ /* configure the chip select */
+ cqspi_chipselect(nor);
+
+ /* Store the new configuration of the controller */
+ cqspi->current_page_size = nor->page_size;
+ cqspi->current_erase_size = nor->mtd.erasesize;
+ cqspi->current_addr_width = nor->addr_width;
+}
+
+static unsigned int calculate_ticks_for_ns(const unsigned int ref_clk_hz,
+ const unsigned int ns_val)
+{
+ unsigned int ticks;
+
+ ticks = ref_clk_hz / 1000; /* kHz */
+ ticks = DIV_ROUND_UP(ticks * ns_val, 1000000);
+
+ return ticks;
+}
+
+static void cqspi_delay(struct spi_nor *nor)
+{
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+ void __iomem *iobase = cqspi->iobase;
+ const unsigned int ref_clk_hz = cqspi->master_ref_clk_hz;
+ unsigned int tshsl, tchsh, tslch, tsd2d;
+ unsigned int reg;
+ unsigned int tsclk;
+
+ /* calculate the number of ref ticks for one sclk tick */
+ tsclk = DIV_ROUND_UP(ref_clk_hz, cqspi->sclk);
+
+ tshsl = calculate_ticks_for_ns(ref_clk_hz, f_pdata->tshsl_ns);
+ /* this particular value must be at least one sclk */
+ if (tshsl < tsclk)
+ tshsl = tsclk;
+
+ tchsh = calculate_ticks_for_ns(ref_clk_hz, f_pdata->tchsh_ns);
+ tslch = calculate_ticks_for_ns(ref_clk_hz, f_pdata->tslch_ns);
+ tsd2d = calculate_ticks_for_ns(ref_clk_hz, f_pdata->tsd2d_ns);
+
+ reg = (tshsl & CQSPI_REG_DELAY_TSHSL_MASK)
+ << CQSPI_REG_DELAY_TSHSL_LSB;
+ reg |= (tchsh & CQSPI_REG_DELAY_TCHSH_MASK)
+ << CQSPI_REG_DELAY_TCHSH_LSB;
+ reg |= (tslch & CQSPI_REG_DELAY_TSLCH_MASK)
+ << CQSPI_REG_DELAY_TSLCH_LSB;
+ reg |= (tsd2d & CQSPI_REG_DELAY_TSD2D_MASK)
+ << CQSPI_REG_DELAY_TSD2D_LSB;
+ writel(reg, iobase + CQSPI_REG_DELAY);
+}
+
+static void cqspi_config_baudrate_div(struct cqspi_st *cqspi)
+{
+ const unsigned int ref_clk_hz = cqspi->master_ref_clk_hz;
+ void __iomem *reg_base = cqspi->iobase;
+ u32 reg, div;
+
+ /* Recalculate the baudrate divisor based on QSPI specification. */
+ div = DIV_ROUND_UP(ref_clk_hz, 2 * cqspi->sclk) - 1;
+
+ reg = readl(reg_base + CQSPI_REG_CONFIG);
+ reg &= ~(CQSPI_REG_CONFIG_BAUD_MASK << CQSPI_REG_CONFIG_BAUD_LSB);
+ reg |= (div & CQSPI_REG_CONFIG_BAUD_MASK) << CQSPI_REG_CONFIG_BAUD_LSB;
+ writel(reg, reg_base + CQSPI_REG_CONFIG);
+}
+
+static void cqspi_readdata_capture(struct cqspi_st *cqspi,
+ const unsigned int bypass,
+ const unsigned int delay)
+{
+ void __iomem *reg_base = cqspi->iobase;
+ unsigned int reg;
+
+ reg = readl(reg_base + CQSPI_REG_READCAPTURE);
+
+ if (bypass)
+ reg |= (1 << CQSPI_REG_READCAPTURE_BYPASS_LSB);
+ else
+ reg &= ~(1 << CQSPI_REG_READCAPTURE_BYPASS_LSB);
+
+ reg &= ~(CQSPI_REG_READCAPTURE_DELAY_MASK
+ << CQSPI_REG_READCAPTURE_DELAY_LSB);
+
+ reg |= (delay & CQSPI_REG_READCAPTURE_DELAY_MASK)
+ << CQSPI_REG_READCAPTURE_DELAY_LSB;
+
+ writel(reg, reg_base + CQSPI_REG_READCAPTURE);
+}
+
+static void cqspi_controller_enable(struct cqspi_st *cqspi, bool enable)
+{
+ void __iomem *reg_base = cqspi->iobase;
+ unsigned int reg;
+
+ reg = readl(reg_base + CQSPI_REG_CONFIG);
+
+ if (enable)
+ reg |= CQSPI_REG_CONFIG_ENABLE_MASK;
+ else
+ reg &= ~CQSPI_REG_CONFIG_ENABLE_MASK;
+
+ writel(reg, reg_base + CQSPI_REG_CONFIG);
+}
+
+static void cqspi_configure(struct spi_nor *nor)
+{
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+ const unsigned int sclk = f_pdata->clk_rate;
+ int switch_cs = (cqspi->current_cs != f_pdata->cs);
+ int switch_ck = (cqspi->sclk != sclk);
+
+ if ((cqspi->current_page_size != nor->page_size) ||
+ (cqspi->current_erase_size != nor->mtd.erasesize) ||
+ (cqspi->current_addr_width != nor->addr_width))
+ switch_cs = 1;
+
+ if (switch_cs || switch_ck)
+ cqspi_controller_enable(cqspi, 0);
+
+ /* Switch chip select. */
+ if (switch_cs) {
+ cqspi->current_cs = f_pdata->cs;
+ cqspi_configure_cs_and_sizes(nor);
+ }
+
+ /* Setup baudrate divisor and delays */
+ if (switch_ck) {
+ cqspi->sclk = sclk;
+ cqspi_config_baudrate_div(cqspi);
+ cqspi_delay(nor);
+ cqspi_readdata_capture(cqspi, 1, f_pdata->read_delay);
+ }
+
+ if (switch_cs || switch_ck)
+ cqspi_controller_enable(cqspi, 1);
+}
+
+static int cqspi_set_protocol(struct spi_nor *nor, const int read)
+{
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+
+ f_pdata->inst_width = CQSPI_INST_TYPE_SINGLE;
+ f_pdata->addr_width = CQSPI_INST_TYPE_SINGLE;
+ f_pdata->data_width = CQSPI_INST_TYPE_SINGLE;
+
+ if (read) {
+ switch (nor->flash_read) {
+ case SPI_NOR_NORMAL:
+ case SPI_NOR_FAST:
+ f_pdata->data_width = CQSPI_INST_TYPE_SINGLE;
+ break;
+ case SPI_NOR_DUAL:
+ f_pdata->data_width = CQSPI_INST_TYPE_DUAL;
+ break;
+ case SPI_NOR_QUAD:
+ f_pdata->data_width = CQSPI_INST_TYPE_QUAD;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
+ cqspi_configure(nor);
+
+ return 0;
+}
+
+static ssize_t cqspi_write(struct spi_nor *nor, loff_t to,
+ size_t len, const u_char *buf)
+{
+ int ret;
+
+ ret = cqspi_set_protocol(nor, 0);
+ if (ret)
+ return ret;
+
+ ret = cqspi_indirect_write_setup(nor, to);
+ if (ret)
+ return ret;
+
+ ret = cqspi_indirect_write_execute(nor, buf, len);
+ if (ret)
+ return ret;
+
+ return (ret < 0) ? ret : len;
+}
+
+static ssize_t cqspi_read(struct spi_nor *nor, loff_t from,
+ size_t len, u_char *buf)
+{
+ int ret;
+
+ ret = cqspi_set_protocol(nor, 1);
+ if (ret)
+ return ret;
+
+ ret = cqspi_indirect_read_setup(nor, from);
+ if (ret)
+ return ret;
+
+ ret = cqspi_indirect_read_execute(nor, buf, len);
+ if (ret)
+ return ret;
+
+ return (ret < 0) ? ret : len;
+}
+
+static int cqspi_erase(struct spi_nor *nor, loff_t offs)
+{
+ int ret;
+
+ ret = cqspi_set_protocol(nor, 0);
+ if (ret)
+ return ret;
+
+ /* Send write enable, then erase commands. */
+ ret = nor->write_reg(nor, SPINOR_OP_WREN, NULL, 0);
+ if (ret)
+ return ret;
+
+ /* Set up command buffer. */
+ ret = cqspi_command_write_addr(nor, nor->erase_opcode, offs);
+ if (ret)
+ return ret;
+
+ return 0;
+}
+
+static int cqspi_prep(struct spi_nor *nor, enum spi_nor_ops ops)
+{
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+
+ mutex_lock(&cqspi->bus_mutex);
+
+ return 0;
+}
+
+static void cqspi_unprep(struct spi_nor *nor, enum spi_nor_ops ops)
+{
+ struct cqspi_flash_pdata *f_pdata = nor->priv;
+ struct cqspi_st *cqspi = f_pdata->cqspi;
+
+ mutex_unlock(&cqspi->bus_mutex);
+}
+
+static int cqspi_read_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
+{
+ int ret;
+
+ ret = cqspi_set_protocol(nor, 0);
+ if (!ret)
+ ret = cqspi_command_read(nor, &opcode, 1, buf, len);
+
+ return ret;
+}
+
+static int cqspi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
+{
+ int ret;
+
+ ret = cqspi_set_protocol(nor, 0);
+ if (!ret)
+ ret = cqspi_command_write(nor, opcode, buf, len);
+
+ return ret;
+}
+
+static int cqspi_of_get_flash_pdata(struct platform_device *pdev,
+ struct cqspi_flash_pdata *f_pdata,
+ struct device_node *np)
+{
+ if (of_property_read_u32(np, "cdns,read-delay", &f_pdata->read_delay)) {
+ dev_err(&pdev->dev, "couldn't determine read-delay\n");
+ return -ENXIO;
+ }
+
+ if (of_property_read_u32(np, "cdns,tshsl-ns", &f_pdata->tshsl_ns)) {
+ dev_err(&pdev->dev, "couldn't determine tshsl-ns\n");
+ return -ENXIO;
+ }
+
+ if (of_property_read_u32(np, "cdns,tsd2d-ns", &f_pdata->tsd2d_ns)) {
+ dev_err(&pdev->dev, "couldn't determine tsd2d-ns\n");
+ return -ENXIO;
+ }
+
+ if (of_property_read_u32(np, "cdns,tchsh-ns", &f_pdata->tchsh_ns)) {
+ dev_err(&pdev->dev, "couldn't determine tchsh-ns\n");
+ return -ENXIO;
+ }
+
+ if (of_property_read_u32(np, "cdns,tslch-ns", &f_pdata->tslch_ns)) {
+ dev_err(&pdev->dev, "couldn't determine tslch-ns\n");
+ return -ENXIO;
+ }
+
+ if (of_property_read_u32(np, "spi-max-frequency", &f_pdata->clk_rate)) {
+ dev_err(&pdev->dev, "couldn't determine spi-max-frequency\n");
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static int cqspi_of_get_pdata(struct platform_device *pdev)
+{
+ struct device_node *np = pdev->dev.of_node;
+ struct cqspi_st *cqspi = platform_get_drvdata(pdev);
+
+ cqspi->is_decoded_cs = of_property_read_bool(np, "cdns,is-decoded-cs");
+
+ if (of_property_read_u32(np, "cdns,fifo-depth", &cqspi->fifo_depth)) {
+ dev_err(&pdev->dev, "couldn't determine fifo-depth\n");
+ return -ENXIO;
+ }
+
+ if (of_property_read_u32(np, "cdns,fifo-width", &cqspi->fifo_width)) {
+ dev_err(&pdev->dev, "couldn't determine fifo-width\n");
+ return -ENXIO;
+ }
+
+ if (of_property_read_u32(np, "cdns,trigger-address",
+ &cqspi->trigger_address)) {
+ dev_err(&pdev->dev, "couldn't determine trigger-address\n");
+ return -ENXIO;
+ }
+
+ return 0;
+}
+
+static void cqspi_controller_init(struct cqspi_st *cqspi)
+{
+ cqspi_controller_enable(cqspi, 0);
+
+ /* Configure the remap address register, no remap */
+ writel(0, cqspi->iobase + CQSPI_REG_REMAP);
+
+ /* Disable all interrupts. */
+ writel(0, cqspi->iobase + CQSPI_REG_IRQMASK);
+
+ /* Configure the SRAM split to 1:1 . */
+ writel(cqspi->fifo_depth / 2, cqspi->iobase + CQSPI_REG_SRAMPARTITION);
+
+ /* Load indirect trigger address. */
+ writel(cqspi->trigger_address,
+ cqspi->iobase + CQSPI_REG_INDIRECTTRIGGER);
+
+ /* Program read watermark -- 1/2 of the FIFO. */
+ writel(cqspi->fifo_depth * cqspi->fifo_width / 2,
+ cqspi->iobase + CQSPI_REG_INDIRECTRDWATERMARK);
+ /* Program write watermark -- 1/8 of the FIFO. */
+ writel(cqspi->fifo_depth * cqspi->fifo_width / 8,
+ cqspi->iobase + CQSPI_REG_INDIRECTWRWATERMARK);
+
+ cqspi_controller_enable(cqspi, 1);
+}
+
+static int cqspi_setup_flash(struct cqspi_st *cqspi, struct device_node *np)
+{
+ struct platform_device *pdev = cqspi->pdev;
+ struct device *dev = &pdev->dev;
+ struct cqspi_flash_pdata *f_pdata;
+ struct spi_nor *nor;
+ struct mtd_info *mtd;
+ unsigned int cs;
+ int i, ret;
+
+ /* Get flash device data */
+ for_each_available_child_of_node(dev->of_node, np) {
+ if (of_property_read_u32(np, "reg", &cs)) {
+ dev_err(dev, "Couldn't determine chip select.\n");
+ goto err;
+ }
+
+ if (cs > CQSPI_MAX_CHIPSELECT) {
+ dev_err(dev, "Chip select %d out of range.\n", cs);
+ goto err;
+ }
+
+ f_pdata = &cqspi->f_pdata[cs];
+ f_pdata->cqspi = cqspi;
+ f_pdata->cs = cs;
+
+ ret = cqspi_of_get_flash_pdata(pdev, f_pdata, np);
+ if (ret)
+ goto err;
+
+ nor = &f_pdata->nor;
+ mtd = &nor->mtd;
+
+ mtd->priv = nor;
+
+ nor->dev = dev;
+ spi_nor_set_flash_node(nor, np);
+ nor->priv = f_pdata;
+
+ nor->read_reg = cqspi_read_reg;
+ nor->write_reg = cqspi_write_reg;
+ nor->read = cqspi_read;
+ nor->write = cqspi_write;
+ nor->erase = cqspi_erase;
+ nor->prepare = cqspi_prep;
+ nor->unprepare = cqspi_unprep;
+
+ mtd->name = devm_kasprintf(dev, GFP_KERNEL, "%s.%d",
+ dev_name(dev), cs);
+ if (!mtd->name) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ ret = spi_nor_scan(nor, NULL, SPI_NOR_QUAD);
+ if (ret)
+ goto err;
+
+ ret = mtd_device_register(mtd, NULL, 0);
+ if (ret)
+ goto err;
+
+ f_pdata->registered = true;
+ }
+
+ return 0;
+
+err:
+ for (i = 0; i < CQSPI_MAX_CHIPSELECT; i++)
+ if (cqspi->f_pdata[i].registered)
+ mtd_device_unregister(&cqspi->f_pdata[i].nor.mtd);
+ return ret;
+}
+
+static int cqspi_probe(struct platform_device *pdev)
+{
+ struct device_node *np = pdev->dev.of_node;
+ struct device *dev = &pdev->dev;
+ struct cqspi_st *cqspi;
+ struct resource *res;
+ struct resource *res_ahb;
+ int ret;
+ int irq;
+
+ cqspi = devm_kzalloc(dev, sizeof(*cqspi), GFP_KERNEL);
+ if (!cqspi)
+ return -ENOMEM;
+
+ mutex_init(&cqspi->bus_mutex);
+ cqspi->pdev = pdev;
+ platform_set_drvdata(pdev, cqspi);
+
+ /* Obtain configuration from OF. */
+ ret = cqspi_of_get_pdata(pdev);
+ if (ret) {
+ dev_err(dev, "Cannot get mandatory OF data.\n");
+ return -ENODEV;
+ }
+
+ /* Obtain QSPI clock. */
+ cqspi->clk = devm_clk_get(dev, NULL);
+ if (IS_ERR(cqspi->clk)) {
+ dev_err(dev, "Cannot claim QSPI clock.\n");
+ return PTR_ERR(cqspi->clk);
+ }
+
+ /* Obtain and remap controller address. */
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ cqspi->iobase = devm_ioremap_resource(dev, res);
+ if (IS_ERR(cqspi->iobase)) {
+ dev_err(dev, "Cannot remap controller address.\n");
+ return PTR_ERR(cqspi->iobase);
+ }
+
+ /* Obtain and remap AHB address. */
+ res_ahb = platform_get_resource(pdev, IORESOURCE_MEM, 1);
+ cqspi->ahb_base = devm_ioremap_resource(dev, res_ahb);
+ if (IS_ERR(cqspi->ahb_base)) {
+ dev_err(dev, "Cannot remap AHB address.\n");
+ return PTR_ERR(cqspi->ahb_base);
+ }
+
+ init_completion(&cqspi->transfer_complete);
+
+ /* Obtain IRQ line. */
+ irq = platform_get_irq(pdev, 0);
+ if (irq < 0) {
+ dev_err(dev, "Cannot obtain IRQ.\n");
+ return -ENXIO;
+ }
+
+ ret = clk_prepare_enable(cqspi->clk);
+ if (ret) {
+ dev_err(dev, "Cannot enable QSPI clock.\n");
+ return ret;
+ }
+
+ cqspi->master_ref_clk_hz = clk_get_rate(cqspi->clk);
+
+ ret = devm_request_irq(dev, irq, cqspi_irq_handler, 0,
+ pdev->name, cqspi);
+ if (ret) {
+ dev_err(dev, "Cannot request IRQ.\n");
+ goto probe_irq_failed;
+ }
+
+ cqspi_wait_idle(cqspi);
+ cqspi_controller_init(cqspi);
+ cqspi->current_cs = -1;
+ cqspi->sclk = 0;
+
+ ret = cqspi_setup_flash(cqspi, np);
+ if (ret) {
+ dev_err(dev, "Cadence QSPI NOR probe failed %d\n", ret);
+ goto probe_setup_failed;
+ }
+
+ return ret;
+probe_irq_failed:
+ cqspi_controller_enable(cqspi, 0);
+probe_setup_failed:
+ clk_disable_unprepare(cqspi->clk);
+ return ret;
+}
+
+static int cqspi_remove(struct platform_device *pdev)
+{
+ struct cqspi_st *cqspi = platform_get_drvdata(pdev);
+ int i;
+
+ for (i = 0; i < CQSPI_MAX_CHIPSELECT; i++)
+ if (cqspi->f_pdata[i].registered)
+ mtd_device_unregister(&cqspi->f_pdata[i].nor.mtd);
+
+ cqspi_controller_enable(cqspi, 0);
+
+ clk_disable_unprepare(cqspi->clk);
+
+ return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int cqspi_suspend(struct device *dev)
+{
+ struct cqspi_st *cqspi = dev_get_drvdata(dev);
+
+ cqspi_controller_enable(cqspi, 0);
+ return 0;
+}
+
+static int cqspi_resume(struct device *dev)
+{
+ struct cqspi_st *cqspi = dev_get_drvdata(dev);
+
+ cqspi_controller_enable(cqspi, 1);
+ return 0;
+}
+
+static const struct dev_pm_ops cqspi__dev_pm_ops = {
+ .suspend = cqspi_suspend,
+ .resume = cqspi_resume,
+};
+
+#define CQSPI_DEV_PM_OPS (&cqspi__dev_pm_ops)
+#else
+#define CQSPI_DEV_PM_OPS NULL
+#endif
+
+static struct of_device_id const cqspi_dt_ids[] = {
+ {.compatible = "cdns,qspi-nor",},
+ { /* end of table */ }
+};
+
+MODULE_DEVICE_TABLE(of, cqspi_dt_ids);
+
+static struct platform_driver cqspi_platform_driver = {
+ .probe = cqspi_probe,
+ .remove = cqspi_remove,
+ .driver = {
+ .name = CQSPI_NAME,
+ .pm = CQSPI_DEV_PM_OPS,
+ .of_match_table = cqspi_dt_ids,
+ },
+};
+
+module_platform_driver(cqspi_platform_driver);
+
+MODULE_DESCRIPTION("Cadence QSPI Controller Driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:" CQSPI_NAME);
+MODULE_AUTHOR("Ley Foon Tan <lftan@altera.com>");
+MODULE_AUTHOR("Graham Moore <grmoore@opensource.altera.com>");
diff --git a/drivers/mtd/spi-nor/fsl-quadspi.c b/drivers/mtd/spi-nor/fsl-quadspi.c
index 9ab2b51d54b8..5c82e4ef1904 100644
--- a/drivers/mtd/spi-nor/fsl-quadspi.c
+++ b/drivers/mtd/spi-nor/fsl-quadspi.c
@@ -618,9 +618,9 @@ static inline void fsl_qspi_invalid(struct fsl_qspi *q)
qspi_writel(q, reg, q->iobase + QUADSPI_MCR);
}
-static int fsl_qspi_nor_write(struct fsl_qspi *q, struct spi_nor *nor,
+static ssize_t fsl_qspi_nor_write(struct fsl_qspi *q, struct spi_nor *nor,
u8 opcode, unsigned int to, u32 *txbuf,
- unsigned count, size_t *retlen)
+ unsigned count)
{
int ret, i, j;
u32 tmp;
@@ -647,8 +647,8 @@ static int fsl_qspi_nor_write(struct fsl_qspi *q, struct spi_nor *nor,
/* Trigger it */
ret = fsl_qspi_runcmd(q, opcode, to, count);
- if (ret == 0 && retlen)
- *retlen += count;
+ if (ret == 0)
+ return count;
return ret;
}
@@ -859,7 +859,9 @@ static int fsl_qspi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
} else if (len > 0) {
ret = fsl_qspi_nor_write(q, nor, opcode, 0,
- (u32 *)buf, len, NULL);
+ (u32 *)buf, len);
+ if (ret > 0)
+ return 0;
} else {
dev_err(q->dev, "invalid cmd %d\n", opcode);
ret = -EINVAL;
@@ -868,20 +870,20 @@ static int fsl_qspi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
return ret;
}
-static void fsl_qspi_write(struct spi_nor *nor, loff_t to,
- size_t len, size_t *retlen, const u_char *buf)
+static ssize_t fsl_qspi_write(struct spi_nor *nor, loff_t to,
+ size_t len, const u_char *buf)
{
struct fsl_qspi *q = nor->priv;
-
- fsl_qspi_nor_write(q, nor, nor->program_opcode, to,
- (u32 *)buf, len, retlen);
+ ssize_t ret = fsl_qspi_nor_write(q, nor, nor->program_opcode, to,
+ (u32 *)buf, len);
/* invalid the data in the AHB buffer. */
fsl_qspi_invalid(q);
+ return ret;
}
-static int fsl_qspi_read(struct spi_nor *nor, loff_t from,
- size_t len, size_t *retlen, u_char *buf)
+static ssize_t fsl_qspi_read(struct spi_nor *nor, loff_t from,
+ size_t len, u_char *buf)
{
struct fsl_qspi *q = nor->priv;
u8 cmd = nor->read_opcode;
@@ -923,8 +925,7 @@ static int fsl_qspi_read(struct spi_nor *nor, loff_t from,
memcpy(buf, q->ahb_addr + q->chip_base_addr + from - q->memmap_offs,
len);
- *retlen += len;
- return 0;
+ return len;
}
static int fsl_qspi_erase(struct spi_nor *nor, loff_t offs)
diff --git a/drivers/mtd/spi-nor/hisi-sfc.c b/drivers/mtd/spi-nor/hisi-sfc.c
new file mode 100644
index 000000000000..20378b0d55e9
--- /dev/null
+++ b/drivers/mtd/spi-nor/hisi-sfc.c
@@ -0,0 +1,489 @@
+/*
+ * HiSilicon SPI Nor Flash Controller Driver
+ *
+ * Copyright (c) 2015-2016 HiSilicon Technologies Co., Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#include <linux/bitops.h>
+#include <linux/clk.h>
+#include <linux/dma-mapping.h>
+#include <linux/iopoll.h>
+#include <linux/module.h>
+#include <linux/mtd/mtd.h>
+#include <linux/mtd/spi-nor.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+/* Hardware register offsets and field definitions */
+#define FMC_CFG 0x00
+#define FMC_CFG_OP_MODE_MASK BIT_MASK(0)
+#define FMC_CFG_OP_MODE_BOOT 0
+#define FMC_CFG_OP_MODE_NORMAL 1
+#define FMC_CFG_FLASH_SEL(type) (((type) & 0x3) << 1)
+#define FMC_CFG_FLASH_SEL_MASK 0x6
+#define FMC_ECC_TYPE(type) (((type) & 0x7) << 5)
+#define FMC_ECC_TYPE_MASK GENMASK(7, 5)
+#define SPI_NOR_ADDR_MODE_MASK BIT_MASK(10)
+#define SPI_NOR_ADDR_MODE_3BYTES (0x0 << 10)
+#define SPI_NOR_ADDR_MODE_4BYTES (0x1 << 10)
+#define FMC_GLOBAL_CFG 0x04
+#define FMC_GLOBAL_CFG_WP_ENABLE BIT(6)
+#define FMC_SPI_TIMING_CFG 0x08
+#define TIMING_CFG_TCSH(nr) (((nr) & 0xf) << 8)
+#define TIMING_CFG_TCSS(nr) (((nr) & 0xf) << 4)
+#define TIMING_CFG_TSHSL(nr) ((nr) & 0xf)
+#define CS_HOLD_TIME 0x6
+#define CS_SETUP_TIME 0x6
+#define CS_DESELECT_TIME 0xf
+#define FMC_INT 0x18
+#define FMC_INT_OP_DONE BIT(0)
+#define FMC_INT_CLR 0x20
+#define FMC_CMD 0x24
+#define FMC_CMD_CMD1(cmd) ((cmd) & 0xff)
+#define FMC_ADDRL 0x2c
+#define FMC_OP_CFG 0x30
+#define OP_CFG_FM_CS(cs) ((cs) << 11)
+#define OP_CFG_MEM_IF_TYPE(type) (((type) & 0x7) << 7)
+#define OP_CFG_ADDR_NUM(addr) (((addr) & 0x7) << 4)
+#define OP_CFG_DUMMY_NUM(dummy) ((dummy) & 0xf)
+#define FMC_DATA_NUM 0x38
+#define FMC_DATA_NUM_CNT(cnt) ((cnt) & GENMASK(13, 0))
+#define FMC_OP 0x3c
+#define FMC_OP_DUMMY_EN BIT(8)
+#define FMC_OP_CMD1_EN BIT(7)
+#define FMC_OP_ADDR_EN BIT(6)
+#define FMC_OP_WRITE_DATA_EN BIT(5)
+#define FMC_OP_READ_DATA_EN BIT(2)
+#define FMC_OP_READ_STATUS_EN BIT(1)
+#define FMC_OP_REG_OP_START BIT(0)
+#define FMC_DMA_LEN 0x40
+#define FMC_DMA_LEN_SET(len) ((len) & GENMASK(27, 0))
+#define FMC_DMA_SADDR_D0 0x4c
+#define HIFMC_DMA_MAX_LEN (4096)
+#define HIFMC_DMA_MASK (HIFMC_DMA_MAX_LEN - 1)
+#define FMC_OP_DMA 0x68
+#define OP_CTRL_RD_OPCODE(code) (((code) & 0xff) << 16)
+#define OP_CTRL_WR_OPCODE(code) (((code) & 0xff) << 8)
+#define OP_CTRL_RW_OP(op) ((op) << 1)
+#define OP_CTRL_DMA_OP_READY BIT(0)
+#define FMC_OP_READ 0x0
+#define FMC_OP_WRITE 0x1
+#define FMC_WAIT_TIMEOUT 1000000
+
+enum hifmc_iftype {
+ IF_TYPE_STD,
+ IF_TYPE_DUAL,
+ IF_TYPE_DIO,
+ IF_TYPE_QUAD,
+ IF_TYPE_QIO,
+};
+
+struct hifmc_priv {
+ u32 chipselect;
+ u32 clkrate;
+ struct hifmc_host *host;
+};
+
+#define HIFMC_MAX_CHIP_NUM 2
+struct hifmc_host {
+ struct device *dev;
+ struct mutex lock;
+
+ void __iomem *regbase;
+ void __iomem *iobase;
+ struct clk *clk;
+ void *buffer;
+ dma_addr_t dma_buffer;
+
+ struct spi_nor *nor[HIFMC_MAX_CHIP_NUM];
+ u32 num_chip;
+};
+
+static inline int wait_op_finish(struct hifmc_host *host)
+{
+ u32 reg;
+
+ return readl_poll_timeout(host->regbase + FMC_INT, reg,
+ (reg & FMC_INT_OP_DONE), 0, FMC_WAIT_TIMEOUT);
+}
+
+static int get_if_type(enum read_mode flash_read)
+{
+ enum hifmc_iftype if_type;
+
+ switch (flash_read) {
+ case SPI_NOR_DUAL:
+ if_type = IF_TYPE_DUAL;
+ break;
+ case SPI_NOR_QUAD:
+ if_type = IF_TYPE_QUAD;
+ break;
+ case SPI_NOR_NORMAL:
+ case SPI_NOR_FAST:
+ default:
+ if_type = IF_TYPE_STD;
+ break;
+ }
+
+ return if_type;
+}
+
+static void hisi_spi_nor_init(struct hifmc_host *host)
+{
+ u32 reg;
+
+ reg = TIMING_CFG_TCSH(CS_HOLD_TIME)
+ | TIMING_CFG_TCSS(CS_SETUP_TIME)
+ | TIMING_CFG_TSHSL(CS_DESELECT_TIME);
+ writel(reg, host->regbase + FMC_SPI_TIMING_CFG);
+}
+
+static int hisi_spi_nor_prep(struct spi_nor *nor, enum spi_nor_ops ops)
+{
+ struct hifmc_priv *priv = nor->priv;
+ struct hifmc_host *host = priv->host;
+ int ret;
+
+ mutex_lock(&host->lock);
+
+ ret = clk_set_rate(host->clk, priv->clkrate);
+ if (ret)
+ goto out;
+
+ ret = clk_prepare_enable(host->clk);
+ if (ret)
+ goto out;
+
+ return 0;
+
+out:
+ mutex_unlock(&host->lock);
+ return ret;
+}
+
+static void hisi_spi_nor_unprep(struct spi_nor *nor, enum spi_nor_ops ops)
+{
+ struct hifmc_priv *priv = nor->priv;
+ struct hifmc_host *host = priv->host;
+
+ clk_disable_unprepare(host->clk);
+ mutex_unlock(&host->lock);
+}
+
+static int hisi_spi_nor_op_reg(struct spi_nor *nor,
+ u8 opcode, int len, u8 optype)
+{
+ struct hifmc_priv *priv = nor->priv;
+ struct hifmc_host *host = priv->host;
+ u32 reg;
+
+ reg = FMC_CMD_CMD1(opcode);
+ writel(reg, host->regbase + FMC_CMD);
+
+ reg = FMC_DATA_NUM_CNT(len);
+ writel(reg, host->regbase + FMC_DATA_NUM);
+
+ reg = OP_CFG_FM_CS(priv->chipselect);
+ writel(reg, host->regbase + FMC_OP_CFG);
+
+ writel(0xff, host->regbase + FMC_INT_CLR);
+ reg = FMC_OP_CMD1_EN | FMC_OP_REG_OP_START | optype;
+ writel(reg, host->regbase + FMC_OP);
+
+ return wait_op_finish(host);
+}
+
+static int hisi_spi_nor_read_reg(struct spi_nor *nor, u8 opcode, u8 *buf,
+ int len)
+{
+ struct hifmc_priv *priv = nor->priv;
+ struct hifmc_host *host = priv->host;
+ int ret;
+
+ ret = hisi_spi_nor_op_reg(nor, opcode, len, FMC_OP_READ_DATA_EN);
+ if (ret)
+ return ret;
+
+ memcpy_fromio(buf, host->iobase, len);
+ return 0;
+}
+
+static int hisi_spi_nor_write_reg(struct spi_nor *nor, u8 opcode,
+ u8 *buf, int len)
+{
+ struct hifmc_priv *priv = nor->priv;
+ struct hifmc_host *host = priv->host;
+
+ if (len)
+ memcpy_toio(host->iobase, buf, len);
+
+ return hisi_spi_nor_op_reg(nor, opcode, len, FMC_OP_WRITE_DATA_EN);
+}
+
+static int hisi_spi_nor_dma_transfer(struct spi_nor *nor, loff_t start_off,
+ dma_addr_t dma_buf, size_t len, u8 op_type)
+{
+ struct hifmc_priv *priv = nor->priv;
+ struct hifmc_host *host = priv->host;
+ u8 if_type = 0;
+ u32 reg;
+
+ reg = readl(host->regbase + FMC_CFG);
+ reg &= ~(FMC_CFG_OP_MODE_MASK | SPI_NOR_ADDR_MODE_MASK);
+ reg |= FMC_CFG_OP_MODE_NORMAL;
+ reg |= (nor->addr_width == 4) ? SPI_NOR_ADDR_MODE_4BYTES
+ : SPI_NOR_ADDR_MODE_3BYTES;
+ writel(reg, host->regbase + FMC_CFG);
+
+ writel(start_off, host->regbase + FMC_ADDRL);
+ writel(dma_buf, host->regbase + FMC_DMA_SADDR_D0);
+ writel(FMC_DMA_LEN_SET(len), host->regbase + FMC_DMA_LEN);
+
+ reg = OP_CFG_FM_CS(priv->chipselect);
+ if_type = get_if_type(nor->flash_read);
+ reg |= OP_CFG_MEM_IF_TYPE(if_type);
+ if (op_type == FMC_OP_READ)
+ reg |= OP_CFG_DUMMY_NUM(nor->read_dummy >> 3);
+ writel(reg, host->regbase + FMC_OP_CFG);
+
+ writel(0xff, host->regbase + FMC_INT_CLR);
+ reg = OP_CTRL_RW_OP(op_type) | OP_CTRL_DMA_OP_READY;
+ reg |= (op_type == FMC_OP_READ)
+ ? OP_CTRL_RD_OPCODE(nor->read_opcode)
+ : OP_CTRL_WR_OPCODE(nor->program_opcode);
+ writel(reg, host->regbase + FMC_OP_DMA);
+
+ return wait_op_finish(host);
+}
+
+static ssize_t hisi_spi_nor_read(struct spi_nor *nor, loff_t from, size_t len,
+ u_char *read_buf)
+{
+ struct hifmc_priv *priv = nor->priv;
+ struct hifmc_host *host = priv->host;
+ size_t offset;
+ int ret;
+
+ for (offset = 0; offset < len; offset += HIFMC_DMA_MAX_LEN) {
+ size_t trans = min_t(size_t, HIFMC_DMA_MAX_LEN, len - offset);
+
+ ret = hisi_spi_nor_dma_transfer(nor,
+ from + offset, host->dma_buffer, trans, FMC_OP_READ);
+ if (ret) {
+ dev_warn(nor->dev, "DMA read timeout\n");
+ return ret;
+ }
+ memcpy(read_buf + offset, host->buffer, trans);
+ }
+
+ return len;
+}
+
+static ssize_t hisi_spi_nor_write(struct spi_nor *nor, loff_t to,
+ size_t len, const u_char *write_buf)
+{
+ struct hifmc_priv *priv = nor->priv;
+ struct hifmc_host *host = priv->host;
+ size_t offset;
+ int ret;
+
+ for (offset = 0; offset < len; offset += HIFMC_DMA_MAX_LEN) {
+ size_t trans = min_t(size_t, HIFMC_DMA_MAX_LEN, len - offset);
+
+ memcpy(host->buffer, write_buf + offset, trans);
+ ret = hisi_spi_nor_dma_transfer(nor,
+ to + offset, host->dma_buffer, trans, FMC_OP_WRITE);
+ if (ret) {
+ dev_warn(nor->dev, "DMA write timeout\n");
+ return ret;
+ }
+ }
+
+ return len;
+}
+
+/**
+ * Get spi flash device information and register it as a mtd device.
+ */
+static int hisi_spi_nor_register(struct device_node *np,
+ struct hifmc_host *host)
+{
+ struct device *dev = host->dev;
+ struct spi_nor *nor;
+ struct hifmc_priv *priv;
+ struct mtd_info *mtd;
+ int ret;
+
+ nor = devm_kzalloc(dev, sizeof(*nor), GFP_KERNEL);
+ if (!nor)
+ return -ENOMEM;
+
+ nor->dev = dev;
+ spi_nor_set_flash_node(nor, np);
+
+ priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
+ if (!priv)
+ return -ENOMEM;
+
+ ret = of_property_read_u32(np, "reg", &priv->chipselect);
+ if (ret) {
+ dev_err(dev, "There's no reg property for %s\n",
+ np->full_name);
+ return ret;
+ }
+
+ ret = of_property_read_u32(np, "spi-max-frequency",
+ &priv->clkrate);
+ if (ret) {
+ dev_err(dev, "There's no spi-max-frequency property for %s\n",
+ np->full_name);
+ return ret;
+ }
+ priv->host = host;
+ nor->priv = priv;
+
+ nor->prepare = hisi_spi_nor_prep;
+ nor->unprepare = hisi_spi_nor_unprep;
+ nor->read_reg = hisi_spi_nor_read_reg;
+ nor->write_reg = hisi_spi_nor_write_reg;
+ nor->read = hisi_spi_nor_read;
+ nor->write = hisi_spi_nor_write;
+ nor->erase = NULL;
+ ret = spi_nor_scan(nor, NULL, SPI_NOR_QUAD);
+ if (ret)
+ return ret;
+
+ mtd = &nor->mtd;
+ mtd->name = np->name;
+ ret = mtd_device_register(mtd, NULL, 0);
+ if (ret)
+ return ret;
+
+ host->nor[host->num_chip] = nor;
+ host->num_chip++;
+ return 0;
+}
+
+static void hisi_spi_nor_unregister_all(struct hifmc_host *host)
+{
+ int i;
+
+ for (i = 0; i < host->num_chip; i++)
+ mtd_device_unregister(&host->nor[i]->mtd);
+}
+
+static int hisi_spi_nor_register_all(struct hifmc_host *host)
+{
+ struct device *dev = host->dev;
+ struct device_node *np;
+ int ret;
+
+ for_each_available_child_of_node(dev->of_node, np) {
+ ret = hisi_spi_nor_register(np, host);
+ if (ret)
+ goto fail;
+
+ if (host->num_chip == HIFMC_MAX_CHIP_NUM) {
+ dev_warn(dev, "Flash device number exceeds the maximum chipselect number\n");
+ break;
+ }
+ }
+
+ return 0;
+
+fail:
+ hisi_spi_nor_unregister_all(host);
+ return ret;
+}
+
+static int hisi_spi_nor_probe(struct platform_device *pdev)
+{
+ struct device *dev = &pdev->dev;
+ struct resource *res;
+ struct hifmc_host *host;
+ int ret;
+
+ host = devm_kzalloc(dev, sizeof(*host), GFP_KERNEL);
+ if (!host)
+ return -ENOMEM;
+
+ platform_set_drvdata(pdev, host);
+ host->dev = dev;
+
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "control");
+ host->regbase = devm_ioremap_resource(dev, res);
+ if (IS_ERR(host->regbase))
+ return PTR_ERR(host->regbase);
+
+ res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "memory");
+ host->iobase = devm_ioremap_resource(dev, res);
+ if (IS_ERR(host->iobase))
+ return PTR_ERR(host->iobase);
+
+ host->clk = devm_clk_get(dev, NULL);
+ if (IS_ERR(host->clk))
+ return PTR_ERR(host->clk);
+
+ ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(32));
+ if (ret) {
+ dev_warn(dev, "Unable to set dma mask\n");
+ return ret;
+ }
+
+ host->buffer = dmam_alloc_coherent(dev, HIFMC_DMA_MAX_LEN,
+ &host->dma_buffer, GFP_KERNEL);
+ if (!host->buffer)
+ return -ENOMEM;
+
+ mutex_init(&host->lock);
+ clk_prepare_enable(host->clk);
+ hisi_spi_nor_init(host);
+ ret = hisi_spi_nor_register_all(host);
+ if (ret)
+ mutex_destroy(&host->lock);
+
+ clk_disable_unprepare(host->clk);
+ return ret;
+}
+
+static int hisi_spi_nor_remove(struct platform_device *pdev)
+{
+ struct hifmc_host *host = platform_get_drvdata(pdev);
+
+ hisi_spi_nor_unregister_all(host);
+ mutex_destroy(&host->lock);
+ clk_disable_unprepare(host->clk);
+ return 0;
+}
+
+static const struct of_device_id hisi_spi_nor_dt_ids[] = {
+ { .compatible = "hisilicon,fmc-spi-nor"},
+ { /* sentinel */ }
+};
+MODULE_DEVICE_TABLE(of, hisi_spi_nor_dt_ids);
+
+static struct platform_driver hisi_spi_nor_driver = {
+ .driver = {
+ .name = "hisi-sfc",
+ .of_match_table = hisi_spi_nor_dt_ids,
+ },
+ .probe = hisi_spi_nor_probe,
+ .remove = hisi_spi_nor_remove,
+};
+module_platform_driver(hisi_spi_nor_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("HiSilicon SPI Nor Flash Controller Driver");
diff --git a/drivers/mtd/spi-nor/mtk-quadspi.c b/drivers/mtd/spi-nor/mtk-quadspi.c
index 8bed1a4cb79c..e661877c23de 100644
--- a/drivers/mtd/spi-nor/mtk-quadspi.c
+++ b/drivers/mtd/spi-nor/mtk-quadspi.c
@@ -21,7 +21,6 @@
#include <linux/ioport.h>
#include <linux/math64.h>
#include <linux/module.h>
-#include <linux/mtd/mtd.h>
#include <linux/mutex.h>
#include <linux/of.h>
#include <linux/of_device.h>
@@ -243,8 +242,8 @@ static void mt8173_nor_set_addr(struct mt8173_nor *mt8173_nor, u32 addr)
writeb(addr & 0xff, mt8173_nor->base + MTK_NOR_RADR3_REG);
}
-static int mt8173_nor_read(struct spi_nor *nor, loff_t from, size_t length,
- size_t *retlen, u_char *buffer)
+static ssize_t mt8173_nor_read(struct spi_nor *nor, loff_t from, size_t length,
+ u_char *buffer)
{
int i, ret;
int addr = (int)from;
@@ -255,13 +254,13 @@ static int mt8173_nor_read(struct spi_nor *nor, loff_t from, size_t length,
mt8173_nor_set_read_mode(mt8173_nor);
mt8173_nor_set_addr(mt8173_nor, addr);
- for (i = 0; i < length; i++, (*retlen)++) {
+ for (i = 0; i < length; i++) {
ret = mt8173_nor_execute_cmd(mt8173_nor, MTK_NOR_PIO_READ_CMD);
if (ret < 0)
return ret;
buf[i] = readb(mt8173_nor->base + MTK_NOR_RDATA_REG);
}
- return 0;
+ return length;
}
static int mt8173_nor_write_single_byte(struct mt8173_nor *mt8173_nor,
@@ -297,36 +296,44 @@ static int mt8173_nor_write_buffer(struct mt8173_nor *mt8173_nor, int addr,
return mt8173_nor_execute_cmd(mt8173_nor, MTK_NOR_WR_CMD);
}
-static void mt8173_nor_write(struct spi_nor *nor, loff_t to, size_t len,
- size_t *retlen, const u_char *buf)
+static ssize_t mt8173_nor_write(struct spi_nor *nor, loff_t to, size_t len,
+ const u_char *buf)
{
int ret;
struct mt8173_nor *mt8173_nor = nor->priv;
+ size_t i;
ret = mt8173_nor_write_buffer_enable(mt8173_nor);
- if (ret < 0)
+ if (ret < 0) {
dev_warn(mt8173_nor->dev, "write buffer enable failed!\n");
+ return ret;
+ }
- while (len >= SFLASH_WRBUF_SIZE) {
+ for (i = 0; i + SFLASH_WRBUF_SIZE <= len; i += SFLASH_WRBUF_SIZE) {
ret = mt8173_nor_write_buffer(mt8173_nor, to, buf);
- if (ret < 0)
+ if (ret < 0) {
dev_err(mt8173_nor->dev, "write buffer failed!\n");
- len -= SFLASH_WRBUF_SIZE;
+ return ret;
+ }
to += SFLASH_WRBUF_SIZE;
buf += SFLASH_WRBUF_SIZE;
- (*retlen) += SFLASH_WRBUF_SIZE;
}
ret = mt8173_nor_write_buffer_disable(mt8173_nor);
- if (ret < 0)
+ if (ret < 0) {
dev_warn(mt8173_nor->dev, "write buffer disable failed!\n");
+ return ret;
+ }
- if (len) {
- ret = mt8173_nor_write_single_byte(mt8173_nor, to, (int)len,
- (u8 *)buf);
- if (ret < 0)
+ if (i < len) {
+ ret = mt8173_nor_write_single_byte(mt8173_nor, to,
+ (int)(len - i), (u8 *)buf);
+ if (ret < 0) {
dev_err(mt8173_nor->dev, "write single byte failed!\n");
- (*retlen) += len;
+ return ret;
+ }
}
+
+ return len;
}
static int mt8173_nor_read_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
diff --git a/drivers/mtd/spi-nor/nxp-spifi.c b/drivers/mtd/spi-nor/nxp-spifi.c
index ae428cb0e04b..73a14f40928b 100644
--- a/drivers/mtd/spi-nor/nxp-spifi.c
+++ b/drivers/mtd/spi-nor/nxp-spifi.c
@@ -172,8 +172,8 @@ static int nxp_spifi_write_reg(struct spi_nor *nor, u8 opcode, u8 *buf, int len)
return nxp_spifi_wait_for_cmd(spifi);
}
-static int nxp_spifi_read(struct spi_nor *nor, loff_t from, size_t len,
- size_t *retlen, u_char *buf)
+static ssize_t nxp_spifi_read(struct spi_nor *nor, loff_t from, size_t len,
+ u_char *buf)
{
struct nxp_spifi *spifi = nor->priv;
int ret;
@@ -183,24 +183,23 @@ static int nxp_spifi_read(struct spi_nor *nor, loff_t from, size_t len,
return ret;
memcpy_fromio(buf, spifi->flash_base + from, len);
- *retlen += len;
- return 0;
+ return len;
}
-static void nxp_spifi_write(struct spi_nor *nor, loff_t to, size_t len,
- size_t *retlen, const u_char *buf)
+static ssize_t nxp_spifi_write(struct spi_nor *nor, loff_t to, size_t len,
+ const u_char *buf)
{
struct nxp_spifi *spifi = nor->priv;
u32 cmd;
int ret;
+ size_t i;
ret = nxp_spifi_set_memory_mode_off(spifi);
if (ret)
- return;
+ return ret;
writel(to, spifi->io_base + SPIFI_ADDR);
- *retlen += len;
cmd = SPIFI_CMD_DOUT |
SPIFI_CMD_DATALEN(len) |
@@ -209,10 +208,14 @@ static void nxp_spifi_write(struct spi_nor *nor, loff_t to, size_t len,
SPIFI_CMD_FRAMEFORM(spifi->nor.addr_width + 1);
writel(cmd, spifi->io_base + SPIFI_CMD);
- while (len--)
- writeb(*buf++, spifi->io_base + SPIFI_DATA);
+ for (i = 0; i < len; i++)
+ writeb(buf[i], spifi->io_base + SPIFI_DATA);
+
+ ret = nxp_spifi_wait_for_cmd(spifi);
+ if (ret)
+ return ret;
- nxp_spifi_wait_for_cmd(spifi);
+ return len;
}
static int nxp_spifi_erase(struct spi_nor *nor, loff_t offs)
diff --git a/drivers/mtd/spi-nor/spi-nor.c b/drivers/mtd/spi-nor/spi-nor.c
index c52e45594bfd..d0fc165d7d66 100644
--- a/drivers/mtd/spi-nor/spi-nor.c
+++ b/drivers/mtd/spi-nor/spi-nor.c
@@ -661,7 +661,7 @@ static int stm_unlock(struct spi_nor *nor, loff_t ofs, uint64_t len)
status_new = (status_old & ~mask & ~SR_TB) | val;
/* Don't protect status register if we're fully unlocked */
- if (lock_len == mtd->size)
+ if (lock_len == 0)
status_new &= ~SR_SRWD;
if (!use_top)
@@ -830,10 +830,26 @@ static const struct flash_info spi_nor_ids[] = {
{ "mb85rs1mt", INFO(0x047f27, 0, 128 * 1024, 1, SPI_NOR_NO_ERASE) },
/* GigaDevice */
- { "gd25q32", INFO(0xc84016, 0, 64 * 1024, 64, SECT_4K) },
- { "gd25q64", INFO(0xc84017, 0, 64 * 1024, 128, SECT_4K) },
- { "gd25lq64c", INFO(0xc86017, 0, 64 * 1024, 128, SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ) },
- { "gd25q128", INFO(0xc84018, 0, 64 * 1024, 256, SECT_4K) },
+ {
+ "gd25q32", INFO(0xc84016, 0, 64 * 1024, 64,
+ SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ |
+ SPI_NOR_HAS_LOCK | SPI_NOR_HAS_TB)
+ },
+ {
+ "gd25q64", INFO(0xc84017, 0, 64 * 1024, 128,
+ SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ |
+ SPI_NOR_HAS_LOCK | SPI_NOR_HAS_TB)
+ },
+ {
+ "gd25lq64c", INFO(0xc86017, 0, 64 * 1024, 128,
+ SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ |
+ SPI_NOR_HAS_LOCK | SPI_NOR_HAS_TB)
+ },
+ {
+ "gd25q128", INFO(0xc84018, 0, 64 * 1024, 256,
+ SECT_4K | SPI_NOR_DUAL_READ | SPI_NOR_QUAD_READ |
+ SPI_NOR_HAS_LOCK | SPI_NOR_HAS_TB)
+ },
/* Intel/Numonyx -- xxxs33b */
{ "160s33b", INFO(0x898911, 0, 64 * 1024, 32, 0) },
@@ -871,6 +887,7 @@ static const struct flash_info spi_nor_ids[] = {
{ "n25q512a", INFO(0x20bb20, 0, 64 * 1024, 1024, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) },
{ "n25q512ax3", INFO(0x20ba20, 0, 64 * 1024, 1024, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) },
{ "n25q00", INFO(0x20ba21, 0, 64 * 1024, 2048, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) },
+ { "n25q00a", INFO(0x20bb21, 0, 64 * 1024, 2048, SECT_4K | USE_FSR | SPI_NOR_QUAD_READ) },
/* PMC */
{ "pm25lv512", INFO(0, 0, 32 * 1024, 2, SECT_4K_PMC) },
@@ -1031,8 +1048,25 @@ static int spi_nor_read(struct mtd_info *mtd, loff_t from, size_t len,
if (ret)
return ret;
- ret = nor->read(nor, from, len, retlen, buf);
+ while (len) {
+ ret = nor->read(nor, from, len, buf);
+ if (ret == 0) {
+ /* We shouldn't see 0-length reads */
+ ret = -EIO;
+ goto read_err;
+ }
+ if (ret < 0)
+ goto read_err;
+
+ WARN_ON(ret > len);
+ *retlen += ret;
+ buf += ret;
+ from += ret;
+ len -= ret;
+ }
+ ret = 0;
+read_err:
spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_READ);
return ret;
}
@@ -1060,10 +1094,14 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len,
nor->program_opcode = SPINOR_OP_BP;
/* write one byte. */
- nor->write(nor, to, 1, retlen, buf);
+ ret = nor->write(nor, to, 1, buf);
+ if (ret < 0)
+ goto sst_write_err;
+ WARN(ret != 1, "While writing 1 byte written %i bytes\n",
+ (int)ret);
ret = spi_nor_wait_till_ready(nor);
if (ret)
- goto time_out;
+ goto sst_write_err;
}
to += actual;
@@ -1072,10 +1110,14 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len,
nor->program_opcode = SPINOR_OP_AAI_WP;
/* write two bytes. */
- nor->write(nor, to, 2, retlen, buf + actual);
+ ret = nor->write(nor, to, 2, buf + actual);
+ if (ret < 0)
+ goto sst_write_err;
+ WARN(ret != 2, "While writing 2 bytes written %i bytes\n",
+ (int)ret);
ret = spi_nor_wait_till_ready(nor);
if (ret)
- goto time_out;
+ goto sst_write_err;
to += 2;
nor->sst_write_second = true;
}
@@ -1084,21 +1126,26 @@ static int sst_write(struct mtd_info *mtd, loff_t to, size_t len,
write_disable(nor);
ret = spi_nor_wait_till_ready(nor);
if (ret)
- goto time_out;
+ goto sst_write_err;
/* Write out trailing byte if it exists. */
if (actual != len) {
write_enable(nor);
nor->program_opcode = SPINOR_OP_BP;
- nor->write(nor, to, 1, retlen, buf + actual);
-
+ ret = nor->write(nor, to, 1, buf + actual);
+ if (ret < 0)
+ goto sst_write_err;
+ WARN(ret != 1, "While writing 1 byte written %i bytes\n",
+ (int)ret);
ret = spi_nor_wait_till_ready(nor);
if (ret)
- goto time_out;
+ goto sst_write_err;
write_disable(nor);
+ actual += 1;
}
-time_out:
+sst_write_err:
+ *retlen += actual;
spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_WRITE);
return ret;
}
@@ -1112,8 +1159,8 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len,
size_t *retlen, const u_char *buf)
{
struct spi_nor *nor = mtd_to_spi_nor(mtd);
- u32 page_offset, page_size, i;
- int ret;
+ size_t page_offset, page_remain, i;
+ ssize_t ret;
dev_dbg(nor->dev, "to 0x%08x, len %zd\n", (u32)to, len);
@@ -1121,35 +1168,37 @@ static int spi_nor_write(struct mtd_info *mtd, loff_t to, size_t len,
if (ret)
return ret;
- write_enable(nor);
-
- page_offset = to & (nor->page_size - 1);
+ for (i = 0; i < len; ) {
+ ssize_t written;
- /* do all the bytes fit onto one page? */
- if (page_offset + len <= nor->page_size) {
- nor->write(nor, to, len, retlen, buf);
- } else {
+ page_offset = (to + i) & (nor->page_size - 1);
+ WARN_ONCE(page_offset,
+ "Writing at offset %zu into a NOR page. Writing partial pages may decrease reliability and increase wear of NOR flash.",
+ page_offset);
/* the size of data remaining on the first page */
- page_size = nor->page_size - page_offset;
- nor->write(nor, to, page_size, retlen, buf);
-
- /* write everything in nor->page_size chunks */
- for (i = page_size; i < len; i += page_size) {
- page_size = len - i;
- if (page_size > nor->page_size)
- page_size = nor->page_size;
+ page_remain = min_t(size_t,
+ nor->page_size - page_offset, len - i);
- ret = spi_nor_wait_till_ready(nor);
- if (ret)
- goto write_err;
-
- write_enable(nor);
+ write_enable(nor);
+ ret = nor->write(nor, to + i, page_remain, buf + i);
+ if (ret < 0)
+ goto write_err;
+ written = ret;
- nor->write(nor, to + i, page_size, retlen, buf + i);
+ ret = spi_nor_wait_till_ready(nor);
+ if (ret)
+ goto write_err;
+ *retlen += written;
+ i += written;
+ if (written != page_remain) {
+ dev_err(nor->dev,
+ "While writing %zu bytes written %zd bytes\n",
+ page_remain, written);
+ ret = -EIO;
+ goto write_err;
}
}
- ret = spi_nor_wait_till_ready(nor);
write_err:
spi_nor_unlock_and_unprep(nor, SPI_NOR_OPS_WRITE);
return ret;
diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c
index daf82ba7aba0..41b13d1cdcc4 100644
--- a/drivers/mtd/ssfdc.c
+++ b/drivers/mtd/ssfdc.c
@@ -380,8 +380,7 @@ static int ssfdcr_readsect(struct mtd_blktrans_dev *dev,
" block_addr=%d\n", logic_sect_no, sectors_per_block, offset,
block_address);
- if (block_address >= ssfdc->map_len)
- BUG();
+ BUG_ON(block_address >= ssfdc->map_len);
block_address = ssfdc->logic_block_map[block_address];
diff --git a/drivers/mtd/tests/nandbiterrs.c b/drivers/mtd/tests/nandbiterrs.c
index 09a4ccac53a2..f26dec896afa 100644
--- a/drivers/mtd/tests/nandbiterrs.c
+++ b/drivers/mtd/tests/nandbiterrs.c
@@ -290,7 +290,7 @@ static int overwrite_test(void)
while (opno < max_overwrite) {
- err = rewrite_page(0);
+ err = write_page(0);
if (err)
break;
diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c
index e708e360a9e3..6453148d066a 100644
--- a/drivers/net/ethernet/atheros/alx/main.c
+++ b/drivers/net/ethernet/atheros/alx/main.c
@@ -1251,7 +1251,7 @@ static int alx_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
struct alx_priv *alx;
struct alx_hw *hw;
bool phy_configured;
- int bars, err;
+ int err;
err = pci_enable_device_mem(pdev);
if (err)
@@ -1271,11 +1271,10 @@ static int alx_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
}
}
- bars = pci_select_bars(pdev, IORESOURCE_MEM);
- err = pci_request_selected_regions(pdev, bars, alx_drv_name);
+ err = pci_request_mem_regions(pdev, alx_drv_name);
if (err) {
dev_err(&pdev->dev,
- "pci_request_selected_regions failed(bars:%d)\n", bars);
+ "pci_request_mem_regions failed\n");
goto out_pci_disable;
}
@@ -1401,7 +1400,7 @@ out_unmap:
out_free_netdev:
free_netdev(netdev);
out_pci_release:
- pci_release_selected_regions(pdev, bars);
+ pci_release_mem_regions(pdev);
out_pci_disable:
pci_disable_device(pdev);
return err;
@@ -1420,8 +1419,7 @@ static void alx_remove(struct pci_dev *pdev)
unregister_netdev(alx->dev);
iounmap(hw->hw_addr);
- pci_release_selected_regions(pdev,
- pci_select_bars(pdev, IORESOURCE_MEM));
+ pci_release_mem_regions(pdev);
pci_disable_pcie_error_reporting(pdev);
pci_disable_device(pdev);
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c b/drivers/net/ethernet/intel/e1000e/netdev.c
index 41f32c0b341e..02f443958f31 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -7330,8 +7330,7 @@ err_flashmap:
err_ioremap:
free_netdev(netdev);
err_alloc_etherdev:
- pci_release_selected_regions(pdev,
- pci_select_bars(pdev, IORESOURCE_MEM));
+ pci_release_mem_regions(pdev);
err_pci_reg:
err_dma:
pci_disable_device(pdev);
@@ -7398,8 +7397,7 @@ static void e1000_remove(struct pci_dev *pdev)
if ((adapter->hw.flash_address) &&
(adapter->hw.mac.type < e1000_pch_spt))
iounmap(adapter->hw.flash_address);
- pci_release_selected_regions(pdev,
- pci_select_bars(pdev, IORESOURCE_MEM));
+ pci_release_mem_regions(pdev);
free_netdev(netdev);
diff --git a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
index b8245c734c96..774a5654bf42 100644
--- a/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
+++ b/drivers/net/ethernet/intel/fm10k/fm10k_pci.c
@@ -1963,10 +1963,7 @@ static int fm10k_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
goto err_dma;
}
- err = pci_request_selected_regions(pdev,
- pci_select_bars(pdev,
- IORESOURCE_MEM),
- fm10k_driver_name);
+ err = pci_request_mem_regions(pdev, fm10k_driver_name);
if (err) {
dev_err(&pdev->dev,
"pci_request_selected_regions failed: %d\n", err);
@@ -2070,8 +2067,7 @@ err_sw_init:
err_ioremap:
free_netdev(netdev);
err_alloc_netdev:
- pci_release_selected_regions(pdev,
- pci_select_bars(pdev, IORESOURCE_MEM));
+ pci_release_mem_regions(pdev);
err_pci_reg:
err_dma:
pci_disable_device(pdev);
@@ -2119,8 +2115,7 @@ static void fm10k_remove(struct pci_dev *pdev)
free_netdev(netdev);
- pci_release_selected_regions(pdev,
- pci_select_bars(pdev, IORESOURCE_MEM));
+ pci_release_mem_regions(pdev);
pci_disable_pcie_error_reporting(pdev);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 339d99be4702..81c99e1be708 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -10710,8 +10710,7 @@ static int i40e_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
}
/* set up pci connections */
- err = pci_request_selected_regions(pdev, pci_select_bars(pdev,
- IORESOURCE_MEM), i40e_driver_name);
+ err = pci_request_mem_regions(pdev, i40e_driver_name);
if (err) {
dev_info(&pdev->dev,
"pci_request_selected_regions failed %d\n", err);
@@ -11208,8 +11207,7 @@ err_ioremap:
kfree(pf);
err_pf_alloc:
pci_disable_pcie_error_reporting(pdev);
- pci_release_selected_regions(pdev,
- pci_select_bars(pdev, IORESOURCE_MEM));
+ pci_release_mem_regions(pdev);
err_pci_reg:
err_dma:
pci_disable_device(pdev);
@@ -11320,8 +11318,7 @@ static void i40e_remove(struct pci_dev *pdev)
iounmap(hw->hw_addr);
kfree(pf);
- pci_release_selected_regions(pdev,
- pci_select_bars(pdev, IORESOURCE_MEM));
+ pci_release_mem_regions(pdev);
pci_disable_pcie_error_reporting(pdev);
pci_disable_device(pdev);
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 9bcba42abb91..942a89fb0090 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2324,9 +2324,7 @@ static int igb_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
}
}
- err = pci_request_selected_regions(pdev, pci_select_bars(pdev,
- IORESOURCE_MEM),
- igb_driver_name);
+ err = pci_request_mem_regions(pdev, igb_driver_name);
if (err)
goto err_pci_reg;
@@ -2750,8 +2748,7 @@ err_sw_init:
err_ioremap:
free_netdev(netdev);
err_alloc_etherdev:
- pci_release_selected_regions(pdev,
- pci_select_bars(pdev, IORESOURCE_MEM));
+ pci_release_mem_regions(pdev);
err_pci_reg:
err_dma:
pci_disable_device(pdev);
@@ -2916,8 +2913,7 @@ static void igb_remove(struct pci_dev *pdev)
pci_iounmap(pdev, adapter->io_addr);
if (hw->flash_address)
iounmap(hw->flash_address);
- pci_release_selected_regions(pdev,
- pci_select_bars(pdev, IORESOURCE_MEM));
+ pci_release_mem_regions(pdev);
kfree(adapter->shadow_vfta);
free_netdev(netdev);
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 7871f538f0ad..5418c69a7463 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9353,8 +9353,7 @@ static int ixgbe_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
pci_using_dac = 0;
}
- err = pci_request_selected_regions(pdev, pci_select_bars(pdev,
- IORESOURCE_MEM), ixgbe_driver_name);
+ err = pci_request_mem_regions(pdev, ixgbe_driver_name);
if (err) {
dev_err(&pdev->dev,
"pci_request_selected_regions failed 0x%x\n", err);
@@ -9740,8 +9739,7 @@ err_ioremap:
disable_dev = !test_and_set_bit(__IXGBE_DISABLED, &adapter->state);
free_netdev(netdev);
err_alloc_etherdev:
- pci_release_selected_regions(pdev,
- pci_select_bars(pdev, IORESOURCE_MEM));
+ pci_release_mem_regions(pdev);
err_pci_reg:
err_dma:
if (!adapter || disable_dev)
@@ -9808,8 +9806,7 @@ static void ixgbe_remove(struct pci_dev *pdev)
#endif
iounmap(adapter->io_addr);
- pci_release_selected_regions(pdev, pci_select_bars(pdev,
- IORESOURCE_MEM));
+ pci_release_mem_regions(pdev);
e_dev_info("complete\n");
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4cb9b156cab7..d7c33f9361aa 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1661,14 +1661,9 @@ static int nvme_pci_enable(struct nvme_dev *dev)
static void nvme_dev_unmap(struct nvme_dev *dev)
{
- struct pci_dev *pdev = to_pci_dev(dev->dev);
- int bars;
-
if (dev->bar)
iounmap(dev->bar);
-
- bars = pci_select_bars(pdev, IORESOURCE_MEM);
- pci_release_selected_regions(pdev, bars);
+ pci_release_mem_regions(to_pci_dev(dev->dev));
}
static void nvme_pci_disable(struct nvme_dev *dev)
@@ -1897,13 +1892,9 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
static int nvme_dev_map(struct nvme_dev *dev)
{
- int bars;
struct pci_dev *pdev = to_pci_dev(dev->dev);
- bars = pci_select_bars(pdev, IORESOURCE_MEM);
- if (!bars)
- return -ENODEV;
- if (pci_request_selected_regions(pdev, bars, "nvme"))
+ if (pci_request_mem_regions(pdev, "nvme"))
return -ENODEV;
dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
@@ -1912,7 +1903,7 @@ static int nvme_dev_map(struct nvme_dev *dev)
return 0;
release:
- pci_release_selected_regions(pdev, bars);
+ pci_release_mem_regions(pdev);
return -ENODEV;
}
diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig
index 56389be5d08b..67f9916ff14d 100644
--- a/drivers/pci/Kconfig
+++ b/drivers/pci/Kconfig
@@ -25,7 +25,7 @@ config PCI_MSI
If you don't know what to do here, say Y.
config PCI_MSI_IRQ_DOMAIN
- bool
+ def_bool ARM || ARM64 || X86
depends on PCI_MSI
select GENERIC_MSI_IRQ_DOMAIN
diff --git a/drivers/pci/bus.c b/drivers/pci/bus.c
index dd7cdbee8029..c288e5a52575 100644
--- a/drivers/pci/bus.c
+++ b/drivers/pci/bus.c
@@ -91,6 +91,35 @@ void pci_bus_remove_resources(struct pci_bus *bus)
}
}
+int devm_request_pci_bus_resources(struct device *dev,
+ struct list_head *resources)
+{
+ struct resource_entry *win;
+ struct resource *parent, *res;
+ int err;
+
+ resource_list_for_each_entry(win, resources) {
+ res = win->res;
+ switch (resource_type(res)) {
+ case IORESOURCE_IO:
+ parent = &ioport_resource;
+ break;
+ case IORESOURCE_MEM:
+ parent = &iomem_resource;
+ break;
+ default:
+ continue;
+ }
+
+ err = devm_request_resource(dev, parent, res);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devm_request_pci_bus_resources);
+
static struct pci_bus_region pci_32_bit = {0, 0xffffffffULL};
#ifdef CONFIG_PCI_BUS_ADDR_T_64BIT
static struct pci_bus_region pci_64_bit = {0,
@@ -291,6 +320,7 @@ void pci_bus_add_device(struct pci_dev *dev)
pci_fixup_device(pci_fixup_final, dev);
pci_create_sysfs_dev_files(dev);
pci_proc_attach_device(dev);
+ pci_bridge_d3_device_changed(dev);
dev->match_driver = true;
retval = device_attach(&dev->dev);
@@ -397,4 +427,3 @@ void pci_bus_put(struct pci_bus *bus)
put_device(&bus->dev);
}
EXPORT_SYMBOL(pci_bus_put);
-
diff --git a/drivers/pci/ecam.c b/drivers/pci/ecam.c
index f9832ad8efe2..66e0d718472f 100644
--- a/drivers/pci/ecam.c
+++ b/drivers/pci/ecam.c
@@ -19,10 +19,9 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/pci.h>
+#include <linux/pci-ecam.h>
#include <linux/slab.h>
-#include "ecam.h"
-
/*
* On 64-bit systems, we do a single ioremap for the whole config space
* since we have enough virtual address range available. On 32-bit, we
@@ -52,6 +51,7 @@ struct pci_config_window *pci_ecam_create(struct device *dev,
if (!cfg)
return ERR_PTR(-ENOMEM);
+ cfg->parent = dev;
cfg->ops = ops;
cfg->busr.start = busr->start;
cfg->busr.end = busr->end;
@@ -95,7 +95,7 @@ struct pci_config_window *pci_ecam_create(struct device *dev,
}
if (ops->init) {
- err = ops->init(dev, cfg);
+ err = ops->init(cfg);
if (err)
goto err_exit;
}
diff --git a/drivers/pci/host/Kconfig b/drivers/pci/host/Kconfig
index 5d2374e4ee7f..9b485d873b0d 100644
--- a/drivers/pci/host/Kconfig
+++ b/drivers/pci/host/Kconfig
@@ -3,8 +3,9 @@ menu "PCI host controller drivers"
config PCI_DRA7XX
bool "TI DRA7xx PCIe controller"
- select PCIE_DW
depends on OF && HAS_IOMEM && TI_PIPE3
+ depends on PCI_MSI_IRQ_DOMAIN
+ select PCIE_DW
help
Enables support for the PCIe controller in the DRA7xx SoC. There
are two instances of PCIe controller in DRA7xx. This controller can
@@ -16,11 +17,20 @@ config PCI_MVEBU
depends on ARM
depends on OF
+config PCI_AARDVARK
+ bool "Aardvark PCIe controller"
+ depends on ARCH_MVEBU && ARM64
+ depends on OF
+ depends on PCI_MSI_IRQ_DOMAIN
+ help
+ Add support for Aardvark 64bit PCIe Host Controller. This
+ controller is part of the South Bridge of the Marvel Armada
+ 3700 SoC.
config PCIE_XILINX_NWL
bool "NWL PCIe Core"
depends on ARCH_ZYNQMP
- select PCI_MSI_IRQ_DOMAIN if PCI_MSI
+ depends on PCI_MSI_IRQ_DOMAIN
help
Say 'Y' here if you want kernel support for Xilinx
NWL PCIe controller. The controller can act as Root Port
@@ -29,6 +39,7 @@ config PCIE_XILINX_NWL
config PCIE_DW_PLAT
bool "Platform bus based DesignWare PCIe Controller"
+ depends on PCI_MSI_IRQ_DOMAIN
select PCIE_DW
---help---
This selects the DesignWare PCIe controller support. Select this if
@@ -40,16 +51,19 @@ config PCIE_DW_PLAT
config PCIE_DW
bool
+ depends on PCI_MSI_IRQ_DOMAIN
config PCI_EXYNOS
bool "Samsung Exynos PCIe controller"
depends on SOC_EXYNOS5440
+ depends on PCI_MSI_IRQ_DOMAIN
select PCIEPORTBUS
select PCIE_DW
config PCI_IMX6
bool "Freescale i.MX6 PCIe controller"
depends on SOC_IMX6Q
+ depends on PCI_MSI_IRQ_DOMAIN
select PCIEPORTBUS
select PCIE_DW
@@ -72,8 +86,7 @@ config PCI_RCAR_GEN2
config PCIE_RCAR
bool "Renesas R-Car PCIe controller"
depends on ARCH_RENESAS || (ARM && COMPILE_TEST)
- select PCI_MSI
- select PCI_MSI_IRQ_DOMAIN
+ depends on PCI_MSI_IRQ_DOMAIN
help
Say Y here if you want PCIe controller support on R-Car SoCs.
@@ -85,6 +98,7 @@ config PCI_HOST_GENERIC
bool "Generic PCI host controller"
depends on (ARM || ARM64) && OF
select PCI_HOST_COMMON
+ select IRQ_DOMAIN
help
Say Y here if you want to support a simple generic PCI host
controller, such as the one emulated by kvmtool.
@@ -92,6 +106,7 @@ config PCI_HOST_GENERIC
config PCIE_SPEAR13XX
bool "STMicroelectronics SPEAr PCIe controller"
depends on ARCH_SPEAR13XX
+ depends on PCI_MSI_IRQ_DOMAIN
select PCIEPORTBUS
select PCIE_DW
help
@@ -100,6 +115,7 @@ config PCIE_SPEAR13XX
config PCI_KEYSTONE
bool "TI Keystone PCIe controller"
depends on ARCH_KEYSTONE
+ depends on PCI_MSI_IRQ_DOMAIN
select PCIE_DW
select PCIEPORTBUS
help
@@ -120,7 +136,6 @@ config PCI_XGENE
depends on ARCH_XGENE
depends on OF
select PCIEPORTBUS
- select PCI_MSI_IRQ_DOMAIN if PCI_MSI
help
Say Y here if you want internal PCI support on APM X-Gene SoC.
There are 5 internal PCIe ports available. Each port is GEN3 capable
@@ -128,7 +143,8 @@ config PCI_XGENE
config PCI_XGENE_MSI
bool "X-Gene v1 PCIe MSI feature"
- depends on PCI_XGENE && PCI_MSI
+ depends on PCI_XGENE
+ depends on PCI_MSI_IRQ_DOMAIN
default y
help
Say Y here if you want PCIe MSI support for the APM X-Gene v1 SoC.
@@ -137,6 +153,7 @@ config PCI_XGENE_MSI
config PCI_LAYERSCAPE
bool "Freescale Layerscape PCIe controller"
depends on OF && (ARM || ARCH_LAYERSCAPE)
+ depends on PCI_MSI_IRQ_DOMAIN
select PCIE_DW
select MFD_SYSCON
help
@@ -177,8 +194,7 @@ config PCIE_IPROC_BCMA
config PCIE_IPROC_MSI
bool "Broadcom iProc PCIe MSI support"
depends on PCIE_IPROC_PLATFORM || PCIE_IPROC_BCMA
- depends on PCI_MSI
- select PCI_MSI_IRQ_DOMAIN
+ depends on PCI_MSI_IRQ_DOMAIN
default ARCH_BCM_IPROC
help
Say Y here if you want to enable MSI support for Broadcom's iProc
@@ -195,8 +211,8 @@ config PCIE_ALTERA
config PCIE_ALTERA_MSI
bool "Altera PCIe MSI feature"
- depends on PCIE_ALTERA && PCI_MSI
- select PCI_MSI_IRQ_DOMAIN
+ depends on PCIE_ALTERA
+ depends on PCI_MSI_IRQ_DOMAIN
help
Say Y here if you want PCIe MSI support for the Altera FPGA.
This MSI driver supports Altera MSI to GIC controller IP.
@@ -204,6 +220,7 @@ config PCIE_ALTERA_MSI
config PCI_HISI
depends on OF && ARM64
bool "HiSilicon Hip05 and Hip06 SoCs PCIe controllers"
+ depends on PCI_MSI_IRQ_DOMAIN
select PCIEPORTBUS
select PCIE_DW
help
@@ -213,6 +230,7 @@ config PCI_HISI
config PCIE_QCOM
bool "Qualcomm PCIe controller"
depends on ARCH_QCOM && OF
+ depends on PCI_MSI_IRQ_DOMAIN
select PCIE_DW
select PCIEPORTBUS
help
@@ -237,6 +255,7 @@ config PCI_HOST_THUNDER_ECAM
config PCIE_ARMADA_8K
bool "Marvell Armada-8K PCIe controller"
depends on ARCH_MVEBU
+ depends on PCI_MSI_IRQ_DOMAIN
select PCIE_DW
select PCIEPORTBUS
help
@@ -245,4 +264,14 @@ config PCIE_ARMADA_8K
Designware hardware and therefore the driver re-uses the
Designware core functions to implement the driver.
+config PCIE_ARTPEC6
+ bool "Axis ARTPEC-6 PCIe controller"
+ depends on MACH_ARTPEC6
+ depends on PCI_MSI_IRQ_DOMAIN
+ select PCIE_DW
+ select PCIEPORTBUS
+ help
+ Say Y here to enable PCIe controller support on Axis ARTPEC-6
+ SoCs. This PCIe controller uses the DesignWare core.
+
endmenu
diff --git a/drivers/pci/host/Makefile b/drivers/pci/host/Makefile
index 9c8698e89e96..88434101e4c4 100644
--- a/drivers/pci/host/Makefile
+++ b/drivers/pci/host/Makefile
@@ -5,6 +5,7 @@ obj-$(CONFIG_PCI_EXYNOS) += pci-exynos.o
obj-$(CONFIG_PCI_IMX6) += pci-imx6.o
obj-$(CONFIG_PCI_HYPERV) += pci-hyperv.o
obj-$(CONFIG_PCI_MVEBU) += pci-mvebu.o
+obj-$(CONFIG_PCI_AARDVARK) += pci-aardvark.o
obj-$(CONFIG_PCI_TEGRA) += pci-tegra.o
obj-$(CONFIG_PCI_RCAR_GEN2) += pci-rcar-gen2.o
obj-$(CONFIG_PCIE_RCAR) += pcie-rcar.o
@@ -29,3 +30,4 @@ obj-$(CONFIG_PCIE_QCOM) += pcie-qcom.o
obj-$(CONFIG_PCI_HOST_THUNDER_ECAM) += pci-thunder-ecam.o
obj-$(CONFIG_PCI_HOST_THUNDER_PEM) += pci-thunder-pem.o
obj-$(CONFIG_PCIE_ARMADA_8K) += pcie-armada8k.o
+obj-$(CONFIG_PCIE_ARTPEC6) += pcie-artpec6.o
diff --git a/drivers/pci/host/pci-aardvark.c b/drivers/pci/host/pci-aardvark.c
new file mode 100644
index 000000000000..ef9893fa3176
--- /dev/null
+++ b/drivers/pci/host/pci-aardvark.c
@@ -0,0 +1,1001 @@
+/*
+ * Driver for the Aardvark PCIe controller, used on Marvell Armada
+ * 3700.
+ *
+ * Copyright (C) 2016 Marvell
+ *
+ * Author: Hezi Shahmoon <hezi.shahmoon@marvell.com>
+ *
+ * This file is licensed under the terms of the GNU General Public
+ * License version 2. This program is licensed "as is" without any
+ * warranty of any kind, whether express or implied.
+ */
+
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/irqdomain.h>
+#include <linux/kernel.h>
+#include <linux/pci.h>
+#include <linux/init.h>
+#include <linux/platform_device.h>
+#include <linux/of_address.h>
+#include <linux/of_pci.h>
+
+/* PCIe core registers */
+#define PCIE_CORE_CMD_STATUS_REG 0x4
+#define PCIE_CORE_CMD_IO_ACCESS_EN BIT(0)
+#define PCIE_CORE_CMD_MEM_ACCESS_EN BIT(1)
+#define PCIE_CORE_CMD_MEM_IO_REQ_EN BIT(2)
+#define PCIE_CORE_DEV_CTRL_STATS_REG 0xc8
+#define PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE (0 << 4)
+#define PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT 5
+#define PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE (0 << 11)
+#define PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT 12
+#define PCIE_CORE_LINK_CTRL_STAT_REG 0xd0
+#define PCIE_CORE_LINK_L0S_ENTRY BIT(0)
+#define PCIE_CORE_LINK_TRAINING BIT(5)
+#define PCIE_CORE_LINK_WIDTH_SHIFT 20
+#define PCIE_CORE_ERR_CAPCTL_REG 0x118
+#define PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX BIT(5)
+#define PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX_EN BIT(6)
+#define PCIE_CORE_ERR_CAPCTL_ECRC_CHCK BIT(7)
+#define PCIE_CORE_ERR_CAPCTL_ECRC_CHCK_RCV BIT(8)
+
+/* PIO registers base address and register offsets */
+#define PIO_BASE_ADDR 0x4000
+#define PIO_CTRL (PIO_BASE_ADDR + 0x0)
+#define PIO_CTRL_TYPE_MASK GENMASK(3, 0)
+#define PIO_CTRL_ADDR_WIN_DISABLE BIT(24)
+#define PIO_STAT (PIO_BASE_ADDR + 0x4)
+#define PIO_COMPLETION_STATUS_SHIFT 7
+#define PIO_COMPLETION_STATUS_MASK GENMASK(9, 7)
+#define PIO_COMPLETION_STATUS_OK 0
+#define PIO_COMPLETION_STATUS_UR 1
+#define PIO_COMPLETION_STATUS_CRS 2
+#define PIO_COMPLETION_STATUS_CA 4
+#define PIO_NON_POSTED_REQ BIT(0)
+#define PIO_ADDR_LS (PIO_BASE_ADDR + 0x8)
+#define PIO_ADDR_MS (PIO_BASE_ADDR + 0xc)
+#define PIO_WR_DATA (PIO_BASE_ADDR + 0x10)
+#define PIO_WR_DATA_STRB (PIO_BASE_ADDR + 0x14)
+#define PIO_RD_DATA (PIO_BASE_ADDR + 0x18)
+#define PIO_START (PIO_BASE_ADDR + 0x1c)
+#define PIO_ISR (PIO_BASE_ADDR + 0x20)
+#define PIO_ISRM (PIO_BASE_ADDR + 0x24)
+
+/* Aardvark Control registers */
+#define CONTROL_BASE_ADDR 0x4800
+#define PCIE_CORE_CTRL0_REG (CONTROL_BASE_ADDR + 0x0)
+#define PCIE_GEN_SEL_MSK 0x3
+#define PCIE_GEN_SEL_SHIFT 0x0
+#define SPEED_GEN_1 0
+#define SPEED_GEN_2 1
+#define SPEED_GEN_3 2
+#define IS_RC_MSK 1
+#define IS_RC_SHIFT 2
+#define LANE_CNT_MSK 0x18
+#define LANE_CNT_SHIFT 0x3
+#define LANE_COUNT_1 (0 << LANE_CNT_SHIFT)
+#define LANE_COUNT_2 (1 << LANE_CNT_SHIFT)
+#define LANE_COUNT_4 (2 << LANE_CNT_SHIFT)
+#define LANE_COUNT_8 (3 << LANE_CNT_SHIFT)
+#define LINK_TRAINING_EN BIT(6)
+#define LEGACY_INTA BIT(28)
+#define LEGACY_INTB BIT(29)
+#define LEGACY_INTC BIT(30)
+#define LEGACY_INTD BIT(31)
+#define PCIE_CORE_CTRL1_REG (CONTROL_BASE_ADDR + 0x4)
+#define HOT_RESET_GEN BIT(0)
+#define PCIE_CORE_CTRL2_REG (CONTROL_BASE_ADDR + 0x8)
+#define PCIE_CORE_CTRL2_RESERVED 0x7
+#define PCIE_CORE_CTRL2_TD_ENABLE BIT(4)
+#define PCIE_CORE_CTRL2_STRICT_ORDER_ENABLE BIT(5)
+#define PCIE_CORE_CTRL2_OB_WIN_ENABLE BIT(6)
+#define PCIE_CORE_CTRL2_MSI_ENABLE BIT(10)
+#define PCIE_ISR0_REG (CONTROL_BASE_ADDR + 0x40)
+#define PCIE_ISR0_MASK_REG (CONTROL_BASE_ADDR + 0x44)
+#define PCIE_ISR0_MSI_INT_PENDING BIT(24)
+#define PCIE_ISR0_INTX_ASSERT(val) BIT(16 + (val))
+#define PCIE_ISR0_INTX_DEASSERT(val) BIT(20 + (val))
+#define PCIE_ISR0_ALL_MASK GENMASK(26, 0)
+#define PCIE_ISR1_REG (CONTROL_BASE_ADDR + 0x48)
+#define PCIE_ISR1_MASK_REG (CONTROL_BASE_ADDR + 0x4C)
+#define PCIE_ISR1_POWER_STATE_CHANGE BIT(4)
+#define PCIE_ISR1_FLUSH BIT(5)
+#define PCIE_ISR1_ALL_MASK GENMASK(5, 4)
+#define PCIE_MSI_ADDR_LOW_REG (CONTROL_BASE_ADDR + 0x50)
+#define PCIE_MSI_ADDR_HIGH_REG (CONTROL_BASE_ADDR + 0x54)
+#define PCIE_MSI_STATUS_REG (CONTROL_BASE_ADDR + 0x58)
+#define PCIE_MSI_MASK_REG (CONTROL_BASE_ADDR + 0x5C)
+#define PCIE_MSI_PAYLOAD_REG (CONTROL_BASE_ADDR + 0x9C)
+
+/* PCIe window configuration */
+#define OB_WIN_BASE_ADDR 0x4c00
+#define OB_WIN_BLOCK_SIZE 0x20
+#define OB_WIN_REG_ADDR(win, offset) (OB_WIN_BASE_ADDR + \
+ OB_WIN_BLOCK_SIZE * (win) + \
+ (offset))
+#define OB_WIN_MATCH_LS(win) OB_WIN_REG_ADDR(win, 0x00)
+#define OB_WIN_MATCH_MS(win) OB_WIN_REG_ADDR(win, 0x04)
+#define OB_WIN_REMAP_LS(win) OB_WIN_REG_ADDR(win, 0x08)
+#define OB_WIN_REMAP_MS(win) OB_WIN_REG_ADDR(win, 0x0c)
+#define OB_WIN_MASK_LS(win) OB_WIN_REG_ADDR(win, 0x10)
+#define OB_WIN_MASK_MS(win) OB_WIN_REG_ADDR(win, 0x14)
+#define OB_WIN_ACTIONS(win) OB_WIN_REG_ADDR(win, 0x18)
+
+/* PCIe window types */
+#define OB_PCIE_MEM 0x0
+#define OB_PCIE_IO 0x4
+
+/* LMI registers base address and register offsets */
+#define LMI_BASE_ADDR 0x6000
+#define CFG_REG (LMI_BASE_ADDR + 0x0)
+#define LTSSM_SHIFT 24
+#define LTSSM_MASK 0x3f
+#define LTSSM_L0 0x10
+#define RC_BAR_CONFIG 0x300
+
+/* PCIe core controller registers */
+#define CTRL_CORE_BASE_ADDR 0x18000
+#define CTRL_CONFIG_REG (CTRL_CORE_BASE_ADDR + 0x0)
+#define CTRL_MODE_SHIFT 0x0
+#define CTRL_MODE_MASK 0x1
+#define PCIE_CORE_MODE_DIRECT 0x0
+#define PCIE_CORE_MODE_COMMAND 0x1
+
+/* PCIe Central Interrupts Registers */
+#define CENTRAL_INT_BASE_ADDR 0x1b000
+#define HOST_CTRL_INT_STATUS_REG (CENTRAL_INT_BASE_ADDR + 0x0)
+#define HOST_CTRL_INT_MASK_REG (CENTRAL_INT_BASE_ADDR + 0x4)
+#define PCIE_IRQ_CMDQ_INT BIT(0)
+#define PCIE_IRQ_MSI_STATUS_INT BIT(1)
+#define PCIE_IRQ_CMD_SENT_DONE BIT(3)
+#define PCIE_IRQ_DMA_INT BIT(4)
+#define PCIE_IRQ_IB_DXFERDONE BIT(5)
+#define PCIE_IRQ_OB_DXFERDONE BIT(6)
+#define PCIE_IRQ_OB_RXFERDONE BIT(7)
+#define PCIE_IRQ_COMPQ_INT BIT(12)
+#define PCIE_IRQ_DIR_RD_DDR_DET BIT(13)
+#define PCIE_IRQ_DIR_WR_DDR_DET BIT(14)
+#define PCIE_IRQ_CORE_INT BIT(16)
+#define PCIE_IRQ_CORE_INT_PIO BIT(17)
+#define PCIE_IRQ_DPMU_INT BIT(18)
+#define PCIE_IRQ_PCIE_MIS_INT BIT(19)
+#define PCIE_IRQ_MSI_INT1_DET BIT(20)
+#define PCIE_IRQ_MSI_INT2_DET BIT(21)
+#define PCIE_IRQ_RC_DBELL_DET BIT(22)
+#define PCIE_IRQ_EP_STATUS BIT(23)
+#define PCIE_IRQ_ALL_MASK 0xfff0fb
+#define PCIE_IRQ_ENABLE_INTS_MASK PCIE_IRQ_CORE_INT
+
+/* Transaction types */
+#define PCIE_CONFIG_RD_TYPE0 0x8
+#define PCIE_CONFIG_RD_TYPE1 0x9
+#define PCIE_CONFIG_WR_TYPE0 0xa
+#define PCIE_CONFIG_WR_TYPE1 0xb
+
+/* PCI_BDF shifts 8bit, so we need extra 4bit shift */
+#define PCIE_BDF(dev) (dev << 4)
+#define PCIE_CONF_BUS(bus) (((bus) & 0xff) << 20)
+#define PCIE_CONF_DEV(dev) (((dev) & 0x1f) << 15)
+#define PCIE_CONF_FUNC(fun) (((fun) & 0x7) << 12)
+#define PCIE_CONF_REG(reg) ((reg) & 0xffc)
+#define PCIE_CONF_ADDR(bus, devfn, where) \
+ (PCIE_CONF_BUS(bus) | PCIE_CONF_DEV(PCI_SLOT(devfn)) | \
+ PCIE_CONF_FUNC(PCI_FUNC(devfn)) | PCIE_CONF_REG(where))
+
+#define PIO_TIMEOUT_MS 1
+
+#define LINK_WAIT_MAX_RETRIES 10
+#define LINK_WAIT_USLEEP_MIN 90000
+#define LINK_WAIT_USLEEP_MAX 100000
+
+#define LEGACY_IRQ_NUM 4
+#define MSI_IRQ_NUM 32
+
+struct advk_pcie {
+ struct platform_device *pdev;
+ void __iomem *base;
+ struct list_head resources;
+ struct irq_domain *irq_domain;
+ struct irq_chip irq_chip;
+ struct msi_controller msi;
+ struct irq_domain *msi_domain;
+ struct irq_chip msi_irq_chip;
+ DECLARE_BITMAP(msi_irq_in_use, MSI_IRQ_NUM);
+ struct mutex msi_used_lock;
+ u16 msi_msg;
+ int root_bus_nr;
+};
+
+static inline void advk_writel(struct advk_pcie *pcie, u32 val, u64 reg)
+{
+ writel(val, pcie->base + reg);
+}
+
+static inline u32 advk_readl(struct advk_pcie *pcie, u64 reg)
+{
+ return readl(pcie->base + reg);
+}
+
+static int advk_pcie_link_up(struct advk_pcie *pcie)
+{
+ u32 val, ltssm_state;
+
+ val = advk_readl(pcie, CFG_REG);
+ ltssm_state = (val >> LTSSM_SHIFT) & LTSSM_MASK;
+ return ltssm_state >= LTSSM_L0;
+}
+
+static int advk_pcie_wait_for_link(struct advk_pcie *pcie)
+{
+ int retries;
+
+ /* check if the link is up or not */
+ for (retries = 0; retries < LINK_WAIT_MAX_RETRIES; retries++) {
+ if (advk_pcie_link_up(pcie)) {
+ dev_info(&pcie->pdev->dev, "link up\n");
+ return 0;
+ }
+
+ usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX);
+ }
+
+ dev_err(&pcie->pdev->dev, "link never came up\n");
+
+ return -ETIMEDOUT;
+}
+
+/*
+ * Set PCIe address window register which could be used for memory
+ * mapping.
+ */
+static void advk_pcie_set_ob_win(struct advk_pcie *pcie,
+ u32 win_num, u32 match_ms,
+ u32 match_ls, u32 mask_ms,
+ u32 mask_ls, u32 remap_ms,
+ u32 remap_ls, u32 action)
+{
+ advk_writel(pcie, match_ls, OB_WIN_MATCH_LS(win_num));
+ advk_writel(pcie, match_ms, OB_WIN_MATCH_MS(win_num));
+ advk_writel(pcie, mask_ms, OB_WIN_MASK_MS(win_num));
+ advk_writel(pcie, mask_ls, OB_WIN_MASK_LS(win_num));
+ advk_writel(pcie, remap_ms, OB_WIN_REMAP_MS(win_num));
+ advk_writel(pcie, remap_ls, OB_WIN_REMAP_LS(win_num));
+ advk_writel(pcie, action, OB_WIN_ACTIONS(win_num));
+ advk_writel(pcie, match_ls | BIT(0), OB_WIN_MATCH_LS(win_num));
+}
+
+static void advk_pcie_setup_hw(struct advk_pcie *pcie)
+{
+ u32 reg;
+ int i;
+
+ /* Point PCIe unit MBUS decode windows to DRAM space */
+ for (i = 0; i < 8; i++)
+ advk_pcie_set_ob_win(pcie, i, 0, 0, 0, 0, 0, 0, 0);
+
+ /* Set to Direct mode */
+ reg = advk_readl(pcie, CTRL_CONFIG_REG);
+ reg &= ~(CTRL_MODE_MASK << CTRL_MODE_SHIFT);
+ reg |= ((PCIE_CORE_MODE_DIRECT & CTRL_MODE_MASK) << CTRL_MODE_SHIFT);
+ advk_writel(pcie, reg, CTRL_CONFIG_REG);
+
+ /* Set PCI global control register to RC mode */
+ reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG);
+ reg |= (IS_RC_MSK << IS_RC_SHIFT);
+ advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG);
+
+ /* Set Advanced Error Capabilities and Control PF0 register */
+ reg = PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX |
+ PCIE_CORE_ERR_CAPCTL_ECRC_CHK_TX_EN |
+ PCIE_CORE_ERR_CAPCTL_ECRC_CHCK |
+ PCIE_CORE_ERR_CAPCTL_ECRC_CHCK_RCV;
+ advk_writel(pcie, reg, PCIE_CORE_ERR_CAPCTL_REG);
+
+ /* Set PCIe Device Control and Status 1 PF0 register */
+ reg = PCIE_CORE_DEV_CTRL_STATS_RELAX_ORDER_DISABLE |
+ (7 << PCIE_CORE_DEV_CTRL_STATS_MAX_PAYLOAD_SZ_SHIFT) |
+ PCIE_CORE_DEV_CTRL_STATS_SNOOP_DISABLE |
+ PCIE_CORE_DEV_CTRL_STATS_MAX_RD_REQ_SIZE_SHIFT;
+ advk_writel(pcie, reg, PCIE_CORE_DEV_CTRL_STATS_REG);
+
+ /* Program PCIe Control 2 to disable strict ordering */
+ reg = PCIE_CORE_CTRL2_RESERVED |
+ PCIE_CORE_CTRL2_TD_ENABLE;
+ advk_writel(pcie, reg, PCIE_CORE_CTRL2_REG);
+
+ /* Set GEN2 */
+ reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG);
+ reg &= ~PCIE_GEN_SEL_MSK;
+ reg |= SPEED_GEN_2;
+ advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG);
+
+ /* Set lane X1 */
+ reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG);
+ reg &= ~LANE_CNT_MSK;
+ reg |= LANE_COUNT_1;
+ advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG);
+
+ /* Enable link training */
+ reg = advk_readl(pcie, PCIE_CORE_CTRL0_REG);
+ reg |= LINK_TRAINING_EN;
+ advk_writel(pcie, reg, PCIE_CORE_CTRL0_REG);
+
+ /* Enable MSI */
+ reg = advk_readl(pcie, PCIE_CORE_CTRL2_REG);
+ reg |= PCIE_CORE_CTRL2_MSI_ENABLE;
+ advk_writel(pcie, reg, PCIE_CORE_CTRL2_REG);
+
+ /* Clear all interrupts */
+ advk_writel(pcie, PCIE_ISR0_ALL_MASK, PCIE_ISR0_REG);
+ advk_writel(pcie, PCIE_ISR1_ALL_MASK, PCIE_ISR1_REG);
+ advk_writel(pcie, PCIE_IRQ_ALL_MASK, HOST_CTRL_INT_STATUS_REG);
+
+ /* Disable All ISR0/1 Sources */
+ reg = PCIE_ISR0_ALL_MASK;
+ reg &= ~PCIE_ISR0_MSI_INT_PENDING;
+ advk_writel(pcie, reg, PCIE_ISR0_MASK_REG);
+
+ advk_writel(pcie, PCIE_ISR1_ALL_MASK, PCIE_ISR1_MASK_REG);
+
+ /* Unmask all MSI's */
+ advk_writel(pcie, 0, PCIE_MSI_MASK_REG);
+
+ /* Enable summary interrupt for GIC SPI source */
+ reg = PCIE_IRQ_ALL_MASK & (~PCIE_IRQ_ENABLE_INTS_MASK);
+ advk_writel(pcie, reg, HOST_CTRL_INT_MASK_REG);
+
+ reg = advk_readl(pcie, PCIE_CORE_CTRL2_REG);
+ reg |= PCIE_CORE_CTRL2_OB_WIN_ENABLE;
+ advk_writel(pcie, reg, PCIE_CORE_CTRL2_REG);
+
+ /* Bypass the address window mapping for PIO */
+ reg = advk_readl(pcie, PIO_CTRL);
+ reg |= PIO_CTRL_ADDR_WIN_DISABLE;
+ advk_writel(pcie, reg, PIO_CTRL);
+
+ /* Start link training */
+ reg = advk_readl(pcie, PCIE_CORE_LINK_CTRL_STAT_REG);
+ reg |= PCIE_CORE_LINK_TRAINING;
+ advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG);
+
+ advk_pcie_wait_for_link(pcie);
+
+ reg = PCIE_CORE_LINK_L0S_ENTRY |
+ (1 << PCIE_CORE_LINK_WIDTH_SHIFT);
+ advk_writel(pcie, reg, PCIE_CORE_LINK_CTRL_STAT_REG);
+
+ reg = advk_readl(pcie, PCIE_CORE_CMD_STATUS_REG);
+ reg |= PCIE_CORE_CMD_MEM_ACCESS_EN |
+ PCIE_CORE_CMD_IO_ACCESS_EN |
+ PCIE_CORE_CMD_MEM_IO_REQ_EN;
+ advk_writel(pcie, reg, PCIE_CORE_CMD_STATUS_REG);
+}
+
+static void advk_pcie_check_pio_status(struct advk_pcie *pcie)
+{
+ u32 reg;
+ unsigned int status;
+ char *strcomp_status, *str_posted;
+
+ reg = advk_readl(pcie, PIO_STAT);
+ status = (reg & PIO_COMPLETION_STATUS_MASK) >>
+ PIO_COMPLETION_STATUS_SHIFT;
+
+ if (!status)
+ return;
+
+ switch (status) {
+ case PIO_COMPLETION_STATUS_UR:
+ strcomp_status = "UR";
+ break;
+ case PIO_COMPLETION_STATUS_CRS:
+ strcomp_status = "CRS";
+ break;
+ case PIO_COMPLETION_STATUS_CA:
+ strcomp_status = "CA";
+ break;
+ default:
+ strcomp_status = "Unknown";
+ break;
+ }
+
+ if (reg & PIO_NON_POSTED_REQ)
+ str_posted = "Non-posted";
+ else
+ str_posted = "Posted";
+
+ dev_err(&pcie->pdev->dev, "%s PIO Response Status: %s, %#x @ %#x\n",
+ str_posted, strcomp_status, reg, advk_readl(pcie, PIO_ADDR_LS));
+}
+
+static int advk_pcie_wait_pio(struct advk_pcie *pcie)
+{
+ unsigned long timeout;
+
+ timeout = jiffies + msecs_to_jiffies(PIO_TIMEOUT_MS);
+
+ while (time_before(jiffies, timeout)) {
+ u32 start, isr;
+
+ start = advk_readl(pcie, PIO_START);
+ isr = advk_readl(pcie, PIO_ISR);
+ if (!start && isr)
+ return 0;
+ }
+
+ dev_err(&pcie->pdev->dev, "config read/write timed out\n");
+ return -ETIMEDOUT;
+}
+
+static int advk_pcie_rd_conf(struct pci_bus *bus, u32 devfn,
+ int where, int size, u32 *val)
+{
+ struct advk_pcie *pcie = bus->sysdata;
+ u32 reg;
+ int ret;
+
+ if (PCI_SLOT(devfn) != 0) {
+ *val = 0xffffffff;
+ return PCIBIOS_DEVICE_NOT_FOUND;
+ }
+
+ /* Start PIO */
+ advk_writel(pcie, 0, PIO_START);
+ advk_writel(pcie, 1, PIO_ISR);
+
+ /* Program the control register */
+ reg = advk_readl(pcie, PIO_CTRL);
+ reg &= ~PIO_CTRL_TYPE_MASK;
+ if (bus->number == pcie->root_bus_nr)
+ reg |= PCIE_CONFIG_RD_TYPE0;
+ else
+ reg |= PCIE_CONFIG_RD_TYPE1;
+ advk_writel(pcie, reg, PIO_CTRL);
+
+ /* Program the address registers */
+ reg = PCIE_BDF(devfn) | PCIE_CONF_REG(where);
+ advk_writel(pcie, reg, PIO_ADDR_LS);
+ advk_writel(pcie, 0, PIO_ADDR_MS);
+
+ /* Program the data strobe */
+ advk_writel(pcie, 0xf, PIO_WR_DATA_STRB);
+
+ /* Start the transfer */
+ advk_writel(pcie, 1, PIO_START);
+
+ ret = advk_pcie_wait_pio(pcie);
+ if (ret < 0)
+ return PCIBIOS_SET_FAILED;
+
+ advk_pcie_check_pio_status(pcie);
+
+ /* Get the read result */
+ *val = advk_readl(pcie, PIO_RD_DATA);
+ if (size == 1)
+ *val = (*val >> (8 * (where & 3))) & 0xff;
+ else if (size == 2)
+ *val = (*val >> (8 * (where & 3))) & 0xffff;
+
+ return PCIBIOS_SUCCESSFUL;
+}
+
+static int advk_pcie_wr_conf(struct pci_bus *bus, u32 devfn,
+ int where, int size, u32 val)
+{
+ struct advk_pcie *pcie = bus->sysdata;
+ u32 reg;
+ u32 data_strobe = 0x0;
+ int offset;
+ int ret;
+
+ if (PCI_SLOT(devfn) != 0)
+ return PCIBIOS_DEVICE_NOT_FOUND;
+
+ if (where % size)
+ return PCIBIOS_SET_FAILED;
+
+ /* Start PIO */
+ advk_writel(pcie, 0, PIO_START);
+ advk_writel(pcie, 1, PIO_ISR);
+
+ /* Program the control register */
+ reg = advk_readl(pcie, PIO_CTRL);
+ reg &= ~PIO_CTRL_TYPE_MASK;
+ if (bus->number == pcie->root_bus_nr)
+ reg |= PCIE_CONFIG_WR_TYPE0;
+ else
+ reg |= PCIE_CONFIG_WR_TYPE1;
+ advk_writel(pcie, reg, PIO_CTRL);
+
+ /* Program the address registers */
+ reg = PCIE_CONF_ADDR(bus->number, devfn, where);
+ advk_writel(pcie, reg, PIO_ADDR_LS);
+ advk_writel(pcie, 0, PIO_ADDR_MS);
+
+ /* Calculate the write strobe */
+ offset = where & 0x3;
+ reg = val << (8 * offset);
+ data_strobe = GENMASK(size - 1, 0) << offset;
+
+ /* Program the data register */
+ advk_writel(pcie, reg, PIO_WR_DATA);
+
+ /* Program the data strobe */
+ advk_writel(pcie, data_strobe, PIO_WR_DATA_STRB);
+
+ /* Start the transfer */
+ advk_writel(pcie, 1, PIO_START);
+
+ ret = advk_pcie_wait_pio(pcie);
+ if (ret < 0)
+ return PCIBIOS_SET_FAILED;
+
+ advk_pcie_check_pio_status(pcie);
+
+ return PCIBIOS_SUCCESSFUL;
+}
+
+static struct pci_ops advk_pcie_ops = {
+ .read = advk_pcie_rd_conf,
+ .write = advk_pcie_wr_conf,
+};
+
+static int advk_pcie_alloc_msi(struct advk_pcie *pcie)
+{
+ int hwirq;
+
+ mutex_lock(&pcie->msi_used_lock);
+ hwirq = find_first_zero_bit(pcie->msi_irq_in_use, MSI_IRQ_NUM);
+ if (hwirq >= MSI_IRQ_NUM)
+ hwirq = -ENOSPC;
+ else
+ set_bit(hwirq, pcie->msi_irq_in_use);
+ mutex_unlock(&pcie->msi_used_lock);
+
+ return hwirq;
+}
+
+static void advk_pcie_free_msi(struct advk_pcie *pcie, int hwirq)
+{
+ mutex_lock(&pcie->msi_used_lock);
+ if (!test_bit(hwirq, pcie->msi_irq_in_use))
+ dev_err(&pcie->pdev->dev, "trying to free unused MSI#%d\n",
+ hwirq);
+ else
+ clear_bit(hwirq, pcie->msi_irq_in_use);
+ mutex_unlock(&pcie->msi_used_lock);
+}
+
+static int advk_pcie_setup_msi_irq(struct msi_controller *chip,
+ struct pci_dev *pdev,
+ struct msi_desc *desc)
+{
+ struct advk_pcie *pcie = pdev->bus->sysdata;
+ struct msi_msg msg;
+ int virq, hwirq;
+ phys_addr_t msi_msg_phys;
+
+ /* We support MSI, but not MSI-X */
+ if (desc->msi_attrib.is_msix)
+ return -EINVAL;
+
+ hwirq = advk_pcie_alloc_msi(pcie);
+ if (hwirq < 0)
+ return hwirq;
+
+ virq = irq_create_mapping(pcie->msi_domain, hwirq);
+ if (!virq) {
+ advk_pcie_free_msi(pcie, hwirq);
+ return -EINVAL;
+ }
+
+ irq_set_msi_desc(virq, desc);
+
+ msi_msg_phys = virt_to_phys(&pcie->msi_msg);
+
+ msg.address_lo = lower_32_bits(msi_msg_phys);
+ msg.address_hi = upper_32_bits(msi_msg_phys);
+ msg.data = virq;
+
+ pci_write_msi_msg(virq, &msg);
+
+ return 0;
+}
+
+static void advk_pcie_teardown_msi_irq(struct msi_controller *chip,
+ unsigned int irq)
+{
+ struct irq_data *d = irq_get_irq_data(irq);
+ struct msi_desc *msi = irq_data_get_msi_desc(d);
+ struct advk_pcie *pcie = msi_desc_to_pci_sysdata(msi);
+ unsigned long hwirq = d->hwirq;
+
+ irq_dispose_mapping(irq);
+ advk_pcie_free_msi(pcie, hwirq);
+}
+
+static int advk_pcie_msi_map(struct irq_domain *domain,
+ unsigned int virq, irq_hw_number_t hw)
+{
+ struct advk_pcie *pcie = domain->host_data;
+
+ irq_set_chip_and_handler(virq, &pcie->msi_irq_chip,
+ handle_simple_irq);
+
+ return 0;
+}
+
+static const struct irq_domain_ops advk_pcie_msi_irq_ops = {
+ .map = advk_pcie_msi_map,
+};
+
+static void advk_pcie_irq_mask(struct irq_data *d)
+{
+ struct advk_pcie *pcie = d->domain->host_data;
+ irq_hw_number_t hwirq = irqd_to_hwirq(d);
+ u32 mask;
+
+ mask = advk_readl(pcie, PCIE_ISR0_MASK_REG);
+ mask |= PCIE_ISR0_INTX_ASSERT(hwirq);
+ advk_writel(pcie, mask, PCIE_ISR0_MASK_REG);
+}
+
+static void advk_pcie_irq_unmask(struct irq_data *d)
+{
+ struct advk_pcie *pcie = d->domain->host_data;
+ irq_hw_number_t hwirq = irqd_to_hwirq(d);
+ u32 mask;
+
+ mask = advk_readl(pcie, PCIE_ISR0_MASK_REG);
+ mask &= ~PCIE_ISR0_INTX_ASSERT(hwirq);
+ advk_writel(pcie, mask, PCIE_ISR0_MASK_REG);
+}
+
+static int advk_pcie_irq_map(struct irq_domain *h,
+ unsigned int virq, irq_hw_number_t hwirq)
+{
+ struct advk_pcie *pcie = h->host_data;
+
+ advk_pcie_irq_mask(irq_get_irq_data(virq));
+ irq_set_status_flags(virq, IRQ_LEVEL);
+ irq_set_chip_and_handler(virq, &pcie->irq_chip,
+ handle_level_irq);
+ irq_set_chip_data(virq, pcie);
+
+ return 0;
+}
+
+static const struct irq_domain_ops advk_pcie_irq_domain_ops = {
+ .map = advk_pcie_irq_map,
+ .xlate = irq_domain_xlate_onecell,
+};
+
+static int advk_pcie_init_msi_irq_domain(struct advk_pcie *pcie)
+{
+ struct device *dev = &pcie->pdev->dev;
+ struct device_node *node = dev->of_node;
+ struct irq_chip *msi_irq_chip;
+ struct msi_controller *msi;
+ phys_addr_t msi_msg_phys;
+ int ret;
+
+ msi_irq_chip = &pcie->msi_irq_chip;
+
+ msi_irq_chip->name = devm_kasprintf(dev, GFP_KERNEL, "%s-msi",
+ dev_name(dev));
+ if (!msi_irq_chip->name)
+ return -ENOMEM;
+
+ msi_irq_chip->irq_enable = pci_msi_unmask_irq;
+ msi_irq_chip->irq_disable = pci_msi_mask_irq;
+ msi_irq_chip->irq_mask = pci_msi_mask_irq;
+ msi_irq_chip->irq_unmask = pci_msi_unmask_irq;
+
+ msi = &pcie->msi;
+
+ msi->setup_irq = advk_pcie_setup_msi_irq;
+ msi->teardown_irq = advk_pcie_teardown_msi_irq;
+ msi->of_node = node;
+
+ mutex_init(&pcie->msi_used_lock);
+
+ msi_msg_phys = virt_to_phys(&pcie->msi_msg);
+
+ advk_writel(pcie, lower_32_bits(msi_msg_phys),
+ PCIE_MSI_ADDR_LOW_REG);
+ advk_writel(pcie, upper_32_bits(msi_msg_phys),
+ PCIE_MSI_ADDR_HIGH_REG);
+
+ pcie->msi_domain =
+ irq_domain_add_linear(NULL, MSI_IRQ_NUM,
+ &advk_pcie_msi_irq_ops, pcie);
+ if (!pcie->msi_domain)
+ return -ENOMEM;
+
+ ret = of_pci_msi_chip_add(msi);
+ if (ret < 0) {
+ irq_domain_remove(pcie->msi_domain);
+ return ret;
+ }
+
+ return 0;
+}
+
+static void advk_pcie_remove_msi_irq_domain(struct advk_pcie *pcie)
+{
+ of_pci_msi_chip_remove(&pcie->msi);
+ irq_domain_remove(pcie->msi_domain);
+}
+
+static int advk_pcie_init_irq_domain(struct advk_pcie *pcie)
+{
+ struct device *dev = &pcie->pdev->dev;
+ struct device_node *node = dev->of_node;
+ struct device_node *pcie_intc_node;
+ struct irq_chip *irq_chip;
+
+ pcie_intc_node = of_get_next_child(node, NULL);
+ if (!pcie_intc_node) {
+ dev_err(dev, "No PCIe Intc node found\n");
+ return -ENODEV;
+ }
+
+ irq_chip = &pcie->irq_chip;
+
+ irq_chip->name = devm_kasprintf(dev, GFP_KERNEL, "%s-irq",
+ dev_name(dev));
+ if (!irq_chip->name) {
+ of_node_put(pcie_intc_node);
+ return -ENOMEM;
+ }
+
+ irq_chip->irq_mask = advk_pcie_irq_mask;
+ irq_chip->irq_mask_ack = advk_pcie_irq_mask;
+ irq_chip->irq_unmask = advk_pcie_irq_unmask;
+
+ pcie->irq_domain =
+ irq_domain_add_linear(pcie_intc_node, LEGACY_IRQ_NUM,
+ &advk_pcie_irq_domain_ops, pcie);
+ if (!pcie->irq_domain) {
+ dev_err(dev, "Failed to get a INTx IRQ domain\n");
+ of_node_put(pcie_intc_node);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void advk_pcie_remove_irq_domain(struct advk_pcie *pcie)
+{
+ irq_domain_remove(pcie->irq_domain);
+}
+
+static void advk_pcie_handle_msi(struct advk_pcie *pcie)
+{
+ u32 msi_val, msi_mask, msi_status, msi_idx;
+ u16 msi_data;
+
+ msi_mask = advk_readl(pcie, PCIE_MSI_MASK_REG);
+ msi_val = advk_readl(pcie, PCIE_MSI_STATUS_REG);
+ msi_status = msi_val & ~msi_mask;
+
+ for (msi_idx = 0; msi_idx < MSI_IRQ_NUM; msi_idx++) {
+ if (!(BIT(msi_idx) & msi_status))
+ continue;
+
+ advk_writel(pcie, BIT(msi_idx), PCIE_MSI_STATUS_REG);
+ msi_data = advk_readl(pcie, PCIE_MSI_PAYLOAD_REG) & 0xFF;
+ generic_handle_irq(msi_data);
+ }
+
+ advk_writel(pcie, PCIE_ISR0_MSI_INT_PENDING,
+ PCIE_ISR0_REG);
+}
+
+static void advk_pcie_handle_int(struct advk_pcie *pcie)
+{
+ u32 val, mask, status;
+ int i, virq;
+
+ val = advk_readl(pcie, PCIE_ISR0_REG);
+ mask = advk_readl(pcie, PCIE_ISR0_MASK_REG);
+ status = val & ((~mask) & PCIE_ISR0_ALL_MASK);
+
+ if (!status) {
+ advk_writel(pcie, val, PCIE_ISR0_REG);
+ return;
+ }
+
+ /* Process MSI interrupts */
+ if (status & PCIE_ISR0_MSI_INT_PENDING)
+ advk_pcie_handle_msi(pcie);
+
+ /* Process legacy interrupts */
+ for (i = 0; i < LEGACY_IRQ_NUM; i++) {
+ if (!(status & PCIE_ISR0_INTX_ASSERT(i)))
+ continue;
+
+ advk_writel(pcie, PCIE_ISR0_INTX_ASSERT(i),
+ PCIE_ISR0_REG);
+
+ virq = irq_find_mapping(pcie->irq_domain, i);
+ generic_handle_irq(virq);
+ }
+}
+
+static irqreturn_t advk_pcie_irq_handler(int irq, void *arg)
+{
+ struct advk_pcie *pcie = arg;
+ u32 status;
+
+ status = advk_readl(pcie, HOST_CTRL_INT_STATUS_REG);
+ if (!(status & PCIE_IRQ_CORE_INT))
+ return IRQ_NONE;
+
+ advk_pcie_handle_int(pcie);
+
+ /* Clear interrupt */
+ advk_writel(pcie, PCIE_IRQ_CORE_INT, HOST_CTRL_INT_STATUS_REG);
+
+ return IRQ_HANDLED;
+}
+
+static int advk_pcie_parse_request_of_pci_ranges(struct advk_pcie *pcie)
+{
+ int err, res_valid = 0;
+ struct device *dev = &pcie->pdev->dev;
+ struct device_node *np = dev->of_node;
+ struct resource_entry *win;
+ resource_size_t iobase;
+
+ INIT_LIST_HEAD(&pcie->resources);
+
+ err = of_pci_get_host_bridge_resources(np, 0, 0xff, &pcie->resources,
+ &iobase);
+ if (err)
+ return err;
+
+ err = devm_request_pci_bus_resources(dev, &pcie->resources);
+ if (err)
+ goto out_release_res;
+
+ resource_list_for_each_entry(win, &pcie->resources) {
+ struct resource *res = win->res;
+
+ switch (resource_type(res)) {
+ case IORESOURCE_IO:
+ advk_pcie_set_ob_win(pcie, 1,
+ upper_32_bits(res->start),
+ lower_32_bits(res->start),
+ 0, 0xF8000000, 0,
+ lower_32_bits(res->start),
+ OB_PCIE_IO);
+ err = pci_remap_iospace(res, iobase);
+ if (err)
+ dev_warn(dev, "error %d: failed to map resource %pR\n",
+ err, res);
+ break;
+ case IORESOURCE_MEM:
+ advk_pcie_set_ob_win(pcie, 0,
+ upper_32_bits(res->start),
+ lower_32_bits(res->start),
+ 0x0, 0xF8000000, 0,
+ lower_32_bits(res->start),
+ (2 << 20) | OB_PCIE_MEM);
+ res_valid |= !(res->flags & IORESOURCE_PREFETCH);
+ break;
+ case IORESOURCE_BUS:
+ pcie->root_bus_nr = res->start;
+ break;
+ }
+ }
+
+ if (!res_valid) {
+ dev_err(dev, "non-prefetchable memory resource required\n");
+ err = -EINVAL;
+ goto out_release_res;
+ }
+
+ return 0;
+
+out_release_res:
+ pci_free_resource_list(&pcie->resources);
+ return err;
+}
+
+static int advk_pcie_probe(struct platform_device *pdev)
+{
+ struct advk_pcie *pcie;
+ struct resource *res;
+ struct pci_bus *bus, *child;
+ struct msi_controller *msi;
+ struct device_node *msi_node;
+ int ret, irq;
+
+ pcie = devm_kzalloc(&pdev->dev, sizeof(struct advk_pcie),
+ GFP_KERNEL);
+ if (!pcie)
+ return -ENOMEM;
+
+ pcie->pdev = pdev;
+ platform_set_drvdata(pdev, pcie);
+
+ res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+ pcie->base = devm_ioremap_resource(&pdev->dev, res);
+ if (IS_ERR(pcie->base)) {
+ dev_err(&pdev->dev, "Failed to map registers\n");
+ return PTR_ERR(pcie->base);
+ }
+
+ irq = platform_get_irq(pdev, 0);
+ ret = devm_request_irq(&pdev->dev, irq, advk_pcie_irq_handler,
+ IRQF_SHARED | IRQF_NO_THREAD, "advk-pcie",
+ pcie);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to register interrupt\n");
+ return ret;
+ }
+
+ ret = advk_pcie_parse_request_of_pci_ranges(pcie);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to parse resources\n");
+ return ret;
+ }
+
+ advk_pcie_setup_hw(pcie);
+
+ ret = advk_pcie_init_irq_domain(pcie);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to initialize irq\n");
+ return ret;
+ }
+
+ ret = advk_pcie_init_msi_irq_domain(pcie);
+ if (ret) {
+ dev_err(&pdev->dev, "Failed to initialize irq\n");
+ advk_pcie_remove_irq_domain(pcie);
+ return ret;
+ }
+
+ msi_node = of_parse_phandle(pdev->dev.of_node, "msi-parent", 0);
+ if (msi_node)
+ msi = of_pci_find_msi_chip_by_node(msi_node);
+ else
+ msi = NULL;
+
+ bus = pci_scan_root_bus_msi(&pdev->dev, 0, &advk_pcie_ops,
+ pcie, &pcie->resources, &pcie->msi);
+ if (!bus) {
+ advk_pcie_remove_msi_irq_domain(pcie);
+ advk_pcie_remove_irq_domain(pcie);
+ return -ENOMEM;
+ }
+
+ pci_bus_assign_resources(bus);
+
+ list_for_each_entry(child, &bus->children, node)
+ pcie_bus_configure_settings(child);
+
+ pci_bus_add_devices(bus);
+
+ return 0;
+}
+
+static const struct of_device_id advk_pcie_of_match_table[] = {
+ { .compatible = "marvell,armada-3700-pcie", },
+ {},
+};
+
+static struct platform_driver advk_pcie_driver = {
+ .driver = {
+ .name = "advk-pcie",
+ .of_match_table = advk_pcie_of_match_table,
+ /* Driver unloading/unbinding currently not supported */
+ .suppress_bind_attrs = true,
+ },
+ .probe = advk_pcie_probe,
+};
+builtin_platform_driver(advk_pcie_driver);
diff --git a/drivers/pci/host/pci-dra7xx.c b/drivers/pci/host/pci-dra7xx.c
index f441130407e7..81b3949a26db 100644
--- a/drivers/pci/host/pci-dra7xx.c
+++ b/drivers/pci/host/pci-dra7xx.c
@@ -181,14 +181,14 @@ static int dra7xx_pcie_init_irq_domain(struct pcie_port *pp)
if (!pcie_intc_node) {
dev_err(dev, "No PCIe Intc node found\n");
- return PTR_ERR(pcie_intc_node);
+ return -ENODEV;
}
pp->irq_domain = irq_domain_add_linear(pcie_intc_node, 4,
&intx_domain_ops, pp);
if (!pp->irq_domain) {
dev_err(dev, "Failed to get a INTx IRQ domain\n");
- return PTR_ERR(pp->irq_domain);
+ return -ENODEV;
}
return 0;
diff --git a/drivers/pci/host/pci-host-common.c b/drivers/pci/host/pci-host-common.c
index 8cba7ab73df9..9d9d34e959b6 100644
--- a/drivers/pci/host/pci-host-common.c
+++ b/drivers/pci/host/pci-host-common.c
@@ -20,10 +20,9 @@
#include <linux/module.h>
#include <linux/of_address.h>
#include <linux/of_pci.h>
+#include <linux/pci-ecam.h>
#include <linux/platform_device.h>
-#include "../ecam.h"
-
static int gen_pci_parse_request_of_pci_ranges(struct device *dev,
struct list_head *resources, struct resource **bus_range)
{
@@ -36,44 +35,34 @@ static int gen_pci_parse_request_of_pci_ranges(struct device *dev,
if (err)
return err;
+ err = devm_request_pci_bus_resources(dev, resources);
+ if (err)
+ return err;
+
resource_list_for_each_entry(win, resources) {
- struct resource *parent, *res = win->res;
+ struct resource *res = win->res;
switch (resource_type(res)) {
case IORESOURCE_IO:
- parent = &ioport_resource;
err = pci_remap_iospace(res, iobase);
- if (err) {
+ if (err)
dev_warn(dev, "error %d: failed to map resource %pR\n",
err, res);
- continue;
- }
break;
case IORESOURCE_MEM:
- parent = &iomem_resource;
res_valid |= !(res->flags & IORESOURCE_PREFETCH);
break;
case IORESOURCE_BUS:
*bus_range = res;
- default:
- continue;
+ break;
}
-
- err = devm_request_resource(dev, parent, res);
- if (err)
- goto out_release_res;
- }
-
- if (!res_valid) {
- dev_err(dev, "non-prefetchable memory resource required\n");
- err = -EINVAL;
- goto out_release_res;
}
- return 0;
+ if (res_valid)
+ return 0;
-out_release_res:
- return err;
+ dev_err(dev, "non-prefetchable memory resource required\n");
+ return -EINVAL;
}
static void gen_pci_unmap_cfg(void *ptr)
@@ -155,7 +144,14 @@ int pci_host_common_probe(struct platform_device *pdev,
pci_fixup_irqs(pci_common_swizzle, of_irq_parse_and_map_pci);
- if (!pci_has_flag(PCI_PROBE_ONLY)) {
+ /*
+ * We insert PCI resources into the iomem_resource and
+ * ioport_resource trees in either pci_bus_claim_resources()
+ * or pci_bus_assign_resources().
+ */
+ if (pci_has_flag(PCI_PROBE_ONLY)) {
+ pci_bus_claim_resources(bus);
+ } else {
pci_bus_size_bridges(bus);
pci_bus_assign_resources(bus);
diff --git a/drivers/pci/host/pci-host-generic.c b/drivers/pci/host/pci-host-generic.c
index 6eaceab1bf04..c05ea9d72f69 100644
--- a/drivers/pci/host/pci-host-generic.c
+++ b/drivers/pci/host/pci-host-generic.c
@@ -20,13 +20,12 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/of_address.h>
#include <linux/of_pci.h>
+#include <linux/pci-ecam.h>
#include <linux/platform_device.h>
-#include "../ecam.h"
-
static struct pci_ecam_ops gen_pci_cfg_cam_bus_ops = {
.bus_shift = 16,
.pci_ops = {
@@ -46,8 +45,6 @@ static const struct of_device_id gen_pci_of_match[] = {
{ },
};
-MODULE_DEVICE_TABLE(of, gen_pci_of_match);
-
static int gen_pci_probe(struct platform_device *pdev)
{
const struct of_device_id *of_id;
@@ -66,8 +63,4 @@ static struct platform_driver gen_pci_driver = {
},
.probe = gen_pci_probe,
};
-module_platform_driver(gen_pci_driver);
-
-MODULE_DESCRIPTION("Generic PCI host driver");
-MODULE_AUTHOR("Will Deacon <will.deacon@arm.com>");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(gen_pci_driver);
diff --git a/drivers/pci/host/pci-hyperv.c b/drivers/pci/host/pci-hyperv.c
index 7e9b2de2aa24..6955ffdb89f3 100644
--- a/drivers/pci/host/pci-hyperv.c
+++ b/drivers/pci/host/pci-hyperv.c
@@ -732,16 +732,18 @@ static void hv_msi_free(struct irq_domain *domain, struct msi_domain_info *info,
pdev = msi_desc_to_pci_dev(msi);
hbus = info->data;
- hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
- if (!hpdev)
+ int_desc = irq_data_get_irq_chip_data(irq_data);
+ if (!int_desc)
return;
- int_desc = irq_data_get_irq_chip_data(irq_data);
- if (int_desc) {
- irq_data->chip_data = NULL;
- hv_int_desc_free(hpdev, int_desc);
+ irq_data->chip_data = NULL;
+ hpdev = get_pcichild_wslot(hbus, devfn_to_wslot(pdev->devfn));
+ if (!hpdev) {
+ kfree(int_desc);
+ return;
}
+ hv_int_desc_free(hpdev, int_desc);
put_pcichild(hpdev, hv_pcidev_ref_by_slot);
}
@@ -1657,14 +1659,16 @@ static void hv_pci_onchannelcallback(void *context)
continue;
}
+ /* Zero length indicates there are no more packets. */
+ if (ret || !bytes_recvd)
+ break;
+
/*
* All incoming packets must be at least as large as a
* response.
*/
- if (bytes_recvd <= sizeof(struct pci_response)) {
- kfree(buffer);
- return;
- }
+ if (bytes_recvd <= sizeof(struct pci_response))
+ continue;
desc = (struct vmpacket_descriptor *)buffer;
switch (desc->type) {
@@ -1679,8 +1683,7 @@ static void hv_pci_onchannelcallback(void *context)
comp_packet->completion_func(comp_packet->compl_ctxt,
response,
bytes_recvd);
- kfree(buffer);
- return;
+ break;
case VM_PKT_DATA_INBAND:
@@ -1727,8 +1730,9 @@ static void hv_pci_onchannelcallback(void *context)
desc->type, req_id, bytes_recvd);
break;
}
- break;
}
+
+ kfree(buffer);
}
/**
diff --git a/drivers/pci/host/pci-keystone.c b/drivers/pci/host/pci-keystone.c
index 6b8301ef21ca..8ba28834d470 100644
--- a/drivers/pci/host/pci-keystone.c
+++ b/drivers/pci/host/pci-keystone.c
@@ -17,7 +17,7 @@
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/irqdomain.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/msi.h>
#include <linux/of_irq.h>
#include <linux/of.h>
@@ -360,7 +360,6 @@ static const struct of_device_id ks_pcie_of_match[] = {
},
{ },
};
-MODULE_DEVICE_TABLE(of, ks_pcie_of_match);
static int __exit ks_pcie_remove(struct platform_device *pdev)
{
@@ -439,9 +438,4 @@ static struct platform_driver ks_pcie_driver __refdata = {
.of_match_table = of_match_ptr(ks_pcie_of_match),
},
};
-
-module_platform_driver(ks_pcie_driver);
-
-MODULE_AUTHOR("Murali Karicheri <m-karicheri2@ti.com>");
-MODULE_DESCRIPTION("Keystone PCIe host controller driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(ks_pcie_driver);
diff --git a/drivers/pci/host/pci-layerscape.c b/drivers/pci/host/pci-layerscape.c
index a21e229d95e0..114ba819277a 100644
--- a/drivers/pci/host/pci-layerscape.c
+++ b/drivers/pci/host/pci-layerscape.c
@@ -12,7 +12,7 @@
#include <linux/kernel.h>
#include <linux/interrupt.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/of_pci.h>
#include <linux/of_platform.h>
#include <linux/of_irq.h>
@@ -211,7 +211,6 @@ static const struct of_device_id ls_pcie_of_match[] = {
{ .compatible = "fsl,ls2085a-pcie", .data = &ls2080_drvdata },
{ },
};
-MODULE_DEVICE_TABLE(of, ls_pcie_of_match);
static int __init ls_add_pcie_port(struct pcie_port *pp,
struct platform_device *pdev)
@@ -275,9 +274,4 @@ static struct platform_driver ls_pcie_driver = {
.of_match_table = ls_pcie_of_match,
},
};
-
-module_platform_driver_probe(ls_pcie_driver, ls_pcie_probe);
-
-MODULE_AUTHOR("Minghuan Lian <Minghuan.Lian@freescale.com>");
-MODULE_DESCRIPTION("Freescale Layerscape PCIe host controller driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver_probe(ls_pcie_driver, ls_pcie_probe);
diff --git a/drivers/pci/host/pci-mvebu.c b/drivers/pci/host/pci-mvebu.c
index 6b451df6502c..307f81d6b479 100644
--- a/drivers/pci/host/pci-mvebu.c
+++ b/drivers/pci/host/pci-mvebu.c
@@ -1,6 +1,8 @@
/*
* PCIe driver for Marvell Armada 370 and Armada XP SoCs
*
+ * Author: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
+ *
* This file is licensed under the terms of the GNU General Public
* License version 2. This program is licensed "as is" without any
* warranty of any kind, whether express or implied.
@@ -11,7 +13,7 @@
#include <linux/clk.h>
#include <linux/delay.h>
#include <linux/gpio.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/mbus.h>
#include <linux/msi.h>
#include <linux/slab.h>
@@ -839,25 +841,22 @@ static struct pci_ops mvebu_pcie_ops = {
static int mvebu_pcie_setup(int nr, struct pci_sys_data *sys)
{
struct mvebu_pcie *pcie = sys_to_pcie(sys);
- int i;
+ int err, i;
pcie->mem.name = "PCI MEM";
pcie->realio.name = "PCI I/O";
- if (request_resource(&iomem_resource, &pcie->mem))
- return 0;
-
- if (resource_size(&pcie->realio) != 0) {
- if (request_resource(&ioport_resource, &pcie->realio)) {
- release_resource(&pcie->mem);
- return 0;
- }
+ if (resource_size(&pcie->realio) != 0)
pci_add_resource_offset(&sys->resources, &pcie->realio,
sys->io_offset);
- }
+
pci_add_resource_offset(&sys->resources, &pcie->mem, sys->mem_offset);
pci_add_resource(&sys->resources, &pcie->busn);
+ err = devm_request_pci_bus_resources(&pcie->pdev->dev, &sys->resources);
+ if (err)
+ return 0;
+
for (i = 0; i < pcie->nports; i++) {
struct mvebu_pcie_port *port = &pcie->ports[i];
@@ -1298,7 +1297,6 @@ static const struct of_device_id mvebu_pcie_of_match_table[] = {
{ .compatible = "marvell,kirkwood-pcie", },
{},
};
-MODULE_DEVICE_TABLE(of, mvebu_pcie_of_match_table);
static const struct dev_pm_ops mvebu_pcie_pm_ops = {
SET_NOIRQ_SYSTEM_SLEEP_PM_OPS(mvebu_pcie_suspend, mvebu_pcie_resume)
@@ -1314,8 +1312,4 @@ static struct platform_driver mvebu_pcie_driver = {
},
.probe = mvebu_pcie_probe,
};
-module_platform_driver(mvebu_pcie_driver);
-
-MODULE_AUTHOR("Thomas Petazzoni <thomas.petazzoni@free-electrons.com>");
-MODULE_DESCRIPTION("Marvell EBU PCIe driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(mvebu_pcie_driver);
diff --git a/drivers/pci/host/pci-rcar-gen2.c b/drivers/pci/host/pci-rcar-gen2.c
index 9980a4bdae7e..597566f96f5e 100644
--- a/drivers/pci/host/pci-rcar-gen2.c
+++ b/drivers/pci/host/pci-rcar-gen2.c
@@ -4,6 +4,8 @@
* Copyright (C) 2013 Renesas Solutions Corp.
* Copyright (C) 2013 Cogent Embedded, Inc.
*
+ * Author: Valentine Barshak <valentine.barshak@cogentembedded.com>
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
@@ -14,7 +16,6 @@
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/kernel.h>
-#include <linux/module.h>
#include <linux/of_address.h>
#include <linux/of_pci.h>
#include <linux/pci.h>
@@ -97,7 +98,6 @@
struct rcar_pci_priv {
struct device *dev;
void __iomem *reg;
- struct resource io_res;
struct resource mem_res;
struct resource *cfg_res;
unsigned busnr;
@@ -194,6 +194,7 @@ static int rcar_pci_setup(int nr, struct pci_sys_data *sys)
struct rcar_pci_priv *priv = sys->private_data;
void __iomem *reg = priv->reg;
u32 val;
+ int ret;
pm_runtime_enable(priv->dev);
pm_runtime_get_sync(priv->dev);
@@ -273,8 +274,10 @@ static int rcar_pci_setup(int nr, struct pci_sys_data *sys)
rcar_pci_setup_errirq(priv);
/* Add PCI resources */
- pci_add_resource(&sys->resources, &priv->io_res);
pci_add_resource(&sys->resources, &priv->mem_res);
+ ret = devm_request_pci_bus_resources(priv->dev, &sys->resources);
+ if (ret < 0)
+ return ret;
/* Setup bus number based on platform device id / of bus-range */
sys->busnr = priv->busnr;
@@ -371,14 +374,6 @@ static int rcar_pci_probe(struct platform_device *pdev)
return -ENOMEM;
priv->mem_res = *mem_res;
- /*
- * The controller does not support/use port I/O,
- * so setup a dummy port I/O region here.
- */
- priv->io_res.start = priv->mem_res.start;
- priv->io_res.end = priv->mem_res.end;
- priv->io_res.flags = IORESOURCE_IO;
-
priv->cfg_res = cfg_res;
priv->irq = platform_get_irq(pdev, 0);
@@ -421,6 +416,7 @@ static int rcar_pci_probe(struct platform_device *pdev)
hw_private[0] = priv;
memset(&hw, 0, sizeof(hw));
hw.nr_controllers = ARRAY_SIZE(hw_private);
+ hw.io_optional = 1;
hw.private_data = hw_private;
hw.map_irq = rcar_pci_map_irq;
hw.ops = &rcar_pci_ops;
@@ -437,8 +433,6 @@ static struct of_device_id rcar_pci_of_match[] = {
{ },
};
-MODULE_DEVICE_TABLE(of, rcar_pci_of_match);
-
static struct platform_driver rcar_pci_driver = {
.driver = {
.name = "pci-rcar-gen2",
@@ -447,9 +441,4 @@ static struct platform_driver rcar_pci_driver = {
},
.probe = rcar_pci_probe,
};
-
-module_platform_driver(rcar_pci_driver);
-
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Renesas R-Car Gen2 internal PCI");
-MODULE_AUTHOR("Valentine Barshak <valentine.barshak@cogentembedded.com>");
+builtin_platform_driver(rcar_pci_driver);
diff --git a/drivers/pci/host/pci-tegra.c b/drivers/pci/host/pci-tegra.c
index c388468c202a..6de0757b11e4 100644
--- a/drivers/pci/host/pci-tegra.c
+++ b/drivers/pci/host/pci-tegra.c
@@ -9,6 +9,8 @@
*
* Bits taken from arch/arm/mach-dove/pcie.c
*
+ * Author: Thierry Reding <treding@nvidia.com>
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
@@ -32,7 +34,7 @@
#include <linux/irq.h>
#include <linux/irqdomain.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/msi.h>
#include <linux/of_address.h>
#include <linux/of_pci.h>
@@ -183,26 +185,26 @@
#define AFI_PEXBIAS_CTRL_0 0x168
-#define RP_VEND_XP 0x00000F00
+#define RP_VEND_XP 0x00000f00
#define RP_VEND_XP_DL_UP (1 << 30)
-#define RP_PRIV_MISC 0x00000FE0
-#define RP_PRIV_MISC_PRSNT_MAP_EP_PRSNT (0xE << 0)
-#define RP_PRIV_MISC_PRSNT_MAP_EP_ABSNT (0xF << 0)
+#define RP_PRIV_MISC 0x00000fe0
+#define RP_PRIV_MISC_PRSNT_MAP_EP_PRSNT (0xe << 0)
+#define RP_PRIV_MISC_PRSNT_MAP_EP_ABSNT (0xf << 0)
#define RP_LINK_CONTROL_STATUS 0x00000090
#define RP_LINK_CONTROL_STATUS_DL_LINK_ACTIVE 0x20000000
#define RP_LINK_CONTROL_STATUS_LINKSTAT_MASK 0x3fff0000
-#define PADS_CTL_SEL 0x0000009C
+#define PADS_CTL_SEL 0x0000009c
-#define PADS_CTL 0x000000A0
+#define PADS_CTL 0x000000a0
#define PADS_CTL_IDDQ_1L (1 << 0)
#define PADS_CTL_TX_DATA_EN_1L (1 << 6)
#define PADS_CTL_RX_DATA_EN_1L (1 << 10)
-#define PADS_PLL_CTL_TEGRA20 0x000000B8
-#define PADS_PLL_CTL_TEGRA30 0x000000B4
+#define PADS_PLL_CTL_TEGRA20 0x000000b8
+#define PADS_PLL_CTL_TEGRA30 0x000000b4
#define PADS_PLL_CTL_RST_B4SM (1 << 1)
#define PADS_PLL_CTL_LOCKDET (1 << 8)
#define PADS_PLL_CTL_REFCLK_MASK (0x3 << 16)
@@ -214,9 +216,9 @@
#define PADS_PLL_CTL_TXCLKREF_DIV5 (1 << 20)
#define PADS_PLL_CTL_TXCLKREF_BUF_EN (1 << 22)
-#define PADS_REFCLK_CFG0 0x000000C8
-#define PADS_REFCLK_CFG1 0x000000CC
-#define PADS_REFCLK_BIAS 0x000000D0
+#define PADS_REFCLK_CFG0 0x000000c8
+#define PADS_REFCLK_CFG1 0x000000cc
+#define PADS_REFCLK_BIAS 0x000000d0
/*
* Fields in PADS_REFCLK_CFG*. Those registers form an array of 16-bit
@@ -228,15 +230,6 @@
#define PADS_REFCLK_CFG_PREDI_SHIFT 8 /* 11:8 */
#define PADS_REFCLK_CFG_DRVI_SHIFT 12 /* 15:12 */
-/* Default value provided by HW engineering is 0xfa5c */
-#define PADS_REFCLK_CFG_VALUE \
- ( \
- (0x17 << PADS_REFCLK_CFG_TERM_SHIFT) | \
- (0 << PADS_REFCLK_CFG_E_TERM_SHIFT) | \
- (0xa << PADS_REFCLK_CFG_PREDI_SHIFT) | \
- (0xf << PADS_REFCLK_CFG_DRVI_SHIFT) \
- )
-
struct tegra_msi {
struct msi_controller chip;
DECLARE_BITMAP(used, INT_PCI_MSI_NR);
@@ -252,6 +245,8 @@ struct tegra_pcie_soc_data {
unsigned int msi_base_shift;
u32 pads_pll_ctl;
u32 tx_ref_sel;
+ u32 pads_refclk_cfg0;
+ u32 pads_refclk_cfg1;
bool has_pex_clkreq_en;
bool has_pex_bias_ctrl;
bool has_intr_prsnt_sense;
@@ -274,7 +269,6 @@ struct tegra_pcie {
struct list_head buses;
struct resource *cs;
- struct resource all;
struct resource io;
struct resource pio;
struct resource mem;
@@ -623,30 +617,21 @@ static int tegra_pcie_setup(int nr, struct pci_sys_data *sys)
sys->mem_offset = pcie->offset.mem;
sys->io_offset = pcie->offset.io;
- err = devm_request_resource(pcie->dev, &pcie->all, &pcie->io);
- if (err < 0)
- return err;
-
- err = devm_request_resource(pcie->dev, &ioport_resource, &pcie->pio);
- if (err < 0)
- return err;
-
- err = devm_request_resource(pcie->dev, &pcie->all, &pcie->mem);
+ err = devm_request_resource(pcie->dev, &iomem_resource, &pcie->io);
if (err < 0)
return err;
- err = devm_request_resource(pcie->dev, &pcie->all, &pcie->prefetch);
- if (err)
- return err;
-
pci_add_resource_offset(&sys->resources, &pcie->pio, sys->io_offset);
pci_add_resource_offset(&sys->resources, &pcie->mem, sys->mem_offset);
pci_add_resource_offset(&sys->resources, &pcie->prefetch,
sys->mem_offset);
pci_add_resource(&sys->resources, &pcie->busn);
- pci_ioremap_io(pcie->pio.start, pcie->io.start);
+ err = devm_request_pci_bus_resources(pcie->dev, &sys->resources);
+ if (err < 0)
+ return err;
+ pci_remap_iospace(&pcie->pio, pcie->io.start);
return 1;
}
@@ -838,12 +823,6 @@ static int tegra_pcie_phy_enable(struct tegra_pcie *pcie)
value |= PADS_PLL_CTL_RST_B4SM;
pads_writel(pcie, value, soc->pads_pll_ctl);
- /* Configure the reference clock driver */
- value = PADS_REFCLK_CFG_VALUE | (PADS_REFCLK_CFG_VALUE << 16);
- pads_writel(pcie, value, PADS_REFCLK_CFG0);
- if (soc->num_ports > 2)
- pads_writel(pcie, PADS_REFCLK_CFG_VALUE, PADS_REFCLK_CFG1);
-
/* wait for the PLL to lock */
err = tegra_pcie_pll_wait(pcie, 500);
if (err < 0) {
@@ -927,6 +906,7 @@ static int tegra_pcie_port_phy_power_off(struct tegra_pcie_port *port)
static int tegra_pcie_phy_power_on(struct tegra_pcie *pcie)
{
+ const struct tegra_pcie_soc_data *soc = pcie->soc_data;
struct tegra_pcie_port *port;
int err;
@@ -952,6 +932,12 @@ static int tegra_pcie_phy_power_on(struct tegra_pcie *pcie)
}
}
+ /* Configure the reference clock driver */
+ pads_writel(pcie, soc->pads_refclk_cfg0, PADS_REFCLK_CFG0);
+
+ if (soc->num_ports > 2)
+ pads_writel(pcie, soc->pads_refclk_cfg1, PADS_REFCLK_CFG1);
+
return 0;
}
@@ -1822,12 +1808,6 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie)
struct resource res;
int err;
- memset(&pcie->all, 0, sizeof(pcie->all));
- pcie->all.flags = IORESOURCE_MEM;
- pcie->all.name = np->full_name;
- pcie->all.start = ~0;
- pcie->all.end = 0;
-
if (of_pci_range_parser_init(&parser, np)) {
dev_err(pcie->dev, "missing \"ranges\" property\n");
return -EINVAL;
@@ -1880,18 +1860,8 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie)
}
break;
}
-
- if (res.start <= pcie->all.start)
- pcie->all.start = res.start;
-
- if (res.end >= pcie->all.end)
- pcie->all.end = res.end;
}
- err = devm_request_resource(pcie->dev, &iomem_resource, &pcie->all);
- if (err < 0)
- return err;
-
err = of_pci_parse_bus_range(np, &pcie->busn);
if (err < 0) {
dev_err(pcie->dev, "failed to parse ranges property: %d\n",
@@ -2078,6 +2048,7 @@ static const struct tegra_pcie_soc_data tegra20_pcie_data = {
.msi_base_shift = 0,
.pads_pll_ctl = PADS_PLL_CTL_TEGRA20,
.tx_ref_sel = PADS_PLL_CTL_TXCLKREF_DIV10,
+ .pads_refclk_cfg0 = 0xfa5cfa5c,
.has_pex_clkreq_en = false,
.has_pex_bias_ctrl = false,
.has_intr_prsnt_sense = false,
@@ -2090,6 +2061,8 @@ static const struct tegra_pcie_soc_data tegra30_pcie_data = {
.msi_base_shift = 8,
.pads_pll_ctl = PADS_PLL_CTL_TEGRA30,
.tx_ref_sel = PADS_PLL_CTL_TXCLKREF_BUF_EN,
+ .pads_refclk_cfg0 = 0xfa5cfa5c,
+ .pads_refclk_cfg1 = 0xfa5cfa5c,
.has_pex_clkreq_en = true,
.has_pex_bias_ctrl = true,
.has_intr_prsnt_sense = true,
@@ -2102,6 +2075,7 @@ static const struct tegra_pcie_soc_data tegra124_pcie_data = {
.msi_base_shift = 8,
.pads_pll_ctl = PADS_PLL_CTL_TEGRA30,
.tx_ref_sel = PADS_PLL_CTL_TXCLKREF_BUF_EN,
+ .pads_refclk_cfg0 = 0x44ac44ac,
.has_pex_clkreq_en = true,
.has_pex_bias_ctrl = true,
.has_intr_prsnt_sense = true,
@@ -2115,7 +2089,6 @@ static const struct of_device_id tegra_pcie_of_match[] = {
{ .compatible = "nvidia,tegra20-pcie", .data = &tegra20_pcie_data },
{ },
};
-MODULE_DEVICE_TABLE(of, tegra_pcie_of_match);
static void *tegra_pcie_ports_seq_start(struct seq_file *s, loff_t *pos)
{
@@ -2249,8 +2222,6 @@ static int tegra_pcie_probe(struct platform_device *pdev)
if (err < 0)
return err;
- pcibios_min_mem = 0;
-
err = tegra_pcie_get_resources(pcie);
if (err < 0) {
dev_err(&pdev->dev, "failed to request resources: %d\n", err);
@@ -2306,8 +2277,4 @@ static struct platform_driver tegra_pcie_driver = {
},
.probe = tegra_pcie_probe,
};
-module_platform_driver(tegra_pcie_driver);
-
-MODULE_AUTHOR("Thierry Reding <treding@nvidia.com>");
-MODULE_DESCRIPTION("NVIDIA Tegra PCIe driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(tegra_pcie_driver);
diff --git a/drivers/pci/host/pci-thunder-ecam.c b/drivers/pci/host/pci-thunder-ecam.c
index 540d030613eb..d50a3dc2d8db 100644
--- a/drivers/pci/host/pci-thunder-ecam.c
+++ b/drivers/pci/host/pci-thunder-ecam.c
@@ -7,14 +7,13 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/ioport.h>
#include <linux/of_pci.h>
#include <linux/of.h>
+#include <linux/pci-ecam.h>
#include <linux/platform_device.h>
-#include "../ecam.h"
-
static void set_val(u32 v, int where, int size, u32 *val)
{
int shift = (where & 3) * 8;
@@ -360,7 +359,6 @@ static const struct of_device_id thunder_ecam_of_match[] = {
{ .compatible = "cavium,pci-host-thunder-ecam" },
{ },
};
-MODULE_DEVICE_TABLE(of, thunder_ecam_of_match);
static int thunder_ecam_probe(struct platform_device *pdev)
{
@@ -374,7 +372,4 @@ static struct platform_driver thunder_ecam_driver = {
},
.probe = thunder_ecam_probe,
};
-module_platform_driver(thunder_ecam_driver);
-
-MODULE_DESCRIPTION("Thunder ECAM PCI host driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(thunder_ecam_driver);
diff --git a/drivers/pci/host/pci-thunder-pem.c b/drivers/pci/host/pci-thunder-pem.c
index 9b8ab94f3c8c..6abaf80ffb39 100644
--- a/drivers/pci/host/pci-thunder-pem.c
+++ b/drivers/pci/host/pci-thunder-pem.c
@@ -15,13 +15,12 @@
*/
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/of_address.h>
#include <linux/of_pci.h>
+#include <linux/pci-ecam.h>
#include <linux/platform_device.h>
-#include "../ecam.h"
-
#define PEM_CFG_WR 0x28
#define PEM_CFG_RD 0x30
@@ -285,8 +284,9 @@ static int thunder_pem_config_write(struct pci_bus *bus, unsigned int devfn,
return pci_generic_config_write(bus, devfn, where, size, val);
}
-static int thunder_pem_init(struct device *dev, struct pci_config_window *cfg)
+static int thunder_pem_init(struct pci_config_window *cfg)
{
+ struct device *dev = cfg->parent;
resource_size_t bar4_start;
struct resource *res_pem;
struct thunder_pem_pci *pem_pci;
@@ -346,7 +346,6 @@ static const struct of_device_id thunder_pem_of_match[] = {
{ .compatible = "cavium,pci-host-thunder-pem" },
{ },
};
-MODULE_DEVICE_TABLE(of, thunder_pem_of_match);
static int thunder_pem_probe(struct platform_device *pdev)
{
@@ -360,7 +359,4 @@ static struct platform_driver thunder_pem_driver = {
},
.probe = thunder_pem_probe,
};
-module_platform_driver(thunder_pem_driver);
-
-MODULE_DESCRIPTION("Thunder PEM PCIe host driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(thunder_pem_driver);
diff --git a/drivers/pci/host/pci-versatile.c b/drivers/pci/host/pci-versatile.c
index f843a72dc51c..f234405770ab 100644
--- a/drivers/pci/host/pci-versatile.c
+++ b/drivers/pci/host/pci-versatile.c
@@ -80,21 +80,21 @@ static int versatile_pci_parse_request_of_pci_ranges(struct device *dev,
if (err)
return err;
+ err = devm_request_pci_bus_resources(dev, res);
+ if (err)
+ goto out_release_res;
+
resource_list_for_each_entry(win, res) {
- struct resource *parent, *res = win->res;
+ struct resource *res = win->res;
switch (resource_type(res)) {
case IORESOURCE_IO:
- parent = &ioport_resource;
err = pci_remap_iospace(res, iobase);
- if (err) {
+ if (err)
dev_warn(dev, "error %d: failed to map resource %pR\n",
err, res);
- continue;
- }
break;
case IORESOURCE_MEM:
- parent = &iomem_resource;
res_valid |= !(res->flags & IORESOURCE_PREFETCH);
writel(res->start >> 28, PCI_IMAP(mem));
@@ -102,23 +102,14 @@ static int versatile_pci_parse_request_of_pci_ranges(struct device *dev,
mem++;
break;
- case IORESOURCE_BUS:
- default:
- continue;
}
-
- err = devm_request_resource(dev, parent, res);
- if (err)
- goto out_release_res;
}
- if (!res_valid) {
- dev_err(dev, "non-prefetchable memory resource required\n");
- err = -EINVAL;
- goto out_release_res;
- }
+ if (res_valid)
+ return 0;
- return 0;
+ dev_err(dev, "non-prefetchable memory resource required\n");
+ err = -EINVAL;
out_release_res:
pci_free_resource_list(res);
diff --git a/drivers/pci/host/pci-xgene.c b/drivers/pci/host/pci-xgene.c
index ae00ce22d5a6..a81273c23341 100644
--- a/drivers/pci/host/pci-xgene.c
+++ b/drivers/pci/host/pci-xgene.c
@@ -21,7 +21,7 @@
#include <linux/io.h>
#include <linux/jiffies.h>
#include <linux/memblock.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/of.h>
#include <linux/of_address.h>
#include <linux/of_irq.h>
@@ -540,14 +540,20 @@ static int xgene_pcie_probe_bridge(struct platform_device *pdev)
if (ret)
return ret;
+ ret = devm_request_pci_bus_resources(&pdev->dev, &res);
+ if (ret)
+ goto error;
+
ret = xgene_pcie_setup(port, &res, iobase);
if (ret)
- return ret;
+ goto error;
bus = pci_create_root_bus(&pdev->dev, 0,
&xgene_pcie_ops, port, &res);
- if (!bus)
- return -ENOMEM;
+ if (!bus) {
+ ret = -ENOMEM;
+ goto error;
+ }
pci_scan_child_bus(bus);
pci_assign_unassigned_bus_resources(bus);
@@ -555,6 +561,10 @@ static int xgene_pcie_probe_bridge(struct platform_device *pdev)
platform_set_drvdata(pdev, port);
return 0;
+
+error:
+ pci_free_resource_list(&res);
+ return ret;
}
static const struct of_device_id xgene_pcie_match_table[] = {
@@ -569,8 +579,4 @@ static struct platform_driver xgene_pcie_driver = {
},
.probe = xgene_pcie_probe_bridge,
};
-module_platform_driver(xgene_pcie_driver);
-
-MODULE_AUTHOR("Tanmay Inamdar <tinamdar@apm.com>");
-MODULE_DESCRIPTION("APM X-Gene PCIe driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(xgene_pcie_driver);
diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c
index dbac6fb3f0bd..2b7837650db8 100644
--- a/drivers/pci/host/pcie-altera.c
+++ b/drivers/pci/host/pcie-altera.c
@@ -61,6 +61,8 @@
#define TLP_LOOP 500
#define RP_DEVFN 0
+#define LINK_UP_TIMEOUT 5000
+
#define INTX_NUM 4
#define DWORD_MASK 3
@@ -81,9 +83,30 @@ struct tlp_rp_regpair_t {
u32 reg1;
};
+static inline void cra_writel(struct altera_pcie *pcie, const u32 value,
+ const u32 reg)
+{
+ writel_relaxed(value, pcie->cra_base + reg);
+}
+
+static inline u32 cra_readl(struct altera_pcie *pcie, const u32 reg)
+{
+ return readl_relaxed(pcie->cra_base + reg);
+}
+
+static bool altera_pcie_link_is_up(struct altera_pcie *pcie)
+{
+ return !!((cra_readl(pcie, RP_LTSSM) & RP_LTSSM_MASK) == LTSSM_L0);
+}
+
static void altera_pcie_retrain(struct pci_dev *dev)
{
u16 linkcap, linkstat;
+ struct altera_pcie *pcie = dev->bus->sysdata;
+ int timeout = 0;
+
+ if (!altera_pcie_link_is_up(pcie))
+ return;
/*
* Set the retrain bit if the PCIe rootport support > 2.5GB/s, but
@@ -95,9 +118,16 @@ static void altera_pcie_retrain(struct pci_dev *dev)
return;
pcie_capability_read_word(dev, PCI_EXP_LNKSTA, &linkstat);
- if ((linkstat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB)
+ if ((linkstat & PCI_EXP_LNKSTA_CLS) == PCI_EXP_LNKSTA_CLS_2_5GB) {
pcie_capability_set_word(dev, PCI_EXP_LNKCTL,
PCI_EXP_LNKCTL_RL);
+ while (!altera_pcie_link_is_up(pcie)) {
+ timeout++;
+ if (timeout > LINK_UP_TIMEOUT)
+ break;
+ udelay(5);
+ }
+ }
}
DECLARE_PCI_FIXUP_EARLY(0x1172, PCI_ANY_ID, altera_pcie_retrain);
@@ -120,17 +150,6 @@ static bool altera_pcie_hide_rc_bar(struct pci_bus *bus, unsigned int devfn,
return false;
}
-static inline void cra_writel(struct altera_pcie *pcie, const u32 value,
- const u32 reg)
-{
- writel_relaxed(value, pcie->cra_base + reg);
-}
-
-static inline u32 cra_readl(struct altera_pcie *pcie, const u32 reg)
-{
- return readl_relaxed(pcie->cra_base + reg);
-}
-
static void tlp_write_tx(struct altera_pcie *pcie,
struct tlp_rp_regpair_t *tlp_rp_regdata)
{
@@ -139,11 +158,6 @@ static void tlp_write_tx(struct altera_pcie *pcie,
cra_writel(pcie, tlp_rp_regdata->ctrl, RP_TX_CNTRL);
}
-static bool altera_pcie_link_is_up(struct altera_pcie *pcie)
-{
- return !!((cra_readl(pcie, RP_LTSSM) & RP_LTSSM_MASK) == LTSSM_L0);
-}
-
static bool altera_pcie_valid_config(struct altera_pcie *pcie,
struct pci_bus *bus, int dev)
{
@@ -415,11 +429,6 @@ static void altera_pcie_isr(struct irq_desc *desc)
chained_irq_exit(chip, desc);
}
-static void altera_pcie_release_of_pci_ranges(struct altera_pcie *pcie)
-{
- pci_free_resource_list(&pcie->resources);
-}
-
static int altera_pcie_parse_request_of_pci_ranges(struct altera_pcie *pcie)
{
int err, res_valid = 0;
@@ -432,33 +441,25 @@ static int altera_pcie_parse_request_of_pci_ranges(struct altera_pcie *pcie)
if (err)
return err;
+ err = devm_request_pci_bus_resources(dev, &pcie->resources);
+ if (err)
+ goto out_release_res;
+
resource_list_for_each_entry(win, &pcie->resources) {
- struct resource *parent, *res = win->res;
+ struct resource *res = win->res;
- switch (resource_type(res)) {
- case IORESOURCE_MEM:
- parent = &iomem_resource;
+ if (resource_type(res) == IORESOURCE_MEM)
res_valid |= !(res->flags & IORESOURCE_PREFETCH);
- break;
- default:
- continue;
- }
-
- err = devm_request_resource(dev, parent, res);
- if (err)
- goto out_release_res;
}
- if (!res_valid) {
- dev_err(dev, "non-prefetchable memory resource required\n");
- err = -EINVAL;
- goto out_release_res;
- }
+ if (res_valid)
+ return 0;
- return 0;
+ dev_err(dev, "non-prefetchable memory resource required\n");
+ err = -EINVAL;
out_release_res:
- altera_pcie_release_of_pci_ranges(pcie);
+ pci_free_resource_list(&pcie->resources);
return err;
}
diff --git a/drivers/pci/host/pcie-armada8k.c b/drivers/pci/host/pcie-armada8k.c
index 55723567b5d4..0f4f570068e3 100644
--- a/drivers/pci/host/pcie-armada8k.c
+++ b/drivers/pci/host/pcie-armada8k.c
@@ -5,6 +5,9 @@
*
* Copyright (C) 2016 Marvell Technology Group Ltd.
*
+ * Author: Yehuda Yitshak <yehuday@marvell.com>
+ * Author: Shadi Ammouri <shadi@marvell.com>
+ *
* This file is licensed under the terms of the GNU General Public
* License version 2. This program is licensed "as is" without any
* warranty of any kind, whether express or implied.
@@ -14,7 +17,7 @@
#include <linux/delay.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/of.h>
#include <linux/pci.h>
#include <linux/phy/phy.h>
@@ -244,7 +247,6 @@ static const struct of_device_id armada8k_pcie_of_match[] = {
{ .compatible = "marvell,armada8k-pcie", },
{},
};
-MODULE_DEVICE_TABLE(of, armada8k_pcie_of_match);
static struct platform_driver armada8k_pcie_driver = {
.probe = armada8k_pcie_probe,
@@ -253,10 +255,4 @@ static struct platform_driver armada8k_pcie_driver = {
.of_match_table = of_match_ptr(armada8k_pcie_of_match),
},
};
-
-module_platform_driver(armada8k_pcie_driver);
-
-MODULE_DESCRIPTION("Armada 8k PCIe host controller driver");
-MODULE_AUTHOR("Yehuda Yitshak <yehuday@marvell.com>");
-MODULE_AUTHOR("Shadi Ammouri <shadi@marvell.com>");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(armada8k_pcie_driver);
diff --git a/drivers/pci/host/pcie-artpec6.c b/drivers/pci/host/pcie-artpec6.c
new file mode 100644
index 000000000000..16ba70b7ec65
--- /dev/null
+++ b/drivers/pci/host/pcie-artpec6.c
@@ -0,0 +1,280 @@
+/*
+ * PCIe host controller driver for Axis ARTPEC-6 SoC
+ *
+ * Author: Niklas Cassel <niklas.cassel@axis.com>
+ *
+ * Based on work done by Phil Edworthy <phil@edworthys.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/pci.h>
+#include <linux/platform_device.h>
+#include <linux/resource.h>
+#include <linux/signal.h>
+#include <linux/types.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/syscon.h>
+#include <linux/regmap.h>
+
+#include "pcie-designware.h"
+
+#define to_artpec6_pcie(x) container_of(x, struct artpec6_pcie, pp)
+
+struct artpec6_pcie {
+ struct pcie_port pp;
+ struct regmap *regmap;
+ void __iomem *phy_base;
+};
+
+/* PCIe Port Logic registers (memory-mapped) */
+#define PL_OFFSET 0x700
+#define PCIE_PHY_DEBUG_R0 (PL_OFFSET + 0x28)
+#define PCIE_PHY_DEBUG_R1 (PL_OFFSET + 0x2c)
+
+#define MISC_CONTROL_1_OFF (PL_OFFSET + 0x1bc)
+#define DBI_RO_WR_EN 1
+
+/* ARTPEC-6 specific registers */
+#define PCIECFG 0x18
+#define PCIECFG_DBG_OEN (1 << 24)
+#define PCIECFG_CORE_RESET_REQ (1 << 21)
+#define PCIECFG_LTSSM_ENABLE (1 << 20)
+#define PCIECFG_CLKREQ_B (1 << 11)
+#define PCIECFG_REFCLK_ENABLE (1 << 10)
+#define PCIECFG_PLL_ENABLE (1 << 9)
+#define PCIECFG_PCLK_ENABLE (1 << 8)
+#define PCIECFG_RISRCREN (1 << 4)
+#define PCIECFG_MODE_TX_DRV_EN (1 << 3)
+#define PCIECFG_CISRREN (1 << 2)
+#define PCIECFG_MACRO_ENABLE (1 << 0)
+
+#define NOCCFG 0x40
+#define NOCCFG_ENABLE_CLK_PCIE (1 << 4)
+#define NOCCFG_POWER_PCIE_IDLEACK (1 << 3)
+#define NOCCFG_POWER_PCIE_IDLE (1 << 2)
+#define NOCCFG_POWER_PCIE_IDLEREQ (1 << 1)
+
+#define PHY_STATUS 0x118
+#define PHY_COSPLLLOCK (1 << 0)
+
+#define ARTPEC6_CPU_TO_BUS_ADDR 0x0fffffff
+
+static int artpec6_pcie_establish_link(struct pcie_port *pp)
+{
+ struct artpec6_pcie *artpec6_pcie = to_artpec6_pcie(pp);
+ u32 val;
+ unsigned int retries;
+
+ /* Hold DW core in reset */
+ regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+ val |= PCIECFG_CORE_RESET_REQ;
+ regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+
+ regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+ val |= PCIECFG_RISRCREN | /* Receiver term. 50 Ohm */
+ PCIECFG_MODE_TX_DRV_EN |
+ PCIECFG_CISRREN | /* Reference clock term. 100 Ohm */
+ PCIECFG_MACRO_ENABLE;
+ val |= PCIECFG_REFCLK_ENABLE;
+ val &= ~PCIECFG_DBG_OEN;
+ val &= ~PCIECFG_CLKREQ_B;
+ regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+ usleep_range(5000, 6000);
+
+ regmap_read(artpec6_pcie->regmap, NOCCFG, &val);
+ val |= NOCCFG_ENABLE_CLK_PCIE;
+ regmap_write(artpec6_pcie->regmap, NOCCFG, val);
+ usleep_range(20, 30);
+
+ regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+ val |= PCIECFG_PCLK_ENABLE | PCIECFG_PLL_ENABLE;
+ regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+ usleep_range(6000, 7000);
+
+ regmap_read(artpec6_pcie->regmap, NOCCFG, &val);
+ val &= ~NOCCFG_POWER_PCIE_IDLEREQ;
+ regmap_write(artpec6_pcie->regmap, NOCCFG, val);
+
+ retries = 50;
+ do {
+ usleep_range(1000, 2000);
+ regmap_read(artpec6_pcie->regmap, NOCCFG, &val);
+ retries--;
+ } while (retries &&
+ (val & (NOCCFG_POWER_PCIE_IDLEACK | NOCCFG_POWER_PCIE_IDLE)));
+
+ retries = 50;
+ do {
+ usleep_range(1000, 2000);
+ val = readl(artpec6_pcie->phy_base + PHY_STATUS);
+ retries--;
+ } while (retries && !(val & PHY_COSPLLLOCK));
+
+ /* Take DW core out of reset */
+ regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+ val &= ~PCIECFG_CORE_RESET_REQ;
+ regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+ usleep_range(100, 200);
+
+ /*
+ * Enable writing to config regs. This is required as the Synopsys
+ * driver changes the class code. That register needs DBI write enable.
+ */
+ writel(DBI_RO_WR_EN, pp->dbi_base + MISC_CONTROL_1_OFF);
+
+ pp->io_base &= ARTPEC6_CPU_TO_BUS_ADDR;
+ pp->mem_base &= ARTPEC6_CPU_TO_BUS_ADDR;
+ pp->cfg0_base &= ARTPEC6_CPU_TO_BUS_ADDR;
+ pp->cfg1_base &= ARTPEC6_CPU_TO_BUS_ADDR;
+
+ /* setup root complex */
+ dw_pcie_setup_rc(pp);
+
+ /* assert LTSSM enable */
+ regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+ val |= PCIECFG_LTSSM_ENABLE;
+ regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+
+ /* check if the link is up or not */
+ if (!dw_pcie_wait_for_link(pp))
+ return 0;
+
+ dev_dbg(pp->dev, "DEBUG_R0: 0x%08x, DEBUG_R1: 0x%08x\n",
+ readl(pp->dbi_base + PCIE_PHY_DEBUG_R0),
+ readl(pp->dbi_base + PCIE_PHY_DEBUG_R1));
+
+ return -ETIMEDOUT;
+}
+
+static void artpec6_pcie_enable_interrupts(struct pcie_port *pp)
+{
+ if (IS_ENABLED(CONFIG_PCI_MSI))
+ dw_pcie_msi_init(pp);
+}
+
+static void artpec6_pcie_host_init(struct pcie_port *pp)
+{
+ artpec6_pcie_establish_link(pp);
+ artpec6_pcie_enable_interrupts(pp);
+}
+
+static int artpec6_pcie_link_up(struct pcie_port *pp)
+{
+ u32 rc;
+
+ /*
+ * Get status from Synopsys IP
+ * link is debug bit 36, debug register 1 starts at bit 32
+ */
+ rc = readl(pp->dbi_base + PCIE_PHY_DEBUG_R1) & (0x1 << (36 - 32));
+ if (rc)
+ return 1;
+
+ return 0;
+}
+
+static struct pcie_host_ops artpec6_pcie_host_ops = {
+ .link_up = artpec6_pcie_link_up,
+ .host_init = artpec6_pcie_host_init,
+};
+
+static irqreturn_t artpec6_pcie_msi_handler(int irq, void *arg)
+{
+ struct pcie_port *pp = arg;
+
+ return dw_handle_msi_irq(pp);
+}
+
+static int __init artpec6_add_pcie_port(struct pcie_port *pp,
+ struct platform_device *pdev)
+{
+ int ret;
+
+ if (IS_ENABLED(CONFIG_PCI_MSI)) {
+ pp->msi_irq = platform_get_irq_byname(pdev, "msi");
+ if (pp->msi_irq <= 0) {
+ dev_err(&pdev->dev, "failed to get MSI irq\n");
+ return -ENODEV;
+ }
+
+ ret = devm_request_irq(&pdev->dev, pp->msi_irq,
+ artpec6_pcie_msi_handler,
+ IRQF_SHARED | IRQF_NO_THREAD,
+ "artpec6-pcie-msi", pp);
+ if (ret) {
+ dev_err(&pdev->dev, "failed to request MSI irq\n");
+ return ret;
+ }
+ }
+
+ pp->root_bus_nr = -1;
+ pp->ops = &artpec6_pcie_host_ops;
+
+ ret = dw_pcie_host_init(pp);
+ if (ret) {
+ dev_err(&pdev->dev, "failed to initialize host\n");
+ return ret;
+ }
+
+ return 0;
+}
+
+static int artpec6_pcie_probe(struct platform_device *pdev)
+{
+ struct artpec6_pcie *artpec6_pcie;
+ struct pcie_port *pp;
+ struct resource *dbi_base;
+ struct resource *phy_base;
+ int ret;
+
+ artpec6_pcie = devm_kzalloc(&pdev->dev, sizeof(*artpec6_pcie),
+ GFP_KERNEL);
+ if (!artpec6_pcie)
+ return -ENOMEM;
+
+ pp = &artpec6_pcie->pp;
+ pp->dev = &pdev->dev;
+
+ dbi_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi");
+ pp->dbi_base = devm_ioremap_resource(&pdev->dev, dbi_base);
+ if (IS_ERR(pp->dbi_base))
+ return PTR_ERR(pp->dbi_base);
+
+ phy_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "phy");
+ artpec6_pcie->phy_base = devm_ioremap_resource(&pdev->dev, phy_base);
+ if (IS_ERR(artpec6_pcie->phy_base))
+ return PTR_ERR(artpec6_pcie->phy_base);
+
+ artpec6_pcie->regmap =
+ syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
+ "axis,syscon-pcie");
+ if (IS_ERR(artpec6_pcie->regmap))
+ return PTR_ERR(artpec6_pcie->regmap);
+
+ ret = artpec6_add_pcie_port(pp, pdev);
+ if (ret < 0)
+ return ret;
+
+ platform_set_drvdata(pdev, artpec6_pcie);
+ return 0;
+}
+
+static const struct of_device_id artpec6_pcie_of_match[] = {
+ { .compatible = "axis,artpec6-pcie", },
+ {},
+};
+
+static struct platform_driver artpec6_pcie_driver = {
+ .probe = artpec6_pcie_probe,
+ .driver = {
+ .name = "artpec6-pcie",
+ .of_match_table = artpec6_pcie_of_match,
+ },
+};
+builtin_platform_driver(artpec6_pcie_driver);
diff --git a/drivers/pci/host/pcie-designware-plat.c b/drivers/pci/host/pcie-designware-plat.c
index b3500994d08a..c8079dc81c10 100644
--- a/drivers/pci/host/pcie-designware-plat.c
+++ b/drivers/pci/host/pcie-designware-plat.c
@@ -14,7 +14,7 @@
#include <linux/gpio.h>
#include <linux/interrupt.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/of_gpio.h>
#include <linux/pci.h>
#include <linux/platform_device.h>
@@ -121,7 +121,6 @@ static const struct of_device_id dw_plat_pcie_of_match[] = {
{ .compatible = "snps,dw-pcie", },
{},
};
-MODULE_DEVICE_TABLE(of, dw_plat_pcie_of_match);
static struct platform_driver dw_plat_pcie_driver = {
.driver = {
@@ -130,9 +129,4 @@ static struct platform_driver dw_plat_pcie_driver = {
},
.probe = dw_plat_pcie_probe,
};
-
-module_platform_driver(dw_plat_pcie_driver);
-
-MODULE_AUTHOR("Joao Pinto <Joao.Pinto@synopsys.com>");
-MODULE_DESCRIPTION("Synopsys PCIe host controller glue platform driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(dw_plat_pcie_driver);
diff --git a/drivers/pci/host/pcie-designware.c b/drivers/pci/host/pcie-designware.c
index aafd766546f3..12afce19890b 100644
--- a/drivers/pci/host/pcie-designware.c
+++ b/drivers/pci/host/pcie-designware.c
@@ -452,6 +452,10 @@ int dw_pcie_host_init(struct pcie_port *pp)
if (ret)
return ret;
+ ret = devm_request_pci_bus_resources(&pdev->dev, &res);
+ if (ret)
+ goto error;
+
/* Get the I/O and memory ranges from DT */
resource_list_for_each_entry(win, &res) {
switch (resource_type(win->res)) {
@@ -461,11 +465,9 @@ int dw_pcie_host_init(struct pcie_port *pp)
pp->io_size = resource_size(pp->io);
pp->io_bus_addr = pp->io->start - win->offset;
ret = pci_remap_iospace(pp->io, pp->io_base);
- if (ret) {
+ if (ret)
dev_warn(pp->dev, "error %d: failed to map resource %pR\n",
ret, pp->io);
- continue;
- }
break;
case IORESOURCE_MEM:
pp->mem = win->res;
@@ -483,8 +485,6 @@ int dw_pcie_host_init(struct pcie_port *pp)
case IORESOURCE_BUS:
pp->busn = win->res;
break;
- default:
- continue;
}
}
@@ -493,7 +493,8 @@ int dw_pcie_host_init(struct pcie_port *pp)
resource_size(pp->cfg));
if (!pp->dbi_base) {
dev_err(pp->dev, "error with ioremap\n");
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto error;
}
}
@@ -504,7 +505,8 @@ int dw_pcie_host_init(struct pcie_port *pp)
pp->cfg0_size);
if (!pp->va_cfg0_base) {
dev_err(pp->dev, "error with ioremap in function\n");
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto error;
}
}
@@ -513,7 +515,8 @@ int dw_pcie_host_init(struct pcie_port *pp)
pp->cfg1_size);
if (!pp->va_cfg1_base) {
dev_err(pp->dev, "error with ioremap\n");
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto error;
}
}
@@ -528,7 +531,8 @@ int dw_pcie_host_init(struct pcie_port *pp)
&dw_pcie_msi_chip);
if (!pp->irq_domain) {
dev_err(pp->dev, "irq domain init failed\n");
- return -ENXIO;
+ ret = -ENXIO;
+ goto error;
}
for (i = 0; i < MAX_MSI_IRQS; i++)
@@ -536,7 +540,7 @@ int dw_pcie_host_init(struct pcie_port *pp)
} else {
ret = pp->ops->msi_host_init(pp, &dw_pcie_msi_chip);
if (ret < 0)
- return ret;
+ goto error;
}
}
@@ -552,8 +556,10 @@ int dw_pcie_host_init(struct pcie_port *pp)
} else
bus = pci_scan_root_bus(pp->dev, pp->root_bus_nr, &dw_pcie_ops,
pp, &res);
- if (!bus)
- return -ENOMEM;
+ if (!bus) {
+ ret = -ENOMEM;
+ goto error;
+ }
if (pp->ops->scan_bus)
pp->ops->scan_bus(pp);
@@ -571,6 +577,10 @@ int dw_pcie_host_init(struct pcie_port *pp)
pci_bus_add_devices(bus);
return 0;
+
+error:
+ pci_free_resource_list(&res);
+ return ret;
}
static int dw_pcie_rd_other_conf(struct pcie_port *pp, struct pci_bus *bus,
diff --git a/drivers/pci/host/pcie-hisi.c b/drivers/pci/host/pcie-hisi.c
index 3e98d4edae2d..7ee9dfcc45fb 100644
--- a/drivers/pci/host/pcie-hisi.c
+++ b/drivers/pci/host/pcie-hisi.c
@@ -12,7 +12,7 @@
* published by the Free Software Foundation.
*/
#include <linux/interrupt.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/mfd/syscon.h>
#include <linux/of_address.h>
#include <linux/of_pci.h>
@@ -235,9 +235,6 @@ static const struct of_device_id hisi_pcie_of_match[] = {
{},
};
-
-MODULE_DEVICE_TABLE(of, hisi_pcie_of_match);
-
static struct platform_driver hisi_pcie_driver = {
.probe = hisi_pcie_probe,
.driver = {
@@ -245,10 +242,4 @@ static struct platform_driver hisi_pcie_driver = {
.of_match_table = hisi_pcie_of_match,
},
};
-
-module_platform_driver(hisi_pcie_driver);
-
-MODULE_AUTHOR("Zhou Wang <wangzhou1@hisilicon.com>");
-MODULE_AUTHOR("Dacai Zhu <zhudacai@hisilicon.com>");
-MODULE_AUTHOR("Gabriele Paoloni <gabriele.paoloni@huawei.com>");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(hisi_pcie_driver);
diff --git a/drivers/pci/host/pcie-iproc.c b/drivers/pci/host/pcie-iproc.c
index a576aeeb22da..e167b2f0098d 100644
--- a/drivers/pci/host/pcie-iproc.c
+++ b/drivers/pci/host/pcie-iproc.c
@@ -462,6 +462,10 @@ int iproc_pcie_setup(struct iproc_pcie *pcie, struct list_head *res)
if (!pcie || !pcie->dev || !pcie->base)
return -EINVAL;
+ ret = devm_request_pci_bus_resources(pcie->dev, res);
+ if (ret)
+ return ret;
+
ret = phy_init(pcie->phy);
if (ret) {
dev_err(pcie->dev, "unable to initialize PCIe PHY\n");
diff --git a/drivers/pci/host/pcie-rcar.c b/drivers/pci/host/pcie-rcar.c
index 35092188039b..65db7a221509 100644
--- a/drivers/pci/host/pcie-rcar.c
+++ b/drivers/pci/host/pcie-rcar.c
@@ -7,6 +7,8 @@
* arch/sh/drivers/pci/ops-sh7786.c
* Copyright (C) 2009 - 2011 Paul Mundt
*
+ * Author: Phil Edworthy <phil.edworthy@renesas.com>
+ *
* This file is licensed under the terms of the GNU General Public
* License version 2. This program is licensed "as is" without any
* warranty of any kind, whether express or implied.
@@ -18,7 +20,7 @@
#include <linux/irq.h>
#include <linux/irqdomain.h>
#include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
#include <linux/msi.h>
#include <linux/of_address.h>
#include <linux/of_irq.h>
@@ -936,12 +938,6 @@ static const struct of_device_id rcar_pcie_of_match[] = {
{ .compatible = "renesas,pcie-r8a7795", .data = rcar_pcie_hw_init },
{},
};
-MODULE_DEVICE_TABLE(of, rcar_pcie_of_match);
-
-static void rcar_pcie_release_of_pci_ranges(struct rcar_pcie *pci)
-{
- pci_free_resource_list(&pci->resources);
-}
static int rcar_pcie_parse_request_of_pci_ranges(struct rcar_pcie *pci)
{
@@ -955,37 +951,25 @@ static int rcar_pcie_parse_request_of_pci_ranges(struct rcar_pcie *pci)
if (err)
return err;
+ err = devm_request_pci_bus_resources(dev, &pci->resources);
+ if (err)
+ goto out_release_res;
+
resource_list_for_each_entry(win, &pci->resources) {
- struct resource *parent, *res = win->res;
+ struct resource *res = win->res;
- switch (resource_type(res)) {
- case IORESOURCE_IO:
- parent = &ioport_resource;
+ if (resource_type(res) == IORESOURCE_IO) {
err = pci_remap_iospace(res, iobase);
- if (err) {
+ if (err)
dev_warn(dev, "error %d: failed to map resource %pR\n",
err, res);
- continue;
- }
- break;
- case IORESOURCE_MEM:
- parent = &iomem_resource;
- break;
-
- case IORESOURCE_BUS:
- default:
- continue;
}
-
- err = devm_request_resource(dev, parent, res);
- if (err)
- goto out_release_res;
}
return 0;
out_release_res:
- rcar_pcie_release_of_pci_ranges(pci);
+ pci_free_resource_list(&pci->resources);
return err;
}
@@ -1073,8 +1057,4 @@ static struct platform_driver rcar_pcie_driver = {
},
.probe = rcar_pcie_probe,
};
-module_platform_driver(rcar_pcie_driver);
-
-MODULE_AUTHOR("Phil Edworthy <phil.edworthy@renesas.com>");
-MODULE_DESCRIPTION("Renesas R-Car PCIe driver");
-MODULE_LICENSE("GPL v2");
+builtin_platform_driver(rcar_pcie_driver);
diff --git a/drivers/pci/host/pcie-xilinx-nwl.c b/drivers/pci/host/pcie-xilinx-nwl.c
index 3479d30e2be8..0b597d9190b4 100644
--- a/drivers/pci/host/pcie-xilinx-nwl.c
+++ b/drivers/pci/host/pcie-xilinx-nwl.c
@@ -825,27 +825,33 @@ static int nwl_pcie_probe(struct platform_device *pdev)
err = of_pci_get_host_bridge_resources(node, 0, 0xff, &res, &iobase);
if (err) {
- pr_err("Getting bridge resources failed\n");
+ dev_err(pcie->dev, "Getting bridge resources failed\n");
return err;
}
+ err = devm_request_pci_bus_resources(pcie->dev, &res);
+ if (err)
+ goto error;
+
err = nwl_pcie_init_irq_domain(pcie);
if (err) {
dev_err(pcie->dev, "Failed creating IRQ Domain\n");
- return err;
+ goto error;
}
bus = pci_create_root_bus(&pdev->dev, pcie->root_busno,
&nwl_pcie_ops, pcie, &res);
- if (!bus)
- return -ENOMEM;
+ if (!bus) {
+ err = -ENOMEM;
+ goto error;
+ }
if (IS_ENABLED(CONFIG_PCI_MSI)) {
err = nwl_pcie_enable_msi(pcie, bus);
if (err < 0) {
dev_err(&pdev->dev,
"failed to enable MSI support: %d\n", err);
- return err;
+ goto error;
}
}
pci_scan_child_bus(bus);
@@ -855,6 +861,10 @@ static int nwl_pcie_probe(struct platform_device *pdev)
pci_bus_add_devices(bus);
platform_set_drvdata(pdev, pcie);
return 0;
+
+error:
+ pci_free_resource_list(&res);
+ return err;
}
static int nwl_pcie_remove(struct platform_device *pdev)
diff --git a/drivers/pci/host/pcie-xilinx.c b/drivers/pci/host/pcie-xilinx.c
index 65f0fe0c2eaf..a30e01639557 100644
--- a/drivers/pci/host/pcie-xilinx.c
+++ b/drivers/pci/host/pcie-xilinx.c
@@ -550,7 +550,7 @@ static int xilinx_pcie_init_irq_domain(struct xilinx_pcie_port *port)
pcie_intc_node = of_get_next_child(node, NULL);
if (!pcie_intc_node) {
dev_err(dev, "No PCIe Intc node found\n");
- return PTR_ERR(pcie_intc_node);
+ return -ENODEV;
}
port->irq_domain = irq_domain_add_linear(pcie_intc_node, 4,
@@ -558,7 +558,7 @@ static int xilinx_pcie_init_irq_domain(struct xilinx_pcie_port *port)
port);
if (!port->irq_domain) {
dev_err(dev, "Failed to get a INTx IRQ domain\n");
- return PTR_ERR(port->irq_domain);
+ return -ENODEV;
}
/* Setup MSI */
@@ -569,7 +569,7 @@ static int xilinx_pcie_init_irq_domain(struct xilinx_pcie_port *port)
&xilinx_pcie_msi_chip);
if (!port->irq_domain) {
dev_err(dev, "Failed to get a MSI IRQ domain\n");
- return PTR_ERR(port->irq_domain);
+ return -ENODEV;
}
xilinx_pcie_enable_msi(port);
@@ -660,7 +660,6 @@ static int xilinx_pcie_probe(struct platform_device *pdev)
struct xilinx_pcie_port *port;
struct device *dev = &pdev->dev;
struct pci_bus *bus;
-
int err;
resource_size_t iobase = 0;
LIST_HEAD(res);
@@ -694,10 +693,17 @@ static int xilinx_pcie_probe(struct platform_device *pdev)
dev_err(dev, "Getting bridge resources failed\n");
return err;
}
+
+ err = devm_request_pci_bus_resources(dev, &res);
+ if (err)
+ goto error;
+
bus = pci_create_root_bus(&pdev->dev, 0,
&xilinx_pcie_ops, port, &res);
- if (!bus)
- return -ENOMEM;
+ if (!bus) {
+ err = -ENOMEM;
+ goto error;
+ }
#ifdef CONFIG_PCI_MSI
xilinx_pcie_msi_chip.dev = port->dev;
@@ -712,6 +718,10 @@ static int xilinx_pcie_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, port);
return 0;
+
+error:
+ pci_free_resource_list(&res);
+ return err;
}
/**
diff --git a/drivers/pci/hotplug/acpiphp_glue.c b/drivers/pci/hotplug/acpiphp_glue.c
index fa49f9143b80..6a33ddcfa20b 100644
--- a/drivers/pci/hotplug/acpiphp_glue.c
+++ b/drivers/pci/hotplug/acpiphp_glue.c
@@ -675,6 +675,8 @@ static void acpiphp_check_bridge(struct acpiphp_bridge *bridge)
if (bridge->is_going_away)
return;
+ pm_runtime_get_sync(&bridge->pci_dev->dev);
+
list_for_each_entry(slot, &bridge->slots, node) {
struct pci_bus *bus = slot->bus;
struct pci_dev *dev, *tmp;
@@ -694,6 +696,8 @@ static void acpiphp_check_bridge(struct acpiphp_bridge *bridge)
disable_slot(slot);
}
}
+
+ pm_runtime_put(&bridge->pci_dev->dev);
}
/*
diff --git a/drivers/pci/hotplug/pciehp_hpc.c b/drivers/pci/hotplug/pciehp_hpc.c
index 5c24e938042f..08e84d61874e 100644
--- a/drivers/pci/hotplug/pciehp_hpc.c
+++ b/drivers/pci/hotplug/pciehp_hpc.c
@@ -546,6 +546,10 @@ static irqreturn_t pcie_isr(int irq, void *dev_id)
u8 present;
bool link;
+ /* Interrupts cannot originate from a controller that's asleep */
+ if (pdev->current_state == PCI_D3cold)
+ return IRQ_NONE;
+
/*
* In order to guarantee that all interrupt events are
* serviced, we need to re-inspect Slot Status register after
diff --git a/drivers/pci/msi.c b/drivers/pci/msi.c
index a080f4496fe2..a02981efdad5 100644
--- a/drivers/pci/msi.c
+++ b/drivers/pci/msi.c
@@ -4,6 +4,7 @@
*
* Copyright (C) 2003-2004 Intel
* Copyright (C) Tom Long Nguyen (tom.l.nguyen@intel.com)
+ * Copyright (C) 2016 Christoph Hellwig.
*/
#include <linux/err.h>
@@ -207,6 +208,12 @@ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
desc->masked = __pci_msi_desc_mask_irq(desc, mask, flag);
}
+static void __iomem *pci_msix_desc_addr(struct msi_desc *desc)
+{
+ return desc->mask_base +
+ desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+}
+
/*
* This internal function does not flush PCI writes to the device.
* All users must ensure that they read from the device before either
@@ -217,8 +224,6 @@ static void msi_mask_irq(struct msi_desc *desc, u32 mask, u32 flag)
u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag)
{
u32 mask_bits = desc->masked;
- unsigned offset = desc->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE +
- PCI_MSIX_ENTRY_VECTOR_CTRL;
if (pci_msi_ignore_mask)
return 0;
@@ -226,7 +231,7 @@ u32 __pci_msix_desc_mask_irq(struct msi_desc *desc, u32 flag)
mask_bits &= ~PCI_MSIX_ENTRY_CTRL_MASKBIT;
if (flag)
mask_bits |= PCI_MSIX_ENTRY_CTRL_MASKBIT;
- writel(mask_bits, desc->mask_base + offset);
+ writel(mask_bits, pci_msix_desc_addr(desc) + PCI_MSIX_ENTRY_VECTOR_CTRL);
return mask_bits;
}
@@ -284,8 +289,7 @@ void __pci_read_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
BUG_ON(dev->current_state != PCI_D0);
if (entry->msi_attrib.is_msix) {
- void __iomem *base = entry->mask_base +
- entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+ void __iomem *base = pci_msix_desc_addr(entry);
msg->address_lo = readl(base + PCI_MSIX_ENTRY_LOWER_ADDR);
msg->address_hi = readl(base + PCI_MSIX_ENTRY_UPPER_ADDR);
@@ -315,9 +319,7 @@ void __pci_write_msi_msg(struct msi_desc *entry, struct msi_msg *msg)
if (dev->current_state != PCI_D0) {
/* Don't touch the hardware now */
} else if (entry->msi_attrib.is_msix) {
- void __iomem *base;
- base = entry->mask_base +
- entry->msi_attrib.entry_nr * PCI_MSIX_ENTRY_SIZE;
+ void __iomem *base = pci_msix_desc_addr(entry);
writel(msg->address_lo, base + PCI_MSIX_ENTRY_LOWER_ADDR);
writel(msg->address_hi, base + PCI_MSIX_ENTRY_UPPER_ADDR);
@@ -567,6 +569,7 @@ static struct msi_desc *msi_setup_entry(struct pci_dev *dev, int nvec)
entry->msi_attrib.multi_cap = (control & PCI_MSI_FLAGS_QMASK) >> 1;
entry->msi_attrib.multiple = ilog2(__roundup_pow_of_two(nvec));
entry->nvec_used = nvec;
+ entry->affinity = dev->irq_affinity;
if (control & PCI_MSI_FLAGS_64BIT)
entry->mask_pos = dev->msi_cap + PCI_MSI_MASK_64;
@@ -678,10 +681,18 @@ static void __iomem *msix_map_region(struct pci_dev *dev, unsigned nr_entries)
static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
struct msix_entry *entries, int nvec)
{
+ const struct cpumask *mask = NULL;
struct msi_desc *entry;
- int i;
+ int cpu = -1, i;
for (i = 0; i < nvec; i++) {
+ if (dev->irq_affinity) {
+ cpu = cpumask_next(cpu, dev->irq_affinity);
+ if (cpu >= nr_cpu_ids)
+ cpu = cpumask_first(dev->irq_affinity);
+ mask = cpumask_of(cpu);
+ }
+
entry = alloc_msi_entry(&dev->dev);
if (!entry) {
if (!i)
@@ -694,10 +705,14 @@ static int msix_setup_entries(struct pci_dev *dev, void __iomem *base,
entry->msi_attrib.is_msix = 1;
entry->msi_attrib.is_64 = 1;
- entry->msi_attrib.entry_nr = entries[i].entry;
+ if (entries)
+ entry->msi_attrib.entry_nr = entries[i].entry;
+ else
+ entry->msi_attrib.entry_nr = i;
entry->msi_attrib.default_irq = dev->irq;
entry->mask_base = base;
entry->nvec_used = 1;
+ entry->affinity = mask;
list_add_tail(&entry->list, dev_to_msi_list(&dev->dev));
}
@@ -712,13 +727,11 @@ static void msix_program_entries(struct pci_dev *dev,
int i = 0;
for_each_pci_msi_entry(entry, dev) {
- int offset = entries[i].entry * PCI_MSIX_ENTRY_SIZE +
- PCI_MSIX_ENTRY_VECTOR_CTRL;
-
- entries[i].vector = entry->irq;
- entry->masked = readl(entry->mask_base + offset);
+ if (entries)
+ entries[i++].vector = entry->irq;
+ entry->masked = readl(pci_msix_desc_addr(entry) +
+ PCI_MSIX_ENTRY_VECTOR_CTRL);
msix_mask_irq(entry, 1);
- i++;
}
}
@@ -931,7 +944,7 @@ EXPORT_SYMBOL(pci_msix_vec_count);
/**
* pci_enable_msix - configure device's MSI-X capability structure
* @dev: pointer to the pci_dev data structure of MSI-X device function
- * @entries: pointer to an array of MSI-X entries
+ * @entries: pointer to an array of MSI-X entries (optional)
* @nvec: number of MSI-X irqs requested for allocation by device driver
*
* Setup the MSI-X capability structure of device function with the number
@@ -951,22 +964,21 @@ int pci_enable_msix(struct pci_dev *dev, struct msix_entry *entries, int nvec)
if (!pci_msi_supported(dev, nvec))
return -EINVAL;
- if (!entries)
- return -EINVAL;
-
nr_entries = pci_msix_vec_count(dev);
if (nr_entries < 0)
return nr_entries;
if (nvec > nr_entries)
return nr_entries;
- /* Check for any invalid entries */
- for (i = 0; i < nvec; i++) {
- if (entries[i].entry >= nr_entries)
- return -EINVAL; /* invalid entry */
- for (j = i + 1; j < nvec; j++) {
- if (entries[i].entry == entries[j].entry)
- return -EINVAL; /* duplicate entry */
+ if (entries) {
+ /* Check for any invalid entries */
+ for (i = 0; i < nvec; i++) {
+ if (entries[i].entry >= nr_entries)
+ return -EINVAL; /* invalid entry */
+ for (j = i + 1; j < nvec; j++) {
+ if (entries[i].entry == entries[j].entry)
+ return -EINVAL; /* duplicate entry */
+ }
}
}
WARN_ON(!!dev->msix_enabled);
@@ -1026,19 +1038,8 @@ int pci_msi_enabled(void)
}
EXPORT_SYMBOL(pci_msi_enabled);
-/**
- * pci_enable_msi_range - configure device's MSI capability structure
- * @dev: device to configure
- * @minvec: minimal number of interrupts to configure
- * @maxvec: maximum number of interrupts to configure
- *
- * This function tries to allocate a maximum possible number of interrupts in a
- * range between @minvec and @maxvec. It returns a negative errno if an error
- * occurs. If it succeeds, it returns the actual number of interrupts allocated
- * and updates the @dev's irq member to the lowest new interrupt number;
- * the other interrupt numbers allocated to this device are consecutive.
- **/
-int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec)
+static int __pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec,
+ unsigned int flags)
{
int nvec;
int rc;
@@ -1061,25 +1062,85 @@ int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec)
nvec = pci_msi_vec_count(dev);
if (nvec < 0)
return nvec;
- else if (nvec < minvec)
+ if (nvec < minvec)
return -EINVAL;
- else if (nvec > maxvec)
+
+ if (nvec > maxvec)
nvec = maxvec;
- do {
+ for (;;) {
+ if (!(flags & PCI_IRQ_NOAFFINITY)) {
+ dev->irq_affinity = irq_create_affinity_mask(&nvec);
+ if (nvec < minvec)
+ return -ENOSPC;
+ }
+
rc = msi_capability_init(dev, nvec);
- if (rc < 0) {
+ if (rc == 0)
+ return nvec;
+
+ kfree(dev->irq_affinity);
+ dev->irq_affinity = NULL;
+
+ if (rc < 0)
return rc;
- } else if (rc > 0) {
- if (rc < minvec)
+ if (rc < minvec)
+ return -ENOSPC;
+
+ nvec = rc;
+ }
+}
+
+/**
+ * pci_enable_msi_range - configure device's MSI capability structure
+ * @dev: device to configure
+ * @minvec: minimal number of interrupts to configure
+ * @maxvec: maximum number of interrupts to configure
+ *
+ * This function tries to allocate a maximum possible number of interrupts in a
+ * range between @minvec and @maxvec. It returns a negative errno if an error
+ * occurs. If it succeeds, it returns the actual number of interrupts allocated
+ * and updates the @dev's irq member to the lowest new interrupt number;
+ * the other interrupt numbers allocated to this device are consecutive.
+ **/
+int pci_enable_msi_range(struct pci_dev *dev, int minvec, int maxvec)
+{
+ return __pci_enable_msi_range(dev, minvec, maxvec, PCI_IRQ_NOAFFINITY);
+}
+EXPORT_SYMBOL(pci_enable_msi_range);
+
+static int __pci_enable_msix_range(struct pci_dev *dev,
+ struct msix_entry *entries, int minvec, int maxvec,
+ unsigned int flags)
+{
+ int nvec = maxvec;
+ int rc;
+
+ if (maxvec < minvec)
+ return -ERANGE;
+
+ for (;;) {
+ if (!(flags & PCI_IRQ_NOAFFINITY)) {
+ dev->irq_affinity = irq_create_affinity_mask(&nvec);
+ if (nvec < minvec)
return -ENOSPC;
- nvec = rc;
}
- } while (rc);
- return nvec;
+ rc = pci_enable_msix(dev, entries, nvec);
+ if (rc == 0)
+ return nvec;
+
+ kfree(dev->irq_affinity);
+ dev->irq_affinity = NULL;
+
+ if (rc < 0)
+ return rc;
+ if (rc < minvec)
+ return -ENOSPC;
+
+ nvec = rc;
+ }
}
-EXPORT_SYMBOL(pci_enable_msi_range);
/**
* pci_enable_msix_range - configure device's MSI-X capability structure
@@ -1097,28 +1158,101 @@ EXPORT_SYMBOL(pci_enable_msi_range);
* with new allocated MSI-X interrupts.
**/
int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries,
- int minvec, int maxvec)
+ int minvec, int maxvec)
{
- int nvec = maxvec;
- int rc;
+ return __pci_enable_msix_range(dev, entries, minvec, maxvec,
+ PCI_IRQ_NOAFFINITY);
+}
+EXPORT_SYMBOL(pci_enable_msix_range);
- if (maxvec < minvec)
- return -ERANGE;
+/**
+ * pci_alloc_irq_vectors - allocate multiple IRQs for a device
+ * @dev: PCI device to operate on
+ * @min_vecs: minimum number of vectors required (must be >= 1)
+ * @max_vecs: maximum (desired) number of vectors
+ * @flags: flags or quirks for the allocation
+ *
+ * Allocate up to @max_vecs interrupt vectors for @dev, using MSI-X or MSI
+ * vectors if available, and fall back to a single legacy vector
+ * if neither is available. Return the number of vectors allocated,
+ * (which might be smaller than @max_vecs) if successful, or a negative
+ * error code on error. If less than @min_vecs interrupt vectors are
+ * available for @dev the function will fail with -ENOSPC.
+ *
+ * To get the Linux IRQ number used for a vector that can be passed to
+ * request_irq() use the pci_irq_vector() helper.
+ */
+int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
+ unsigned int max_vecs, unsigned int flags)
+{
+ int vecs = -ENOSPC;
- do {
- rc = pci_enable_msix(dev, entries, nvec);
- if (rc < 0) {
- return rc;
- } else if (rc > 0) {
- if (rc < minvec)
- return -ENOSPC;
- nvec = rc;
+ if (!(flags & PCI_IRQ_NOMSIX)) {
+ vecs = __pci_enable_msix_range(dev, NULL, min_vecs, max_vecs,
+ flags);
+ if (vecs > 0)
+ return vecs;
+ }
+
+ if (!(flags & PCI_IRQ_NOMSI)) {
+ vecs = __pci_enable_msi_range(dev, min_vecs, max_vecs, flags);
+ if (vecs > 0)
+ return vecs;
+ }
+
+ /* use legacy irq if allowed */
+ if (!(flags & PCI_IRQ_NOLEGACY) && min_vecs == 1)
+ return 1;
+ return vecs;
+}
+EXPORT_SYMBOL(pci_alloc_irq_vectors);
+
+/**
+ * pci_free_irq_vectors - free previously allocated IRQs for a device
+ * @dev: PCI device to operate on
+ *
+ * Undoes the allocations and enabling in pci_alloc_irq_vectors().
+ */
+void pci_free_irq_vectors(struct pci_dev *dev)
+{
+ pci_disable_msix(dev);
+ pci_disable_msi(dev);
+}
+EXPORT_SYMBOL(pci_free_irq_vectors);
+
+/**
+ * pci_irq_vector - return Linux IRQ number of a device vector
+ * @dev: PCI device to operate on
+ * @nr: device-relative interrupt vector index (0-based).
+ */
+int pci_irq_vector(struct pci_dev *dev, unsigned int nr)
+{
+ if (dev->msix_enabled) {
+ struct msi_desc *entry;
+ int i = 0;
+
+ for_each_pci_msi_entry(entry, dev) {
+ if (i == nr)
+ return entry->irq;
+ i++;
}
- } while (rc);
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
- return nvec;
+ if (dev->msi_enabled) {
+ struct msi_desc *entry = first_pci_msi_entry(dev);
+
+ if (WARN_ON_ONCE(nr >= entry->nvec_used))
+ return -EINVAL;
+ } else {
+ if (WARN_ON_ONCE(nr > 0))
+ return -EINVAL;
+ }
+
+ return dev->irq + nr;
}
-EXPORT_SYMBOL(pci_enable_msix_range);
+EXPORT_SYMBOL(pci_irq_vector);
struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc)
{
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index d7ffd66814bb..e39a67c8ef39 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c
@@ -777,7 +777,7 @@ static int pci_pm_suspend_noirq(struct device *dev)
if (!pci_dev->state_saved) {
pci_save_state(pci_dev);
- if (!pci_has_subordinate(pci_dev))
+ if (pci_power_manageable(pci_dev))
pci_prepare_to_sleep(pci_dev);
}
@@ -1144,7 +1144,6 @@ static int pci_pm_runtime_suspend(struct device *dev)
return -ENOSYS;
pci_dev->state_saved = false;
- pci_dev->no_d3cold = false;
error = pm->runtime_suspend(dev);
if (error) {
/*
@@ -1161,8 +1160,6 @@ static int pci_pm_runtime_suspend(struct device *dev)
return error;
}
- if (!pci_dev->d3cold_allowed)
- pci_dev->no_d3cold = true;
pci_fixup_device(pci_fixup_suspend, pci_dev);
diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c
index d319a9ca9b7b..bcd10c795284 100644
--- a/drivers/pci/pci-sysfs.c
+++ b/drivers/pci/pci-sysfs.c
@@ -406,6 +406,11 @@ static ssize_t d3cold_allowed_store(struct device *dev,
return -EINVAL;
pdev->d3cold_allowed = !!val;
+ if (pdev->d3cold_allowed)
+ pci_d3cold_enable(pdev);
+ else
+ pci_d3cold_disable(pdev);
+
pm_runtime_resume(dev);
return count;
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index badbddc683f0..aab9d5115a5f 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -7,8 +7,10 @@
* Copyright 1997 -- 2000 Martin Mares <mj@ucw.cz>
*/
+#include <linux/acpi.h>
#include <linux/kernel.h>
#include <linux/delay.h>
+#include <linux/dmi.h>
#include <linux/init.h>
#include <linux/of.h>
#include <linux/of_pci.h>
@@ -25,7 +27,9 @@
#include <linux/device.h>
#include <linux/pm_runtime.h>
#include <linux/pci_hotplug.h>
+#include <linux/vmalloc.h>
#include <asm/setup.h>
+#include <asm/dma.h>
#include <linux/aer.h>
#include "pci.h"
@@ -81,6 +85,9 @@ unsigned long pci_cardbus_mem_size = DEFAULT_CARDBUS_MEM_SIZE;
unsigned long pci_hotplug_io_size = DEFAULT_HOTPLUG_IO_SIZE;
unsigned long pci_hotplug_mem_size = DEFAULT_HOTPLUG_MEM_SIZE;
+#define DEFAULT_HOTPLUG_BUS_SIZE 1
+unsigned long pci_hotplug_bus_size = DEFAULT_HOTPLUG_BUS_SIZE;
+
enum pcie_bus_config_types pcie_bus_config = PCIE_BUS_DEFAULT;
/*
@@ -101,6 +108,21 @@ unsigned int pcibios_max_latency = 255;
/* If set, the PCIe ARI capability will not be used. */
static bool pcie_ari_disabled;
+/* Disable bridge_d3 for all PCIe ports */
+static bool pci_bridge_d3_disable;
+/* Force bridge_d3 for all PCIe ports */
+static bool pci_bridge_d3_force;
+
+static int __init pcie_port_pm_setup(char *str)
+{
+ if (!strcmp(str, "off"))
+ pci_bridge_d3_disable = true;
+ else if (!strcmp(str, "force"))
+ pci_bridge_d3_force = true;
+ return 1;
+}
+__setup("pcie_port_pm=", pcie_port_pm_setup);
+
/**
* pci_bus_max_busnr - returns maximum PCI bus number of given bus' children
* @bus: pointer to PCI bus structure to search
@@ -2156,6 +2178,164 @@ void pci_config_pm_runtime_put(struct pci_dev *pdev)
}
/**
+ * pci_bridge_d3_possible - Is it possible to put the bridge into D3
+ * @bridge: Bridge to check
+ *
+ * This function checks if it is possible to move the bridge to D3.
+ * Currently we only allow D3 for recent enough PCIe ports.
+ */
+static bool pci_bridge_d3_possible(struct pci_dev *bridge)
+{
+ unsigned int year;
+
+ if (!pci_is_pcie(bridge))
+ return false;
+
+ switch (pci_pcie_type(bridge)) {
+ case PCI_EXP_TYPE_ROOT_PORT:
+ case PCI_EXP_TYPE_UPSTREAM:
+ case PCI_EXP_TYPE_DOWNSTREAM:
+ if (pci_bridge_d3_disable)
+ return false;
+ if (pci_bridge_d3_force)
+ return true;
+
+ /*
+ * It should be safe to put PCIe ports from 2015 or newer
+ * to D3.
+ */
+ if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) &&
+ year >= 2015) {
+ return true;
+ }
+ break;
+ }
+
+ return false;
+}
+
+static int pci_dev_check_d3cold(struct pci_dev *dev, void *data)
+{
+ bool *d3cold_ok = data;
+ bool no_d3cold;
+
+ /*
+ * The device needs to be allowed to go D3cold and if it is wake
+ * capable to do so from D3cold.
+ */
+ no_d3cold = dev->no_d3cold || !dev->d3cold_allowed ||
+ (device_may_wakeup(&dev->dev) && !pci_pme_capable(dev, PCI_D3cold)) ||
+ !pci_power_manageable(dev);
+
+ *d3cold_ok = !no_d3cold;
+
+ return no_d3cold;
+}
+
+/*
+ * pci_bridge_d3_update - Update bridge D3 capabilities
+ * @dev: PCI device which is changed
+ * @remove: Is the device being removed
+ *
+ * Update upstream bridge PM capabilities accordingly depending on if the
+ * device PM configuration was changed or the device is being removed. The
+ * change is also propagated upstream.
+ */
+static void pci_bridge_d3_update(struct pci_dev *dev, bool remove)
+{
+ struct pci_dev *bridge;
+ bool d3cold_ok = true;
+
+ bridge = pci_upstream_bridge(dev);
+ if (!bridge || !pci_bridge_d3_possible(bridge))
+ return;
+
+ pci_dev_get(bridge);
+ /*
+ * If the device is removed we do not care about its D3cold
+ * capabilities.
+ */
+ if (!remove)
+ pci_dev_check_d3cold(dev, &d3cold_ok);
+
+ if (d3cold_ok) {
+ /*
+ * We need to go through all children to find out if all of
+ * them can still go to D3cold.
+ */
+ pci_walk_bus(bridge->subordinate, pci_dev_check_d3cold,
+ &d3cold_ok);
+ }
+
+ if (bridge->bridge_d3 != d3cold_ok) {
+ bridge->bridge_d3 = d3cold_ok;
+ /* Propagate change to upstream bridges */
+ pci_bridge_d3_update(bridge, false);
+ }
+
+ pci_dev_put(bridge);
+}
+
+/**
+ * pci_bridge_d3_device_changed - Update bridge D3 capabilities on change
+ * @dev: PCI device that was changed
+ *
+ * If a device is added or its PM configuration, such as is it allowed to
+ * enter D3cold, is changed this function updates upstream bridge PM
+ * capabilities accordingly.
+ */
+void pci_bridge_d3_device_changed(struct pci_dev *dev)
+{
+ pci_bridge_d3_update(dev, false);
+}
+
+/**
+ * pci_bridge_d3_device_removed - Update bridge D3 capabilities on remove
+ * @dev: PCI device being removed
+ *
+ * Function updates upstream bridge PM capabilities based on other devices
+ * still left on the bus.
+ */
+void pci_bridge_d3_device_removed(struct pci_dev *dev)
+{
+ pci_bridge_d3_update(dev, true);
+}
+
+/**
+ * pci_d3cold_enable - Enable D3cold for device
+ * @dev: PCI device to handle
+ *
+ * This function can be used in drivers to enable D3cold from the device
+ * they handle. It also updates upstream PCI bridge PM capabilities
+ * accordingly.
+ */
+void pci_d3cold_enable(struct pci_dev *dev)
+{
+ if (dev->no_d3cold) {
+ dev->no_d3cold = false;
+ pci_bridge_d3_device_changed(dev);
+ }
+}
+EXPORT_SYMBOL_GPL(pci_d3cold_enable);
+
+/**
+ * pci_d3cold_disable - Disable D3cold for device
+ * @dev: PCI device to handle
+ *
+ * This function can be used in drivers to disable D3cold from the device
+ * they handle. It also updates upstream PCI bridge PM capabilities
+ * accordingly.
+ */
+void pci_d3cold_disable(struct pci_dev *dev)
+{
+ if (!dev->no_d3cold) {
+ dev->no_d3cold = true;
+ pci_bridge_d3_device_changed(dev);
+ }
+}
+EXPORT_SYMBOL_GPL(pci_d3cold_disable);
+
+/**
* pci_pm_init - Initialize PM functions of given PCI device
* @dev: PCI device to handle.
*/
@@ -2189,6 +2369,7 @@ void pci_pm_init(struct pci_dev *dev)
dev->pm_cap = pm;
dev->d3_delay = PCI_PM_D3_WAIT;
dev->d3cold_delay = PCI_PM_D3COLD_WAIT;
+ dev->bridge_d3 = pci_bridge_d3_possible(dev);
dev->d3cold_allowed = true;
dev->d1_support = false;
@@ -3165,6 +3346,23 @@ int __weak pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr)
#endif
}
+/**
+ * pci_unmap_iospace - Unmap the memory mapped I/O space
+ * @res: resource to be unmapped
+ *
+ * Unmap the CPU virtual address @res from virtual address space.
+ * Only architectures that have memory mapped IO functions defined
+ * (and the PCI_IOBASE value defined) should call this function.
+ */
+void pci_unmap_iospace(struct resource *res)
+{
+#if defined(PCI_IOBASE) && defined(CONFIG_MMU)
+ unsigned long vaddr = (unsigned long)PCI_IOBASE + res->start;
+
+ unmap_kernel_range(vaddr, resource_size(res));
+#endif
+}
+
static void __pci_set_master(struct pci_dev *dev, bool enable)
{
u16 old_cmd, cmd;
@@ -4755,6 +4953,7 @@ static DEFINE_SPINLOCK(resource_alignment_lock);
static resource_size_t pci_specified_resource_alignment(struct pci_dev *dev)
{
int seg, bus, slot, func, align_order, count;
+ unsigned short vendor, device, subsystem_vendor, subsystem_device;
resource_size_t align = 0;
char *p;
@@ -4768,28 +4967,55 @@ static resource_size_t pci_specified_resource_alignment(struct pci_dev *dev)
} else {
align_order = -1;
}
- if (sscanf(p, "%x:%x:%x.%x%n",
- &seg, &bus, &slot, &func, &count) != 4) {
- seg = 0;
- if (sscanf(p, "%x:%x.%x%n",
- &bus, &slot, &func, &count) != 3) {
- /* Invalid format */
- printk(KERN_ERR "PCI: Can't parse resource_alignment parameter: %s\n",
- p);
+ if (strncmp(p, "pci:", 4) == 0) {
+ /* PCI vendor/device (subvendor/subdevice) ids are specified */
+ p += 4;
+ if (sscanf(p, "%hx:%hx:%hx:%hx%n",
+ &vendor, &device, &subsystem_vendor, &subsystem_device, &count) != 4) {
+ if (sscanf(p, "%hx:%hx%n", &vendor, &device, &count) != 2) {
+ printk(KERN_ERR "PCI: Can't parse resource_alignment parameter: pci:%s\n",
+ p);
+ break;
+ }
+ subsystem_vendor = subsystem_device = 0;
+ }
+ p += count;
+ if ((!vendor || (vendor == dev->vendor)) &&
+ (!device || (device == dev->device)) &&
+ (!subsystem_vendor || (subsystem_vendor == dev->subsystem_vendor)) &&
+ (!subsystem_device || (subsystem_device == dev->subsystem_device))) {
+ if (align_order == -1)
+ align = PAGE_SIZE;
+ else
+ align = 1 << align_order;
+ /* Found */
break;
}
}
- p += count;
- if (seg == pci_domain_nr(dev->bus) &&
- bus == dev->bus->number &&
- slot == PCI_SLOT(dev->devfn) &&
- func == PCI_FUNC(dev->devfn)) {
- if (align_order == -1)
- align = PAGE_SIZE;
- else
- align = 1 << align_order;
- /* Found */
- break;
+ else {
+ if (sscanf(p, "%x:%x:%x.%x%n",
+ &seg, &bus, &slot, &func, &count) != 4) {
+ seg = 0;
+ if (sscanf(p, "%x:%x.%x%n",
+ &bus, &slot, &func, &count) != 3) {
+ /* Invalid format */
+ printk(KERN_ERR "PCI: Can't parse resource_alignment parameter: %s\n",
+ p);
+ break;
+ }
+ }
+ p += count;
+ if (seg == pci_domain_nr(dev->bus) &&
+ bus == dev->bus->number &&
+ slot == PCI_SLOT(dev->devfn) &&
+ func == PCI_FUNC(dev->devfn)) {
+ if (align_order == -1)
+ align = PAGE_SIZE;
+ else
+ align = 1 << align_order;
+ /* Found */
+ break;
+ }
}
if (*p != ';' && *p != ',') {
/* End of param or invalid format */
@@ -4897,7 +5123,7 @@ static ssize_t pci_resource_alignment_store(struct bus_type *bus,
return pci_set_resource_alignment_param(buf, count);
}
-BUS_ATTR(resource_alignment, 0644, pci_resource_alignment_show,
+static BUS_ATTR(resource_alignment, 0644, pci_resource_alignment_show,
pci_resource_alignment_store);
static int __init pci_resource_alignment_sysfs_init(void)
@@ -4923,7 +5149,7 @@ int pci_get_new_domain_nr(void)
}
#ifdef CONFIG_PCI_DOMAINS_GENERIC
-void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent)
+static int of_pci_bus_find_domain_nr(struct device *parent)
{
static int use_dt_domains = -1;
int domain = -1;
@@ -4967,7 +5193,13 @@ void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent)
domain = -1;
}
- bus->domain_nr = domain;
+ return domain;
+}
+
+int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent)
+{
+ return acpi_disabled ? of_pci_bus_find_domain_nr(parent) :
+ acpi_pci_bus_find_domain_nr(bus);
}
#endif
#endif
@@ -5021,6 +5253,11 @@ static int __init pci_setup(char *str)
pci_hotplug_io_size = memparse(str + 9, &str);
} else if (!strncmp(str, "hpmemsize=", 10)) {
pci_hotplug_mem_size = memparse(str + 10, &str);
+ } else if (!strncmp(str, "hpbussize=", 10)) {
+ pci_hotplug_bus_size =
+ simple_strtoul(str + 10, &str, 0);
+ if (pci_hotplug_bus_size > 0xff)
+ pci_hotplug_bus_size = DEFAULT_HOTPLUG_BUS_SIZE;
} else if (!strncmp(str, "pcie_bus_tune_off", 17)) {
pcie_bus_config = PCIE_BUS_TUNE_OFF;
} else if (!strncmp(str, "pcie_bus_safe", 13)) {
diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index a814bbb80fcb..9730c474b016 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -82,6 +82,8 @@ void pci_pm_init(struct pci_dev *dev);
void pci_ea_init(struct pci_dev *dev);
void pci_allocate_cap_save_buffers(struct pci_dev *dev);
void pci_free_cap_save_buffers(struct pci_dev *dev);
+void pci_bridge_d3_device_changed(struct pci_dev *dev);
+void pci_bridge_d3_device_removed(struct pci_dev *dev);
static inline void pci_wakeup_event(struct pci_dev *dev)
{
@@ -94,6 +96,15 @@ static inline bool pci_has_subordinate(struct pci_dev *pci_dev)
return !!(pci_dev->subordinate);
}
+static inline bool pci_power_manageable(struct pci_dev *pci_dev)
+{
+ /*
+ * Currently we allow normal PCI devices and PCI bridges transition
+ * into D3 if their bridge_d3 is set.
+ */
+ return !pci_has_subordinate(pci_dev) || pci_dev->bridge_d3;
+}
+
struct pci_vpd_ops {
ssize_t (*read)(struct pci_dev *dev, loff_t pos, size_t count, void *buf);
ssize_t (*write)(struct pci_dev *dev, loff_t pos, size_t count, const void *buf);
diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig
index 22ca6412bd15..7fcea75afa4c 100644
--- a/drivers/pci/pcie/Kconfig
+++ b/drivers/pci/pcie/Kconfig
@@ -83,7 +83,7 @@ config PCIE_PME
depends on PCIEPORTBUS && PM
config PCIE_DPC
- tristate "PCIe Downstream Port Containment support"
+ bool "PCIe Downstream Port Containment support"
depends on PCIEPORTBUS
default n
help
@@ -92,6 +92,3 @@ config PCIE_DPC
will be handled by the DPC driver. If your system doesn't
have this capability or you do not want to use this feature,
it is safe to answer N.
-
- To compile this driver as a module, choose M here: the module
- will be called pcie-dpc.
diff --git a/drivers/pci/pcie/aspm.c b/drivers/pci/pcie/aspm.c
index 2dfe7fdb77e7..0ec649d961d7 100644
--- a/drivers/pci/pcie/aspm.c
+++ b/drivers/pci/pcie/aspm.c
@@ -139,7 +139,7 @@ static void pcie_set_clkpm_nocheck(struct pcie_link_state *link, int enable)
static void pcie_set_clkpm(struct pcie_link_state *link, int enable)
{
/* Don't enable Clock PM if the link is not Clock PM capable */
- if (!link->clkpm_capable && enable)
+ if (!link->clkpm_capable)
enable = 0;
/* Need nothing if the specified equals to current state */
if (link->clkpm_enabled == enable)
diff --git a/drivers/pci/pcie/pcie-dpc.c b/drivers/pci/pcie/pcie-dpc.c
index ab552f1bc08f..250f87861786 100644
--- a/drivers/pci/pcie/pcie-dpc.c
+++ b/drivers/pci/pcie/pcie-dpc.c
@@ -15,8 +15,8 @@
struct dpc_dev {
struct pcie_device *dev;
- struct work_struct work;
- int cap_pos;
+ struct work_struct work;
+ int cap_pos;
};
static void dpc_wait_link_inactive(struct pci_dev *pdev)
@@ -89,7 +89,7 @@ static int dpc_probe(struct pcie_device *dev)
int status;
u16 ctl, cap;
- dpc = kzalloc(sizeof(*dpc), GFP_KERNEL);
+ dpc = devm_kzalloc(&dev->device, sizeof(*dpc), GFP_KERNEL);
if (!dpc)
return -ENOMEM;
@@ -98,11 +98,12 @@ static int dpc_probe(struct pcie_device *dev)
INIT_WORK(&dpc->work, interrupt_event_handler);
set_service_data(dev, dpc);
- status = request_irq(dev->irq, dpc_irq, IRQF_SHARED, "pcie-dpc", dpc);
+ status = devm_request_irq(&dev->device, dev->irq, dpc_irq, IRQF_SHARED,
+ "pcie-dpc", dpc);
if (status) {
dev_warn(&dev->device, "request IRQ%d failed: %d\n", dev->irq,
status);
- goto out;
+ return status;
}
pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CAP, &cap);
@@ -117,9 +118,6 @@ static int dpc_probe(struct pcie_device *dev)
FLAG(cap, PCI_EXP_DPC_CAP_SW_TRIGGER), (cap >> 8) & 0xf,
FLAG(cap, PCI_EXP_DPC_CAP_DL_ACTIVE));
return status;
- out:
- kfree(dpc);
- return status;
}
static void dpc_remove(struct pcie_device *dev)
@@ -131,14 +129,11 @@ static void dpc_remove(struct pcie_device *dev)
pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, &ctl);
ctl &= ~(PCI_EXP_DPC_CTL_EN_NONFATAL | PCI_EXP_DPC_CTL_INT_EN);
pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
-
- free_irq(dev->irq, dpc);
- kfree(dpc);
}
static struct pcie_port_service_driver dpcdriver = {
.name = "dpc",
- .port_type = PCI_EXP_TYPE_ROOT_PORT | PCI_EXP_TYPE_DOWNSTREAM,
+ .port_type = PCIE_ANY_PORT,
.service = PCIE_PORT_SERVICE_DPC,
.probe = dpc_probe,
.remove = dpc_remove,
diff --git a/drivers/pci/pcie/portdrv_core.c b/drivers/pci/pcie/portdrv_core.c
index 32d4d0a3d20e..e9270b4026f3 100644
--- a/drivers/pci/pcie/portdrv_core.c
+++ b/drivers/pci/pcie/portdrv_core.c
@@ -11,6 +11,7 @@
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/pm.h>
+#include <linux/pm_runtime.h>
#include <linux/string.h>
#include <linux/slab.h>
#include <linux/pcieport_if.h>
@@ -342,6 +343,8 @@ static int pcie_device_init(struct pci_dev *pdev, int service, int irq)
return retval;
}
+ pm_runtime_no_callbacks(device);
+
return 0;
}
diff --git a/drivers/pci/pcie/portdrv_pci.c b/drivers/pci/pcie/portdrv_pci.c
index be35da2e105e..70d7ad8c6d17 100644
--- a/drivers/pci/pcie/portdrv_pci.c
+++ b/drivers/pci/pcie/portdrv_pci.c
@@ -93,6 +93,26 @@ static int pcie_port_resume_noirq(struct device *dev)
return 0;
}
+static int pcie_port_runtime_suspend(struct device *dev)
+{
+ return to_pci_dev(dev)->bridge_d3 ? 0 : -EBUSY;
+}
+
+static int pcie_port_runtime_resume(struct device *dev)
+{
+ return 0;
+}
+
+static int pcie_port_runtime_idle(struct device *dev)
+{
+ /*
+ * Assume the PCI core has set bridge_d3 whenever it thinks the port
+ * should be good to go to D3. Everything else, including moving
+ * the port to D3, is handled by the PCI core.
+ */
+ return to_pci_dev(dev)->bridge_d3 ? 0 : -EBUSY;
+}
+
static const struct dev_pm_ops pcie_portdrv_pm_ops = {
.suspend = pcie_port_device_suspend,
.resume = pcie_port_device_resume,
@@ -101,6 +121,9 @@ static const struct dev_pm_ops pcie_portdrv_pm_ops = {
.poweroff = pcie_port_device_suspend,
.restore = pcie_port_device_resume,
.resume_noirq = pcie_port_resume_noirq,
+ .runtime_suspend = pcie_port_runtime_suspend,
+ .runtime_resume = pcie_port_runtime_resume,
+ .runtime_idle = pcie_port_runtime_idle,
};
#define PCIE_PORTDRV_PM_OPS (&pcie_portdrv_pm_ops)
@@ -134,16 +157,39 @@ static int pcie_portdrv_probe(struct pci_dev *dev,
return status;
pci_save_state(dev);
+
/*
- * D3cold may not work properly on some PCIe port, so disable
- * it by default.
+ * Prevent runtime PM if the port is advertising support for PCIe
+ * hotplug. Otherwise the BIOS hotplug SMI code might not be able
+ * to enumerate devices behind this port properly (the port is
+ * powered down preventing all config space accesses to the
+ * subordinate devices). We can't be sure for native PCIe hotplug
+ * either so prevent that as well.
*/
- dev->d3cold_allowed = false;
+ if (!dev->is_hotplug_bridge) {
+ /*
+ * Keep the port resumed 100ms to make sure things like
+ * config space accesses from userspace (lspci) will not
+ * cause the port to repeatedly suspend and resume.
+ */
+ pm_runtime_set_autosuspend_delay(&dev->dev, 100);
+ pm_runtime_use_autosuspend(&dev->dev);
+ pm_runtime_mark_last_busy(&dev->dev);
+ pm_runtime_put_autosuspend(&dev->dev);
+ pm_runtime_allow(&dev->dev);
+ }
+
return 0;
}
static void pcie_portdrv_remove(struct pci_dev *dev)
{
+ if (!dev->is_hotplug_bridge) {
+ pm_runtime_forbid(&dev->dev);
+ pm_runtime_get_noresume(&dev->dev);
+ pm_runtime_dont_use_autosuspend(&dev->dev);
+ }
+
pcie_port_device_remove(dev);
}
diff --git a/drivers/pci/probe.c b/drivers/pci/probe.c
index 8e3ef720997d..93f280df3428 100644
--- a/drivers/pci/probe.c
+++ b/drivers/pci/probe.c
@@ -16,6 +16,7 @@
#include <linux/aer.h>
#include <linux/acpi.h>
#include <linux/irqdomain.h>
+#include <linux/pm_runtime.h>
#include "pci.h"
#define CARDBUS_LATENCY_TIMER 176 /* secondary latency timer */
@@ -832,6 +833,12 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass)
u8 primary, secondary, subordinate;
int broken = 0;
+ /*
+ * Make sure the bridge is powered on to be able to access config
+ * space of devices below it.
+ */
+ pm_runtime_get_sync(&dev->dev);
+
pci_read_config_dword(dev, PCI_PRIMARY_BUS, &buses);
primary = buses & 0xFF;
secondary = (buses >> 8) & 0xFF;
@@ -1012,6 +1019,8 @@ int pci_scan_bridge(struct pci_bus *bus, struct pci_dev *dev, int max, int pass)
out:
pci_write_config_word(dev, PCI_BRIDGE_CONTROL, bctl);
+ pm_runtime_put(&dev->dev);
+
return max;
}
EXPORT_SYMBOL(pci_scan_bridge);
@@ -2077,6 +2086,15 @@ unsigned int pci_scan_child_bus(struct pci_bus *bus)
}
/*
+ * Make sure a hotplug bridge has at least the minimum requested
+ * number of buses.
+ */
+ if (bus->self && bus->self->is_hotplug_bridge && pci_hotplug_bus_size) {
+ if (max - bus->busn_res.start < pci_hotplug_bus_size - 1)
+ max = bus->busn_res.start + pci_hotplug_bus_size - 1;
+ }
+
+ /*
* We've scanned the bus and so we know all about what's on
* the other side of any bridges that may be on this bus plus
* any devices.
@@ -2127,7 +2145,9 @@ struct pci_bus *pci_create_root_bus(struct device *parent, int bus,
b->sysdata = sysdata;
b->ops = ops;
b->number = b->busn_res.start = bus;
- pci_bus_assign_domain_nr(b, parent);
+#ifdef CONFIG_PCI_DOMAINS_GENERIC
+ b->domain_nr = pci_bus_find_domain_nr(b, parent);
+#endif
b2 = pci_find_bus(pci_domain_nr(b), bus);
if (b2) {
/* If we already got to this bus through a different bridge, ignore it */
diff --git a/drivers/pci/proc.c b/drivers/pci/proc.c
index 3f155e78513f..2408abe4ee8c 100644
--- a/drivers/pci/proc.c
+++ b/drivers/pci/proc.c
@@ -231,7 +231,7 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma)
{
struct pci_dev *dev = PDE_DATA(file_inode(file));
struct pci_filp_private *fpriv = file->private_data;
- int i, ret;
+ int i, ret, write_combine;
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
@@ -245,9 +245,12 @@ static int proc_bus_pci_mmap(struct file *file, struct vm_area_struct *vma)
if (i >= PCI_ROM_RESOURCE)
return -ENODEV;
+ if (fpriv->mmap_state == pci_mmap_mem)
+ write_combine = fpriv->write_combine;
+ else
+ write_combine = 0;
ret = pci_mmap_page_range(dev, vma,
- fpriv->mmap_state,
- fpriv->write_combine);
+ fpriv->mmap_state, write_combine);
if (ret < 0)
return ret;
diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c
index ee72ebe18f4b..37ff0158e45f 100644
--- a/drivers/pci/quirks.c
+++ b/drivers/pci/quirks.c
@@ -3189,13 +3189,15 @@ static void quirk_no_bus_reset(struct pci_dev *dev)
}
/*
- * Atheros AR93xx chips do not behave after a bus reset. The device will
- * throw a Link Down error on AER-capable systems and regardless of AER,
- * config space of the device is never accessible again and typically
- * causes the system to hang or reset when access is attempted.
+ * Some Atheros AR9xxx and QCA988x chips do not behave after a bus reset.
+ * The device will throw a Link Down error on AER-capable systems and
+ * regardless of AER, config space of the device is never accessible again
+ * and typically causes the system to hang or reset when access is attempted.
* http://www.spinics.net/lists/linux-pci/msg34797.html
*/
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0030, quirk_no_bus_reset);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x0032, quirk_no_bus_reset);
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATHEROS, 0x003c, quirk_no_bus_reset);
static void quirk_no_pm_reset(struct pci_dev *dev)
{
@@ -3711,6 +3713,9 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x9172,
/* https://bugzilla.kernel.org/show_bug.cgi?id=42679#c59 */
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x917a,
quirk_dma_func1_alias);
+/* https://bugzilla.kernel.org/show_bug.cgi?id=42679#c78 */
+DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x9182,
+ quirk_dma_func1_alias);
/* https://bugzilla.kernel.org/show_bug.cgi?id=42679#c46 */
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MARVELL_EXT, 0x91a0,
quirk_dma_func1_alias);
@@ -3747,6 +3752,9 @@ static const struct pci_device_id fixed_dma_alias_tbl[] = {
{ PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x0285,
PCI_VENDOR_ID_ADAPTEC2, 0x02bb), /* Adaptec 3405 */
.driver_data = PCI_DEVFN(1, 0) },
+ { PCI_DEVICE_SUB(PCI_VENDOR_ID_ADAPTEC2, 0x0285,
+ PCI_VENDOR_ID_ADAPTEC2, 0x02bc), /* Adaptec 3805 */
+ .driver_data = PCI_DEVFN(1, 0) },
{ 0 }
};
@@ -4087,6 +4095,7 @@ static const struct pci_dev_acs_enabled {
{ PCI_VENDOR_ID_AMD, 0x7809, pci_quirk_amd_sb_acs },
{ PCI_VENDOR_ID_SOLARFLARE, 0x0903, pci_quirk_mf_endpoint_acs },
{ PCI_VENDOR_ID_SOLARFLARE, 0x0923, pci_quirk_mf_endpoint_acs },
+ { PCI_VENDOR_ID_SOLARFLARE, 0x0A03, pci_quirk_mf_endpoint_acs },
{ PCI_VENDOR_ID_INTEL, 0x10C6, pci_quirk_mf_endpoint_acs },
{ PCI_VENDOR_ID_INTEL, 0x10DB, pci_quirk_mf_endpoint_acs },
{ PCI_VENDOR_ID_INTEL, 0x10DD, pci_quirk_mf_endpoint_acs },
diff --git a/drivers/pci/remove.c b/drivers/pci/remove.c
index 8982026637d5..d1ef7acf6930 100644
--- a/drivers/pci/remove.c
+++ b/drivers/pci/remove.c
@@ -96,6 +96,8 @@ static void pci_remove_bus_device(struct pci_dev *dev)
dev->subordinate = NULL;
}
+ pci_bridge_d3_device_removed(dev);
+
pci_destroy_dev(dev);
}
diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c
index d678c46e5f03..c74059e10a6d 100644
--- a/drivers/pci/setup-bus.c
+++ b/drivers/pci/setup-bus.c
@@ -1428,6 +1428,74 @@ void pci_bus_assign_resources(const struct pci_bus *bus)
}
EXPORT_SYMBOL(pci_bus_assign_resources);
+static void pci_claim_device_resources(struct pci_dev *dev)
+{
+ int i;
+
+ for (i = 0; i < PCI_BRIDGE_RESOURCES; i++) {
+ struct resource *r = &dev->resource[i];
+
+ if (!r->flags || r->parent)
+ continue;
+
+ pci_claim_resource(dev, i);
+ }
+}
+
+static void pci_claim_bridge_resources(struct pci_dev *dev)
+{
+ int i;
+
+ for (i = PCI_BRIDGE_RESOURCES; i < PCI_NUM_RESOURCES; i++) {
+ struct resource *r = &dev->resource[i];
+
+ if (!r->flags || r->parent)
+ continue;
+
+ pci_claim_bridge_resource(dev, i);
+ }
+}
+
+static void pci_bus_allocate_dev_resources(struct pci_bus *b)
+{
+ struct pci_dev *dev;
+ struct pci_bus *child;
+
+ list_for_each_entry(dev, &b->devices, bus_list) {
+ pci_claim_device_resources(dev);
+
+ child = dev->subordinate;
+ if (child)
+ pci_bus_allocate_dev_resources(child);
+ }
+}
+
+static void pci_bus_allocate_resources(struct pci_bus *b)
+{
+ struct pci_bus *child;
+
+ /*
+ * Carry out a depth-first search on the PCI bus
+ * tree to allocate bridge apertures. Read the
+ * programmed bridge bases and recursively claim
+ * the respective bridge resources.
+ */
+ if (b->self) {
+ pci_read_bridge_bases(b);
+ pci_claim_bridge_resources(b->self);
+ }
+
+ list_for_each_entry(child, &b->children, node)
+ pci_bus_allocate_resources(child);
+}
+
+void pci_bus_claim_resources(struct pci_bus *b)
+{
+ pci_bus_allocate_resources(b);
+ pci_bus_allocate_dev_resources(b);
+}
+EXPORT_SYMBOL(pci_bus_claim_resources);
+
static void __pci_bridge_assign_resources(const struct pci_dev *bridge,
struct list_head *add_head,
struct list_head *fail_head)
diff --git a/drivers/s390/char/sclp_early.c b/drivers/s390/char/sclp_early.c
index 0ac520dd1b21..c71df0c7dedc 100644
--- a/drivers/s390/char/sclp_early.c
+++ b/drivers/s390/char/sclp_early.c
@@ -46,7 +46,8 @@ struct read_info_sccb {
u64 rnmax2; /* 104-111 */
u8 _pad_112[116 - 112]; /* 112-115 */
u8 fac116; /* 116 */
- u8 _pad_117[119 - 117]; /* 117-118 */
+ u8 fac117; /* 117 */
+ u8 _pad_118; /* 118 */
u8 fac119; /* 119 */
u16 hcpua; /* 120-121 */
u8 _pad_122[124 - 122]; /* 122-123 */
@@ -114,7 +115,12 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
sclp.facilities = sccb->facilities;
sclp.has_sprp = !!(sccb->fac84 & 0x02);
sclp.has_core_type = !!(sccb->fac84 & 0x01);
+ sclp.has_gsls = !!(sccb->fac85 & 0x80);
+ sclp.has_64bscao = !!(sccb->fac116 & 0x80);
+ sclp.has_cmma = !!(sccb->fac116 & 0x40);
sclp.has_esca = !!(sccb->fac116 & 0x08);
+ sclp.has_pfmfi = !!(sccb->fac117 & 0x40);
+ sclp.has_ibs = !!(sccb->fac117 & 0x20);
sclp.has_hvs = !!(sccb->fac119 & 0x80);
if (sccb->fac85 & 0x02)
S390_lowcore.machine_flags |= MACHINE_FLAG_ESOP;
@@ -145,6 +151,10 @@ static void __init sclp_facilities_detect(struct read_info_sccb *sccb)
sclp.has_siif = cpue->siif;
sclp.has_sigpif = cpue->sigpif;
sclp.has_sief2 = cpue->sief2;
+ sclp.has_gpere = cpue->gpere;
+ sclp.has_ib = cpue->ib;
+ sclp.has_cei = cpue->cei;
+ sclp.has_skey = cpue->skey;
break;
}
diff --git a/drivers/s390/char/sclp_ocf.c b/drivers/s390/char/sclp_ocf.c
index 2553db0fdb52..f59b71776bbd 100644
--- a/drivers/s390/char/sclp_ocf.c
+++ b/drivers/s390/char/sclp_ocf.c
@@ -26,7 +26,7 @@
#define OCF_LENGTH_CPC_NAME 8UL
static char hmc_network[OCF_LENGTH_HMC_NETWORK + 1];
-static char cpc_name[OCF_LENGTH_CPC_NAME + 1];
+static char cpc_name[OCF_LENGTH_CPC_NAME]; /* in EBCDIC */
static DEFINE_SPINLOCK(sclp_ocf_lock);
static struct work_struct sclp_ocf_change_work;
@@ -72,9 +72,8 @@ static void sclp_ocf_handler(struct evbuf_header *evbuf)
}
if (cpc) {
size = min(OCF_LENGTH_CPC_NAME, (size_t) cpc->length);
+ memset(cpc_name, 0, OCF_LENGTH_CPC_NAME);
memcpy(cpc_name, cpc + 1, size);
- EBCASC(cpc_name, size);
- cpc_name[size] = 0;
}
spin_unlock(&sclp_ocf_lock);
schedule_work(&sclp_ocf_change_work);
@@ -85,15 +84,23 @@ static struct sclp_register sclp_ocf_event = {
.receiver_fn = sclp_ocf_handler,
};
+void sclp_ocf_cpc_name_copy(char *dst)
+{
+ spin_lock_irq(&sclp_ocf_lock);
+ memcpy(dst, cpc_name, OCF_LENGTH_CPC_NAME);
+ spin_unlock_irq(&sclp_ocf_lock);
+}
+EXPORT_SYMBOL(sclp_ocf_cpc_name_copy);
+
static ssize_t cpc_name_show(struct kobject *kobj,
struct kobj_attribute *attr, char *page)
{
- int rc;
+ char name[OCF_LENGTH_CPC_NAME + 1];
- spin_lock_irq(&sclp_ocf_lock);
- rc = snprintf(page, PAGE_SIZE, "%s\n", cpc_name);
- spin_unlock_irq(&sclp_ocf_lock);
- return rc;
+ sclp_ocf_cpc_name_copy(name);
+ name[OCF_LENGTH_CPC_NAME] = 0;
+ EBCASC(name, OCF_LENGTH_CPC_NAME);
+ return snprintf(page, PAGE_SIZE, "%s\n", name);
}
static struct kobj_attribute cpc_name_attr =
diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c
index adf61b43eb70..734a0428ef0e 100644
--- a/drivers/scsi/lpfc/lpfc_init.c
+++ b/drivers/scsi/lpfc/lpfc_init.c
@@ -4854,20 +4854,17 @@ static int
lpfc_enable_pci_dev(struct lpfc_hba *phba)
{
struct pci_dev *pdev;
- int bars = 0;
/* Obtain PCI device reference */
if (!phba->pcidev)
goto out_error;
else
pdev = phba->pcidev;
- /* Select PCI BARs */
- bars = pci_select_bars(pdev, IORESOURCE_MEM);
/* Enable PCI device */
if (pci_enable_device_mem(pdev))
goto out_error;
/* Request PCI resource for the device */
- if (pci_request_selected_regions(pdev, bars, LPFC_DRIVER_NAME))
+ if (pci_request_mem_regions(pdev, LPFC_DRIVER_NAME))
goto out_disable_device;
/* Set up device as PCI master and save state for EEH */
pci_set_master(pdev);
@@ -4884,7 +4881,7 @@ out_disable_device:
pci_disable_device(pdev);
out_error:
lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
- "1401 Failed to enable pci device, bars:x%x\n", bars);
+ "1401 Failed to enable pci device\n");
return -ENODEV;
}
@@ -4899,17 +4896,14 @@ static void
lpfc_disable_pci_dev(struct lpfc_hba *phba)
{
struct pci_dev *pdev;
- int bars;
/* Obtain PCI device reference */
if (!phba->pcidev)
return;
else
pdev = phba->pcidev;
- /* Select PCI BARs */
- bars = pci_select_bars(pdev, IORESOURCE_MEM);
/* Release PCI resource and disable PCI device */
- pci_release_selected_regions(pdev, bars);
+ pci_release_mem_regions(pdev);
pci_disable_device(pdev);
return;
@@ -9811,7 +9805,6 @@ lpfc_pci_remove_one_s3(struct pci_dev *pdev)
struct lpfc_vport **vports;
struct lpfc_hba *phba = vport->phba;
int i;
- int bars = pci_select_bars(pdev, IORESOURCE_MEM);
spin_lock_irq(&phba->hbalock);
vport->load_flag |= FC_UNLOADING;
@@ -9886,7 +9879,7 @@ lpfc_pci_remove_one_s3(struct pci_dev *pdev)
lpfc_hba_free(phba);
- pci_release_selected_regions(pdev, bars);
+ pci_release_mem_regions(pdev);
pci_disable_device(pdev);
}
diff --git a/drivers/usb/host/xhci-pci.c b/drivers/usb/host/xhci-pci.c
index c10972fcc8e4..4fd041bec332 100644
--- a/drivers/usb/host/xhci-pci.c
+++ b/drivers/usb/host/xhci-pci.c
@@ -387,7 +387,7 @@ static int xhci_pci_suspend(struct usb_hcd *hcd, bool do_wakeup)
* need to have the registers polled during D3, so avoid D3cold.
*/
if (xhci->quirks & XHCI_COMP_MODE_QUIRK)
- pdev->no_d3cold = true;
+ pci_d3cold_disable(pdev);
if (xhci->quirks & XHCI_PME_STUCK_QUIRK)
xhci_pme_quirk(hcd);
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index 26a9d10d75e9..d5b6f959a3c3 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1730,7 +1730,8 @@ enum {
POOL_WRITE = 2,
};
-static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
+static int __ceph_pool_perm_get(struct ceph_inode_info *ci,
+ s64 pool, struct ceph_string *pool_ns)
{
struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode);
struct ceph_mds_client *mdsc = fsc->mdsc;
@@ -1738,6 +1739,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
struct rb_node **p, *parent;
struct ceph_pool_perm *perm;
struct page **pages;
+ size_t pool_ns_len;
int err = 0, err2 = 0, have = 0;
down_read(&mdsc->pool_perm_rwsem);
@@ -1749,17 +1751,31 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
else if (pool > perm->pool)
p = &(*p)->rb_right;
else {
- have = perm->perm;
- break;
+ int ret = ceph_compare_string(pool_ns,
+ perm->pool_ns,
+ perm->pool_ns_len);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else if (ret > 0)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
}
}
up_read(&mdsc->pool_perm_rwsem);
if (*p)
goto out;
- dout("__ceph_pool_perm_get pool %u no perm cached\n", pool);
+ if (pool_ns)
+ dout("__ceph_pool_perm_get pool %lld ns %.*s no perm cached\n",
+ pool, (int)pool_ns->len, pool_ns->str);
+ else
+ dout("__ceph_pool_perm_get pool %lld no perm cached\n", pool);
down_write(&mdsc->pool_perm_rwsem);
+ p = &mdsc->pool_perm_tree.rb_node;
parent = NULL;
while (*p) {
parent = *p;
@@ -1769,8 +1785,17 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
else if (pool > perm->pool)
p = &(*p)->rb_right;
else {
- have = perm->perm;
- break;
+ int ret = ceph_compare_string(pool_ns,
+ perm->pool_ns,
+ perm->pool_ns_len);
+ if (ret < 0)
+ p = &(*p)->rb_left;
+ else if (ret > 0)
+ p = &(*p)->rb_right;
+ else {
+ have = perm->perm;
+ break;
+ }
}
}
if (*p) {
@@ -1788,6 +1813,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
rd_req->r_flags = CEPH_OSD_FLAG_READ;
osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0);
rd_req->r_base_oloc.pool = pool;
+ if (pool_ns)
+ rd_req->r_base_oloc.pool_ns = ceph_get_string(pool_ns);
ceph_oid_printf(&rd_req->r_base_oid, "%llx.00000000", ci->i_vino.ino);
err = ceph_osdc_alloc_messages(rd_req, GFP_NOFS);
@@ -1841,7 +1868,8 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
goto out_unlock;
}
- perm = kmalloc(sizeof(*perm), GFP_NOFS);
+ pool_ns_len = pool_ns ? pool_ns->len : 0;
+ perm = kmalloc(sizeof(*perm) + pool_ns_len + 1, GFP_NOFS);
if (!perm) {
err = -ENOMEM;
goto out_unlock;
@@ -1849,6 +1877,11 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
perm->pool = pool;
perm->perm = have;
+ perm->pool_ns_len = pool_ns_len;
+ if (pool_ns_len > 0)
+ memcpy(perm->pool_ns, pool_ns->str, pool_ns_len);
+ perm->pool_ns[pool_ns_len] = 0;
+
rb_link_node(&perm->node, parent, p);
rb_insert_color(&perm->node, &mdsc->pool_perm_tree);
err = 0;
@@ -1860,43 +1893,46 @@ out_unlock:
out:
if (!err)
err = have;
- dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err);
+ if (pool_ns)
+ dout("__ceph_pool_perm_get pool %lld ns %.*s result = %d\n",
+ pool, (int)pool_ns->len, pool_ns->str, err);
+ else
+ dout("__ceph_pool_perm_get pool %lld result = %d\n", pool, err);
return err;
}
int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
{
- u32 pool;
+ s64 pool;
+ struct ceph_string *pool_ns;
int ret, flags;
- /* does not support pool namespace yet */
- if (ci->i_pool_ns_len)
- return -EIO;
-
if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
NOPOOLPERM))
return 0;
spin_lock(&ci->i_ceph_lock);
flags = ci->i_ceph_flags;
- pool = ceph_file_layout_pg_pool(ci->i_layout);
+ pool = ci->i_layout.pool_id;
spin_unlock(&ci->i_ceph_lock);
check:
if (flags & CEPH_I_POOL_PERM) {
if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) {
- dout("ceph_pool_perm_check pool %u no read perm\n",
+ dout("ceph_pool_perm_check pool %lld no read perm\n",
pool);
return -EPERM;
}
if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) {
- dout("ceph_pool_perm_check pool %u no write perm\n",
+ dout("ceph_pool_perm_check pool %lld no write perm\n",
pool);
return -EPERM;
}
return 0;
}
- ret = __ceph_pool_perm_get(ci, pool);
+ pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ ret = __ceph_pool_perm_get(ci, pool, pool_ns);
+ ceph_put_string(pool_ns);
if (ret < 0)
return ret;
@@ -1907,10 +1943,11 @@ check:
flags |= CEPH_I_POOL_WR;
spin_lock(&ci->i_ceph_lock);
- if (pool == ceph_file_layout_pg_pool(ci->i_layout)) {
- ci->i_ceph_flags = flags;
+ if (pool == ci->i_layout.pool_id &&
+ pool_ns == rcu_dereference_raw(ci->i_layout.pool_ns)) {
+ ci->i_ceph_flags |= flags;
} else {
- pool = ceph_file_layout_pg_pool(ci->i_layout);
+ pool = ci->i_layout.pool_id;
flags = ci->i_ceph_flags;
}
spin_unlock(&ci->i_ceph_lock);
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index 238c55b01723..5bc5d37b1217 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -71,7 +71,7 @@ int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
&ceph_fscache_fsid_object_def,
fsc, true);
if (!fsc->fscache)
- pr_err("Unable to resgister fsid: %p fscache cookie", fsc);
+ pr_err("Unable to register fsid: %p fscache cookie\n", fsc);
return 0;
}
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 6f60d0a3d0f9..99115cae1652 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -40,6 +40,11 @@
* cluster to release server state.
*/
+static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc);
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci,
+ u64 oldest_flush_tid);
/*
* Generate readable cap strings for debugging output.
@@ -849,12 +854,14 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
*/
int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
{
- int want = 0;
- int mode;
- for (mode = 0; mode < CEPH_FILE_MODE_NUM; mode++)
- if (ci->i_nr_by_mode[mode])
- want |= ceph_caps_for_mode(mode);
- return want;
+ int i, bits = 0;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (ci->i_nr_by_mode[i])
+ bits |= 1 << i;
+ }
+ if (bits == 0)
+ return 0;
+ return ceph_caps_for_mode(bits >> 1);
}
/*
@@ -991,7 +998,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
u32 seq, u64 flush_tid, u64 oldest_flush_tid,
u32 issue_seq, u32 mseq, u64 size, u64 max_size,
struct timespec *mtime, struct timespec *atime,
- struct timespec *ctime, u64 time_warp_seq,
+ struct timespec *ctime, u32 time_warp_seq,
kuid_t uid, kgid_t gid, umode_t mode,
u64 xattr_version,
struct ceph_buffer *xattrs_buf,
@@ -1116,8 +1123,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
struct inode *inode = &ci->vfs_inode;
u64 cap_id = cap->cap_id;
int held, revoking, dropping, keep;
- u64 seq, issue_seq, mseq, time_warp_seq, follows;
- u64 size, max_size;
+ u64 follows, size, max_size;
+ u32 seq, issue_seq, mseq, time_warp_seq;
struct timespec mtime, atime, ctime;
int wake = 0;
umode_t mode;
@@ -1215,6 +1222,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
return delayed;
}
+static inline int __send_flush_snap(struct inode *inode,
+ struct ceph_mds_session *session,
+ struct ceph_cap_snap *capsnap,
+ u32 mseq, u64 oldest_flush_tid)
+{
+ return send_cap_msg(session, ceph_vino(inode).ino, 0,
+ CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+ capsnap->dirty, 0, capsnap->cap_flush.tid,
+ oldest_flush_tid, 0, mseq, capsnap->size, 0,
+ &capsnap->mtime, &capsnap->atime,
+ &capsnap->ctime, capsnap->time_warp_seq,
+ capsnap->uid, capsnap->gid, capsnap->mode,
+ capsnap->xattr_version, capsnap->xattr_blob,
+ capsnap->follows, capsnap->inline_data);
+}
+
/*
* When a snapshot is taken, clients accumulate dirty metadata on
* inodes with capabilities in ceph_cap_snaps to describe the file
@@ -1222,37 +1245,22 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
* asynchronously back to the MDS once sync writes complete and dirty
* data is written out.
*
- * Unless @kick is true, skip cap_snaps that were already sent to
- * the MDS (i.e., during this session).
- *
* Called under i_ceph_lock. Takes s_mutex as needed.
*/
-void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession,
- int kick)
+static void __ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session *session)
__releases(ci->i_ceph_lock)
__acquires(ci->i_ceph_lock)
{
struct inode *inode = &ci->vfs_inode;
- int mds;
+ struct ceph_mds_client *mdsc = session->s_mdsc;
struct ceph_cap_snap *capsnap;
- u32 mseq;
- struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
- struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
- session->s_mutex */
- u64 next_follows = 0; /* keep track of how far we've gotten through the
- i_cap_snaps list, and skip these entries next time
- around to avoid an infinite loop */
+ u64 oldest_flush_tid = 0;
+ u64 first_tid = 1, last_tid = 0;
- if (psession)
- session = *psession;
+ dout("__flush_snaps %p session %p\n", inode, session);
- dout("__flush_snaps %p\n", inode);
-retry:
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
- /* avoid an infiniute loop after retry */
- if (capsnap->follows < next_follows)
- continue;
/*
* we need to wait for sync writes to complete and for dirty
* pages to be written out.
@@ -1263,97 +1271,129 @@ retry:
/* should be removed by ceph_try_drop_cap_snap() */
BUG_ON(!capsnap->need_flush);
- /* pick mds, take s_mutex */
- if (ci->i_auth_cap == NULL) {
- dout("no auth cap (migrating?), doing nothing\n");
- goto out;
- }
-
/* only flush each capsnap once */
- if (!kick && !list_empty(&capsnap->flushing_item)) {
- dout("already flushed %p, skipping\n", capsnap);
+ if (capsnap->cap_flush.tid > 0) {
+ dout(" already flushed %p, skipping\n", capsnap);
continue;
}
- mds = ci->i_auth_cap->session->s_mds;
- mseq = ci->i_auth_cap->mseq;
+ spin_lock(&mdsc->cap_dirty_lock);
+ capsnap->cap_flush.tid = ++mdsc->last_cap_flush_tid;
+ list_add_tail(&capsnap->cap_flush.g_list,
+ &mdsc->cap_flush_list);
+ if (oldest_flush_tid == 0)
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ if (list_empty(&ci->i_flushing_item)) {
+ list_add_tail(&ci->i_flushing_item,
+ &session->s_cap_flushing);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ list_add_tail(&capsnap->cap_flush.i_list,
+ &ci->i_cap_flush_list);
- if (session && session->s_mds != mds) {
- dout("oops, wrong session %p mutex\n", session);
- if (kick)
- goto out;
+ if (first_tid == 1)
+ first_tid = capsnap->cap_flush.tid;
+ last_tid = capsnap->cap_flush.tid;
+ }
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
- session = NULL;
+ ci->i_ceph_flags &= ~CEPH_I_FLUSH_SNAPS;
+
+ while (first_tid <= last_tid) {
+ struct ceph_cap *cap = ci->i_auth_cap;
+ struct ceph_cap_flush *cf;
+ int ret;
+
+ if (!(cap && cap->session == session)) {
+ dout("__flush_snaps %p auth cap %p not mds%d, "
+ "stop\n", inode, cap, session->s_mds);
+ break;
}
- if (!session) {
- spin_unlock(&ci->i_ceph_lock);
- mutex_lock(&mdsc->mutex);
- session = __ceph_lookup_mds_session(mdsc, mds);
- mutex_unlock(&mdsc->mutex);
- if (session) {
- dout("inverting session/ino locks on %p\n",
- session);
- mutex_lock(&session->s_mutex);
+
+ ret = -ENOENT;
+ list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+ if (cf->tid >= first_tid) {
+ ret = 0;
+ break;
}
- /*
- * if session == NULL, we raced against a cap
- * deletion or migration. retry, and we'll
- * get a better @mds value next time.
- */
- spin_lock(&ci->i_ceph_lock);
- goto retry;
}
+ if (ret < 0)
+ break;
- spin_lock(&mdsc->cap_dirty_lock);
- capsnap->flush_tid = ++mdsc->last_cap_flush_tid;
- spin_unlock(&mdsc->cap_dirty_lock);
+ first_tid = cf->tid + 1;
+ capsnap = container_of(cf, struct ceph_cap_snap, cap_flush);
atomic_inc(&capsnap->nref);
- if (list_empty(&capsnap->flushing_item))
- list_add_tail(&capsnap->flushing_item,
- &session->s_cap_snaps_flushing);
spin_unlock(&ci->i_ceph_lock);
- dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n",
- inode, capsnap, capsnap->follows, capsnap->flush_tid);
- send_cap_msg(session, ceph_vino(inode).ino, 0,
- CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
- capsnap->dirty, 0, capsnap->flush_tid, 0,
- 0, mseq, capsnap->size, 0,
- &capsnap->mtime, &capsnap->atime,
- &capsnap->ctime, capsnap->time_warp_seq,
- capsnap->uid, capsnap->gid, capsnap->mode,
- capsnap->xattr_version, capsnap->xattr_blob,
- capsnap->follows, capsnap->inline_data);
-
- next_follows = capsnap->follows + 1;
- ceph_put_cap_snap(capsnap);
+ dout("__flush_snaps %p capsnap %p tid %llu %s\n",
+ inode, capsnap, cf->tid, ceph_cap_string(capsnap->dirty));
+
+ ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+ oldest_flush_tid);
+ if (ret < 0) {
+ pr_err("__flush_snaps: error sending cap flushsnap, "
+ "ino (%llx.%llx) tid %llu follows %llu\n",
+ ceph_vinop(inode), cf->tid, capsnap->follows);
+ }
+ ceph_put_cap_snap(capsnap);
spin_lock(&ci->i_ceph_lock);
- goto retry;
}
+}
- /* we flushed them all; remove this inode from the queue */
- spin_lock(&mdsc->snap_flush_lock);
- list_del_init(&ci->i_snap_flush_item);
- spin_unlock(&mdsc->snap_flush_lock);
+void ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_session *session = *psession;
+ int mds;
+ dout("ceph_flush_snaps %p\n", inode);
+retry:
+ spin_lock(&ci->i_ceph_lock);
+ if (!(ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)) {
+ dout(" no capsnap needs flush, doing nothing\n");
+ goto out;
+ }
+ if (!ci->i_auth_cap) {
+ dout(" no auth cap (migrating?), doing nothing\n");
+ goto out;
+ }
-out:
- if (psession)
- *psession = session;
- else if (session) {
+ mds = ci->i_auth_cap->session->s_mds;
+ if (session && session->s_mds != mds) {
+ dout(" oops, wrong session %p mutex\n", session);
mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session);
+ session = NULL;
+ }
+ if (!session) {
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_lock(&mdsc->mutex);
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ mutex_unlock(&mdsc->mutex);
+ if (session) {
+ dout(" inverting session/ino locks on %p\n", session);
+ mutex_lock(&session->s_mutex);
+ }
+ goto retry;
}
-}
-static void ceph_flush_snaps(struct ceph_inode_info *ci)
-{
- spin_lock(&ci->i_ceph_lock);
- __ceph_flush_snaps(ci, NULL, 0);
+ __ceph_flush_snaps(ci, session);
+out:
spin_unlock(&ci->i_ceph_lock);
+
+ if (psession) {
+ *psession = session;
+ } else {
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ }
+ /* we flushed them all; remove this inode from the queue */
+ spin_lock(&mdsc->snap_flush_lock);
+ list_del_init(&ci->i_snap_flush_item);
+ spin_unlock(&mdsc->snap_flush_lock);
}
/*
@@ -1411,52 +1451,6 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
return dirty;
}
-static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
- struct ceph_cap_flush *cf)
-{
- struct rb_node **p = &ci->i_cap_flush_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_cap_flush *other = NULL;
-
- while (*p) {
- parent = *p;
- other = rb_entry(parent, struct ceph_cap_flush, i_node);
-
- if (cf->tid < other->tid)
- p = &(*p)->rb_left;
- else if (cf->tid > other->tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&cf->i_node, parent, p);
- rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
-}
-
-static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
- struct ceph_cap_flush *cf)
-{
- struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
- struct rb_node *parent = NULL;
- struct ceph_cap_flush *other = NULL;
-
- while (*p) {
- parent = *p;
- other = rb_entry(parent, struct ceph_cap_flush, g_node);
-
- if (cf->tid < other->tid)
- p = &(*p)->rb_left;
- else if (cf->tid > other->tid)
- p = &(*p)->rb_right;
- else
- BUG();
- }
-
- rb_link_node(&cf->g_node, parent, p);
- rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
-}
-
struct ceph_cap_flush *ceph_alloc_cap_flush(void)
{
return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
@@ -1470,23 +1464,54 @@ void ceph_free_cap_flush(struct ceph_cap_flush *cf)
static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc)
{
- struct rb_node *n = rb_first(&mdsc->cap_flush_tree);
- if (n) {
+ if (!list_empty(&mdsc->cap_flush_list)) {
struct ceph_cap_flush *cf =
- rb_entry(n, struct ceph_cap_flush, g_node);
+ list_first_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
return cf->tid;
}
return 0;
}
/*
+ * Remove cap_flush from the mdsc's or inode's flushing cap list.
+ * Return true if caller needs to wake up flush waiters.
+ */
+static bool __finish_cap_flush(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci,
+ struct ceph_cap_flush *cf)
+{
+ struct ceph_cap_flush *prev;
+ bool wake = cf->wake;
+ if (mdsc) {
+ /* are there older pending cap flushes? */
+ if (wake && cf->g_list.prev != &mdsc->cap_flush_list) {
+ prev = list_prev_entry(cf, g_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del(&cf->g_list);
+ } else if (ci) {
+ if (wake && cf->i_list.prev != &ci->i_cap_flush_list) {
+ prev = list_prev_entry(cf, i_list);
+ prev->wake = true;
+ wake = false;
+ }
+ list_del(&cf->i_list);
+ } else {
+ BUG_ON(1);
+ }
+ return wake;
+}
+
+/*
* Add dirty inode to the flushing list. Assigned a seq number so we
* can wait for caps to flush without starving.
*
* Called under i_ceph_lock.
*/
static int __mark_caps_flushing(struct inode *inode,
- struct ceph_mds_session *session,
+ struct ceph_mds_session *session, bool wake,
u64 *flush_tid, u64 *oldest_flush_tid)
{
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
@@ -1509,26 +1534,22 @@ static int __mark_caps_flushing(struct inode *inode,
swap(cf, ci->i_prealloc_cap_flush);
cf->caps = flushing;
+ cf->wake = wake;
spin_lock(&mdsc->cap_dirty_lock);
list_del_init(&ci->i_dirty_item);
cf->tid = ++mdsc->last_cap_flush_tid;
- __add_cap_flushing_to_mdsc(mdsc, cf);
+ list_add_tail(&cf->g_list, &mdsc->cap_flush_list);
*oldest_flush_tid = __get_oldest_flush_tid(mdsc);
if (list_empty(&ci->i_flushing_item)) {
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
mdsc->num_cap_flushing++;
- dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
- } else {
- list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
- dout(" inode %p now flushing (more) tid %llu\n",
- inode, cf->tid);
}
spin_unlock(&mdsc->cap_dirty_lock);
- __add_cap_flushing_to_inode(ci, cf);
+ list_add_tail(&cf->i_list, &ci->i_cap_flush_list);
*flush_tid = cf->tid;
return flushing;
@@ -1583,10 +1604,11 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
int mds = -1; /* keep track of how far we've gone through i_caps list
to avoid an infinite loop on retry */
struct rb_node *p;
- int tried_invalidate = 0;
- int delayed = 0, sent = 0, force_requeue = 0, num;
- int queue_invalidate = 0;
- int is_delayed = flags & CHECK_CAPS_NODELAY;
+ int delayed = 0, sent = 0, num;
+ bool is_delayed = flags & CHECK_CAPS_NODELAY;
+ bool queue_invalidate = false;
+ bool force_requeue = false;
+ bool tried_invalidate = false;
/* if we are unmounting, flush any unused caps immediately. */
if (mdsc->stopping)
@@ -1597,9 +1619,6 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
- /* flush snaps first time around only */
- if (!list_empty(&ci->i_cap_snaps))
- __ceph_flush_snaps(ci, &session, 0);
goto retry_locked;
retry:
spin_lock(&ci->i_ceph_lock);
@@ -1666,17 +1685,17 @@ retry_locked:
if (revoking & (CEPH_CAP_FILE_CACHE|
CEPH_CAP_FILE_LAZYIO)) {
dout("check_caps queuing invalidate\n");
- queue_invalidate = 1;
+ queue_invalidate = true;
ci->i_rdcache_revoking = ci->i_rdcache_gen;
} else {
dout("check_caps failed to invalidate pages\n");
/* we failed to invalidate pages. check these
caps again later. */
- force_requeue = 1;
+ force_requeue = true;
__cap_set_timeouts(mdsc, ci);
}
}
- tried_invalidate = 1;
+ tried_invalidate = true;
goto retry_locked;
}
@@ -1720,10 +1739,15 @@ retry_locked:
}
}
/* flush anything dirty? */
- if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
- ci->i_dirty_caps) {
- dout("flushing dirty caps\n");
- goto ack;
+ if (cap == ci->i_auth_cap) {
+ if ((flags & CHECK_CAPS_FLUSH) && ci->i_dirty_caps) {
+ dout("flushing dirty caps\n");
+ goto ack;
+ }
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS) {
+ dout("flushing snap caps\n");
+ goto ack;
+ }
}
/* completed revocation? going down and there are no caps? */
@@ -1782,6 +1806,26 @@ ack:
goto retry;
}
}
+
+ /* kick flushing and flush snaps before sending normal
+ * cap message */
+ if (cap == ci->i_auth_cap &&
+ (ci->i_ceph_flags &
+ (CEPH_I_KICK_FLUSH | CEPH_I_FLUSH_SNAPS))) {
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ }
+ if (ci->i_ceph_flags & CEPH_I_FLUSH_SNAPS)
+ __ceph_flush_snaps(ci, session);
+
+ goto retry_locked;
+ }
+
/* take snap_rwsem after session mutex */
if (!took_snap_rwsem) {
if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
@@ -1796,7 +1840,7 @@ ack:
}
if (cap == ci->i_auth_cap && ci->i_dirty_caps) {
- flushing = __mark_caps_flushing(inode, session,
+ flushing = __mark_caps_flushing(inode, session, false,
&flush_tid,
&oldest_flush_tid);
} else {
@@ -1822,7 +1866,7 @@ ack:
* otherwise cancel.
*/
if (delayed && is_delayed)
- force_requeue = 1; /* __send_cap delayed release; requeue */
+ force_requeue = true; /* __send_cap delayed release; requeue */
if (!delayed && !is_delayed)
__cap_delay_cancel(mdsc, ci);
else if (!is_delayed || force_requeue)
@@ -1873,8 +1917,8 @@ retry:
if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
goto out;
- flushing = __mark_caps_flushing(inode, session, &flush_tid,
- &oldest_flush_tid);
+ flushing = __mark_caps_flushing(inode, session, true,
+ &flush_tid, &oldest_flush_tid);
/* __send_cap drops i_ceph_lock */
delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
@@ -1887,10 +1931,11 @@ retry:
spin_unlock(&ci->i_ceph_lock);
}
} else {
- struct rb_node *n = rb_last(&ci->i_cap_flush_tree);
- if (n) {
+ if (!list_empty(&ci->i_cap_flush_list)) {
struct ceph_cap_flush *cf =
- rb_entry(n, struct ceph_cap_flush, i_node);
+ list_last_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ cf->wake = true;
flush_tid = cf->tid;
}
flushing = ci->i_flushing_caps;
@@ -1910,14 +1955,13 @@ out:
static int caps_are_flushed(struct inode *inode, u64 flush_tid)
{
struct ceph_inode_info *ci = ceph_inode(inode);
- struct ceph_cap_flush *cf;
- struct rb_node *n;
int ret = 1;
spin_lock(&ci->i_ceph_lock);
- n = rb_first(&ci->i_cap_flush_tree);
- if (n) {
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ if (!list_empty(&ci->i_cap_flush_list)) {
+ struct ceph_cap_flush * cf =
+ list_first_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
if (cf->tid <= flush_tid)
ret = 0;
}
@@ -1926,53 +1970,6 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
}
/*
- * Wait on any unsafe replies for the given inode. First wait on the
- * newest request, and make that the upper bound. Then, if there are
- * more requests, keep waiting on the oldest as long as it is still older
- * than the original request.
- */
-static void sync_write_wait(struct inode *inode)
-{
- struct ceph_inode_info *ci = ceph_inode(inode);
- struct list_head *head = &ci->i_unsafe_writes;
- struct ceph_osd_request *req;
- u64 last_tid;
-
- if (!S_ISREG(inode->i_mode))
- return;
-
- spin_lock(&ci->i_unsafe_lock);
- if (list_empty(head))
- goto out;
-
- /* set upper bound as _last_ entry in chain */
- req = list_last_entry(head, struct ceph_osd_request,
- r_unsafe_item);
- last_tid = req->r_tid;
-
- do {
- ceph_osdc_get_request(req);
- spin_unlock(&ci->i_unsafe_lock);
- dout("sync_write_wait on tid %llu (until %llu)\n",
- req->r_tid, last_tid);
- wait_for_completion(&req->r_safe_completion);
- spin_lock(&ci->i_unsafe_lock);
- ceph_osdc_put_request(req);
-
- /*
- * from here on look at first entry in chain, since we
- * only want to wait for anything older than last_tid
- */
- if (list_empty(head))
- break;
- req = list_first_entry(head, struct ceph_osd_request,
- r_unsafe_item);
- } while (req->r_tid < last_tid);
-out:
- spin_unlock(&ci->i_unsafe_lock);
-}
-
-/*
* wait for any unsafe requests to complete.
*/
static int unsafe_request_wait(struct inode *inode)
@@ -2024,7 +2021,8 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
int dirty;
dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
- sync_write_wait(inode);
+
+ ceph_sync_write_wait(inode);
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret < 0)
@@ -2087,87 +2085,74 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
return err;
}
-/*
- * After a recovering MDS goes active, we need to resend any caps
- * we were flushing.
- *
- * Caller holds session->s_mutex.
- */
-static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session)
-{
- struct ceph_cap_snap *capsnap;
-
- dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
- list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
- flushing_item) {
- struct ceph_inode_info *ci = capsnap->ci;
- struct inode *inode = &ci->vfs_inode;
- struct ceph_cap *cap;
-
- spin_lock(&ci->i_ceph_lock);
- cap = ci->i_auth_cap;
- if (cap && cap->session == session) {
- dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
- cap, capsnap);
- __ceph_flush_snaps(ci, &session, 1);
- } else {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
- }
- spin_unlock(&ci->i_ceph_lock);
- }
-}
-
-static int __kick_flushing_caps(struct ceph_mds_client *mdsc,
- struct ceph_mds_session *session,
- struct ceph_inode_info *ci)
+static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_inode_info *ci,
+ u64 oldest_flush_tid)
+ __releases(ci->i_ceph_lock)
+ __acquires(ci->i_ceph_lock)
{
struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap;
struct ceph_cap_flush *cf;
- struct rb_node *n;
- int delayed = 0;
+ int ret;
u64 first_tid = 0;
- u64 oldest_flush_tid;
- spin_lock(&mdsc->cap_dirty_lock);
- oldest_flush_tid = __get_oldest_flush_tid(mdsc);
- spin_unlock(&mdsc->cap_dirty_lock);
+ list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
+ if (cf->tid < first_tid)
+ continue;
- while (true) {
- spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
if (!(cap && cap->session == session)) {
- pr_err("%p auth cap %p not mds%d ???\n", inode,
- cap, session->s_mds);
- spin_unlock(&ci->i_ceph_lock);
+ pr_err("%p auth cap %p not mds%d ???\n",
+ inode, cap, session->s_mds);
break;
}
- for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) {
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
- if (cf->tid >= first_tid)
- break;
- }
- if (!n) {
+ first_tid = cf->tid + 1;
+
+ if (cf->caps) {
+ dout("kick_flushing_caps %p cap %p tid %llu %s\n",
+ inode, cap, cf->tid, ceph_cap_string(cf->caps));
+ ci->i_ceph_flags |= CEPH_I_NODELAY;
+ ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ __ceph_caps_used(ci),
+ __ceph_caps_wanted(ci),
+ cap->issued | cap->implemented,
+ cf->caps, cf->tid, oldest_flush_tid);
+ if (ret) {
+ pr_err("kick_flushing_caps: error sending "
+ "cap flush, ino (%llx.%llx) "
+ "tid %llu flushing %s\n",
+ ceph_vinop(inode), cf->tid,
+ ceph_cap_string(cf->caps));
+ }
+ } else {
+ struct ceph_cap_snap *capsnap =
+ container_of(cf, struct ceph_cap_snap,
+ cap_flush);
+ dout("kick_flushing_caps %p capsnap %p tid %llu %s\n",
+ inode, capsnap, cf->tid,
+ ceph_cap_string(capsnap->dirty));
+
+ atomic_inc(&capsnap->nref);
spin_unlock(&ci->i_ceph_lock);
- break;
- }
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
+ ret = __send_flush_snap(inode, session, capsnap, cap->mseq,
+ oldest_flush_tid);
+ if (ret < 0) {
+ pr_err("kick_flushing_caps: error sending "
+ "cap flushsnap, ino (%llx.%llx) "
+ "tid %llu follows %llu\n",
+ ceph_vinop(inode), cf->tid,
+ capsnap->follows);
+ }
- first_tid = cf->tid + 1;
+ ceph_put_cap_snap(capsnap);
+ }
- dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode,
- cap, cf->tid, ceph_cap_string(cf->caps));
- delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
- __ceph_caps_used(ci),
- __ceph_caps_wanted(ci),
- cap->issued | cap->implemented,
- cf->caps, cf->tid, oldest_flush_tid);
+ spin_lock(&ci->i_ceph_lock);
}
- return delayed;
}
void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
@@ -2175,8 +2160,14 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
{
struct ceph_inode_info *ci;
struct ceph_cap *cap;
+ u64 oldest_flush_tid;
dout("early_kick_flushing_caps mds%d\n", session->s_mds);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
@@ -2196,10 +2187,11 @@ void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc,
*/
if ((cap->issued & ci->i_flushing_caps) !=
ci->i_flushing_caps) {
- spin_unlock(&ci->i_ceph_lock);
- if (!__kick_flushing_caps(mdsc, session, ci))
- continue;
- spin_lock(&ci->i_ceph_lock);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
+ } else {
+ ci->i_ceph_flags |= CEPH_I_KICK_FLUSH;
}
spin_unlock(&ci->i_ceph_lock);
@@ -2210,50 +2202,56 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session)
{
struct ceph_inode_info *ci;
-
- kick_flushing_capsnaps(mdsc, session);
+ struct ceph_cap *cap;
+ u64 oldest_flush_tid;
dout("kick_flushing_caps mds%d\n", session->s_mds);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
+ spin_unlock(&mdsc->cap_dirty_lock);
+
list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
- int delayed = __kick_flushing_caps(mdsc, session, ci);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
+ spin_lock(&ci->i_ceph_lock);
+ cap = ci->i_auth_cap;
+ if (!(cap && cap->session == session)) {
+ pr_err("%p auth cap %p not mds%d ???\n",
+ &ci->vfs_inode, cap, session->s_mds);
spin_unlock(&ci->i_ceph_lock);
+ continue;
+ }
+ if (ci->i_ceph_flags & CEPH_I_KICK_FLUSH) {
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci,
+ oldest_flush_tid);
}
+ spin_unlock(&ci->i_ceph_lock);
}
}
static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct inode *inode)
+ __releases(ci->i_ceph_lock)
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_cap *cap;
- spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap;
dout("kick_flushing_inode_caps %p flushing %s\n", inode,
ceph_cap_string(ci->i_flushing_caps));
- __ceph_flush_snaps(ci, &session, 1);
-
- if (ci->i_flushing_caps) {
- int delayed;
-
+ if (!list_empty(&ci->i_cap_flush_list)) {
+ u64 oldest_flush_tid;
spin_lock(&mdsc->cap_dirty_lock);
list_move_tail(&ci->i_flushing_item,
&cap->session->s_cap_flushing);
+ oldest_flush_tid = __get_oldest_flush_tid(mdsc);
spin_unlock(&mdsc->cap_dirty_lock);
+ ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
+ __kick_flushing_caps(mdsc, session, ci, oldest_flush_tid);
spin_unlock(&ci->i_ceph_lock);
-
- delayed = __kick_flushing_caps(mdsc, session, ci);
- if (delayed) {
- spin_lock(&ci->i_ceph_lock);
- __cap_delay_requeue(mdsc, ci);
- spin_unlock(&ci->i_ceph_lock);
- }
} else {
spin_unlock(&ci->i_ceph_lock);
}
@@ -2580,16 +2578,19 @@ void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
* drop cap_snap that is not associated with any snapshot.
* we don't need to send FLUSHSNAP message for it.
*/
-static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap)
+static int ceph_try_drop_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap *capsnap)
{
if (!capsnap->need_flush &&
!capsnap->writing && !capsnap->dirty_pages) {
-
dout("dropping cap_snap %p follows %llu\n",
capsnap, capsnap->follows);
+ BUG_ON(capsnap->cap_flush.tid > 0);
ceph_put_snap_context(capsnap->context);
+ if (!list_is_last(&capsnap->ci_item, &ci->i_cap_snaps))
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+
list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
ceph_put_cap_snap(capsnap);
return 1;
}
@@ -2636,7 +2637,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
struct ceph_cap_snap,
ci_item);
capsnap->writing = 0;
- if (ceph_try_drop_cap_snap(capsnap))
+ if (ceph_try_drop_cap_snap(ci, capsnap))
put++;
else if (__ceph_finish_cap_snap(ci, capsnap))
flushsnaps = 1;
@@ -2661,7 +2662,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
if (last && !flushsnaps)
ceph_check_caps(ci, 0, NULL);
else if (flushsnaps)
- ceph_flush_snaps(ci);
+ ceph_flush_snaps(ci, NULL);
if (wake)
wake_up_all(&ci->i_cap_wq);
while (put-- > 0)
@@ -2679,15 +2680,19 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc)
{
struct inode *inode = &ci->vfs_inode;
- int last = 0;
- int complete_capsnap = 0;
- int drop_capsnap = 0;
- int found = 0;
struct ceph_cap_snap *capsnap = NULL;
+ int put = 0;
+ bool last = false;
+ bool found = false;
+ bool flush_snaps = false;
+ bool complete_capsnap = false;
spin_lock(&ci->i_ceph_lock);
ci->i_wrbuffer_ref -= nr;
- last = !ci->i_wrbuffer_ref;
+ if (ci->i_wrbuffer_ref == 0) {
+ last = true;
+ put++;
+ }
if (ci->i_head_snapc == snapc) {
ci->i_wrbuffer_ref_head -= nr;
@@ -2707,15 +2712,22 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
} else {
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
if (capsnap->context == snapc) {
- found = 1;
+ found = true;
break;
}
}
BUG_ON(!found);
capsnap->dirty_pages -= nr;
if (capsnap->dirty_pages == 0) {
- complete_capsnap = 1;
- drop_capsnap = ceph_try_drop_cap_snap(capsnap);
+ complete_capsnap = true;
+ if (!capsnap->writing) {
+ if (ceph_try_drop_cap_snap(ci, capsnap)) {
+ put++;
+ } else {
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
+ flush_snaps = true;
+ }
+ }
}
dout("put_wrbuffer_cap_refs on %p cap_snap %p "
" snap %lld %d/%d -> %d/%d %s%s\n",
@@ -2730,12 +2742,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
if (last) {
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
- iput(inode);
- } else if (complete_capsnap) {
- ceph_flush_snaps(ci);
- wake_up_all(&ci->i_cap_wq);
+ } else if (flush_snaps) {
+ ceph_flush_snaps(ci, NULL);
}
- if (drop_capsnap)
+ if (complete_capsnap)
+ wake_up_all(&ci->i_cap_wq);
+ while (put-- > 0)
iput(inode);
}
@@ -2779,12 +2791,11 @@ static void invalidate_aliases(struct inode *inode)
*/
static void handle_cap_grant(struct ceph_mds_client *mdsc,
struct inode *inode, struct ceph_mds_caps *grant,
- u64 inline_version,
- void *inline_data, int inline_len,
+ struct ceph_string **pns, u64 inline_version,
+ void *inline_data, u32 inline_len,
struct ceph_buffer *xattr_buf,
struct ceph_mds_session *session,
- struct ceph_cap *cap, int issued,
- u32 pool_ns_len)
+ struct ceph_cap *cap, int issued)
__releases(ci->i_ceph_lock)
__releases(mdsc->snap_rwsem)
{
@@ -2895,8 +2906,18 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
/* file layout may have changed */
- ci->i_layout = grant->layout;
- ci->i_pool_ns_len = pool_ns_len;
+ s64 old_pool = ci->i_layout.pool_id;
+ struct ceph_string *old_ns;
+
+ ceph_file_layout_from_legacy(&ci->i_layout, &grant->layout);
+ old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
+ lockdep_is_held(&ci->i_ceph_lock));
+ rcu_assign_pointer(ci->i_layout.pool_ns, *pns);
+
+ if (ci->i_layout.pool_id != old_pool || *pns != old_ns)
+ ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
+
+ *pns = old_ns;
/* size/truncate_seq? */
queue_trunc = ceph_fill_file_size(inode, issued,
@@ -2979,13 +3000,13 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
fill_inline = true;
}
- spin_unlock(&ci->i_ceph_lock);
-
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
- kick_flushing_inode_caps(mdsc, session, inode);
- up_read(&mdsc->snap_rwsem);
if (newcaps & ~issued)
wake = true;
+ kick_flushing_inode_caps(mdsc, session, inode);
+ up_read(&mdsc->snap_rwsem);
+ } else {
+ spin_unlock(&ci->i_ceph_lock);
}
if (fill_inline)
@@ -3029,23 +3050,24 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
- struct ceph_cap_flush *cf;
- struct rb_node *n;
+ struct ceph_cap_flush *cf, *tmp_cf;
LIST_HEAD(to_remove);
unsigned seq = le32_to_cpu(m->seq);
int dirty = le32_to_cpu(m->dirty);
int cleaned = 0;
- int drop = 0;
+ bool drop = false;
+ bool wake_ci = 0;
+ bool wake_mdsc = 0;
- n = rb_first(&ci->i_cap_flush_tree);
- while (n) {
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
- n = rb_next(&cf->i_node);
+ list_for_each_entry_safe(cf, tmp_cf, &ci->i_cap_flush_list, i_list) {
if (cf->tid == flush_tid)
cleaned = cf->caps;
+ if (cf->caps == 0) /* capsnap */
+ continue;
if (cf->tid <= flush_tid) {
- rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
- list_add_tail(&cf->list, &to_remove);
+ if (__finish_cap_flush(NULL, ci, cf))
+ wake_ci = true;
+ list_add_tail(&cf->i_list, &to_remove);
} else {
cleaned &= ~cf->caps;
if (!cleaned)
@@ -3066,31 +3088,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
spin_lock(&mdsc->cap_dirty_lock);
- if (!list_empty(&to_remove)) {
- list_for_each_entry(cf, &to_remove, list)
- rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
-
- n = rb_first(&mdsc->cap_flush_tree);
- cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
- if (!cf || cf->tid > flush_tid)
- wake_up_all(&mdsc->cap_flushing_wq);
+ list_for_each_entry(cf, &to_remove, i_list) {
+ if (__finish_cap_flush(mdsc, NULL, cf))
+ wake_mdsc = true;
}
if (ci->i_flushing_caps == 0) {
- list_del_init(&ci->i_flushing_item);
- if (!list_empty(&session->s_cap_flushing))
- dout(" mds%d still flushing cap on %p\n",
- session->s_mds,
- &list_entry(session->s_cap_flushing.next,
- struct ceph_inode_info,
- i_flushing_item)->vfs_inode);
+ if (list_empty(&ci->i_cap_flush_list)) {
+ list_del_init(&ci->i_flushing_item);
+ if (!list_empty(&session->s_cap_flushing)) {
+ dout(" mds%d still flushing cap on %p\n",
+ session->s_mds,
+ &list_first_entry(&session->s_cap_flushing,
+ struct ceph_inode_info,
+ i_flushing_item)->vfs_inode);
+ }
+ }
mdsc->num_cap_flushing--;
dout(" inode %p now !flushing\n", inode);
if (ci->i_dirty_caps == 0) {
dout(" inode %p now clean\n", inode);
BUG_ON(!list_empty(&ci->i_dirty_item));
- drop = 1;
+ drop = true;
if (ci->i_wr_ref == 0 &&
ci->i_wrbuffer_ref_head == 0) {
BUG_ON(!ci->i_head_snapc);
@@ -3102,17 +3122,21 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
}
}
spin_unlock(&mdsc->cap_dirty_lock);
- wake_up_all(&ci->i_cap_wq);
out:
spin_unlock(&ci->i_ceph_lock);
while (!list_empty(&to_remove)) {
cf = list_first_entry(&to_remove,
- struct ceph_cap_flush, list);
- list_del(&cf->list);
+ struct ceph_cap_flush, i_list);
+ list_del(&cf->i_list);
ceph_free_cap_flush(cf);
}
+
+ if (wake_ci)
+ wake_up_all(&ci->i_cap_wq);
+ if (wake_mdsc)
+ wake_up_all(&mdsc->cap_flushing_wq);
if (drop)
iput(inode);
}
@@ -3131,7 +3155,9 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
u64 follows = le64_to_cpu(m->snap_follows);
struct ceph_cap_snap *capsnap;
- int drop = 0;
+ bool flushed = false;
+ bool wake_ci = false;
+ bool wake_mdsc = false;
dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
inode, ci, session->s_mds, follows);
@@ -3139,30 +3165,47 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
spin_lock(&ci->i_ceph_lock);
list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
if (capsnap->follows == follows) {
- if (capsnap->flush_tid != flush_tid) {
+ if (capsnap->cap_flush.tid != flush_tid) {
dout(" cap_snap %p follows %lld tid %lld !="
" %lld\n", capsnap, follows,
- flush_tid, capsnap->flush_tid);
+ flush_tid, capsnap->cap_flush.tid);
break;
}
- WARN_ON(capsnap->dirty_pages || capsnap->writing);
- dout(" removing %p cap_snap %p follows %lld\n",
- inode, capsnap, follows);
- ceph_put_snap_context(capsnap->context);
- list_del(&capsnap->ci_item);
- list_del(&capsnap->flushing_item);
- ceph_put_cap_snap(capsnap);
- wake_up_all(&mdsc->cap_flushing_wq);
- drop = 1;
+ flushed = true;
break;
} else {
dout(" skipping cap_snap %p follows %lld\n",
capsnap, capsnap->follows);
}
}
+ if (flushed) {
+ WARN_ON(capsnap->dirty_pages || capsnap->writing);
+ dout(" removing %p cap_snap %p follows %lld\n",
+ inode, capsnap, follows);
+ list_del(&capsnap->ci_item);
+ if (__finish_cap_flush(NULL, ci, &capsnap->cap_flush))
+ wake_ci = true;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+
+ if (list_empty(&ci->i_cap_flush_list))
+ list_del_init(&ci->i_flushing_item);
+
+ if (__finish_cap_flush(mdsc, NULL, &capsnap->cap_flush))
+ wake_mdsc = true;
+
+ spin_unlock(&mdsc->cap_dirty_lock);
+ }
spin_unlock(&ci->i_ceph_lock);
- if (drop)
+ if (flushed) {
+ ceph_put_snap_context(capsnap->context);
+ ceph_put_cap_snap(capsnap);
+ if (wake_ci)
+ wake_up_all(&ci->i_cap_wq);
+ if (wake_mdsc)
+ wake_up_all(&mdsc->cap_flushing_wq);
iput(inode);
+ }
}
/*
@@ -3267,7 +3310,8 @@ retry:
tcap->implemented |= issued;
if (cap == ci->i_auth_cap)
ci->i_auth_cap = tcap;
- if (ci->i_flushing_caps && ci->i_auth_cap == tcap) {
+ if (!list_empty(&ci->i_cap_flush_list) &&
+ ci->i_auth_cap == tcap) {
spin_lock(&mdsc->cap_dirty_lock);
list_move_tail(&ci->i_flushing_item,
&tcap->session->s_cap_flushing);
@@ -3420,20 +3464,18 @@ void ceph_handle_caps(struct ceph_mds_session *session,
struct ceph_cap *cap;
struct ceph_mds_caps *h;
struct ceph_mds_cap_peer *peer = NULL;
- struct ceph_snap_realm *realm;
+ struct ceph_snap_realm *realm = NULL;
+ struct ceph_string *pool_ns = NULL;
int mds = session->s_mds;
int op, issued;
u32 seq, mseq;
struct ceph_vino vino;
- u64 cap_id;
- u64 size, max_size;
u64 tid;
u64 inline_version = 0;
void *inline_data = NULL;
u32 inline_len = 0;
void *snaptrace;
size_t snaptrace_len;
- u32 pool_ns_len = 0;
void *p, *end;
dout("handle_caps from mds%d\n", mds);
@@ -3447,11 +3489,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
op = le32_to_cpu(h->op);
vino.ino = le64_to_cpu(h->ino);
vino.snap = CEPH_NOSNAP;
- cap_id = le64_to_cpu(h->cap_id);
seq = le32_to_cpu(h->seq);
mseq = le32_to_cpu(h->migrate_seq);
- size = le64_to_cpu(h->size);
- max_size = le64_to_cpu(h->max_size);
snaptrace = h + 1;
snaptrace_len = le32_to_cpu(h->snap_trace_len);
@@ -3490,6 +3529,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
u64 flush_tid;
u32 caller_uid, caller_gid;
u32 osd_epoch_barrier;
+ u32 pool_ns_len;
/* version >= 5 */
ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
/* version >= 6 */
@@ -3499,6 +3539,11 @@ void ceph_handle_caps(struct ceph_mds_session *session,
ceph_decode_32_safe(&p, end, caller_gid, bad);
/* version >= 8 */
ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+ if (pool_ns_len > 0) {
+ ceph_decode_need(&p, end, pool_ns_len, bad);
+ pool_ns = ceph_find_or_create_string(p, pool_ns_len);
+ p += pool_ns_len;
+ }
}
/* lookup ino */
@@ -3519,7 +3564,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
cap = ceph_get_cap(mdsc, NULL);
cap->cap_ino = vino.ino;
cap->queue_release = 1;
- cap->cap_id = cap_id;
+ cap->cap_id = le64_to_cpu(h->cap_id);
cap->mseq = mseq;
cap->seq = seq;
spin_lock(&session->s_cap_lock);
@@ -3554,10 +3599,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
}
handle_cap_import(mdsc, inode, h, peer, session,
&cap, &issued);
- handle_cap_grant(mdsc, inode, h,
+ handle_cap_grant(mdsc, inode, h, &pool_ns,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued,
- pool_ns_len);
+ msg->middle, session, cap, issued);
if (realm)
ceph_put_snap_realm(mdsc, realm);
goto done_unlocked;
@@ -3579,10 +3623,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
case CEPH_CAP_OP_GRANT:
__ceph_caps_issued(ci, &issued);
issued |= __ceph_caps_dirty(ci);
- handle_cap_grant(mdsc, inode, h,
+ handle_cap_grant(mdsc, inode, h, &pool_ns,
inline_version, inline_data, inline_len,
- msg->middle, session, cap, issued,
- pool_ns_len);
+ msg->middle, session, cap, issued);
goto done_unlocked;
case CEPH_CAP_OP_FLUSH_ACK:
@@ -3613,6 +3656,7 @@ done:
mutex_unlock(&session->s_mutex);
done_unlocked:
iput(inode);
+ ceph_put_string(pool_ns);
return;
bad:
@@ -3673,6 +3717,16 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
dout("flush_dirty_caps done\n");
}
+void __ceph_get_fmode(struct ceph_inode_info *ci, int fmode)
+{
+ int i;
+ int bits = (fmode << 1) | 1;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (bits & (1 << i))
+ ci->i_nr_by_mode[i]++;
+ }
+}
+
/*
* Drop open file reference. If we were the last open file,
* we may need to release capabilities to the MDS (or schedule
@@ -3680,15 +3734,20 @@ void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
*/
void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
{
- struct inode *inode = &ci->vfs_inode;
- int last = 0;
-
+ int i, last = 0;
+ int bits = (fmode << 1) | 1;
spin_lock(&ci->i_ceph_lock);
- dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
- ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
- BUG_ON(ci->i_nr_by_mode[fmode] == 0);
- if (--ci->i_nr_by_mode[fmode] == 0)
- last++;
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++) {
+ if (bits & (1 << i)) {
+ BUG_ON(ci->i_nr_by_mode[i] == 0);
+ if (--ci->i_nr_by_mode[i] == 0)
+ last++;
+ }
+ }
+ dout("put_fmode %p fmode %d {%d,%d,%d,%d}\n",
+ &ci->vfs_inode, fmode,
+ ci->i_nr_by_mode[0], ci->i_nr_by_mode[1],
+ ci->i_nr_by_mode[2], ci->i_nr_by_mode[3]);
spin_unlock(&ci->i_ceph_lock);
if (last && ci->i_vino.snap == CEPH_NOSNAP)
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 6e0fedf6713b..c64a0b794d49 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -59,7 +59,7 @@ int ceph_init_dentry(struct dentry *dentry)
di->dentry = dentry;
di->lease_session = NULL;
- dentry->d_time = jiffies;
+ di->time = jiffies;
/* avoid reordering d_fsdata setup so that the check above is safe */
smp_mb();
dentry->d_fsdata = di;
@@ -1124,7 +1124,7 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
void ceph_invalidate_dentry_lease(struct dentry *dentry)
{
spin_lock(&dentry->d_lock);
- dentry->d_time = jiffies;
+ ceph_dentry(dentry)->time = jiffies;
ceph_dentry(dentry)->lease_shared_gen = 0;
spin_unlock(&dentry->d_lock);
}
@@ -1133,7 +1133,8 @@ void ceph_invalidate_dentry_lease(struct dentry *dentry)
* Check if dentry lease is valid. If not, delete the lease. Try to
* renew if the least is more than half up.
*/
-static int dentry_lease_is_valid(struct dentry *dentry)
+static int dentry_lease_is_valid(struct dentry *dentry, unsigned int flags,
+ struct inode *dir)
{
struct ceph_dentry_info *di;
struct ceph_mds_session *s;
@@ -1141,12 +1142,11 @@ static int dentry_lease_is_valid(struct dentry *dentry)
u32 gen;
unsigned long ttl;
struct ceph_mds_session *session = NULL;
- struct inode *dir = NULL;
u32 seq = 0;
spin_lock(&dentry->d_lock);
di = ceph_dentry(dentry);
- if (di->lease_session) {
+ if (di && di->lease_session) {
s = di->lease_session;
spin_lock(&s->s_gen_ttl_lock);
gen = s->s_cap_gen;
@@ -1154,17 +1154,24 @@ static int dentry_lease_is_valid(struct dentry *dentry)
spin_unlock(&s->s_gen_ttl_lock);
if (di->lease_gen == gen &&
- time_before(jiffies, dentry->d_time) &&
+ time_before(jiffies, di->time) &&
time_before(jiffies, ttl)) {
valid = 1;
if (di->lease_renew_after &&
time_after(jiffies, di->lease_renew_after)) {
- /* we should renew */
- dir = d_inode(dentry->d_parent);
- session = ceph_get_mds_session(s);
- seq = di->lease_seq;
- di->lease_renew_after = 0;
- di->lease_renew_from = jiffies;
+ /*
+ * We should renew. If we're in RCU walk mode
+ * though, we can't do that so just return
+ * -ECHILD.
+ */
+ if (flags & LOOKUP_RCU) {
+ valid = -ECHILD;
+ } else {
+ session = ceph_get_mds_session(s);
+ seq = di->lease_seq;
+ di->lease_renew_after = 0;
+ di->lease_renew_from = jiffies;
+ }
}
}
}
@@ -1207,15 +1214,19 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
struct dentry *parent;
struct inode *dir;
- if (flags & LOOKUP_RCU)
- return -ECHILD;
+ if (flags & LOOKUP_RCU) {
+ parent = ACCESS_ONCE(dentry->d_parent);
+ dir = d_inode_rcu(parent);
+ if (!dir)
+ return -ECHILD;
+ } else {
+ parent = dget_parent(dentry);
+ dir = d_inode(parent);
+ }
dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
- parent = dget_parent(dentry);
- dir = d_inode(parent);
-
/* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) {
dout("d_revalidate %p '%pd' inode %p is SNAPPED\n", dentry,
@@ -1224,12 +1235,16 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
} else if (d_really_is_positive(dentry) &&
ceph_snap(d_inode(dentry)) == CEPH_SNAPDIR) {
valid = 1;
- } else if (dentry_lease_is_valid(dentry) ||
- dir_lease_is_valid(dir, dentry)) {
- if (d_really_is_positive(dentry))
- valid = ceph_is_any_caps(d_inode(dentry));
- else
- valid = 1;
+ } else {
+ valid = dentry_lease_is_valid(dentry, flags, dir);
+ if (valid == -ECHILD)
+ return valid;
+ if (valid || dir_lease_is_valid(dir, dentry)) {
+ if (d_really_is_positive(dentry))
+ valid = ceph_is_any_caps(d_inode(dentry));
+ else
+ valid = 1;
+ }
}
if (!valid) {
@@ -1238,6 +1253,9 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
struct ceph_mds_request *req;
int op, mask, err;
+ if (flags & LOOKUP_RCU)
+ return -ECHILD;
+
op = ceph_snap(dir) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
@@ -1273,7 +1291,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
ceph_dir_clear_complete(dir);
}
- dput(parent);
+ if (!(flags & LOOKUP_RCU))
+ dput(parent);
return valid;
}
@@ -1286,10 +1305,14 @@ static void ceph_d_release(struct dentry *dentry)
dout("d_release %p\n", dentry);
ceph_dentry_lru_del(dentry);
+
+ spin_lock(&dentry->d_lock);
+ dentry->d_fsdata = NULL;
+ spin_unlock(&dentry->d_lock);
+
if (di->lease_session)
ceph_put_mds_session(di->lease_session);
kmem_cache_free(ceph_dentry_cachep, di);
- dentry->d_fsdata = NULL;
}
static int ceph_snapdir_d_revalidate(struct dentry *dentry,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 0daaf7ceedc5..0f5375d8e030 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -708,7 +708,7 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req)
}
}
- ceph_put_page_vector(osd_data->pages, num_pages, false);
+ ceph_put_page_vector(osd_data->pages, num_pages, !aio_req->write);
ceph_osdc_put_request(req);
if (rc < 0)
@@ -821,6 +821,54 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
}
}
+/*
+ * Wait on any unsafe replies for the given inode. First wait on the
+ * newest request, and make that the upper bound. Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+void ceph_sync_write_wait(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct list_head *head = &ci->i_unsafe_writes;
+ struct ceph_osd_request *req;
+ u64 last_tid;
+
+ if (!S_ISREG(inode->i_mode))
+ return;
+
+ spin_lock(&ci->i_unsafe_lock);
+ if (list_empty(head))
+ goto out;
+
+ /* set upper bound as _last_ entry in chain */
+
+ req = list_last_entry(head, struct ceph_osd_request,
+ r_unsafe_item);
+ last_tid = req->r_tid;
+
+ do {
+ ceph_osdc_get_request(req);
+ spin_unlock(&ci->i_unsafe_lock);
+
+ dout("sync_write_wait on tid %llu (until %llu)\n",
+ req->r_tid, last_tid);
+ wait_for_completion(&req->r_safe_completion);
+ ceph_osdc_put_request(req);
+
+ spin_lock(&ci->i_unsafe_lock);
+ /*
+ * from here on look at first entry in chain, since we
+ * only want to wait for anything older than last_tid
+ */
+ if (list_empty(head))
+ break;
+ req = list_first_entry(head, struct ceph_osd_request,
+ r_unsafe_item);
+ } while (req->r_tid < last_tid);
+out:
+ spin_unlock(&ci->i_unsafe_lock);
+}
static ssize_t
ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
@@ -964,7 +1012,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
len = ret;
}
- ceph_put_page_vector(pages, num_pages, false);
+ ceph_put_page_vector(pages, num_pages, !write);
ceph_osdc_put_request(req);
if (ret < 0)
@@ -985,6 +1033,8 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
}
if (aio_req) {
+ LIST_HEAD(osd_reqs);
+
if (aio_req->num_reqs == 0) {
kfree(aio_req);
return ret;
@@ -993,8 +1043,9 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
CEPH_CAP_FILE_RD);
- while (!list_empty(&aio_req->osd_reqs)) {
- req = list_first_entry(&aio_req->osd_reqs,
+ list_splice(&aio_req->osd_reqs, &osd_reqs);
+ while (!list_empty(&osd_reqs)) {
+ req = list_first_entry(&osd_reqs,
struct ceph_osd_request,
r_unsafe_item);
list_del_init(&req->r_unsafe_item);
@@ -1448,16 +1499,14 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
loff_t i_size;
- int ret;
+ loff_t ret;
inode_lock(inode);
if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
- if (ret < 0) {
- offset = ret;
+ if (ret < 0)
goto out;
- }
}
i_size = i_size_read(inode);
@@ -1473,7 +1522,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
* write() or lseek() might have altered it
*/
if (offset == 0) {
- offset = file->f_pos;
+ ret = file->f_pos;
goto out;
}
offset += file->f_pos;
@@ -1493,11 +1542,11 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
break;
}
- offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
+ ret = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
out:
inode_unlock(inode);
- return offset;
+ return ret;
}
static inline void ceph_zero_partial_page(
@@ -1583,9 +1632,9 @@ static int ceph_zero_objects(struct inode *inode, loff_t offset, loff_t length)
{
int ret = 0;
struct ceph_inode_info *ci = ceph_inode(inode);
- s32 stripe_unit = ceph_file_layout_su(ci->i_layout);
- s32 stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
- s32 object_size = ceph_file_layout_object_size(ci->i_layout);
+ s32 stripe_unit = ci->i_layout.stripe_unit;
+ s32 stripe_count = ci->i_layout.stripe_count;
+ s32 object_size = ci->i_layout.object_size;
u64 object_set_size = object_size * stripe_count;
u64 nearly, t;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 99bdef66213a..dd3a6dbf71eb 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -446,7 +446,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
- ci->i_pool_ns_len = 0;
+ RCU_INIT_POINTER(ci->i_layout.pool_ns, NULL);
ci->i_fragtree = RB_ROOT;
mutex_init(&ci->i_fragtree_mutex);
@@ -468,7 +468,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ci->i_dirty_item);
INIT_LIST_HEAD(&ci->i_flushing_item);
ci->i_prealloc_cap_flush = NULL;
- ci->i_cap_flush_tree = RB_ROOT;
+ INIT_LIST_HEAD(&ci->i_cap_flush_list);
init_waitqueue_head(&ci->i_cap_wq);
ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0;
@@ -477,7 +477,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_head_snapc = NULL;
ci->i_snap_caps = 0;
- for (i = 0; i < CEPH_FILE_MODE_NUM; i++)
+ for (i = 0; i < CEPH_FILE_MODE_BITS; i++)
ci->i_nr_by_mode[i] = 0;
mutex_init(&ci->i_truncate_mutex);
@@ -570,6 +570,8 @@ void ceph_destroy_inode(struct inode *inode)
if (ci->i_xattrs.prealloc_blob)
ceph_buffer_put(ci->i_xattrs.prealloc_blob);
+ ceph_put_string(rcu_dereference_raw(ci->i_layout.pool_ns));
+
call_rcu(&inode->i_rcu, ceph_i_callback);
}
@@ -583,6 +585,14 @@ int ceph_drop_inode(struct inode *inode)
return 1;
}
+void ceph_evict_inode(struct inode *inode)
+{
+ /* wait unsafe sync writes */
+ ceph_sync_write_wait(inode);
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+}
+
static inline blkcnt_t calc_inode_blocks(u64 size)
{
return (size + (1<<9) - 1) >> 9;
@@ -733,6 +743,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
int issued = 0, implemented, new_issued;
struct timespec mtime, atime, ctime;
struct ceph_buffer *xattr_blob = NULL;
+ struct ceph_string *pool_ns = NULL;
struct ceph_cap *new_cap = NULL;
int err = 0;
bool wake = false;
@@ -760,6 +771,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
iinfo->xattr_len);
}
+ if (iinfo->pool_ns_len > 0)
+ pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
+ iinfo->pool_ns_len);
+
spin_lock(&ci->i_ceph_lock);
/*
@@ -814,10 +829,18 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
if (new_version ||
(new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) {
- if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
+ s64 old_pool = ci->i_layout.pool_id;
+ struct ceph_string *old_ns;
+
+ ceph_file_layout_from_legacy(&ci->i_layout, &info->layout);
+ old_ns = rcu_dereference_protected(ci->i_layout.pool_ns,
+ lockdep_is_held(&ci->i_ceph_lock));
+ rcu_assign_pointer(ci->i_layout.pool_ns, pool_ns);
+
+ if (ci->i_layout.pool_id != old_pool || pool_ns != old_ns)
ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
- ci->i_layout = info->layout;
- ci->i_pool_ns_len = iinfo->pool_ns_len;
+
+ pool_ns = old_ns;
queue_trunc = ceph_fill_file_size(inode, issued,
le32_to_cpu(info->truncate_seq),
@@ -985,6 +1008,7 @@ out:
ceph_put_cap(mdsc, new_cap);
if (xattr_blob)
ceph_buffer_put(xattr_blob);
+ ceph_put_string(pool_ns);
return err;
}
@@ -1018,7 +1042,7 @@ static void update_dentry_lease(struct dentry *dentry,
goto out_unlock;
if (di->lease_gen == session->s_cap_gen &&
- time_before(ttl, dentry->d_time))
+ time_before(ttl, di->time))
goto out_unlock; /* we already have a newer lease. */
if (di->lease_session && di->lease_session != session)
@@ -1032,7 +1056,7 @@ static void update_dentry_lease(struct dentry *dentry,
di->lease_seq = le32_to_cpu(lease->seq);
di->lease_renew_after = half_ttl;
di->lease_renew_from = 0;
- dentry->d_time = ttl;
+ di->time = ttl;
out_unlock:
spin_unlock(&dentry->d_lock);
return;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index be6b1657b1af..7d752d53353a 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -21,10 +21,10 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false);
if (!err) {
- l.stripe_unit = ceph_file_layout_su(ci->i_layout);
- l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
- l.object_size = ceph_file_layout_object_size(ci->i_layout);
- l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+ l.stripe_unit = ci->i_layout.stripe_unit;
+ l.stripe_count = ci->i_layout.stripe_count;
+ l.object_size = ci->i_layout.object_size;
+ l.data_pool = ci->i_layout.pool_id;
l.preferred_osd = (s32)-1;
if (copy_to_user(arg, &l, sizeof(l)))
return -EFAULT;
@@ -82,19 +82,19 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
if (l.stripe_count)
nl.stripe_count = l.stripe_count;
else
- nl.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+ nl.stripe_count = ci->i_layout.stripe_count;
if (l.stripe_unit)
nl.stripe_unit = l.stripe_unit;
else
- nl.stripe_unit = ceph_file_layout_su(ci->i_layout);
+ nl.stripe_unit = ci->i_layout.stripe_unit;
if (l.object_size)
nl.object_size = l.object_size;
else
- nl.object_size = ceph_file_layout_object_size(ci->i_layout);
+ nl.object_size = ci->i_layout.object_size;
if (l.data_pool)
nl.data_pool = l.data_pool;
else
- nl.data_pool = ceph_file_layout_pg_pool(ci->i_layout);
+ nl.data_pool = ci->i_layout.pool_id;
/* this is obsolete, and always -1 */
nl.preferred_osd = le64_to_cpu(-1);
@@ -183,7 +183,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
struct ceph_osd_client *osdc =
&ceph_sb_to_client(inode->i_sb)->client->osdc;
struct ceph_object_locator oloc;
- struct ceph_object_id oid;
+ CEPH_DEFINE_OID_ONSTACK(oid);
u64 len = 1, olen;
u64 tmp;
struct ceph_pg pgid;
@@ -202,8 +202,8 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
return -EIO;
}
dl.file_offset -= dl.object_offset;
- dl.object_size = ceph_file_layout_object_size(ci->i_layout);
- dl.block_size = ceph_file_layout_su(ci->i_layout);
+ dl.object_size = ci->i_layout.object_size;
+ dl.block_size = ci->i_layout.stripe_unit;
/* block_offset = object_offset % block_size */
tmp = dl.object_offset;
@@ -212,10 +212,13 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no);
- oloc.pool = ceph_file_layout_pg_pool(ci->i_layout);
+ oloc.pool = ci->i_layout.pool_id;
+ oloc.pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
ceph_oid_printf(&oid, "%s", dl.object_name);
r = ceph_object_locator_to_pg(osdc->osdmap, &oid, &oloc, &pgid);
+
+ ceph_oloc_destroy(&oloc);
if (r < 0) {
up_read(&osdc->lock);
return r;
@@ -247,9 +250,8 @@ static long ceph_ioctl_lazyio(struct file *file)
if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) {
spin_lock(&ci->i_ceph_lock);
- ci->i_nr_by_mode[fi->fmode]--;
fi->fmode |= CEPH_FILE_MODE_LAZY;
- ci->i_nr_by_mode[fi->fmode]++;
+ ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++;
spin_unlock(&ci->i_ceph_lock);
dout("ioctl_layzio: file %p marked lazy\n", file);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 4e8678a612b6..fa59a85226b2 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -48,7 +48,7 @@
struct ceph_reconnect_state {
int nr_caps;
struct ceph_pagelist *pagelist;
- bool flock;
+ unsigned msg_version;
};
static void __wake_requests(struct ceph_mds_client *mdsc,
@@ -100,12 +100,15 @@ static int parse_reply_info_in(void **p, void *end,
} else
info->inline_version = CEPH_INLINE_NONE;
+ info->pool_ns_len = 0;
+ info->pool_ns_data = NULL;
if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
- ceph_decode_need(p, end, info->pool_ns_len, bad);
- *p += info->pool_ns_len;
- } else {
- info->pool_ns_len = 0;
+ if (info->pool_ns_len > 0) {
+ ceph_decode_need(p, end, info->pool_ns_len, bad);
+ info->pool_ns_data = *p;
+ *p += info->pool_ns_len;
+ }
}
return 0;
@@ -469,7 +472,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
s->s_cap_iterator = NULL;
INIT_LIST_HEAD(&s->s_cap_releases);
INIT_LIST_HEAD(&s->s_cap_flushing);
- INIT_LIST_HEAD(&s->s_cap_snaps_flushing);
dout("register_session mds%d\n", mds);
if (mds >= mdsc->max_sessions) {
@@ -1145,19 +1147,17 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
ACCESS_ONCE(fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
invalidate = true;
- while (true) {
- struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
- if (!n)
- break;
- cf = rb_entry(n, struct ceph_cap_flush, i_node);
- rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
- list_add(&cf->list, &to_remove);
+ while (!list_empty(&ci->i_cap_flush_list)) {
+ cf = list_first_entry(&ci->i_cap_flush_list,
+ struct ceph_cap_flush, i_list);
+ list_del(&cf->i_list);
+ list_add(&cf->i_list, &to_remove);
}
spin_lock(&mdsc->cap_dirty_lock);
- list_for_each_entry(cf, &to_remove, list)
- rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
+ list_for_each_entry(cf, &to_remove, i_list)
+ list_del(&cf->g_list);
if (!list_empty(&ci->i_dirty_item)) {
pr_warn_ratelimited(
@@ -1181,7 +1181,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_unlock(&mdsc->cap_dirty_lock);
if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) {
- list_add(&ci->i_prealloc_cap_flush->list, &to_remove);
+ list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove);
ci->i_prealloc_cap_flush = NULL;
}
}
@@ -1189,8 +1189,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
while (!list_empty(&to_remove)) {
struct ceph_cap_flush *cf;
cf = list_first_entry(&to_remove,
- struct ceph_cap_flush, list);
- list_del(&cf->list);
+ struct ceph_cap_flush, i_list);
+ list_del(&cf->i_list);
ceph_free_cap_flush(cf);
}
@@ -1212,6 +1212,8 @@ static void remove_session_caps(struct ceph_mds_session *session)
dout("remove_session_caps on %p\n", session);
iterate_session_caps(session, remove_session_caps_cb, fsc);
+ wake_up_all(&fsc->mdsc->cap_flushing_wq);
+
spin_lock(&session->s_cap_lock);
if (session->s_nr_caps > 0) {
struct inode *inode;
@@ -1478,35 +1480,21 @@ static int trim_caps(struct ceph_mds_client *mdsc,
return 0;
}
-static int check_capsnap_flush(struct ceph_inode_info *ci,
- u64 want_snap_seq)
-{
- int ret = 1;
- spin_lock(&ci->i_ceph_lock);
- if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
- struct ceph_cap_snap *capsnap =
- list_first_entry(&ci->i_cap_snaps,
- struct ceph_cap_snap, ci_item);
- ret = capsnap->follows >= want_snap_seq;
- }
- spin_unlock(&ci->i_ceph_lock);
- return ret;
-}
-
static int check_caps_flush(struct ceph_mds_client *mdsc,
u64 want_flush_tid)
{
- struct rb_node *n;
- struct ceph_cap_flush *cf;
int ret = 1;
spin_lock(&mdsc->cap_dirty_lock);
- n = rb_first(&mdsc->cap_flush_tree);
- cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
- if (cf && cf->tid <= want_flush_tid) {
- dout("check_caps_flush still flushing tid %llu <= %llu\n",
- cf->tid, want_flush_tid);
- ret = 0;
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_first_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ if (cf->tid <= want_flush_tid) {
+ dout("check_caps_flush still flushing tid "
+ "%llu <= %llu\n", cf->tid, want_flush_tid);
+ ret = 0;
+ }
}
spin_unlock(&mdsc->cap_dirty_lock);
return ret;
@@ -1518,54 +1506,9 @@ static int check_caps_flush(struct ceph_mds_client *mdsc,
* returns true if we've flushed through want_flush_tid
*/
static void wait_caps_flush(struct ceph_mds_client *mdsc,
- u64 want_flush_tid, u64 want_snap_seq)
+ u64 want_flush_tid)
{
- int mds;
-
- dout("check_caps_flush want %llu snap want %llu\n",
- want_flush_tid, want_snap_seq);
- mutex_lock(&mdsc->mutex);
- for (mds = 0; mds < mdsc->max_sessions; ) {
- struct ceph_mds_session *session = mdsc->sessions[mds];
- struct inode *inode = NULL;
-
- if (!session) {
- mds++;
- continue;
- }
- get_session(session);
- mutex_unlock(&mdsc->mutex);
-
- mutex_lock(&session->s_mutex);
- if (!list_empty(&session->s_cap_snaps_flushing)) {
- struct ceph_cap_snap *capsnap =
- list_first_entry(&session->s_cap_snaps_flushing,
- struct ceph_cap_snap,
- flushing_item);
- struct ceph_inode_info *ci = capsnap->ci;
- if (!check_capsnap_flush(ci, want_snap_seq)) {
- dout("check_cap_flush still flushing snap %p "
- "follows %lld <= %lld to mds%d\n",
- &ci->vfs_inode, capsnap->follows,
- want_snap_seq, mds);
- inode = igrab(&ci->vfs_inode);
- }
- }
- mutex_unlock(&session->s_mutex);
- ceph_put_mds_session(session);
-
- if (inode) {
- wait_event(mdsc->cap_flushing_wq,
- check_capsnap_flush(ceph_inode(inode),
- want_snap_seq));
- iput(inode);
- } else {
- mds++;
- }
-
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
+ dout("check_caps_flush want %llu\n", want_flush_tid);
wait_event(mdsc->cap_flushing_wq,
check_caps_flush(mdsc, want_flush_tid));
@@ -2163,6 +2106,11 @@ static int __do_request(struct ceph_mds_client *mdsc,
mds = __choose_mds(mdsc, req);
if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
+ if (mdsc->mdsmap_err) {
+ err = mdsc->mdsmap_err;
+ dout("do_request mdsmap err %d\n", err);
+ goto finish;
+ }
dout("do_request no mds or not active, waiting for map\n");
list_add(&req->r_wait, &mdsc->waiting_for_map);
goto out;
@@ -2292,14 +2240,6 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
CEPH_CAP_PIN);
- /* deny access to directories with pool_ns layouts */
- if (req->r_inode && S_ISDIR(req->r_inode->i_mode) &&
- ceph_inode(req->r_inode)->i_pool_ns_len)
- return -EIO;
- if (req->r_locked_dir &&
- ceph_inode(req->r_locked_dir)->i_pool_ns_len)
- return -EIO;
-
/* issue */
mutex_lock(&mdsc->mutex);
__register_request(mdsc, req, dir);
@@ -2791,13 +2731,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
struct ceph_mds_cap_reconnect v2;
struct ceph_mds_cap_reconnect_v1 v1;
} rec;
- size_t reclen;
struct ceph_inode_info *ci;
struct ceph_reconnect_state *recon_state = arg;
struct ceph_pagelist *pagelist = recon_state->pagelist;
char *path;
int pathlen, err;
u64 pathbase;
+ u64 snap_follows;
struct dentry *dentry;
ci = cap->ci;
@@ -2820,9 +2760,6 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
path = NULL;
pathlen = 0;
}
- err = ceph_pagelist_encode_string(pagelist, path, pathlen);
- if (err)
- goto out_free;
spin_lock(&ci->i_ceph_lock);
cap->seq = 0; /* reset cap seq */
@@ -2830,14 +2767,13 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
cap->mseq = 0; /* and migrate_seq */
cap->cap_gen = cap->session->s_cap_gen;
- if (recon_state->flock) {
+ if (recon_state->msg_version >= 2) {
rec.v2.cap_id = cpu_to_le64(cap->cap_id);
rec.v2.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
rec.v2.issued = cpu_to_le32(cap->issued);
rec.v2.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
rec.v2.pathbase = cpu_to_le64(pathbase);
rec.v2.flock_len = 0;
- reclen = sizeof(rec.v2);
} else {
rec.v1.cap_id = cpu_to_le64(cap->cap_id);
rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci));
@@ -2847,13 +2783,23 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
ceph_encode_timespec(&rec.v1.atime, &inode->i_atime);
rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino);
rec.v1.pathbase = cpu_to_le64(pathbase);
- reclen = sizeof(rec.v1);
+ }
+
+ if (list_empty(&ci->i_cap_snaps)) {
+ snap_follows = 0;
+ } else {
+ struct ceph_cap_snap *capsnap =
+ list_first_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap, ci_item);
+ snap_follows = capsnap->follows;
}
spin_unlock(&ci->i_ceph_lock);
- if (recon_state->flock) {
+ if (recon_state->msg_version >= 2) {
int num_fcntl_locks, num_flock_locks;
struct ceph_filelock *flocks;
+ size_t struct_len, total_len = 0;
+ u8 struct_v = 0;
encode_again:
ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
@@ -2872,20 +2818,51 @@ encode_again:
goto encode_again;
goto out_free;
}
+
+ if (recon_state->msg_version >= 3) {
+ /* version, compat_version and struct_len */
+ total_len = 2 * sizeof(u8) + sizeof(u32);
+ struct_v = 2;
+ }
/*
* number of encoded locks is stable, so copy to pagelist
*/
- rec.v2.flock_len = cpu_to_le32(2*sizeof(u32) +
- (num_fcntl_locks+num_flock_locks) *
- sizeof(struct ceph_filelock));
- err = ceph_pagelist_append(pagelist, &rec, reclen);
- if (!err)
- err = ceph_locks_to_pagelist(flocks, pagelist,
- num_fcntl_locks,
- num_flock_locks);
+ struct_len = 2 * sizeof(u32) +
+ (num_fcntl_locks + num_flock_locks) *
+ sizeof(struct ceph_filelock);
+ rec.v2.flock_len = cpu_to_le32(struct_len);
+
+ struct_len += sizeof(rec.v2);
+ struct_len += sizeof(u32) + pathlen;
+
+ if (struct_v >= 2)
+ struct_len += sizeof(u64); /* snap_follows */
+
+ total_len += struct_len;
+ err = ceph_pagelist_reserve(pagelist, total_len);
+
+ if (!err) {
+ if (recon_state->msg_version >= 3) {
+ ceph_pagelist_encode_8(pagelist, struct_v);
+ ceph_pagelist_encode_8(pagelist, 1);
+ ceph_pagelist_encode_32(pagelist, struct_len);
+ }
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v2));
+ ceph_locks_to_pagelist(flocks, pagelist,
+ num_fcntl_locks,
+ num_flock_locks);
+ if (struct_v >= 2)
+ ceph_pagelist_encode_64(pagelist, snap_follows);
+ }
kfree(flocks);
} else {
- err = ceph_pagelist_append(pagelist, &rec, reclen);
+ size_t size = sizeof(u32) + pathlen + sizeof(rec.v1);
+ err = ceph_pagelist_reserve(pagelist, size);
+ if (!err) {
+ ceph_pagelist_encode_string(pagelist, path, pathlen);
+ ceph_pagelist_append(pagelist, &rec, sizeof(rec.v1));
+ }
}
recon_state->nr_caps++;
@@ -2976,7 +2953,12 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
recon_state.nr_caps = 0;
recon_state.pagelist = pagelist;
- recon_state.flock = session->s_con.peer_features & CEPH_FEATURE_FLOCK;
+ if (session->s_con.peer_features & CEPH_FEATURE_MDSENC)
+ recon_state.msg_version = 3;
+ else if (session->s_con.peer_features & CEPH_FEATURE_FLOCK)
+ recon_state.msg_version = 2;
+ else
+ recon_state.msg_version = 1;
err = iterate_session_caps(session, encode_caps_cb, &recon_state);
if (err < 0)
goto fail;
@@ -3005,8 +2987,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
goto fail;
}
- if (recon_state.flock)
- reply->hdr.version = cpu_to_le16(2);
+ reply->hdr.version = cpu_to_le16(recon_state.msg_version);
/* raced with cap release? */
if (s_nr_caps != recon_state.nr_caps) {
@@ -3231,7 +3212,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
msecs_to_jiffies(le32_to_cpu(h->duration_ms));
di->lease_seq = seq;
- dentry->d_time = di->lease_renew_from + duration;
+ di->time = di->lease_renew_from + duration;
di->lease_renew_after = di->lease_renew_from +
(duration >> 1);
di->lease_renew_from = 0;
@@ -3297,47 +3278,6 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
}
/*
- * Preemptively release a lease we expect to invalidate anyway.
- * Pass @inode always, @dentry is optional.
- */
-void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
- struct dentry *dentry)
-{
- struct ceph_dentry_info *di;
- struct ceph_mds_session *session;
- u32 seq;
-
- BUG_ON(inode == NULL);
- BUG_ON(dentry == NULL);
-
- /* is dentry lease valid? */
- spin_lock(&dentry->d_lock);
- di = ceph_dentry(dentry);
- if (!di || !di->lease_session ||
- di->lease_session->s_mds < 0 ||
- di->lease_gen != di->lease_session->s_cap_gen ||
- !time_before(jiffies, dentry->d_time)) {
- dout("lease_release inode %p dentry %p -- "
- "no lease\n",
- inode, dentry);
- spin_unlock(&dentry->d_lock);
- return;
- }
-
- /* we do have a lease on this dentry; note mds and seq */
- session = ceph_get_mds_session(di->lease_session);
- seq = di->lease_seq;
- __ceph_mdsc_drop_dentry_lease(dentry);
- spin_unlock(&dentry->d_lock);
-
- dout("lease_release inode %p dentry %p to mds%d\n",
- inode, dentry, session->s_mds);
- ceph_mdsc_lease_send_msg(session, inode, dentry,
- CEPH_MDS_LEASE_RELEASE, seq);
- ceph_put_mds_session(session);
-}
-
-/*
* drop all leases (and dentry refs) in preparation for umount
*/
static void drop_leases(struct ceph_mds_client *mdsc)
@@ -3470,7 +3410,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock);
mdsc->last_cap_flush_tid = 1;
- mdsc->cap_flush_tree = RB_ROOT;
+ INIT_LIST_HEAD(&mdsc->cap_flush_list);
INIT_LIST_HEAD(&mdsc->cap_dirty);
INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
mdsc->num_cap_flushing = 0;
@@ -3585,7 +3525,7 @@ restart:
void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
{
- u64 want_tid, want_flush, want_snap;
+ u64 want_tid, want_flush;
if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_SHUTDOWN)
return;
@@ -3598,17 +3538,19 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
ceph_flush_dirty_caps(mdsc);
spin_lock(&mdsc->cap_dirty_lock);
want_flush = mdsc->last_cap_flush_tid;
+ if (!list_empty(&mdsc->cap_flush_list)) {
+ struct ceph_cap_flush *cf =
+ list_last_entry(&mdsc->cap_flush_list,
+ struct ceph_cap_flush, g_list);
+ cf->wake = true;
+ }
spin_unlock(&mdsc->cap_dirty_lock);
- down_read(&mdsc->snap_rwsem);
- want_snap = mdsc->last_snap_seq;
- up_read(&mdsc->snap_rwsem);
-
- dout("sync want tid %lld flush_seq %lld snap_seq %lld\n",
- want_tid, want_flush, want_snap);
+ dout("sync want tid %lld flush_seq %lld\n",
+ want_tid, want_flush);
wait_unsafe_requests(mdsc, want_tid);
- wait_caps_flush(mdsc, want_flush, want_snap);
+ wait_caps_flush(mdsc, want_flush);
}
/*
@@ -3729,11 +3671,86 @@ void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
dout("mdsc_destroy %p done\n", mdsc);
}
+void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+{
+ struct ceph_fs_client *fsc = mdsc->fsc;
+ const char *mds_namespace = fsc->mount_options->mds_namespace;
+ void *p = msg->front.iov_base;
+ void *end = p + msg->front.iov_len;
+ u32 epoch;
+ u32 map_len;
+ u32 num_fs;
+ u32 mount_fscid = (u32)-1;
+ u8 struct_v, struct_cv;
+ int err = -EINVAL;
+
+ ceph_decode_need(&p, end, sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+
+ dout("handle_fsmap epoch %u\n", epoch);
+
+ ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
+ struct_v = ceph_decode_8(&p);
+ struct_cv = ceph_decode_8(&p);
+ map_len = ceph_decode_32(&p);
+
+ ceph_decode_need(&p, end, sizeof(u32) * 3, bad);
+ p += sizeof(u32) * 2; /* skip epoch and legacy_client_fscid */
+
+ num_fs = ceph_decode_32(&p);
+ while (num_fs-- > 0) {
+ void *info_p, *info_end;
+ u32 info_len;
+ u8 info_v, info_cv;
+ u32 fscid, namelen;
+
+ ceph_decode_need(&p, end, 2 + sizeof(u32), bad);
+ info_v = ceph_decode_8(&p);
+ info_cv = ceph_decode_8(&p);
+ info_len = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, info_len, bad);
+ info_p = p;
+ info_end = p + info_len;
+ p = info_end;
+
+ ceph_decode_need(&info_p, info_end, sizeof(u32) * 2, bad);
+ fscid = ceph_decode_32(&info_p);
+ namelen = ceph_decode_32(&info_p);
+ ceph_decode_need(&info_p, info_end, namelen, bad);
+
+ if (mds_namespace &&
+ strlen(mds_namespace) == namelen &&
+ !strncmp(mds_namespace, (char *)info_p, namelen)) {
+ mount_fscid = fscid;
+ break;
+ }
+ }
+
+ ceph_monc_got_map(&fsc->client->monc, CEPH_SUB_FSMAP, epoch);
+ if (mount_fscid != (u32)-1) {
+ fsc->client->monc.fs_cluster_id = mount_fscid;
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
+ 0, true);
+ ceph_monc_renew_subs(&fsc->client->monc);
+ } else {
+ err = -ENOENT;
+ goto err_out;
+ }
+ return;
+bad:
+ pr_err("error decoding fsmap\n");
+err_out:
+ mutex_lock(&mdsc->mutex);
+ mdsc->mdsmap_err = -ENOENT;
+ __wake_requests(mdsc, &mdsc->waiting_for_map);
+ mutex_unlock(&mdsc->mutex);
+ return;
+}
/*
* handle mds map update.
*/
-void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
+void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
{
u32 epoch;
u32 maplen;
@@ -3840,7 +3857,10 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
switch (type) {
case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(mdsc, msg);
+ ceph_mdsc_handle_mdsmap(mdsc, msg);
+ break;
+ case CEPH_MSG_FS_MAP_USER:
+ ceph_mdsc_handle_fsmap(mdsc, msg);
break;
case CEPH_MSG_CLIENT_SESSION:
handle_session(s, msg);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e7d38aac7109..6b3679737d4a 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -45,6 +45,7 @@ struct ceph_mds_reply_info_in {
u32 inline_len;
char *inline_data;
u32 pool_ns_len;
+ char *pool_ns_data;
};
struct ceph_mds_reply_dir_entry {
@@ -151,7 +152,6 @@ struct ceph_mds_session {
/* protected by mutex */
struct list_head s_cap_flushing; /* inodes w/ flushing caps */
- struct list_head s_cap_snaps_flushing;
unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq;
@@ -275,8 +275,10 @@ struct ceph_mds_request {
struct ceph_pool_perm {
struct rb_node node;
- u32 pool;
int perm;
+ s64 pool;
+ size_t pool_ns_len;
+ char pool_ns[];
};
/*
@@ -290,6 +292,7 @@ struct ceph_mds_client {
struct completion safe_umount_waiters;
wait_queue_head_t session_close_wq;
struct list_head waiting_for_map;
+ int mdsmap_err;
struct ceph_mds_session **sessions; /* NULL for mds if no session */
atomic_t num_sessions;
@@ -321,7 +324,7 @@ struct ceph_mds_client {
spinlock_t snap_flush_lock;
u64 last_cap_flush_tid;
- struct rb_root cap_flush_tree;
+ struct list_head cap_flush_list;
struct list_head cap_dirty; /* inodes with dirty caps */
struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */
@@ -382,10 +385,6 @@ extern void ceph_mdsc_destroy(struct ceph_fs_client *fsc);
extern void ceph_mdsc_sync(struct ceph_mds_client *mdsc);
-extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
- struct inode *inode,
- struct dentry *dn);
-
extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
extern int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
struct inode *dir);
@@ -420,8 +419,10 @@ extern void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
struct dentry *dentry, char action,
u32 seq);
-extern void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc,
- struct ceph_msg *msg);
+extern void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg);
+extern void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc,
+ struct ceph_msg *msg);
extern struct ceph_mds_session *
ceph_mdsc_open_export_target_session(struct ceph_mds_client *mdsc, int target);
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 9caaa7ffc93f..9ff5219d849e 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -520,9 +520,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
ihold(inode);
atomic_set(&capsnap->nref, 1);
- capsnap->ci = ci;
INIT_LIST_HEAD(&capsnap->ci_item);
- INIT_LIST_HEAD(&capsnap->flushing_item);
capsnap->follows = old_snapc->seq;
capsnap->issued = __ceph_caps_issued(ci, NULL);
@@ -551,7 +549,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
ci->i_wrbuffer_ref_head = 0;
capsnap->context = old_snapc;
list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
- old_snapc = NULL;
if (used & CEPH_CAP_FILE_WR) {
dout("queue_cap_snap %p cap_snap %p snapc %p"
@@ -563,6 +560,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci)
__ceph_finish_cap_snap(ci, capsnap);
}
capsnap = NULL;
+ old_snapc = NULL;
update_snapc:
if (ci->i_head_snapc) {
@@ -603,6 +601,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
capsnap->dirty_pages);
return 0;
}
+
+ ci->i_ceph_flags |= CEPH_I_FLUSH_SNAPS;
dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n",
inode, capsnap, capsnap->context,
capsnap->context->seq, ceph_cap_string(capsnap->dirty),
@@ -799,9 +799,7 @@ static void flush_snaps(struct ceph_mds_client *mdsc)
inode = &ci->vfs_inode;
ihold(inode);
spin_unlock(&mdsc->snap_flush_lock);
- spin_lock(&ci->i_ceph_lock);
- __ceph_flush_snaps(ci, &session, 0);
- spin_unlock(&ci->i_ceph_lock);
+ ceph_flush_snaps(ci, &session);
iput(inode);
spin_lock(&mdsc->snap_flush_lock);
}
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 91e02481ce06..e247f6f0feb7 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -108,7 +108,6 @@ static int ceph_sync_fs(struct super_block *sb, int wait)
* mount options
*/
enum {
- Opt_mds_namespace,
Opt_wsize,
Opt_rsize,
Opt_rasize,
@@ -121,6 +120,7 @@ enum {
Opt_last_int,
/* int args above */
Opt_snapdirname,
+ Opt_mds_namespace,
Opt_last_string,
/* string args above */
Opt_dirstat,
@@ -144,7 +144,6 @@ enum {
};
static match_table_t fsopt_tokens = {
- {Opt_mds_namespace, "mds_namespace=%d"},
{Opt_wsize, "wsize=%d"},
{Opt_rsize, "rsize=%d"},
{Opt_rasize, "rasize=%d"},
@@ -156,6 +155,7 @@ static match_table_t fsopt_tokens = {
{Opt_congestion_kb, "write_congestion_kb=%d"},
/* int args above */
{Opt_snapdirname, "snapdirname=%s"},
+ {Opt_mds_namespace, "mds_namespace=%s"},
/* string args above */
{Opt_dirstat, "dirstat"},
{Opt_nodirstat, "nodirstat"},
@@ -212,11 +212,14 @@ static int parse_fsopt_token(char *c, void *private)
if (!fsopt->snapdir_name)
return -ENOMEM;
break;
-
- /* misc */
case Opt_mds_namespace:
- fsopt->mds_namespace = intval;
+ fsopt->mds_namespace = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ if (!fsopt->mds_namespace)
+ return -ENOMEM;
break;
+ /* misc */
case Opt_wsize:
fsopt->wsize = intval;
break;
@@ -302,6 +305,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
{
dout("destroy_mount_options %p\n", args);
kfree(args->snapdir_name);
+ kfree(args->mds_namespace);
kfree(args->server_path);
kfree(args);
}
@@ -333,6 +337,9 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
ret = strcmp_null(fsopt1->snapdir_name, fsopt2->snapdir_name);
if (ret)
return ret;
+ ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
+ if (ret)
+ return ret;
ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
if (ret)
@@ -376,7 +383,6 @@ static int parse_mount_options(struct ceph_mount_options **pfsopt,
fsopt->max_readdir = CEPH_MAX_READDIR_DEFAULT;
fsopt->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
fsopt->congestion_kb = default_congestion_kb();
- fsopt->mds_namespace = CEPH_FS_CLUSTER_ID_NONE;
/*
* Distinguish the server list from the path in "dev_name".
@@ -469,8 +475,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noacl");
#endif
- if (fsopt->mds_namespace != CEPH_FS_CLUSTER_ID_NONE)
- seq_printf(m, ",mds_namespace=%d", fsopt->mds_namespace);
+ if (fsopt->mds_namespace)
+ seq_printf(m, ",mds_namespace=%s", fsopt->mds_namespace);
if (fsopt->wsize)
seq_printf(m, ",wsize=%d", fsopt->wsize);
if (fsopt->rsize != CEPH_RSIZE_DEFAULT)
@@ -509,9 +515,11 @@ static int extra_mon_dispatch(struct ceph_client *client, struct ceph_msg *msg)
switch (type) {
case CEPH_MSG_MDS_MAP:
- ceph_mdsc_handle_map(fsc->mdsc, msg);
+ ceph_mdsc_handle_mdsmap(fsc->mdsc, msg);
+ return 0;
+ case CEPH_MSG_FS_MAP_USER:
+ ceph_mdsc_handle_fsmap(fsc->mdsc, msg);
return 0;
-
default:
return -1;
}
@@ -543,8 +551,14 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
goto fail;
}
fsc->client->extra_mon_dispatch = extra_mon_dispatch;
- fsc->client->monc.fs_cluster_id = fsopt->mds_namespace;
- ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
+
+ if (fsopt->mds_namespace == NULL) {
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP,
+ 0, true);
+ } else {
+ ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_FSMAP,
+ 0, false);
+ }
fsc->mount_options = fsopt;
@@ -672,8 +686,8 @@ static int __init init_caches(void)
if (ceph_dentry_cachep == NULL)
goto bad_dentry;
- ceph_file_cachep = KMEM_CACHE(ceph_file_info,
- SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ ceph_file_cachep = KMEM_CACHE(ceph_file_info, SLAB_MEM_SPREAD);
+
if (ceph_file_cachep == NULL)
goto bad_file;
@@ -731,6 +745,7 @@ static const struct super_operations ceph_super_ops = {
.destroy_inode = ceph_destroy_inode,
.write_inode = ceph_write_inode,
.drop_inode = ceph_drop_inode,
+ .evict_inode = ceph_evict_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
.show_options = ceph_show_options,
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 0168b49fb6ad..3e3fa9163059 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -62,7 +62,6 @@ struct ceph_mount_options {
int cap_release_safety;
int max_readdir; /* max readdir result (entires) */
int max_readdir_bytes; /* max readdir result (bytes) */
- int mds_namespace;
/*
* everything above this point can be memcmp'd; everything below
@@ -70,6 +69,7 @@ struct ceph_mount_options {
*/
char *snapdir_name; /* default ".snap" */
+ char *mds_namespace; /* default NULL */
char *server_path; /* default "/" */
};
@@ -147,6 +147,14 @@ struct ceph_cap {
#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
+struct ceph_cap_flush {
+ u64 tid;
+ int caps; /* 0 means capsnap */
+ bool wake; /* wake up flush waiters when finish ? */
+ struct list_head g_list; // global
+ struct list_head i_list; // per inode
+};
+
/*
* Snapped cap state that is pending flush to mds. When a snapshot occurs,
* we first complete any in-process sync writes and writeback any dirty
@@ -154,10 +162,11 @@ struct ceph_cap {
*/
struct ceph_cap_snap {
atomic_t nref;
- struct ceph_inode_info *ci;
- struct list_head ci_item, flushing_item;
+ struct list_head ci_item;
+
+ struct ceph_cap_flush cap_flush;
- u64 follows, flush_tid;
+ u64 follows;
int issued, dirty;
struct ceph_snap_context *context;
@@ -186,16 +195,6 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
}
}
-struct ceph_cap_flush {
- u64 tid;
- int caps;
- struct rb_node g_node; // global
- union {
- struct rb_node i_node; // inode
- struct list_head list;
- };
-};
-
/*
* The frag tree describes how a directory is fragmented, potentially across
* multiple metadata servers. It is also used to indicate points where
@@ -246,7 +245,7 @@ struct ceph_dentry_info {
unsigned long lease_renew_after, lease_renew_from;
struct list_head lru;
struct dentry *dentry;
- u64 time;
+ unsigned long time;
u64 offset;
};
@@ -287,7 +286,6 @@ struct ceph_inode_info {
struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout;
- size_t i_pool_ns_len;
char *i_symlink;
/* for dirs */
@@ -311,7 +309,7 @@ struct ceph_inode_info {
* overlapping, pipelined cap flushes to the mds. we can probably
* reduce the tid to 8 bits if we're concerned about inode size. */
struct ceph_cap_flush *i_prealloc_cap_flush;
- struct rb_root i_cap_flush_tree;
+ struct list_head i_cap_flush_list;
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */
@@ -322,7 +320,7 @@ struct ceph_inode_info {
dirty|flushing caps */
unsigned i_snap_caps; /* cap bits for snapped files */
- int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
+ int i_nr_by_mode[CEPH_FILE_MODE_BITS]; /* open file counts */
struct mutex i_truncate_mutex;
u32 i_truncate_seq; /* last truncate to smaller size */
@@ -471,6 +469,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */
#define CEPH_I_CAP_DROPPED (1 << 8) /* caps were forcibly dropped */
+#define CEPH_I_KICK_FLUSH (1 << 9) /* kick flushing caps */
+#define CEPH_I_FLUSH_SNAPS (1 << 10) /* need flush snapss */
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
long long release_count,
@@ -750,6 +750,7 @@ extern const struct inode_operations ceph_file_iops;
extern struct inode *ceph_alloc_inode(struct super_block *sb);
extern void ceph_destroy_inode(struct inode *inode);
extern int ceph_drop_inode(struct inode *inode);
+extern void ceph_evict_inode(struct inode *inode);
extern struct inode *ceph_get_inode(struct super_block *sb,
struct ceph_vino vino);
@@ -890,9 +891,8 @@ extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc);
-extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
- struct ceph_mds_session **psession,
- int again);
+extern void ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session);
extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
@@ -907,10 +907,7 @@ extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page);
/* for counting open files by mode */
-static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
-{
- ci->i_nr_by_mode[mode]++;
-}
+extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
/* addr.c */
@@ -931,6 +928,7 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
extern int ceph_release(struct inode *inode, struct file *filp);
extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
char *data, size_t len);
+extern void ceph_sync_write_wait(struct inode *inode);
/* dir.c */
extern const struct file_operations ceph_dir_fops;
extern const struct file_operations ceph_snapdir_fops;
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 4870b29df224..adc231892b0d 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -57,81 +57,88 @@ struct ceph_vxattr {
static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
{
- size_t s;
- char *p = (char *)&ci->i_layout;
-
- for (s = 0; s < sizeof(ci->i_layout); s++, p++)
- if (*p)
- return true;
- return false;
+ struct ceph_file_layout *fl = &ci->i_layout;
+ return (fl->stripe_unit > 0 || fl->stripe_count > 0 ||
+ fl->object_size > 0 || fl->pool_id >= 0 ||
+ rcu_dereference_raw(fl->pool_ns) != NULL);
}
static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
size_t size)
{
- int ret;
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
- s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+ struct ceph_string *pool_ns;
+ s64 pool = ci->i_layout.pool_id;
const char *pool_name;
+ const char *ns_field = " pool_namespace=";
char buf[128];
+ size_t len, total_len = 0;
+ int ret;
+
+ pool_ns = ceph_try_get_string(ci->i_layout.pool_ns);
dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
down_read(&osdc->lock);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name) {
- size_t len = strlen(pool_name);
- ret = snprintf(buf, sizeof(buf),
- "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=",
- (unsigned long long)ceph_file_layout_su(ci->i_layout),
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
- if (!size) {
- ret += len;
- } else if (ret + len > size) {
- ret = -ERANGE;
- } else {
- memcpy(val, buf, ret);
+ len = snprintf(buf, sizeof(buf),
+ "stripe_unit=%u stripe_count=%u object_size=%u pool=",
+ ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+ ci->i_layout.object_size);
+ total_len = len + strlen(pool_name);
+ } else {
+ len = snprintf(buf, sizeof(buf),
+ "stripe_unit=%u stripe_count=%u object_size=%u pool=%lld",
+ ci->i_layout.stripe_unit, ci->i_layout.stripe_count,
+ ci->i_layout.object_size, (unsigned long long)pool);
+ total_len = len;
+ }
+
+ if (pool_ns)
+ total_len += strlen(ns_field) + pool_ns->len;
+
+ if (!size) {
+ ret = total_len;
+ } else if (total_len > size) {
+ ret = -ERANGE;
+ } else {
+ memcpy(val, buf, len);
+ ret = len;
+ if (pool_name) {
+ len = strlen(pool_name);
memcpy(val + ret, pool_name, len);
ret += len;
}
- } else {
- ret = snprintf(buf, sizeof(buf),
- "stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
- (unsigned long long)ceph_file_layout_su(ci->i_layout),
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout),
- (unsigned long long)pool);
- if (size) {
- if (ret <= size)
- memcpy(val, buf, ret);
- else
- ret = -ERANGE;
+ if (pool_ns) {
+ len = strlen(ns_field);
+ memcpy(val + ret, ns_field, len);
+ ret += len;
+ memcpy(val + ret, pool_ns->str, pool_ns->len);
+ ret += pool_ns->len;
}
}
up_read(&osdc->lock);
+ ceph_put_string(pool_ns);
return ret;
}
static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
char *val, size_t size)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_su(ci->i_layout));
+ return snprintf(val, size, "%u", ci->i_layout.stripe_unit);
}
static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
char *val, size_t size)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
+ return snprintf(val, size, "%u", ci->i_layout.stripe_count);
}
static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
char *val, size_t size)
{
- return snprintf(val, size, "%lld",
- (unsigned long long)ceph_file_layout_object_size(ci->i_layout));
+ return snprintf(val, size, "%u", ci->i_layout.object_size);
}
static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
@@ -140,7 +147,7 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
int ret;
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
- s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
+ s64 pool = ci->i_layout.pool_id;
const char *pool_name;
down_read(&osdc->lock);
@@ -153,6 +160,18 @@ static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
return ret;
}
+static size_t ceph_vxattrcb_layout_pool_namespace(struct ceph_inode_info *ci,
+ char *val, size_t size)
+{
+ int ret = 0;
+ struct ceph_string *ns = ceph_try_get_string(ci->i_layout.pool_ns);
+ if (ns) {
+ ret = snprintf(val, size, "%.*s", (int)ns->len, ns->str);
+ ceph_put_string(ns);
+ }
+ return ret;
+}
+
/* directories */
static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
@@ -241,6 +260,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
XATTR_LAYOUT_FIELD(dir, layout, object_size),
XATTR_LAYOUT_FIELD(dir, layout, pool),
+ XATTR_LAYOUT_FIELD(dir, layout, pool_namespace),
XATTR_NAME_CEPH(dir, entries),
XATTR_NAME_CEPH(dir, files),
XATTR_NAME_CEPH(dir, subdirs),
@@ -268,6 +288,7 @@ static struct ceph_vxattr ceph_file_vxattrs[] = {
XATTR_LAYOUT_FIELD(file, layout, stripe_count),
XATTR_LAYOUT_FIELD(file, layout, object_size),
XATTR_LAYOUT_FIELD(file, layout, pool),
+ XATTR_LAYOUT_FIELD(file, layout, pool_namespace),
{ .name = NULL, 0 } /* Required table terminator */
};
static size_t ceph_file_vxattrs_name_size; /* total size of all names */
diff --git a/fs/orangefs/downcall.h b/fs/orangefs/downcall.h
index 66b99210f1f9..db6e8722a89e 100644
--- a/fs/orangefs/downcall.h
+++ b/fs/orangefs/downcall.h
@@ -83,7 +83,10 @@ struct orangefs_listxattr_response {
};
struct orangefs_param_response {
- __s64 value;
+ union {
+ __s64 value64;
+ __s32 value32[2];
+ } u;
};
#define PERF_COUNT_BUF_SIZE 4096
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 526040e09f78..43c08b5c7168 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -14,6 +14,32 @@
#include <linux/fs.h>
#include <linux/pagemap.h>
+static int flush_racache(struct inode *inode)
+{
+ struct orangefs_inode_s *orangefs_inode = ORANGEFS_I(inode);
+ struct orangefs_kernel_op_s *new_op;
+ int ret;
+
+ gossip_debug(GOSSIP_UTILS_DEBUG,
+ "%s: %pU: Handle is %pU | fs_id %d\n", __func__,
+ get_khandle_from_ino(inode), &orangefs_inode->refn.khandle,
+ orangefs_inode->refn.fs_id);
+
+ new_op = op_alloc(ORANGEFS_VFS_OP_RA_FLUSH);
+ if (!new_op)
+ return -ENOMEM;
+ new_op->upcall.req.ra_cache_flush.refn = orangefs_inode->refn;
+
+ ret = service_operation(new_op, "orangefs_flush_racache",
+ get_interruptible_flag(inode));
+
+ gossip_debug(GOSSIP_UTILS_DEBUG, "%s: got return value of %d\n",
+ __func__, ret);
+
+ op_release(new_op);
+ return ret;
+}
+
/*
* Copy to client-core's address space from the buffers specified
* by the iovec upto total_size bytes.
@@ -591,15 +617,21 @@ static int orangefs_file_release(struct inode *inode, struct file *file)
orangefs_flush_inode(inode);
/*
- * remove all associated inode pages from the page cache and mmap
+ * remove all associated inode pages from the page cache and
* readahead cache (if any); this forces an expensive refresh of
* data for the next caller of mmap (or 'get_block' accesses)
*/
if (file->f_path.dentry->d_inode &&
file->f_path.dentry->d_inode->i_mapping &&
- mapping_nrpages(&file->f_path.dentry->d_inode->i_data))
+ mapping_nrpages(&file->f_path.dentry->d_inode->i_data)) {
+ gossip_debug(GOSSIP_INODE_DEBUG,
+ "calling flush_racache on %pU\n",
+ get_khandle_from_ino(inode));
+ flush_racache(inode);
+ gossip_debug(GOSSIP_INODE_DEBUG, "flush_racache finished\n");
truncate_inode_pages(file->f_path.dentry->d_inode->i_mapping,
0);
+ }
return 0;
}
diff --git a/fs/orangefs/orangefs-cache.c b/fs/orangefs/orangefs-cache.c
index b6edbe9fb309..eb0b6e00b519 100644
--- a/fs/orangefs/orangefs-cache.c
+++ b/fs/orangefs/orangefs-cache.c
@@ -73,8 +73,8 @@ char *get_opname_string(struct orangefs_kernel_op_s *new_op)
return "OP_STATFS";
else if (type == ORANGEFS_VFS_OP_TRUNCATE)
return "OP_TRUNCATE";
- else if (type == ORANGEFS_VFS_OP_MMAP_RA_FLUSH)
- return "OP_MMAP_RA_FLUSH";
+ else if (type == ORANGEFS_VFS_OP_RA_FLUSH)
+ return "OP_RA_FLUSH";
else if (type == ORANGEFS_VFS_OP_FS_MOUNT)
return "OP_FS_MOUNT";
else if (type == ORANGEFS_VFS_OP_FS_UMOUNT)
diff --git a/fs/orangefs/orangefs-dev-proto.h b/fs/orangefs/orangefs-dev-proto.h
index 9eac9d9a3f3a..71902899b1da 100644
--- a/fs/orangefs/orangefs-dev-proto.h
+++ b/fs/orangefs/orangefs-dev-proto.h
@@ -28,7 +28,7 @@
#define ORANGEFS_VFS_OP_RENAME 0xFF00000A
#define ORANGEFS_VFS_OP_STATFS 0xFF00000B
#define ORANGEFS_VFS_OP_TRUNCATE 0xFF00000C
-#define ORANGEFS_VFS_OP_MMAP_RA_FLUSH 0xFF00000D
+#define ORANGEFS_VFS_OP_RA_FLUSH 0xFF00000D
#define ORANGEFS_VFS_OP_FS_MOUNT 0xFF00000E
#define ORANGEFS_VFS_OP_FS_UMOUNT 0xFF00000F
#define ORANGEFS_VFS_OP_GETXATTR 0xFF000010
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index 375708c2db87..2fe9a3a2117b 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -73,6 +73,24 @@
* Description:
* Time getattr is valid in milliseconds.
*
+ * What: /sys/fs/orangefs/readahead_count
+ * Date: Aug 2016
+ * Contact: Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ * Readahead cache buffer count.
+ *
+ * What: /sys/fs/orangefs/readahead_size
+ * Date: Aug 2016
+ * Contact: Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ * Readahead cache buffer size.
+ *
+ * What: /sys/fs/orangefs/readahead_count_size
+ * Date: Aug 2016
+ * Contact: Martin Brandenburg <martin@omnibond.com>
+ * Description:
+ * Readahead cache buffer count and size.
+ *
* What: /sys/fs/orangefs/acache/...
* Date: Jun 2015
* Contact: Martin Brandenburg <martin@omnibond.com>
@@ -836,6 +854,20 @@ static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
new_op->upcall.req.param.op =
ORANGEFS_PARAM_REQUEST_OP_PERF_RESET;
+ else if (!strcmp(orangefs_attr->attr.name,
+ "readahead_count"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT;
+
+ else if (!strcmp(orangefs_attr->attr.name,
+ "readahead_size"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE;
+
+ else if (!strcmp(orangefs_attr->attr.name,
+ "readahead_count_size"))
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE;
} else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
acache_attr = (struct acache_orangefs_attribute *)attr;
@@ -949,10 +981,17 @@ static int sysfs_service_op_show(char *kobj_id, char *buf, void *attr)
out:
if (!rc) {
if (strcmp(kobj_id, PC_KOBJ_ID)) {
- rc = scnprintf(buf,
- PAGE_SIZE,
- "%d\n",
- (int)new_op->downcall.resp.param.value);
+ if (new_op->upcall.req.param.op ==
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE) {
+ rc = scnprintf(buf, PAGE_SIZE, "%d %d\n",
+ (int)new_op->downcall.resp.param.u.
+ value32[0],
+ (int)new_op->downcall.resp.param.u.
+ value32[1]);
+ } else {
+ rc = scnprintf(buf, PAGE_SIZE, "%d\n",
+ (int)new_op->downcall.resp.param.u.value64);
+ }
} else {
rc = scnprintf(
buf,
@@ -1079,11 +1118,18 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
}
/*
- * The value we want to send back to userspace is in buf.
+ * The value we want to send back to userspace is in buf, unless this
+ * there are two parameters, which is specially handled below.
*/
- rc = kstrtoint(buf, 0, &val);
- if (rc)
- goto out;
+ if (strcmp(kobj_id, ORANGEFS_KOBJ_ID) ||
+ strcmp(((struct orangefs_attribute *)attr)->attr.name,
+ "readahead_count_size")) {
+ rc = kstrtoint(buf, 0, &val);
+ if (rc)
+ goto out;
+ }
+
+ new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
if (!strcmp(kobj_id, ORANGEFS_KOBJ_ID)) {
orangefs_attr = (struct orangefs_attribute *)attr;
@@ -1114,6 +1160,51 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
rc = 0;
goto out;
}
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "readahead_count")) {
+ if ((val >= 0)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "readahead_size")) {
+ if ((val >= 0)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "readahead_count_size")) {
+ int val1, val2;
+ rc = sscanf(buf, "%d %d", &val1, &val2);
+ if (rc < 2) {
+ rc = 0;
+ goto out;
+ }
+ if ((val1 >= 0) && (val2 >= 0)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE;
+ } else {
+ rc = 0;
+ goto out;
+ }
+ new_op->upcall.req.param.u.value32[0] = val1;
+ new_op->upcall.req.param.u.value32[1] = val2;
+ goto value_set;
+ } else if (!strcmp(orangefs_attr->attr.name,
+ "perf_counter_reset")) {
+ if ((val > 0)) {
+ new_op->upcall.req.param.op =
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE;
+ } else {
+ rc = 0;
+ goto out;
+ }
}
} else if (!strcmp(kobj_id, ACACHE_KOBJ_ID)) {
@@ -1275,9 +1366,8 @@ static int sysfs_service_op_store(char *kobj_id, const char *buf, void *attr)
goto out;
}
- new_op->upcall.req.param.type = ORANGEFS_PARAM_REQUEST_SET;
-
- new_op->upcall.req.param.value = val;
+ new_op->upcall.req.param.u.value64 = val;
+value_set:
/*
* The service_operation will return a errno return code on
@@ -1400,6 +1490,18 @@ static struct orangefs_attribute dcache_timeout_msecs_attribute =
static struct orangefs_attribute getattr_timeout_msecs_attribute =
__ATTR(getattr_timeout_msecs, 0664, int_orangefs_show, int_store);
+static struct orangefs_attribute readahead_count_attribute =
+ __ATTR(readahead_count, 0664, service_orangefs_show,
+ service_orangefs_store);
+
+static struct orangefs_attribute readahead_size_attribute =
+ __ATTR(readahead_size, 0664, service_orangefs_show,
+ service_orangefs_store);
+
+static struct orangefs_attribute readahead_count_size_attribute =
+ __ATTR(readahead_count_size, 0664, service_orangefs_show,
+ service_orangefs_store);
+
static struct orangefs_attribute perf_counter_reset_attribute =
__ATTR(perf_counter_reset,
0664,
@@ -1423,6 +1525,9 @@ static struct attribute *orangefs_default_attrs[] = {
&slot_timeout_secs_attribute.attr,
&dcache_timeout_msecs_attribute.attr,
&getattr_timeout_msecs_attribute.attr,
+ &readahead_count_attribute.attr,
+ &readahead_size_attribute.attr,
+ &readahead_count_size_attribute.attr,
&perf_counter_reset_attribute.attr,
&perf_history_size_attribute.attr,
&perf_time_interval_secs_attribute.attr,
diff --git a/fs/orangefs/orangefs-utils.c b/fs/orangefs/orangefs-utils.c
index d13c7291fd05..9a99285fe310 100644
--- a/fs/orangefs/orangefs-utils.c
+++ b/fs/orangefs/orangefs-utils.c
@@ -50,7 +50,7 @@ __s32 fsid_of_op(struct orangefs_kernel_op_s *op)
case ORANGEFS_VFS_OP_TRUNCATE:
fsid = op->upcall.req.truncate.refn.fs_id;
break;
- case ORANGEFS_VFS_OP_MMAP_RA_FLUSH:
+ case ORANGEFS_VFS_OP_RA_FLUSH:
fsid = op->upcall.req.ra_cache_flush.refn.fs_id;
break;
case ORANGEFS_VFS_OP_FS_UMOUNT:
diff --git a/fs/orangefs/upcall.h b/fs/orangefs/upcall.h
index 001b20239407..7c29fdf08ec5 100644
--- a/fs/orangefs/upcall.h
+++ b/fs/orangefs/upcall.h
@@ -98,7 +98,7 @@ struct orangefs_truncate_request_s {
__s64 size;
};
-struct orangefs_mmap_ra_cache_flush_request_s {
+struct orangefs_ra_cache_flush_request_s {
struct orangefs_object_kref refn;
};
@@ -179,12 +179,18 @@ enum orangefs_param_request_op {
ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_SOFT_LIMIT = 23,
ORANGEFS_PARAM_REQUEST_OP_CAPCACHE_RECLAIM_PERCENTAGE = 24,
ORANGEFS_PARAM_REQUEST_OP_TWO_MASK_VALUES = 25,
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_SIZE = 26,
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT = 27,
+ ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE = 28,
};
struct orangefs_param_request_s {
enum orangefs_param_request_type type;
enum orangefs_param_request_op op;
- __s64 value;
+ union {
+ __s64 value64;
+ __s32 value32[2];
+ } u;
char s_value[ORANGEFS_MAX_DEBUG_STRING_LEN];
};
@@ -228,7 +234,7 @@ struct orangefs_upcall_s {
struct orangefs_rename_request_s rename;
struct orangefs_statfs_request_s statfs;
struct orangefs_truncate_request_s truncate;
- struct orangefs_mmap_ra_cache_flush_request_s ra_cache_flush;
+ struct orangefs_ra_cache_flush_request_s ra_cache_flush;
struct orangefs_fs_mount_request_s fs_mount;
struct orangefs_fs_umount_request_s fs_umount;
struct orangefs_getxattr_request_s getxattr;
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 54643d1f5af4..24563970ff7b 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -164,7 +164,7 @@
#define ___OF_TABLE(cfg, name) _OF_TABLE_##cfg(name)
#define __OF_TABLE(cfg, name) ___OF_TABLE(cfg, name)
-#define OF_TABLE(cfg, name) __OF_TABLE(config_enabled(cfg), name)
+#define OF_TABLE(cfg, name) __OF_TABLE(IS_ENABLED(cfg), name)
#define _OF_TABLE_0(name)
#define _OF_TABLE_1(name) \
. = ALIGN(8); \
diff --git a/include/drm/drm_dp_helper.h b/include/drm/drm_dp_helper.h
index 72dee1213268..63b8bd502444 100644
--- a/include/drm/drm_dp_helper.h
+++ b/include/drm/drm_dp_helper.h
@@ -657,6 +657,8 @@ struct edp_vsc_psr {
#define EDP_VSC_PSR_UPDATE_RFB (1<<1)
#define EDP_VSC_PSR_CRC_VALUES_VALID (1<<2)
+int drm_dp_psr_setup_time(const u8 psr_cap[EDP_PSR_RECEIVER_CAP_SIZE]);
+
static inline int
drm_dp_max_link_rate(const u8 dpcd[DP_RECEIVER_CAP_SIZE])
{
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index da0a524802cb..540da5149ba7 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -1,6 +1,5 @@
/*
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
+ * Copyright (C) 2015, 2016 ARM Ltd.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -12,16 +11,10 @@
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
-
-#ifndef __ASM_ARM_KVM_VGIC_H
-#define __ASM_ARM_KVM_VGIC_H
-
-#ifdef CONFIG_KVM_NEW_VGIC
-#include <kvm/vgic/vgic.h>
-#else
+#ifndef __KVM_ARM_VGIC_H
+#define __KVM_ARM_VGIC_H
#include <linux/kernel.h>
#include <linux/kvm.h>
@@ -29,248 +22,187 @@
#include <linux/spinlock.h>
#include <linux/types.h>
#include <kvm/iodev.h>
-#include <linux/irqchip/arm-gic-common.h>
+#include <linux/list.h>
-#define VGIC_NR_IRQS_LEGACY 256
+#define VGIC_V3_MAX_CPUS 255
+#define VGIC_V2_MAX_CPUS 8
+#define VGIC_NR_IRQS_LEGACY 256
#define VGIC_NR_SGIS 16
#define VGIC_NR_PPIS 16
#define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS)
+#define VGIC_MAX_PRIVATE (VGIC_NR_PRIVATE_IRQS - 1)
+#define VGIC_MAX_SPI 1019
+#define VGIC_MAX_RESERVED 1023
+#define VGIC_MIN_LPI 8192
-#define VGIC_V2_MAX_LRS (1 << 6)
-#define VGIC_V3_MAX_LRS 16
-#define VGIC_MAX_IRQS 1024
-#define VGIC_V2_MAX_CPUS 8
-#define VGIC_V3_MAX_CPUS 255
+enum vgic_type {
+ VGIC_V2, /* Good ol' GICv2 */
+ VGIC_V3, /* New fancy GICv3 */
+};
-#if (VGIC_NR_IRQS_LEGACY & 31)
-#error "VGIC_NR_IRQS must be a multiple of 32"
-#endif
+/* same for all guests, as depending only on the _host's_ GIC model */
+struct vgic_global {
+ /* type of the host GIC */
+ enum vgic_type type;
-#if (VGIC_NR_IRQS_LEGACY > VGIC_MAX_IRQS)
-#error "VGIC_NR_IRQS must be <= 1024"
-#endif
+ /* Physical address of vgic virtual cpu interface */
+ phys_addr_t vcpu_base;
-/*
- * The GIC distributor registers describing interrupts have two parts:
- * - 32 per-CPU interrupts (SGI + PPI)
- * - a bunch of shared interrupts (SPI)
- */
-struct vgic_bitmap {
- /*
- * - One UL per VCPU for private interrupts (assumes UL is at
- * least 32 bits)
- * - As many UL as necessary for shared interrupts.
- *
- * The private interrupts are accessed via the "private"
- * field, one UL per vcpu (the state for vcpu n is in
- * private[n]). The shared interrupts are accessed via the
- * "shared" pointer (IRQn state is at bit n-32 in the bitmap).
- */
- unsigned long *private;
- unsigned long *shared;
-};
+ /* virtual control interface mapping */
+ void __iomem *vctrl_base;
-struct vgic_bytemap {
- /*
- * - 8 u32 per VCPU for private interrupts
- * - As many u32 as necessary for shared interrupts.
- *
- * The private interrupts are accessed via the "private"
- * field, (the state for vcpu n is in private[n*8] to
- * private[n*8 + 7]). The shared interrupts are accessed via
- * the "shared" pointer (IRQn state is at byte (n-32)%4 of the
- * shared[(n-32)/4] word).
- */
- u32 *private;
- u32 *shared;
-};
+ /* Number of implemented list registers */
+ int nr_lr;
-struct kvm_vcpu;
+ /* Maintenance IRQ number */
+ unsigned int maint_irq;
-enum vgic_type {
- VGIC_V2, /* Good ol' GICv2 */
- VGIC_V3, /* New fancy GICv3 */
+ /* maximum number of VCPUs allowed (GICv2 limits us to 8) */
+ int max_gic_vcpus;
+
+ /* Only needed for the legacy KVM_CREATE_IRQCHIP */
+ bool can_emulate_gicv2;
};
-#define LR_STATE_PENDING (1 << 0)
-#define LR_STATE_ACTIVE (1 << 1)
-#define LR_STATE_MASK (3 << 0)
-#define LR_EOI_INT (1 << 2)
-#define LR_HW (1 << 3)
+extern struct vgic_global kvm_vgic_global_state;
-struct vgic_lr {
- unsigned irq:10;
- union {
- unsigned hwirq:10;
- unsigned source:3;
- };
- unsigned state:4;
-};
+#define VGIC_V2_MAX_LRS (1 << 6)
+#define VGIC_V3_MAX_LRS 16
+#define VGIC_V3_LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr)
-struct vgic_vmcr {
- u32 ctlr;
- u32 abpr;
- u32 bpr;
- u32 pmr;
+enum vgic_irq_config {
+ VGIC_CONFIG_EDGE = 0,
+ VGIC_CONFIG_LEVEL
};
-struct vgic_ops {
- struct vgic_lr (*get_lr)(const struct kvm_vcpu *, int);
- void (*set_lr)(struct kvm_vcpu *, int, struct vgic_lr);
- u64 (*get_elrsr)(const struct kvm_vcpu *vcpu);
- u64 (*get_eisr)(const struct kvm_vcpu *vcpu);
- void (*clear_eisr)(struct kvm_vcpu *vcpu);
- u32 (*get_interrupt_status)(const struct kvm_vcpu *vcpu);
- void (*enable_underflow)(struct kvm_vcpu *vcpu);
- void (*disable_underflow)(struct kvm_vcpu *vcpu);
- void (*get_vmcr)(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
- void (*set_vmcr)(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
- void (*enable)(struct kvm_vcpu *vcpu);
+struct vgic_irq {
+ spinlock_t irq_lock; /* Protects the content of the struct */
+ struct list_head lpi_list; /* Used to link all LPIs together */
+ struct list_head ap_list;
+
+ struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU
+ * SPIs and LPIs: The VCPU whose ap_list
+ * this is queued on.
+ */
+
+ struct kvm_vcpu *target_vcpu; /* The VCPU that this interrupt should
+ * be sent to, as a result of the
+ * targets reg (v2) or the
+ * affinity reg (v3).
+ */
+
+ u32 intid; /* Guest visible INTID */
+ bool pending;
+ bool line_level; /* Level only */
+ bool soft_pending; /* Level only */
+ bool active; /* not used for LPIs */
+ bool enabled;
+ bool hw; /* Tied to HW IRQ */
+ struct kref refcount; /* Used for LPIs */
+ u32 hwintid; /* HW INTID number */
+ union {
+ u8 targets; /* GICv2 target VCPUs mask */
+ u32 mpidr; /* GICv3 target VCPU */
+ };
+ u8 source; /* GICv2 SGIs only */
+ u8 priority;
+ enum vgic_irq_config config; /* Level or edge */
};
-struct vgic_params {
- /* vgic type */
- enum vgic_type type;
- /* Physical address of vgic virtual cpu interface */
- phys_addr_t vcpu_base;
- /* Number of list registers */
- u32 nr_lr;
- /* Interrupt number */
- unsigned int maint_irq;
- /* Virtual control interface base address */
- void __iomem *vctrl_base;
- int max_gic_vcpus;
- /* Only needed for the legacy KVM_CREATE_IRQCHIP */
- bool can_emulate_gicv2;
-};
+struct vgic_register_region;
+struct vgic_its;
-struct vgic_vm_ops {
- bool (*queue_sgi)(struct kvm_vcpu *, int irq);
- void (*add_sgi_source)(struct kvm_vcpu *, int irq, int source);
- int (*init_model)(struct kvm *);
- int (*map_resources)(struct kvm *, const struct vgic_params *);
+enum iodev_type {
+ IODEV_CPUIF,
+ IODEV_DIST,
+ IODEV_REDIST,
+ IODEV_ITS
};
struct vgic_io_device {
- gpa_t addr;
- int len;
- const struct vgic_io_range *reg_ranges;
- struct kvm_vcpu *redist_vcpu;
+ gpa_t base_addr;
+ union {
+ struct kvm_vcpu *redist_vcpu;
+ struct vgic_its *its;
+ };
+ const struct vgic_register_region *regions;
+ enum iodev_type iodev_type;
+ int nr_regions;
struct kvm_io_device dev;
};
-struct irq_phys_map {
- u32 virt_irq;
- u32 phys_irq;
-};
-
-struct irq_phys_map_entry {
- struct list_head entry;
- struct rcu_head rcu;
- struct irq_phys_map map;
+struct vgic_its {
+ /* The base address of the ITS control register frame */
+ gpa_t vgic_its_base;
+
+ bool enabled;
+ bool initialized;
+ struct vgic_io_device iodev;
+ struct kvm_device *dev;
+
+ /* These registers correspond to GITS_BASER{0,1} */
+ u64 baser_device_table;
+ u64 baser_coll_table;
+
+ /* Protects the command queue */
+ struct mutex cmd_lock;
+ u64 cbaser;
+ u32 creadr;
+ u32 cwriter;
+
+ /* Protects the device and collection lists */
+ struct mutex its_lock;
+ struct list_head device_list;
+ struct list_head collection_list;
};
struct vgic_dist {
- spinlock_t lock;
bool in_kernel;
bool ready;
+ bool initialized;
/* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
u32 vgic_model;
- int nr_cpus;
- int nr_irqs;
+ /* Do injected MSIs require an additional device ID? */
+ bool msis_require_devid;
+
+ int nr_spis;
+ /* TODO: Consider moving to global state */
/* Virtual control interface mapping */
void __iomem *vctrl_base;
- /* Distributor and vcpu interface mapping in the guest */
- phys_addr_t vgic_dist_base;
- /* GICv2 and GICv3 use different mapped register blocks */
+ /* base addresses in guest physical address space: */
+ gpa_t vgic_dist_base; /* distributor */
union {
- phys_addr_t vgic_cpu_base;
- phys_addr_t vgic_redist_base;
+ /* either a GICv2 CPU interface */
+ gpa_t vgic_cpu_base;
+ /* or a number of GICv3 redistributor regions */
+ gpa_t vgic_redist_base;
};
- /* Distributor enabled */
- u32 enabled;
-
- /* Interrupt enabled (one bit per IRQ) */
- struct vgic_bitmap irq_enabled;
-
- /* Level-triggered interrupt external input is asserted */
- struct vgic_bitmap irq_level;
-
- /*
- * Interrupt state is pending on the distributor
- */
- struct vgic_bitmap irq_pending;
-
- /*
- * Tracks writes to GICD_ISPENDRn and GICD_ICPENDRn for level-triggered
- * interrupts. Essentially holds the state of the flip-flop in
- * Figure 4-10 on page 4-101 in ARM IHI 0048B.b.
- * Once set, it is only cleared for level-triggered interrupts on
- * guest ACKs (when we queue it) or writes to GICD_ICPENDRn.
- */
- struct vgic_bitmap irq_soft_pend;
-
- /* Level-triggered interrupt queued on VCPU interface */
- struct vgic_bitmap irq_queued;
-
- /* Interrupt was active when unqueue from VCPU interface */
- struct vgic_bitmap irq_active;
-
- /* Interrupt priority. Not used yet. */
- struct vgic_bytemap irq_priority;
+ /* distributor enabled */
+ bool enabled;
- /* Level/edge triggered */
- struct vgic_bitmap irq_cfg;
+ struct vgic_irq *spis;
- /*
- * Source CPU per SGI and target CPU:
- *
- * Each byte represent a SGI observable on a VCPU, each bit of
- * this byte indicating if the corresponding VCPU has
- * generated this interrupt. This is a GICv2 feature only.
- *
- * For VCPUn (n < 8), irq_sgi_sources[n*16] to [n*16 + 15] are
- * the SGIs observable on VCPUn.
- */
- u8 *irq_sgi_sources;
+ struct vgic_io_device dist_iodev;
- /*
- * Target CPU for each SPI:
- *
- * Array of available SPI, each byte indicating the target
- * VCPU for SPI. IRQn (n >=32) is at irq_spi_cpu[n-32].
- */
- u8 *irq_spi_cpu;
+ bool has_its;
/*
- * Reverse lookup of irq_spi_cpu for faster compute pending:
- *
- * Array of bitmaps, one per VCPU, describing if IRQn is
- * routed to a particular VCPU.
+ * Contains the attributes and gpa of the LPI configuration table.
+ * Since we report GICR_TYPER.CommonLPIAff as 0b00, we can share
+ * one address across all redistributors.
+ * GICv3 spec: 6.1.2 "LPI Configuration tables"
*/
- struct vgic_bitmap *irq_spi_target;
-
- /* Target MPIDR for each IRQ (needed for GICv3 IROUTERn) only */
- u32 *irq_spi_mpidr;
+ u64 propbaser;
- /* Bitmap indicating which CPU has something pending */
- unsigned long *irq_pending_on_cpu;
-
- /* Bitmap indicating which CPU has active IRQs */
- unsigned long *irq_active_on_cpu;
-
- struct vgic_vm_ops vm_ops;
- struct vgic_io_device dist_iodev;
- struct vgic_io_device *redist_iodevs;
-
- /* Virtual irq to hwirq mapping */
- spinlock_t irq_phys_map_lock;
- struct list_head irq_phys_map_list;
+ /* Protects the lpi_list and the count value below. */
+ spinlock_t lpi_list_lock;
+ struct list_head lpi_list_head;
+ int lpi_list_count;
};
struct vgic_v2_cpu_if {
@@ -298,78 +230,88 @@ struct vgic_v3_cpu_if {
};
struct vgic_cpu {
- /* Pending/active/both interrupts on this VCPU */
- DECLARE_BITMAP(pending_percpu, VGIC_NR_PRIVATE_IRQS);
- DECLARE_BITMAP(active_percpu, VGIC_NR_PRIVATE_IRQS);
- DECLARE_BITMAP(pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
-
- /* Pending/active/both shared interrupts, dynamically sized */
- unsigned long *pending_shared;
- unsigned long *active_shared;
- unsigned long *pend_act_shared;
-
/* CPU vif control registers for world switch */
union {
struct vgic_v2_cpu_if vgic_v2;
struct vgic_v3_cpu_if vgic_v3;
};
- /* Protected by the distributor's irq_phys_map_lock */
- struct list_head irq_phys_map_list;
+ unsigned int used_lrs;
+ struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS];
- u64 live_lrs;
-};
+ spinlock_t ap_list_lock; /* Protects the ap_list */
+
+ /*
+ * List of IRQs that this VCPU should consider because they are either
+ * Active or Pending (hence the name; AP list), or because they recently
+ * were one of the two and need to be migrated off this list to another
+ * VCPU.
+ */
+ struct list_head ap_list_head;
-#define LR_EMPTY 0xff
+ u64 live_lrs;
-#define INT_STATUS_EOI (1 << 0)
-#define INT_STATUS_UNDERFLOW (1 << 1)
+ /*
+ * Members below are used with GICv3 emulation only and represent
+ * parts of the redistributor.
+ */
+ struct vgic_io_device rd_iodev;
+ struct vgic_io_device sgi_iodev;
-struct kvm;
-struct kvm_vcpu;
+ /* Contains the attributes and gpa of the LPI pending tables. */
+ u64 pendbaser;
+
+ bool lpis_enabled;
+};
int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
-int kvm_vgic_hyp_init(void);
-int kvm_vgic_map_resources(struct kvm *kvm);
-int kvm_vgic_get_max_vcpus(void);
void kvm_vgic_early_init(struct kvm *kvm);
int kvm_vgic_create(struct kvm *kvm, u32 type);
void kvm_vgic_destroy(struct kvm *kvm);
void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
+int kvm_vgic_map_resources(struct kvm *kvm);
+int kvm_vgic_hyp_init(void);
+
+int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
bool level);
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
- unsigned int virt_irq, bool level);
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq);
+int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
+ bool level);
+int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq);
int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
+int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
+
#define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel))
-#define vgic_initialized(k) (!!((k)->arch.vgic.nr_cpus))
+#define vgic_initialized(k) ((k)->arch.vgic.initialized)
#define vgic_ready(k) ((k)->arch.vgic.ready)
#define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \
- ((i) < (k)->arch.vgic.nr_irqs))
+ ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS))
+
+bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
+void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
+void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
-int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
- const struct vgic_ops **ops,
- const struct vgic_params **params);
#ifdef CONFIG_KVM_ARM_VGIC_V3
-int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
- const struct vgic_ops **ops,
- const struct vgic_params **params);
+void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
#else
-static inline int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
- const struct vgic_ops **ops,
- const struct vgic_params **params)
+static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
{
- return -ENODEV;
}
#endif
-#endif /* old VGIC include */
-#endif
+/**
+ * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
+ *
+ * The host's GIC naturally limits the maximum amount of VCPUs a guest
+ * can use.
+ */
+static inline int kvm_vgic_get_max_vcpus(void)
+{
+ return kvm_vgic_global_state.max_gic_vcpus;
+}
+
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi);
+
+#endif /* __KVM_ARM_VGIC_H */
diff --git a/include/kvm/vgic/vgic.h b/include/kvm/vgic/vgic.h
deleted file mode 100644
index 3fbd175265ae..000000000000
--- a/include/kvm/vgic/vgic.h
+++ /dev/null
@@ -1,246 +0,0 @@
-/*
- * Copyright (C) 2015, 2016 ARM Ltd.
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-#ifndef __ASM_ARM_KVM_VGIC_VGIC_H
-#define __ASM_ARM_KVM_VGIC_VGIC_H
-
-#include <linux/kernel.h>
-#include <linux/kvm.h>
-#include <linux/irqreturn.h>
-#include <linux/spinlock.h>
-#include <linux/types.h>
-#include <kvm/iodev.h>
-
-#define VGIC_V3_MAX_CPUS 255
-#define VGIC_V2_MAX_CPUS 8
-#define VGIC_NR_IRQS_LEGACY 256
-#define VGIC_NR_SGIS 16
-#define VGIC_NR_PPIS 16
-#define VGIC_NR_PRIVATE_IRQS (VGIC_NR_SGIS + VGIC_NR_PPIS)
-#define VGIC_MAX_PRIVATE (VGIC_NR_PRIVATE_IRQS - 1)
-#define VGIC_MAX_SPI 1019
-#define VGIC_MAX_RESERVED 1023
-#define VGIC_MIN_LPI 8192
-
-enum vgic_type {
- VGIC_V2, /* Good ol' GICv2 */
- VGIC_V3, /* New fancy GICv3 */
-};
-
-/* same for all guests, as depending only on the _host's_ GIC model */
-struct vgic_global {
- /* type of the host GIC */
- enum vgic_type type;
-
- /* Physical address of vgic virtual cpu interface */
- phys_addr_t vcpu_base;
-
- /* virtual control interface mapping */
- void __iomem *vctrl_base;
-
- /* Number of implemented list registers */
- int nr_lr;
-
- /* Maintenance IRQ number */
- unsigned int maint_irq;
-
- /* maximum number of VCPUs allowed (GICv2 limits us to 8) */
- int max_gic_vcpus;
-
- /* Only needed for the legacy KVM_CREATE_IRQCHIP */
- bool can_emulate_gicv2;
-};
-
-extern struct vgic_global kvm_vgic_global_state;
-
-#define VGIC_V2_MAX_LRS (1 << 6)
-#define VGIC_V3_MAX_LRS 16
-#define VGIC_V3_LR_INDEX(lr) (VGIC_V3_MAX_LRS - 1 - lr)
-
-enum vgic_irq_config {
- VGIC_CONFIG_EDGE = 0,
- VGIC_CONFIG_LEVEL
-};
-
-struct vgic_irq {
- spinlock_t irq_lock; /* Protects the content of the struct */
- struct list_head ap_list;
-
- struct kvm_vcpu *vcpu; /* SGIs and PPIs: The VCPU
- * SPIs and LPIs: The VCPU whose ap_list
- * this is queued on.
- */
-
- struct kvm_vcpu *target_vcpu; /* The VCPU that this interrupt should
- * be sent to, as a result of the
- * targets reg (v2) or the
- * affinity reg (v3).
- */
-
- u32 intid; /* Guest visible INTID */
- bool pending;
- bool line_level; /* Level only */
- bool soft_pending; /* Level only */
- bool active; /* not used for LPIs */
- bool enabled;
- bool hw; /* Tied to HW IRQ */
- u32 hwintid; /* HW INTID number */
- union {
- u8 targets; /* GICv2 target VCPUs mask */
- u32 mpidr; /* GICv3 target VCPU */
- };
- u8 source; /* GICv2 SGIs only */
- u8 priority;
- enum vgic_irq_config config; /* Level or edge */
-};
-
-struct vgic_register_region;
-
-struct vgic_io_device {
- gpa_t base_addr;
- struct kvm_vcpu *redist_vcpu;
- const struct vgic_register_region *regions;
- int nr_regions;
- struct kvm_io_device dev;
-};
-
-struct vgic_dist {
- bool in_kernel;
- bool ready;
- bool initialized;
-
- /* vGIC model the kernel emulates for the guest (GICv2 or GICv3) */
- u32 vgic_model;
-
- int nr_spis;
-
- /* TODO: Consider moving to global state */
- /* Virtual control interface mapping */
- void __iomem *vctrl_base;
-
- /* base addresses in guest physical address space: */
- gpa_t vgic_dist_base; /* distributor */
- union {
- /* either a GICv2 CPU interface */
- gpa_t vgic_cpu_base;
- /* or a number of GICv3 redistributor regions */
- gpa_t vgic_redist_base;
- };
-
- /* distributor enabled */
- bool enabled;
-
- struct vgic_irq *spis;
-
- struct vgic_io_device dist_iodev;
- struct vgic_io_device *redist_iodevs;
-};
-
-struct vgic_v2_cpu_if {
- u32 vgic_hcr;
- u32 vgic_vmcr;
- u32 vgic_misr; /* Saved only */
- u64 vgic_eisr; /* Saved only */
- u64 vgic_elrsr; /* Saved only */
- u32 vgic_apr;
- u32 vgic_lr[VGIC_V2_MAX_LRS];
-};
-
-struct vgic_v3_cpu_if {
-#ifdef CONFIG_KVM_ARM_VGIC_V3
- u32 vgic_hcr;
- u32 vgic_vmcr;
- u32 vgic_sre; /* Restored only, change ignored */
- u32 vgic_misr; /* Saved only */
- u32 vgic_eisr; /* Saved only */
- u32 vgic_elrsr; /* Saved only */
- u32 vgic_ap0r[4];
- u32 vgic_ap1r[4];
- u64 vgic_lr[VGIC_V3_MAX_LRS];
-#endif
-};
-
-struct vgic_cpu {
- /* CPU vif control registers for world switch */
- union {
- struct vgic_v2_cpu_if vgic_v2;
- struct vgic_v3_cpu_if vgic_v3;
- };
-
- unsigned int used_lrs;
- struct vgic_irq private_irqs[VGIC_NR_PRIVATE_IRQS];
-
- spinlock_t ap_list_lock; /* Protects the ap_list */
-
- /*
- * List of IRQs that this VCPU should consider because they are either
- * Active or Pending (hence the name; AP list), or because they recently
- * were one of the two and need to be migrated off this list to another
- * VCPU.
- */
- struct list_head ap_list_head;
-
- u64 live_lrs;
-};
-
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write);
-void kvm_vgic_early_init(struct kvm *kvm);
-int kvm_vgic_create(struct kvm *kvm, u32 type);
-void kvm_vgic_destroy(struct kvm *kvm);
-void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu);
-void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu);
-int kvm_vgic_map_resources(struct kvm *kvm);
-int kvm_vgic_hyp_init(void);
-
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int intid,
- bool level);
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid, unsigned int intid,
- bool level);
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq);
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq);
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq);
-
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
-
-#define irqchip_in_kernel(k) (!!((k)->arch.vgic.in_kernel))
-#define vgic_initialized(k) ((k)->arch.vgic.initialized)
-#define vgic_ready(k) ((k)->arch.vgic.ready)
-#define vgic_valid_spi(k, i) (((i) >= VGIC_NR_PRIVATE_IRQS) && \
- ((i) < (k)->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS))
-
-bool kvm_vcpu_has_pending_irqs(struct kvm_vcpu *vcpu);
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu);
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu);
-
-#ifdef CONFIG_KVM_ARM_VGIC_V3
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
-#else
-static inline void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
-{
-}
-#endif
-
-/**
- * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
- *
- * The host's GIC naturally limits the maximum amount of VCPUs a guest
- * can use.
- */
-static inline int kvm_vgic_get_max_vcpus(void)
-{
- return kvm_vgic_global_state.max_gic_vcpus;
-}
-
-#endif /* __ASM_ARM_KVM_VGIC_VGIC_H */
diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h
index dfce616002ad..7868d602c0a0 100644
--- a/include/linux/ceph/ceph_fs.h
+++ b/include/linux/ceph/ceph_fs.h
@@ -34,9 +34,9 @@
#define CEPH_MAX_MON 31
/*
- * ceph_file_layout - describe data layout for a file/inode
+ * legacy ceph_file_layoute
*/
-struct ceph_file_layout {
+struct ceph_file_layout_legacy {
/* file -> object mapping */
__le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
of page size. */
@@ -53,33 +53,27 @@ struct ceph_file_layout {
__le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
} __attribute__ ((packed));
-#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
-#define ceph_file_layout_stripe_count(l) \
- ((__s32)le32_to_cpu((l).fl_stripe_count))
-#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
-#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
-#define ceph_file_layout_object_su(l) \
- ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
-#define ceph_file_layout_pg_pool(l) \
- ((__s32)le32_to_cpu((l).fl_pg_pool))
-
-static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
-{
- return le32_to_cpu(l->fl_stripe_unit) *
- le32_to_cpu(l->fl_stripe_count);
-}
-
-/* "period" == bytes before i start on a new set of objects */
-static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
-{
- return le32_to_cpu(l->fl_object_size) *
- le32_to_cpu(l->fl_stripe_count);
-}
+struct ceph_string;
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+ /* file -> object mapping */
+ u32 stripe_unit; /* stripe unit, in bytes */
+ u32 stripe_count; /* over this many objects */
+ u32 object_size; /* until objects are this big */
+ s64 pool_id; /* rados pool id */
+ struct ceph_string __rcu *pool_ns; /* rados pool namespace */
+};
+
+extern int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+extern void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy);
+extern void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy);
#define CEPH_MIN_STRIPE_UNIT 65536
-int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
-
struct ceph_dir_layout {
__u8 dl_dir_hash; /* see ceph_hash.h for ids */
__u8 dl_unused1;
@@ -127,6 +121,7 @@ struct ceph_dir_layout {
/* client <-> mds */
#define CEPH_MSG_MDS_MAP 21
+#define CEPH_MSG_FS_MAP_USER 103
#define CEPH_MSG_CLIENT_SESSION 22
#define CEPH_MSG_CLIENT_RECONNECT 23
@@ -399,7 +394,7 @@ union ceph_mds_request_args {
__le32 flags;
} __attribute__ ((packed)) setxattr;
struct {
- struct ceph_file_layout layout;
+ struct ceph_file_layout_legacy layout;
} __attribute__ ((packed)) setlayout;
struct {
__u8 rule; /* currently fcntl or flock */
@@ -478,7 +473,7 @@ struct ceph_mds_reply_inode {
__le64 version; /* inode version */
__le64 xattr_version; /* version for xattr blob */
struct ceph_mds_reply_cap cap; /* caps issued for this inode */
- struct ceph_file_layout layout;
+ struct ceph_file_layout_legacy layout;
struct ceph_timespec ctime, mtime, atime;
__le32 time_warp_seq;
__le64 size, max_size, truncate_size;
@@ -531,7 +526,7 @@ struct ceph_filelock {
#define CEPH_FILE_MODE_WR 2
#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
-#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
+#define CEPH_FILE_MODE_BITS 4
int ceph_flags_to_mode(int flags);
@@ -673,7 +668,7 @@ struct ceph_mds_caps {
__le64 size, max_size, truncate_size;
__le32 truncate_seq;
struct ceph_timespec mtime, atime, ctime;
- struct ceph_file_layout layout;
+ struct ceph_file_layout_legacy layout;
__le32 time_warp_seq;
} __attribute__ ((packed));
diff --git a/include/linux/ceph/decode.h b/include/linux/ceph/decode.h
index 19e9932f3e77..f990f2cc907a 100644
--- a/include/linux/ceph/decode.h
+++ b/include/linux/ceph/decode.h
@@ -3,6 +3,7 @@
#include <linux/err.h>
#include <linux/bug.h>
+#include <linux/slab.h>
#include <linux/time.h>
#include <asm/unaligned.h>
@@ -217,6 +218,60 @@ static inline void ceph_encode_string(void **p, void *end,
*p += len;
}
+/*
+ * version and length starting block encoders/decoders
+ */
+
+/* current code version (u8) + compat code version (u8) + len of struct (u32) */
+#define CEPH_ENCODING_START_BLK_LEN 6
+
+/**
+ * ceph_start_encoding - start encoding block
+ * @struct_v: current (code) version of the encoding
+ * @struct_compat: oldest code version that can decode it
+ * @struct_len: length of struct encoding
+ */
+static inline void ceph_start_encoding(void **p, u8 struct_v, u8 struct_compat,
+ u32 struct_len)
+{
+ ceph_encode_8(p, struct_v);
+ ceph_encode_8(p, struct_compat);
+ ceph_encode_32(p, struct_len);
+}
+
+/**
+ * ceph_start_decoding - start decoding block
+ * @v: current version of the encoding that the code supports
+ * @name: name of the struct (free-form)
+ * @struct_v: out param for the encoding version
+ * @struct_len: out param for the length of struct encoding
+ *
+ * Validates the length of struct encoding, so unsafe ceph_decode_*
+ * variants can be used for decoding.
+ */
+static inline int ceph_start_decoding(void **p, void *end, u8 v,
+ const char *name, u8 *struct_v,
+ u32 *struct_len)
+{
+ u8 struct_compat;
+
+ ceph_decode_need(p, end, CEPH_ENCODING_START_BLK_LEN, bad);
+ *struct_v = ceph_decode_8(p);
+ struct_compat = ceph_decode_8(p);
+ if (v < struct_compat) {
+ pr_warn("got struct_v %d struct_compat %d > %d of %s\n",
+ *struct_v, struct_compat, v, name);
+ return -EINVAL;
+ }
+
+ *struct_len = ceph_decode_32(p);
+ ceph_decode_need(p, end, *struct_len, bad);
+ return 0;
+
+bad:
+ return -ERANGE;
+}
+
#define ceph_encode_need(p, end, n, bad) \
do { \
if (!likely(ceph_has_room(p, end, n))) \
diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h
index 690985daad1c..83fc1fff7061 100644
--- a/include/linux/ceph/libceph.h
+++ b/include/linux/ceph/libceph.h
@@ -21,6 +21,7 @@
#include <linux/ceph/mon_client.h>
#include <linux/ceph/osd_client.h>
#include <linux/ceph/ceph_fs.h>
+#include <linux/ceph/string_table.h>
/*
* mount options
@@ -214,8 +215,9 @@ static void erase_##name(struct rb_root *root, type *t) \
}
#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \
+extern type __lookup_##name##_key; \
static type *lookup_##name(struct rb_root *root, \
- typeof(((type *)0)->keyfld) key) \
+ typeof(__lookup_##name##_key.keyfld) key) \
{ \
struct rb_node *n = root->rb_node; \
\
diff --git a/include/linux/ceph/mon_client.h b/include/linux/ceph/mon_client.h
index e2a92df08b47..24d704d1ea5c 100644
--- a/include/linux/ceph/mon_client.h
+++ b/include/linux/ceph/mon_client.h
@@ -95,7 +95,7 @@ struct ceph_mon_client {
struct ceph_mon_subscribe_item item;
bool want;
u32 have; /* epoch */
- } subs[3];
+ } subs[4];
int fs_cluster_id; /* "mdsmap.<id>" sub */
#ifdef CONFIG_DEBUG_FS
@@ -111,9 +111,10 @@ extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
extern void ceph_monc_stop(struct ceph_mon_client *monc);
enum {
- CEPH_SUB_MDSMAP = 0,
- CEPH_SUB_MONMAP,
+ CEPH_SUB_MONMAP = 0,
CEPH_SUB_OSDMAP,
+ CEPH_SUB_FSMAP,
+ CEPH_SUB_MDSMAP,
};
extern const char *ceph_sub_str[];
diff --git a/include/linux/ceph/msgpool.h b/include/linux/ceph/msgpool.h
index 4b0d38960726..ddd0d48d0384 100644
--- a/include/linux/ceph/msgpool.h
+++ b/include/linux/ceph/msgpool.h
@@ -2,7 +2,6 @@
#define _FS_CEPH_MSGPOOL
#include <linux/mempool.h>
-#include <linux/ceph/messenger.h>
/*
* we use memory pools for preallocating messages we may receive, to
diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h
index 1b3b6e155392..858932304260 100644
--- a/include/linux/ceph/osd_client.h
+++ b/include/linux/ceph/osd_client.h
@@ -9,6 +9,7 @@
#include <linux/ceph/types.h>
#include <linux/ceph/osdmap.h>
#include <linux/ceph/messenger.h>
+#include <linux/ceph/msgpool.h>
#include <linux/ceph/auth.h>
#include <linux/ceph/pagelist.h>
diff --git a/include/linux/ceph/osdmap.h b/include/linux/ceph/osdmap.h
index 9ccf4dbe55f8..9a9041784dcf 100644
--- a/include/linux/ceph/osdmap.h
+++ b/include/linux/ceph/osdmap.h
@@ -63,11 +63,13 @@ static inline bool ceph_can_shift_osds(struct ceph_pg_pool_info *pool)
struct ceph_object_locator {
s64 pool;
+ struct ceph_string *pool_ns;
};
static inline void ceph_oloc_init(struct ceph_object_locator *oloc)
{
oloc->pool = -1;
+ oloc->pool_ns = NULL;
}
static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
@@ -75,11 +77,9 @@ static inline bool ceph_oloc_empty(const struct ceph_object_locator *oloc)
return oloc->pool == -1;
}
-static inline void ceph_oloc_copy(struct ceph_object_locator *dest,
- const struct ceph_object_locator *src)
-{
- dest->pool = src->pool;
-}
+void ceph_oloc_copy(struct ceph_object_locator *dest,
+ const struct ceph_object_locator *src);
+void ceph_oloc_destroy(struct ceph_object_locator *oloc);
/*
* Maximum supported by kernel client object name length
@@ -115,6 +115,11 @@ static inline void ceph_oid_init(struct ceph_object_id *oid)
oid->name_len = 0;
}
+#define CEPH_OID_INIT_ONSTACK(oid) \
+ ({ ceph_oid_init(&oid); oid; })
+#define CEPH_DEFINE_OID_ONSTACK(oid) \
+ struct ceph_object_id oid = CEPH_OID_INIT_ONSTACK(oid)
+
static inline bool ceph_oid_empty(const struct ceph_object_id *oid)
{
return oid->name == oid->inline_name && !oid->name_len;
diff --git a/include/linux/ceph/string_table.h b/include/linux/ceph/string_table.h
new file mode 100644
index 000000000000..1b02c96daf75
--- /dev/null
+++ b/include/linux/ceph/string_table.h
@@ -0,0 +1,62 @@
+#ifndef _FS_CEPH_STRING_TABLE_H
+#define _FS_CEPH_STRING_TABLE_H
+
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/rbtree.h>
+#include <linux/rcupdate.h>
+
+struct ceph_string {
+ struct kref kref;
+ union {
+ struct rb_node node;
+ struct rcu_head rcu;
+ };
+ size_t len;
+ char str[];
+};
+
+extern void ceph_release_string(struct kref *ref);
+extern struct ceph_string *ceph_find_or_create_string(const char *str,
+ size_t len);
+extern bool ceph_strings_empty(void);
+
+static inline struct ceph_string *ceph_get_string(struct ceph_string *str)
+{
+ kref_get(&str->kref);
+ return str;
+}
+
+static inline void ceph_put_string(struct ceph_string *str)
+{
+ if (!str)
+ return;
+ kref_put(&str->kref, ceph_release_string);
+}
+
+static inline int ceph_compare_string(struct ceph_string *cs,
+ const char* str, size_t len)
+{
+ size_t cs_len = cs ? cs->len : 0;
+ if (cs_len != len)
+ return cs_len - len;
+ if (len == 0)
+ return 0;
+ return strncmp(cs->str, str, len);
+}
+
+#define ceph_try_get_string(x) \
+({ \
+ struct ceph_string *___str; \
+ rcu_read_lock(); \
+ for (;;) { \
+ ___str = rcu_dereference(x); \
+ if (!___str || \
+ kref_get_unless_zero(&___str->kref)) \
+ break; \
+ } \
+ rcu_read_unlock(); \
+ (___str); \
+})
+
+#endif
diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index d9aef2a0ec8e..c78fc27418f2 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -99,7 +99,8 @@ static inline void context_tracking_init(void) { }
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
-static inline void guest_enter(void)
+/* must be called with irqs disabled */
+static inline void guest_enter_irqoff(void)
{
if (vtime_accounting_cpu_enabled())
vtime_guest_enter(current);
@@ -108,9 +109,19 @@ static inline void guest_enter(void)
if (context_tracking_is_enabled())
__context_tracking_enter(CONTEXT_GUEST);
+
+ /* KVM does not hold any references to rcu protected data when it
+ * switches CPU into a guest mode. In fact switching to a guest mode
+ * is very similar to exiting to userspace from rcu point of view. In
+ * addition CPU may stay in a guest mode for quite a long time (up to
+ * one time slice). Lets treat guest mode as quiescent state, just like
+ * we do with user-mode execution.
+ */
+ if (!context_tracking_cpu_is_enabled())
+ rcu_virt_note_context_switch(smp_processor_id());
}
-static inline void guest_exit(void)
+static inline void guest_exit_irqoff(void)
{
if (context_tracking_is_enabled())
__context_tracking_exit(CONTEXT_GUEST);
@@ -122,7 +133,7 @@ static inline void guest_exit(void)
}
#else
-static inline void guest_enter(void)
+static inline void guest_enter_irqoff(void)
{
/*
* This is running in ioctl context so its safe
@@ -131,9 +142,10 @@ static inline void guest_enter(void)
*/
vtime_account_system(current);
current->flags |= PF_VCPU;
+ rcu_virt_note_context_switch(smp_processor_id());
}
-static inline void guest_exit(void)
+static inline void guest_exit_irqoff(void)
{
/* Flush the guest cputime we spent on the guest */
vtime_account_system(current);
@@ -141,4 +153,22 @@ static inline void guest_exit(void)
}
#endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */
+static inline void guest_enter(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ guest_enter_irqoff();
+ local_irq_restore(flags);
+}
+
+static inline void guest_exit(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ guest_exit_irqoff();
+ local_irq_restore(flags);
+}
+
#endif
diff --git a/include/linux/export.h b/include/linux/export.h
index 2f9ccbe6a639..c565f87f005e 100644
--- a/include/linux/export.h
+++ b/include/linux/export.h
@@ -82,7 +82,7 @@ extern struct module __this_module;
#include <generated/autoksyms.h>
#define __EXPORT_SYMBOL(sym, sec) \
- __cond_export_sym(sym, sec, config_enabled(__KSYM_##sym))
+ __cond_export_sym(sym, sec, __is_defined(__KSYM_##sym))
#define __cond_export_sym(sym, sec, conf) \
___cond_export_sym(sym, sec, conf)
#define ___cond_export_sym(sym, sec, enabled) \
diff --git a/include/linux/irqchip/arm-gic-v3.h b/include/linux/irqchip/arm-gic-v3.h
index 107eed475b94..56b0b7ec66aa 100644
--- a/include/linux/irqchip/arm-gic-v3.h
+++ b/include/linux/irqchip/arm-gic-v3.h
@@ -112,34 +112,76 @@
#define GICR_WAKER_ProcessorSleep (1U << 1)
#define GICR_WAKER_ChildrenAsleep (1U << 2)
-#define GICR_PROPBASER_NonShareable (0U << 10)
-#define GICR_PROPBASER_InnerShareable (1U << 10)
-#define GICR_PROPBASER_OuterShareable (2U << 10)
-#define GICR_PROPBASER_SHAREABILITY_MASK (3UL << 10)
-#define GICR_PROPBASER_nCnB (0U << 7)
-#define GICR_PROPBASER_nC (1U << 7)
-#define GICR_PROPBASER_RaWt (2U << 7)
-#define GICR_PROPBASER_RaWb (3U << 7)
-#define GICR_PROPBASER_WaWt (4U << 7)
-#define GICR_PROPBASER_WaWb (5U << 7)
-#define GICR_PROPBASER_RaWaWt (6U << 7)
-#define GICR_PROPBASER_RaWaWb (7U << 7)
-#define GICR_PROPBASER_CACHEABILITY_MASK (7U << 7)
-#define GICR_PROPBASER_IDBITS_MASK (0x1f)
-
-#define GICR_PENDBASER_NonShareable (0U << 10)
-#define GICR_PENDBASER_InnerShareable (1U << 10)
-#define GICR_PENDBASER_OuterShareable (2U << 10)
-#define GICR_PENDBASER_SHAREABILITY_MASK (3UL << 10)
-#define GICR_PENDBASER_nCnB (0U << 7)
-#define GICR_PENDBASER_nC (1U << 7)
-#define GICR_PENDBASER_RaWt (2U << 7)
-#define GICR_PENDBASER_RaWb (3U << 7)
-#define GICR_PENDBASER_WaWt (4U << 7)
-#define GICR_PENDBASER_WaWb (5U << 7)
-#define GICR_PENDBASER_RaWaWt (6U << 7)
-#define GICR_PENDBASER_RaWaWb (7U << 7)
-#define GICR_PENDBASER_CACHEABILITY_MASK (7U << 7)
+#define GIC_BASER_CACHE_nCnB 0ULL
+#define GIC_BASER_CACHE_SameAsInner 0ULL
+#define GIC_BASER_CACHE_nC 1ULL
+#define GIC_BASER_CACHE_RaWt 2ULL
+#define GIC_BASER_CACHE_RaWb 3ULL
+#define GIC_BASER_CACHE_WaWt 4ULL
+#define GIC_BASER_CACHE_WaWb 5ULL
+#define GIC_BASER_CACHE_RaWaWt 6ULL
+#define GIC_BASER_CACHE_RaWaWb 7ULL
+#define GIC_BASER_CACHE_MASK 7ULL
+#define GIC_BASER_NonShareable 0ULL
+#define GIC_BASER_InnerShareable 1ULL
+#define GIC_BASER_OuterShareable 2ULL
+#define GIC_BASER_SHAREABILITY_MASK 3ULL
+
+#define GIC_BASER_CACHEABILITY(reg, inner_outer, type) \
+ (GIC_BASER_CACHE_##type << reg##_##inner_outer##_CACHEABILITY_SHIFT)
+
+#define GIC_BASER_SHAREABILITY(reg, type) \
+ (GIC_BASER_##type << reg##_SHAREABILITY_SHIFT)
+
+#define GICR_PROPBASER_SHAREABILITY_SHIFT (10)
+#define GICR_PROPBASER_INNER_CACHEABILITY_SHIFT (7)
+#define GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT (56)
+#define GICR_PROPBASER_SHAREABILITY_MASK \
+ GIC_BASER_SHAREABILITY(GICR_PROPBASER, SHAREABILITY_MASK)
+#define GICR_PROPBASER_INNER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, MASK)
+#define GICR_PROPBASER_OUTER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, MASK)
+#define GICR_PROPBASER_CACHEABILITY_MASK GICR_PROPBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_PROPBASER_InnerShareable \
+ GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable)
+
+#define GICR_PROPBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nCnB)
+#define GICR_PROPBASER_nC GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, nC)
+#define GICR_PROPBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+#define GICR_PROPBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWt)
+#define GICR_PROPBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWt)
+#define GICR_PROPBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, WaWb)
+#define GICR_PROPBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWt)
+#define GICR_PROPBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWaWb)
+
+#define GICR_PROPBASER_IDBITS_MASK (0x1f)
+
+#define GICR_PENDBASER_SHAREABILITY_SHIFT (10)
+#define GICR_PENDBASER_INNER_CACHEABILITY_SHIFT (7)
+#define GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT (56)
+#define GICR_PENDBASER_SHAREABILITY_MASK \
+ GIC_BASER_SHAREABILITY(GICR_PENDBASER, SHAREABILITY_MASK)
+#define GICR_PENDBASER_INNER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, MASK)
+#define GICR_PENDBASER_OUTER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, MASK)
+#define GICR_PENDBASER_CACHEABILITY_MASK GICR_PENDBASER_INNER_CACHEABILITY_MASK
+
+#define GICR_PENDBASER_InnerShareable \
+ GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable)
+
+#define GICR_PENDBASER_nCnB GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nCnB)
+#define GICR_PENDBASER_nC GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, nC)
+#define GICR_PENDBASER_RaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+#define GICR_PENDBASER_RaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWt)
+#define GICR_PENDBASER_WaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWt)
+#define GICR_PENDBASER_WaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, WaWb)
+#define GICR_PENDBASER_RaWaWt GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWt)
+#define GICR_PENDBASER_RaWaWb GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWaWb)
+
+#define GICR_PENDBASER_PTZ BIT_ULL(62)
/*
* Re-Distributor registers, offsets from SGI_base
@@ -175,54 +217,83 @@
#define GITS_CWRITER 0x0088
#define GITS_CREADR 0x0090
#define GITS_BASER 0x0100
+#define GITS_IDREGS_BASE 0xffd0
+#define GITS_PIDR0 0xffe0
+#define GITS_PIDR1 0xffe4
#define GITS_PIDR2 GICR_PIDR2
+#define GITS_PIDR4 0xffd0
+#define GITS_CIDR0 0xfff0
+#define GITS_CIDR1 0xfff4
+#define GITS_CIDR2 0xfff8
+#define GITS_CIDR3 0xfffc
#define GITS_TRANSLATER 0x10040
#define GITS_CTLR_ENABLE (1U << 0)
#define GITS_CTLR_QUIESCENT (1U << 31)
+#define GITS_TYPER_PLPIS (1UL << 0)
+#define GITS_TYPER_IDBITS_SHIFT 8
#define GITS_TYPER_DEVBITS_SHIFT 13
#define GITS_TYPER_DEVBITS(r) ((((r) >> GITS_TYPER_DEVBITS_SHIFT) & 0x1f) + 1)
#define GITS_TYPER_PTA (1UL << 19)
-
-#define GITS_CBASER_VALID (1UL << 63)
-#define GITS_CBASER_nCnB (0UL << 59)
-#define GITS_CBASER_nC (1UL << 59)
-#define GITS_CBASER_RaWt (2UL << 59)
-#define GITS_CBASER_RaWb (3UL << 59)
-#define GITS_CBASER_WaWt (4UL << 59)
-#define GITS_CBASER_WaWb (5UL << 59)
-#define GITS_CBASER_RaWaWt (6UL << 59)
-#define GITS_CBASER_RaWaWb (7UL << 59)
-#define GITS_CBASER_CACHEABILITY_MASK (7UL << 59)
-#define GITS_CBASER_NonShareable (0UL << 10)
-#define GITS_CBASER_InnerShareable (1UL << 10)
-#define GITS_CBASER_OuterShareable (2UL << 10)
-#define GITS_CBASER_SHAREABILITY_MASK (3UL << 10)
+#define GITS_TYPER_HWCOLLCNT_SHIFT 24
+
+#define GITS_CBASER_VALID (1UL << 63)
+#define GITS_CBASER_SHAREABILITY_SHIFT (10)
+#define GITS_CBASER_INNER_CACHEABILITY_SHIFT (59)
+#define GITS_CBASER_OUTER_CACHEABILITY_SHIFT (53)
+#define GITS_CBASER_SHAREABILITY_MASK \
+ GIC_BASER_SHAREABILITY(GITS_CBASER, SHAREABILITY_MASK)
+#define GITS_CBASER_INNER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, MASK)
+#define GITS_CBASER_OUTER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GITS_CBASER, OUTER, MASK)
+#define GITS_CBASER_CACHEABILITY_MASK GITS_CBASER_INNER_CACHEABILITY_MASK
+
+#define GITS_CBASER_InnerShareable \
+ GIC_BASER_SHAREABILITY(GITS_CBASER, InnerShareable)
+
+#define GITS_CBASER_nCnB GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nCnB)
+#define GITS_CBASER_nC GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, nC)
+#define GITS_CBASER_RaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+#define GITS_CBASER_RaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWt)
+#define GITS_CBASER_WaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWt)
+#define GITS_CBASER_WaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, WaWb)
+#define GITS_CBASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWt)
+#define GITS_CBASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_CBASER, INNER, RaWaWb)
#define GITS_BASER_NR_REGS 8
-#define GITS_BASER_VALID (1UL << 63)
-#define GITS_BASER_INDIRECT (1UL << 62)
-#define GITS_BASER_nCnB (0UL << 59)
-#define GITS_BASER_nC (1UL << 59)
-#define GITS_BASER_RaWt (2UL << 59)
-#define GITS_BASER_RaWb (3UL << 59)
-#define GITS_BASER_WaWt (4UL << 59)
-#define GITS_BASER_WaWb (5UL << 59)
-#define GITS_BASER_RaWaWt (6UL << 59)
-#define GITS_BASER_RaWaWb (7UL << 59)
-#define GITS_BASER_CACHEABILITY_MASK (7UL << 59)
-#define GITS_BASER_TYPE_SHIFT (56)
+#define GITS_BASER_VALID (1UL << 63)
+#define GITS_BASER_INDIRECT (1ULL << 62)
+
+#define GITS_BASER_INNER_CACHEABILITY_SHIFT (59)
+#define GITS_BASER_OUTER_CACHEABILITY_SHIFT (53)
+#define GITS_BASER_INNER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GITS_BASER, INNER, MASK)
+#define GITS_BASER_CACHEABILITY_MASK GITS_BASER_INNER_CACHEABILITY_MASK
+#define GITS_BASER_OUTER_CACHEABILITY_MASK \
+ GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, MASK)
+#define GITS_BASER_SHAREABILITY_MASK \
+ GIC_BASER_SHAREABILITY(GITS_BASER, SHAREABILITY_MASK)
+
+#define GITS_BASER_nCnB GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nCnB)
+#define GITS_BASER_nC GIC_BASER_CACHEABILITY(GITS_BASER, INNER, nC)
+#define GITS_BASER_RaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+#define GITS_BASER_RaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWt)
+#define GITS_BASER_WaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWt)
+#define GITS_BASER_WaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, WaWb)
+#define GITS_BASER_RaWaWt GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWt)
+#define GITS_BASER_RaWaWb GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWaWb)
+
+#define GITS_BASER_TYPE_SHIFT (56)
#define GITS_BASER_TYPE(r) (((r) >> GITS_BASER_TYPE_SHIFT) & 7)
-#define GITS_BASER_ENTRY_SIZE_SHIFT (48)
+#define GITS_BASER_ENTRY_SIZE_SHIFT (48)
#define GITS_BASER_ENTRY_SIZE(r) ((((r) >> GITS_BASER_ENTRY_SIZE_SHIFT) & 0xff) + 1)
-#define GITS_BASER_NonShareable (0UL << 10)
-#define GITS_BASER_InnerShareable (1UL << 10)
-#define GITS_BASER_OuterShareable (2UL << 10)
#define GITS_BASER_SHAREABILITY_SHIFT (10)
-#define GITS_BASER_SHAREABILITY_MASK (3UL << GITS_BASER_SHAREABILITY_SHIFT)
+#define GITS_BASER_InnerShareable \
+ GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable)
#define GITS_BASER_PAGE_SIZE_SHIFT (8)
#define GITS_BASER_PAGE_SIZE_4K (0UL << GITS_BASER_PAGE_SIZE_SHIFT)
#define GITS_BASER_PAGE_SIZE_16K (1UL << GITS_BASER_PAGE_SIZE_SHIFT)
@@ -230,6 +301,7 @@
#define GITS_BASER_PAGE_SIZE_MASK (3UL << GITS_BASER_PAGE_SIZE_SHIFT)
#define GITS_BASER_PAGES_MAX 256
#define GITS_BASER_PAGES_SHIFT (0)
+#define GITS_BASER_NR_PAGES(r) (((r) & 0xff) + 1)
#define GITS_BASER_TYPE_NONE 0
#define GITS_BASER_TYPE_DEVICE 1
@@ -247,7 +319,10 @@
*/
#define GITS_CMD_MAPD 0x08
#define GITS_CMD_MAPC 0x09
-#define GITS_CMD_MAPVI 0x0a
+#define GITS_CMD_MAPTI 0x0a
+/* older GIC documentation used MAPVI for this command */
+#define GITS_CMD_MAPVI GITS_CMD_MAPTI
+#define GITS_CMD_MAPI 0x0b
#define GITS_CMD_MOVI 0x01
#define GITS_CMD_DISCARD 0x0f
#define GITS_CMD_INV 0x0c
@@ -258,6 +333,22 @@
#define GITS_CMD_SYNC 0x05
/*
+ * ITS error numbers
+ */
+#define E_ITS_MOVI_UNMAPPED_INTERRUPT 0x010107
+#define E_ITS_MOVI_UNMAPPED_COLLECTION 0x010109
+#define E_ITS_CLEAR_UNMAPPED_INTERRUPT 0x010507
+#define E_ITS_MAPD_DEVICE_OOR 0x010801
+#define E_ITS_MAPC_PROCNUM_OOR 0x010902
+#define E_ITS_MAPC_COLLECTION_OOR 0x010903
+#define E_ITS_MAPTI_UNMAPPED_DEVICE 0x010a04
+#define E_ITS_MAPTI_PHYSICALID_OOR 0x010a06
+#define E_ITS_INV_UNMAPPED_INTERRUPT 0x010c07
+#define E_ITS_INVALL_UNMAPPED_COLLECTION 0x010d09
+#define E_ITS_MOVALL_PROCNUM_OOR 0x010e01
+#define E_ITS_DISCARD_UNMAPPED_INTERRUPT 0x010f07
+
+/*
* CPU interface registers
*/
#define ICC_CTLR_EL1_EOImode_drop_dir (0U << 1)
diff --git a/include/linux/kconfig.h b/include/linux/kconfig.h
index b33c7797eb57..15ec117ec537 100644
--- a/include/linux/kconfig.h
+++ b/include/linux/kconfig.h
@@ -3,6 +3,21 @@
#include <generated/autoconf.h>
+#define __ARG_PLACEHOLDER_1 0,
+#define __take_second_arg(__ignored, val, ...) val
+
+/*
+ * The use of "&&" / "||" is limited in certain expressions.
+ * The followings enable to calculate "and" / "or" with macro expansion only.
+ */
+#define __and(x, y) ___and(x, y)
+#define ___and(x, y) ____and(__ARG_PLACEHOLDER_##x, y)
+#define ____and(arg1_or_junk, y) __take_second_arg(arg1_or_junk y, 0)
+
+#define __or(x, y) ___or(x, y)
+#define ___or(x, y) ____or(__ARG_PLACEHOLDER_##x, y)
+#define ____or(arg1_or_junk, y) __take_second_arg(arg1_or_junk 1, y)
+
/*
* Helper macros to use CONFIG_ options in C/CPP expressions. Note that
* these only work with boolean and tristate options.
@@ -16,11 +31,10 @@
* When CONFIG_BOOGER is not defined, we generate a (... 1, 0) pair, and when
* the last step cherry picks the 2nd arg, we get a zero.
*/
-#define __ARG_PLACEHOLDER_1 0,
-#define config_enabled(cfg) _config_enabled(cfg)
-#define _config_enabled(value) __config_enabled(__ARG_PLACEHOLDER_##value)
-#define __config_enabled(arg1_or_junk) ___config_enabled(arg1_or_junk 1, 0)
-#define ___config_enabled(__ignored, val, ...) val
+#define config_enabled(cfg) ___is_defined(cfg)
+#define __is_defined(x) ___is_defined(x)
+#define ___is_defined(val) ____is_defined(__ARG_PLACEHOLDER_##val)
+#define ____is_defined(arg1_or_junk) __take_second_arg(arg1_or_junk 1, 0)
/*
* IS_BUILTIN(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y', 0
@@ -41,14 +55,13 @@
* This is similar to IS_ENABLED(), but returns false when invoked from
* built-in code when CONFIG_FOO is set to 'm'.
*/
-#define IS_REACHABLE(option) (config_enabled(option) || \
- (config_enabled(option##_MODULE) && config_enabled(MODULE)))
+#define IS_REACHABLE(option) __or(IS_BUILTIN(option), \
+ __and(IS_MODULE(option), __is_defined(MODULE)))
/*
* IS_ENABLED(CONFIG_FOO) evaluates to 1 if CONFIG_FOO is set to 'y' or 'm',
* 0 otherwise.
*/
-#define IS_ENABLED(option) \
- (IS_BUILTIN(option) || IS_MODULE(option))
+#define IS_ENABLED(option) __or(IS_BUILTIN(option), IS_MODULE(option))
#endif /* __LINUX_KCONFIG_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index 1c9c973a7dd9..aafd702f3e21 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -164,6 +164,8 @@ int kvm_io_bus_register_dev(struct kvm *kvm, enum kvm_bus bus_idx, gpa_t addr,
int len, struct kvm_io_device *dev);
int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
struct kvm_io_device *dev);
+struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ gpa_t addr);
#ifdef CONFIG_KVM_ASYNC_PF
struct kvm_async_pf {
@@ -371,7 +373,15 @@ struct kvm {
struct srcu_struct srcu;
struct srcu_struct irq_srcu;
struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+
+ /*
+ * created_vcpus is protected by kvm->lock, and is incremented
+ * at the beginning of KVM_CREATE_VCPU. online_vcpus is only
+ * incremented after storing the kvm_vcpu pointer in vcpus,
+ * and is accessed atomically.
+ */
atomic_t online_vcpus;
+ int created_vcpus;
int last_boosted_vcpu;
struct list_head vm_list;
struct mutex lock;
@@ -867,45 +877,6 @@ static inline void kvm_iommu_unmap_pages(struct kvm *kvm,
}
#endif
-/* must be called with irqs disabled */
-static inline void __kvm_guest_enter(void)
-{
- guest_enter();
- /* KVM does not hold any references to rcu protected data when it
- * switches CPU into a guest mode. In fact switching to a guest mode
- * is very similar to exiting to userspace from rcu point of view. In
- * addition CPU may stay in a guest mode for quite a long time (up to
- * one time slice). Lets treat guest mode as quiescent state, just like
- * we do with user-mode execution.
- */
- if (!context_tracking_cpu_is_enabled())
- rcu_virt_note_context_switch(smp_processor_id());
-}
-
-/* must be called with irqs disabled */
-static inline void __kvm_guest_exit(void)
-{
- guest_exit();
-}
-
-static inline void kvm_guest_enter(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __kvm_guest_enter();
- local_irq_restore(flags);
-}
-
-static inline void kvm_guest_exit(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- __kvm_guest_exit();
- local_irq_restore(flags);
-}
-
/*
* search_memslots() and __gfn_to_memslot() are here because they are
* used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c.
@@ -1042,7 +1013,8 @@ int kvm_set_irq_routing(struct kvm *kvm,
const struct kvm_irq_routing_entry *entries,
unsigned nr,
unsigned flags);
-int kvm_set_routing_entry(struct kvm_kernel_irq_routing_entry *e,
+int kvm_set_routing_entry(struct kvm *kvm,
+ struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue);
void kvm_free_irq_routing(struct kvm *kvm);
@@ -1097,12 +1069,6 @@ static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
#endif /* CONFIG_HAVE_KVM_EVENTFD */
-#ifdef CONFIG_KVM_APIC_ARCHITECTURE
-bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu);
-#else
-static inline bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) { return true; }
-#endif
-
static inline void kvm_make_request(int req, struct kvm_vcpu *vcpu)
{
/*
diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h
index fbe8e164a4ee..8dd6e01f45c0 100644
--- a/include/linux/mtd/nand.h
+++ b/include/linux/mtd/nand.h
@@ -783,6 +783,7 @@ static inline void nand_set_controller_data(struct nand_chip *chip, void *priv)
* NAND Flash Manufacturer ID Codes
*/
#define NAND_MFR_TOSHIBA 0x98
+#define NAND_MFR_ESMT 0xc8
#define NAND_MFR_SAMSUNG 0xec
#define NAND_MFR_FUJITSU 0x04
#define NAND_MFR_NATIONAL 0x8f
diff --git a/include/linux/mtd/spi-nor.h b/include/linux/mtd/spi-nor.h
index 7f041bd88b82..c425c7b4c2a0 100644
--- a/include/linux/mtd/spi-nor.h
+++ b/include/linux/mtd/spi-nor.h
@@ -173,10 +173,10 @@ struct spi_nor {
int (*read_reg)(struct spi_nor *nor, u8 opcode, u8 *buf, int len);
int (*write_reg)(struct spi_nor *nor, u8 opcode, u8 *buf, int len);
- int (*read)(struct spi_nor *nor, loff_t from,
- size_t len, size_t *retlen, u_char *read_buf);
- void (*write)(struct spi_nor *nor, loff_t to,
- size_t len, size_t *retlen, const u_char *write_buf);
+ ssize_t (*read)(struct spi_nor *nor, loff_t from,
+ size_t len, u_char *read_buf);
+ ssize_t (*write)(struct spi_nor *nor, loff_t to,
+ size_t len, const u_char *write_buf);
int (*erase)(struct spi_nor *nor, loff_t offs);
int (*flash_lock)(struct spi_nor *nor, loff_t ofs, uint64_t len);
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 8b5e0a9f2431..610e13271918 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -124,6 +124,15 @@ static inline int page_ref_sub_and_test(struct page *page, int nr)
return ret;
}
+static inline int page_ref_inc_return(struct page *page)
+{
+ int ret = atomic_inc_return(&page->_refcount);
+
+ if (page_ref_tracepoint_active(__tracepoint_page_ref_mod_and_return))
+ __page_ref_mod_and_return(page, 1, ret);
+ return ret;
+}
+
static inline int page_ref_dec_and_test(struct page *page)
{
int ret = atomic_dec_and_test(&page->_refcount);
diff --git a/include/linux/pci-acpi.h b/include/linux/pci-acpi.h
index 89ab0572dbc6..7d63a66e8ed4 100644
--- a/include/linux/pci-acpi.h
+++ b/include/linux/pci-acpi.h
@@ -24,6 +24,8 @@ static inline acpi_status pci_acpi_remove_pm_notifier(struct acpi_device *dev)
}
extern phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle);
+extern phys_addr_t pci_mcfg_lookup(u16 domain, struct resource *bus_res);
+
static inline acpi_handle acpi_find_root_bridge_handle(struct pci_dev *pdev)
{
struct pci_bus *pbus = pdev->bus;
diff --git a/drivers/pci/ecam.h b/include/linux/pci-ecam.h
index 9878bebd45bb..7adad206b1f4 100644
--- a/drivers/pci/ecam.h
+++ b/include/linux/pci-ecam.h
@@ -27,8 +27,7 @@ struct pci_config_window;
struct pci_ecam_ops {
unsigned int bus_shift;
struct pci_ops pci_ops;
- int (*init)(struct device *,
- struct pci_config_window *);
+ int (*init)(struct pci_config_window *);
};
/*
@@ -45,6 +44,7 @@ struct pci_config_window {
void __iomem *win; /* 64-bit single mapping */
void __iomem **winp; /* 32-bit per-bus mapping */
};
+ struct device *parent;/* ECAM res was from this dev */
};
/* create and free pci_config_window */
diff --git a/include/linux/pci.h b/include/linux/pci.h
index c40ac910cce4..2599a980340f 100644
--- a/include/linux/pci.h
+++ b/include/linux/pci.h
@@ -101,6 +101,10 @@ enum {
DEVICE_COUNT_RESOURCE = PCI_NUM_RESOURCES,
};
+/*
+ * pci_power_t values must match the bits in the Capabilities PME_Support
+ * and Control/Status PowerState fields in the Power Management capability.
+ */
typedef int __bitwise pci_power_t;
#define PCI_D0 ((pci_power_t __force) 0)
@@ -116,7 +120,7 @@ extern const char *pci_power_names[];
static inline const char *pci_power_name(pci_power_t state)
{
- return pci_power_names[1 + (int) state];
+ return pci_power_names[1 + (__force int) state];
}
#define PCI_PM_D2_DELAY 200
@@ -294,6 +298,7 @@ struct pci_dev {
unsigned int d2_support:1; /* Low power state D2 is supported */
unsigned int no_d1d2:1; /* D1 and D2 are forbidden */
unsigned int no_d3cold:1; /* D3cold is forbidden */
+ unsigned int bridge_d3:1; /* Allow D3 for bridge */
unsigned int d3cold_allowed:1; /* D3cold is allowed by user */
unsigned int mmio_always_on:1; /* disallow turning off io/mem
decoding during bar sizing */
@@ -320,6 +325,7 @@ struct pci_dev {
* directly, use the values stored here. They might be different!
*/
unsigned int irq;
+ struct cpumask *irq_affinity;
struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */
bool match_driver; /* Skip attaching driver */
@@ -1084,6 +1090,8 @@ int pci_back_from_sleep(struct pci_dev *dev);
bool pci_dev_run_wake(struct pci_dev *dev);
bool pci_check_pme_status(struct pci_dev *dev);
void pci_pme_wakeup_bus(struct pci_bus *bus);
+void pci_d3cold_enable(struct pci_dev *dev);
+void pci_d3cold_disable(struct pci_dev *dev);
static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state,
bool enable)
@@ -1115,6 +1123,7 @@ int pci_set_vpd_size(struct pci_dev *dev, size_t len);
/* Helper functions for low-level code (drivers/pci/setup-[bus,res].c) */
resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx);
void pci_bus_assign_resources(const struct pci_bus *bus);
+void pci_bus_claim_resources(struct pci_bus *bus);
void pci_bus_size_bridges(struct pci_bus *bus);
int pci_claim_resource(struct pci_dev *, int);
int pci_claim_bridge_resource(struct pci_dev *bridge, int i);
@@ -1144,9 +1153,12 @@ void pci_add_resource(struct list_head *resources, struct resource *res);
void pci_add_resource_offset(struct list_head *resources, struct resource *res,
resource_size_t offset);
void pci_free_resource_list(struct list_head *resources);
-void pci_bus_add_resource(struct pci_bus *bus, struct resource *res, unsigned int flags);
+void pci_bus_add_resource(struct pci_bus *bus, struct resource *res,
+ unsigned int flags);
struct resource *pci_bus_resource_n(const struct pci_bus *bus, int n);
void pci_bus_remove_resources(struct pci_bus *bus);
+int devm_request_pci_bus_resources(struct device *dev,
+ struct list_head *resources);
#define pci_bus_for_each_resource(bus, res, i) \
for (i = 0; \
@@ -1168,6 +1180,7 @@ int pci_register_io_range(phys_addr_t addr, resource_size_t size);
unsigned long pci_address_to_pio(phys_addr_t addr);
phys_addr_t pci_pio_to_address(unsigned long pio);
int pci_remap_iospace(const struct resource *res, phys_addr_t phys_addr);
+void pci_unmap_iospace(struct resource *res);
static inline pci_bus_addr_t pci_bus_address(struct pci_dev *pdev, int bar)
{
@@ -1238,6 +1251,11 @@ resource_size_t pcibios_iov_resource_alignment(struct pci_dev *dev, int resno);
int pci_set_vga_state(struct pci_dev *pdev, bool decode,
unsigned int command_bits, u32 flags);
+#define PCI_IRQ_NOLEGACY (1 << 0) /* don't use legacy interrupts */
+#define PCI_IRQ_NOMSI (1 << 1) /* don't use MSI interrupts */
+#define PCI_IRQ_NOMSIX (1 << 2) /* don't use MSI-X interrupts */
+#define PCI_IRQ_NOAFFINITY (1 << 3) /* don't auto-assign affinity */
+
/* kmem_cache style wrapper around pci_alloc_consistent() */
#include <linux/pci-dma.h>
@@ -1285,6 +1303,11 @@ static inline int pci_enable_msix_exact(struct pci_dev *dev,
return rc;
return 0;
}
+int pci_alloc_irq_vectors(struct pci_dev *dev, unsigned int min_vecs,
+ unsigned int max_vecs, unsigned int flags);
+void pci_free_irq_vectors(struct pci_dev *dev);
+int pci_irq_vector(struct pci_dev *dev, unsigned int nr);
+
#else
static inline int pci_msi_vec_count(struct pci_dev *dev) { return -ENOSYS; }
static inline void pci_msi_shutdown(struct pci_dev *dev) { }
@@ -1308,6 +1331,24 @@ static inline int pci_enable_msix_range(struct pci_dev *dev,
static inline int pci_enable_msix_exact(struct pci_dev *dev,
struct msix_entry *entries, int nvec)
{ return -ENOSYS; }
+static inline int pci_alloc_irq_vectors(struct pci_dev *dev,
+ unsigned int min_vecs, unsigned int max_vecs,
+ unsigned int flags)
+{
+ if (min_vecs > 1)
+ return -EINVAL;
+ return 1;
+}
+static inline void pci_free_irq_vectors(struct pci_dev *dev)
+{
+}
+
+static inline int pci_irq_vector(struct pci_dev *dev, unsigned int nr)
+{
+ if (WARN_ON_ONCE(nr > 0))
+ return -EINVAL;
+ return dev->irq;
+}
#endif
#ifdef CONFIG_PCIEPORTBUS
@@ -1390,12 +1431,13 @@ static inline int pci_domain_nr(struct pci_bus *bus)
{
return bus->domain_nr;
}
-void pci_bus_assign_domain_nr(struct pci_bus *bus, struct device *parent);
+#ifdef CONFIG_ACPI
+int acpi_pci_bus_find_domain_nr(struct pci_bus *bus);
#else
-static inline void pci_bus_assign_domain_nr(struct pci_bus *bus,
- struct device *parent)
-{
-}
+static inline int acpi_pci_bus_find_domain_nr(struct pci_bus *bus)
+{ return 0; }
+#endif
+int pci_bus_find_domain_nr(struct pci_bus *bus, struct device *parent);
#endif
/* some architectures require additional setup to direct VGA traffic */
@@ -1403,6 +1445,34 @@ typedef int (*arch_set_vga_state_t)(struct pci_dev *pdev, bool decode,
unsigned int command_bits, u32 flags);
void pci_register_set_vga_state(arch_set_vga_state_t func);
+static inline int
+pci_request_io_regions(struct pci_dev *pdev, const char *name)
+{
+ return pci_request_selected_regions(pdev,
+ pci_select_bars(pdev, IORESOURCE_IO), name);
+}
+
+static inline void
+pci_release_io_regions(struct pci_dev *pdev)
+{
+ return pci_release_selected_regions(pdev,
+ pci_select_bars(pdev, IORESOURCE_IO));
+}
+
+static inline int
+pci_request_mem_regions(struct pci_dev *pdev, const char *name)
+{
+ return pci_request_selected_regions(pdev,
+ pci_select_bars(pdev, IORESOURCE_MEM), name);
+}
+
+static inline void
+pci_release_mem_regions(struct pci_dev *pdev)
+{
+ return pci_release_selected_regions(pdev,
+ pci_select_bars(pdev, IORESOURCE_MEM));
+}
+
#else /* CONFIG_PCI is not enabled */
static inline void pci_set_flags(int flags) { }
@@ -1555,7 +1625,11 @@ static inline const char *pci_name(const struct pci_dev *pdev)
/* Some archs don't want to expose struct resource to userland as-is
* in sysfs and /proc
*/
-#ifndef HAVE_ARCH_PCI_RESOURCE_TO_USER
+#ifdef HAVE_ARCH_PCI_RESOURCE_TO_USER
+void pci_resource_to_user(const struct pci_dev *dev, int bar,
+ const struct resource *rsrc,
+ resource_size_t *start, resource_size_t *end);
+#else
static inline void pci_resource_to_user(const struct pci_dev *dev, int bar,
const struct resource *rsrc, resource_size_t *start,
resource_size_t *end)
@@ -1707,6 +1781,7 @@ extern u8 pci_cache_line_size;
extern unsigned long pci_hotplug_io_size;
extern unsigned long pci_hotplug_mem_size;
+extern unsigned long pci_hotplug_bus_size;
/* Architecture-specific versions may override these (weak) */
void pcibios_disable_device(struct pci_dev *dev);
@@ -1723,7 +1798,7 @@ void pcibios_free_irq(struct pci_dev *dev);
extern struct dev_pm_ops pcibios_pm_ops;
#endif
-#ifdef CONFIG_PCI_MMCONFIG
+#if defined(CONFIG_PCI_MMCONFIG) || defined(CONFIG_ACPI_MCFG)
void __init pci_mmcfg_early_init(void);
void __init pci_mmcfg_late_init(void);
#else
diff --git a/include/trace/events/kvm.h b/include/trace/events/kvm.h
index f28292d73ddb..8ade3eb6c640 100644
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -151,8 +151,9 @@ TRACE_EVENT(kvm_msi_set_irq,
__entry->data = data;
),
- TP_printk("dst %u vec %u (%s|%s|%s%s)",
- (u8)(__entry->address >> 12), (u8)__entry->data,
+ TP_printk("dst %llx vec %u (%s|%s|%s%s)",
+ (u8)(__entry->address >> 12) | ((__entry->address >> 32) & 0xffffff00),
+ (u8)__entry->data,
__print_symbolic((__entry->data >> 8 & 0x7), kvm_deliver_mode),
(__entry->address & (1<<2)) ? "logical" : "physical",
(__entry->data & (1<<15)) ? "level" : "edge",
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 05ebf475104c..e98bb4cce639 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -866,6 +866,10 @@ struct kvm_ppc_smmu_info {
#define KVM_CAP_ARM_PMU_V3 126
#define KVM_CAP_VCPU_ATTRIBUTES 127
#define KVM_CAP_MAX_VCPU_ID 128
+#define KVM_CAP_X2APIC_API 129
+#define KVM_CAP_S390_USER_INSTR0 130
+#define KVM_CAP_MSI_DEVID 131
+#define KVM_CAP_PPC_HTM 132
#ifdef KVM_CAP_IRQ_ROUTING
@@ -1024,12 +1028,14 @@ struct kvm_one_reg {
__u64 addr;
};
+#define KVM_MSI_VALID_DEVID (1U << 0)
struct kvm_msi {
__u32 address_lo;
__u32 address_hi;
__u32 data;
__u32 flags;
- __u8 pad[16];
+ __u32 devid;
+ __u8 pad[12];
};
struct kvm_arm_device_addr {
@@ -1074,6 +1080,8 @@ enum kvm_device_type {
#define KVM_DEV_TYPE_FLIC KVM_DEV_TYPE_FLIC
KVM_DEV_TYPE_ARM_VGIC_V3,
#define KVM_DEV_TYPE_ARM_VGIC_V3 KVM_DEV_TYPE_ARM_VGIC_V3
+ KVM_DEV_TYPE_ARM_VGIC_ITS,
+#define KVM_DEV_TYPE_ARM_VGIC_ITS KVM_DEV_TYPE_ARM_VGIC_ITS
KVM_DEV_TYPE_MAX,
};
@@ -1313,4 +1321,7 @@ struct kvm_assigned_msix_entry {
__u16 padding[3];
};
+#define KVM_X2APIC_API_USE_32BIT_IDS (1ULL << 0)
+#define KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK (1ULL << 1)
+
#endif /* __LINUX_KVM_H */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index f07842e2d69f..eb8917a71489 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -709,6 +709,8 @@ config KCOV
bool "Code coverage for fuzzing"
depends on ARCH_HAS_KCOV
select DEBUG_FS
+ select GCC_PLUGINS if !COMPILE_TEST
+ select GCC_PLUGIN_SANCOV if !COMPILE_TEST
help
KCOV exposes kernel code coverage information in a form suitable
for coverage-guided fuzzing (randomized testing).
diff --git a/mm/gup.c b/mm/gup.c
index 547741f5f7a7..96b2b2fd0fbd 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -723,6 +723,7 @@ retry:
}
return 0;
}
+EXPORT_SYMBOL_GPL(fixup_user_fault);
static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
struct mm_struct *mm,
diff --git a/net/ceph/Makefile b/net/ceph/Makefile
index 958d9856912c..84cbed630c4b 100644
--- a/net/ceph/Makefile
+++ b/net/ceph/Makefile
@@ -11,5 +11,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
crypto.o armor.o \
auth_x.o \
ceph_fs.o ceph_strings.o ceph_hash.o \
- pagevec.o snapshot.o
+ pagevec.o snapshot.o string_table.o
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 55d2bfee16d7..bddfcf6f09c2 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -747,6 +747,8 @@ out:
static void __exit exit_ceph_lib(void)
{
dout("exit_ceph_lib\n");
+ WARN_ON(!ceph_strings_empty());
+
ceph_osdc_cleanup();
ceph_msgr_exit();
ceph_crypto_shutdown();
diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c
index 41466ccb972a..7d54e944de5e 100644
--- a/net/ceph/ceph_fs.c
+++ b/net/ceph/ceph_fs.c
@@ -9,9 +9,9 @@
*/
int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
{
- __u32 su = le32_to_cpu(layout->fl_stripe_unit);
- __u32 sc = le32_to_cpu(layout->fl_stripe_count);
- __u32 os = le32_to_cpu(layout->fl_object_size);
+ __u32 su = layout->stripe_unit;
+ __u32 sc = layout->stripe_count;
+ __u32 os = layout->object_size;
/* stripe unit, object size must be non-zero, 64k increment */
if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
@@ -27,6 +27,30 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
return 1;
}
+void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy)
+{
+ fl->stripe_unit = le32_to_cpu(legacy->fl_stripe_unit);
+ fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count);
+ fl->object_size = le32_to_cpu(legacy->fl_object_size);
+ fl->pool_id = le32_to_cpu(legacy->fl_pg_pool);
+ if (fl->pool_id == 0)
+ fl->pool_id = -1;
+}
+EXPORT_SYMBOL(ceph_file_layout_from_legacy);
+
+void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
+ struct ceph_file_layout_legacy *legacy)
+{
+ legacy->fl_stripe_unit = cpu_to_le32(fl->stripe_unit);
+ legacy->fl_stripe_count = cpu_to_le32(fl->stripe_count);
+ legacy->fl_object_size = cpu_to_le32(fl->object_size);
+ if (fl->pool_id >= 0)
+ legacy->fl_pg_pool = cpu_to_le32(fl->pool_id);
+ else
+ legacy->fl_pg_pool = 0;
+}
+EXPORT_SYMBOL(ceph_file_layout_to_legacy);
int ceph_flags_to_mode(int flags)
{
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index e77b04ca7802..c62b2b029a6e 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -156,8 +156,16 @@ static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
seq_printf(s, "]/%d\t[", t->up.primary);
for (i = 0; i < t->acting.size; i++)
seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
- seq_printf(s, "]/%d\t%*pE\t0x%x", t->acting.primary,
- t->target_oid.name_len, t->target_oid.name, t->flags);
+ seq_printf(s, "]/%d\t", t->acting.primary);
+ if (t->target_oloc.pool_ns) {
+ seq_printf(s, "%*pE/%*pE\t0x%x",
+ (int)t->target_oloc.pool_ns->len,
+ t->target_oloc.pool_ns->str,
+ t->target_oid.name_len, t->target_oid.name, t->flags);
+ } else {
+ seq_printf(s, "%*pE\t0x%x", t->target_oid.name_len,
+ t->target_oid.name, t->flags);
+ }
if (t->paused)
seq_puts(s, "\tP");
}
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index 37c38a7fb5c5..c83326c5ba58 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -227,9 +227,10 @@ static void __schedule_delayed(struct ceph_mon_client *monc)
}
const char *ceph_sub_str[] = {
- [CEPH_SUB_MDSMAP] = "mdsmap",
[CEPH_SUB_MONMAP] = "monmap",
[CEPH_SUB_OSDMAP] = "osdmap",
+ [CEPH_SUB_FSMAP] = "fsmap.user",
+ [CEPH_SUB_MDSMAP] = "mdsmap",
};
/*
@@ -1193,6 +1194,7 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
case CEPH_MSG_MON_MAP:
case CEPH_MSG_MDS_MAP:
case CEPH_MSG_OSD_MAP:
+ case CEPH_MSG_FS_MAP_USER:
m = ceph_msg_new(type, front_len, GFP_NOFS, false);
if (!m)
return NULL; /* ENOMEM--return skip == 0 */
diff --git a/net/ceph/msgpool.c b/net/ceph/msgpool.c
index ddec1c10ac80..aaed59a47b1d 100644
--- a/net/ceph/msgpool.c
+++ b/net/ceph/msgpool.c
@@ -5,6 +5,7 @@
#include <linux/types.h>
#include <linux/vmalloc.h>
+#include <linux/ceph/messenger.h>
#include <linux/ceph/msgpool.h>
static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index 89469592076c..b5ec09612ff7 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -387,7 +387,9 @@ static void target_copy(struct ceph_osd_request_target *dest,
static void target_destroy(struct ceph_osd_request_target *t)
{
ceph_oid_destroy(&t->base_oid);
+ ceph_oloc_destroy(&t->base_oloc);
ceph_oid_destroy(&t->target_oid);
+ ceph_oloc_destroy(&t->target_oloc);
}
/*
@@ -533,6 +535,11 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
}
EXPORT_SYMBOL(ceph_osdc_alloc_request);
+static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc)
+{
+ return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
+}
+
int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
{
struct ceph_osd_client *osdc = req->r_osdc;
@@ -540,11 +547,13 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
int msg_size;
WARN_ON(ceph_oid_empty(&req->r_base_oid));
+ WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
/* create request message */
msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
- msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
+ msg_size += CEPH_ENCODING_START_BLK_LEN +
+ ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */
msg_size += 1 + 8 + 4 + 4; /* pgid */
msg_size += 4 + req->r_base_oid.name_len; /* oid */
msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
@@ -932,7 +941,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
if (opcode == CEPH_OSD_OP_CREATE || opcode == CEPH_OSD_OP_DELETE) {
osd_req_op_init(req, which, opcode, 0);
} else {
- u32 object_size = le32_to_cpu(layout->fl_object_size);
+ u32 object_size = layout->object_size;
u32 object_base = off - objoff;
if (!(truncate_seq == 1 && truncate_size == -1ULL)) {
if (truncate_size <= object_base) {
@@ -948,7 +957,8 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
}
req->r_flags = flags;
- req->r_base_oloc.pool = ceph_file_layout_pg_pool(*layout);
+ req->r_base_oloc.pool = layout->pool_id;
+ req->r_base_oloc.pool_ns = ceph_try_get_string(layout->pool_ns);
ceph_oid_printf(&req->r_base_oid, "%llx.%08llx", vino.ino, objnum);
req->r_snapid = vino.snap;
@@ -1489,12 +1499,16 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
p += sizeof(req->r_replay_version);
/* oloc */
- ceph_encode_8(&p, 4);
- ceph_encode_8(&p, 4);
- ceph_encode_32(&p, 8 + 4 + 4);
+ ceph_start_encoding(&p, 5, 4,
+ ceph_oloc_encoding_size(&req->r_t.target_oloc));
ceph_encode_64(&p, req->r_t.target_oloc.pool);
ceph_encode_32(&p, -1); /* preferred */
ceph_encode_32(&p, 0); /* key len */
+ if (req->r_t.target_oloc.pool_ns)
+ ceph_encode_string(&p, end, req->r_t.target_oloc.pool_ns->str,
+ req->r_t.target_oloc.pool_ns->len);
+ else
+ ceph_encode_32(&p, 0);
/* pgid */
ceph_encode_8(&p, 1);
@@ -2594,9 +2608,22 @@ static int ceph_oloc_decode(void **p, void *end,
}
if (struct_v >= 5) {
+ bool changed = false;
+
len = ceph_decode_32(p);
if (len > 0) {
- pr_warn("ceph_object_locator::nspace is set\n");
+ ceph_decode_need(p, end, len, e_inval);
+ if (!oloc->pool_ns ||
+ ceph_compare_string(oloc->pool_ns, *p, len))
+ changed = true;
+ *p += len;
+ } else {
+ if (oloc->pool_ns)
+ changed = true;
+ }
+ if (changed) {
+ /* redirect changes namespace */
+ pr_warn("ceph_object_locator::nspace is changed\n");
goto e_inval;
}
}
@@ -2806,7 +2833,9 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
goto out_unlock_session;
}
+ m.redirect.oloc.pool_ns = req->r_t.target_oloc.pool_ns;
ret = decode_MOSDOpReply(msg, &m);
+ m.redirect.oloc.pool_ns = NULL;
if (ret) {
pr_err("failed to decode MOSDOpReply for tid %llu: %d\n",
req->r_tid, ret);
@@ -2835,7 +2864,11 @@ static void handle_reply(struct ceph_osd *osd, struct ceph_msg *msg)
unlink_request(osd, req);
mutex_unlock(&osd->lock);
- ceph_oloc_copy(&req->r_t.target_oloc, &m.redirect.oloc);
+ /*
+ * Not ceph_oloc_copy() - changing pool_ns is not
+ * supported.
+ */
+ req->r_t.target_oloc.pool = m.redirect.oloc.pool;
req->r_flags |= CEPH_OSD_FLAG_REDIRECTED;
req->r_tid = 0;
__submit_request(req, false);
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 7e480bf75bcf..d2436880b305 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -1510,6 +1510,24 @@ bad:
return ERR_PTR(err);
}
+void ceph_oloc_copy(struct ceph_object_locator *dest,
+ const struct ceph_object_locator *src)
+{
+ WARN_ON(!ceph_oloc_empty(dest));
+ WARN_ON(dest->pool_ns); /* empty() only covers ->pool */
+
+ dest->pool = src->pool;
+ if (src->pool_ns)
+ dest->pool_ns = ceph_get_string(src->pool_ns);
+}
+EXPORT_SYMBOL(ceph_oloc_copy);
+
+void ceph_oloc_destroy(struct ceph_object_locator *oloc)
+{
+ ceph_put_string(oloc->pool_ns);
+}
+EXPORT_SYMBOL(ceph_oloc_destroy);
+
void ceph_oid_copy(struct ceph_object_id *dest,
const struct ceph_object_id *src)
{
@@ -1770,9 +1788,9 @@ int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
u64 *ono,
u64 *oxoff, u64 *oxlen)
{
- u32 osize = le32_to_cpu(layout->fl_object_size);
- u32 su = le32_to_cpu(layout->fl_stripe_unit);
- u32 sc = le32_to_cpu(layout->fl_stripe_count);
+ u32 osize = layout->object_size;
+ u32 su = layout->stripe_unit;
+ u32 sc = layout->stripe_count;
u32 bl, stripeno, stripepos, objsetno;
u32 su_per_object;
u64 t, su_offset;
@@ -1844,12 +1862,34 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
if (!pi)
return -ENOENT;
- raw_pgid->pool = oloc->pool;
- raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
- oid->name_len);
-
- dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
- raw_pgid->pool, raw_pgid->seed);
+ if (!oloc->pool_ns) {
+ raw_pgid->pool = oloc->pool;
+ raw_pgid->seed = ceph_str_hash(pi->object_hash, oid->name,
+ oid->name_len);
+ dout("%s %s -> raw_pgid %llu.%x\n", __func__, oid->name,
+ raw_pgid->pool, raw_pgid->seed);
+ } else {
+ char stack_buf[256];
+ char *buf = stack_buf;
+ int nsl = oloc->pool_ns->len;
+ size_t total = nsl + 1 + oid->name_len;
+
+ if (total > sizeof(stack_buf)) {
+ buf = kmalloc(total, GFP_NOIO);
+ if (!buf)
+ return -ENOMEM;
+ }
+ memcpy(buf, oloc->pool_ns->str, nsl);
+ buf[nsl] = '\037';
+ memcpy(buf + nsl + 1, oid->name, oid->name_len);
+ raw_pgid->pool = oloc->pool;
+ raw_pgid->seed = ceph_str_hash(pi->object_hash, buf, total);
+ if (buf != stack_buf)
+ kfree(buf);
+ dout("%s %s ns %.*s -> raw_pgid %llu.%x\n", __func__,
+ oid->name, nsl, oloc->pool_ns->str,
+ raw_pgid->pool, raw_pgid->seed);
+ }
return 0;
}
EXPORT_SYMBOL(ceph_object_locator_to_pg);
diff --git a/net/ceph/string_table.c b/net/ceph/string_table.c
new file mode 100644
index 000000000000..ca53c8319209
--- /dev/null
+++ b/net/ceph/string_table.c
@@ -0,0 +1,111 @@
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/ceph/string_table.h>
+
+static DEFINE_SPINLOCK(string_tree_lock);
+static struct rb_root string_tree = RB_ROOT;
+
+struct ceph_string *ceph_find_or_create_string(const char* str, size_t len)
+{
+ struct ceph_string *cs, *exist;
+ struct rb_node **p, *parent;
+ int ret;
+
+ exist = NULL;
+ spin_lock(&string_tree_lock);
+ p = &string_tree.rb_node;
+ while (*p) {
+ exist = rb_entry(*p, struct ceph_string, node);
+ ret = ceph_compare_string(exist, str, len);
+ if (ret > 0)
+ p = &(*p)->rb_left;
+ else if (ret < 0)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ if (exist && !kref_get_unless_zero(&exist->kref)) {
+ rb_erase(&exist->node, &string_tree);
+ RB_CLEAR_NODE(&exist->node);
+ exist = NULL;
+ }
+ spin_unlock(&string_tree_lock);
+ if (exist)
+ return exist;
+
+ cs = kmalloc(sizeof(*cs) + len + 1, GFP_NOFS);
+ if (!cs)
+ return NULL;
+
+ kref_init(&cs->kref);
+ cs->len = len;
+ memcpy(cs->str, str, len);
+ cs->str[len] = 0;
+
+retry:
+ exist = NULL;
+ parent = NULL;
+ p = &string_tree.rb_node;
+ spin_lock(&string_tree_lock);
+ while (*p) {
+ parent = *p;
+ exist = rb_entry(*p, struct ceph_string, node);
+ ret = ceph_compare_string(exist, str, len);
+ if (ret > 0)
+ p = &(*p)->rb_left;
+ else if (ret < 0)
+ p = &(*p)->rb_right;
+ else
+ break;
+ exist = NULL;
+ }
+ ret = 0;
+ if (!exist) {
+ rb_link_node(&cs->node, parent, p);
+ rb_insert_color(&cs->node, &string_tree);
+ } else if (!kref_get_unless_zero(&exist->kref)) {
+ rb_erase(&exist->node, &string_tree);
+ RB_CLEAR_NODE(&exist->node);
+ ret = -EAGAIN;
+ }
+ spin_unlock(&string_tree_lock);
+ if (ret == -EAGAIN)
+ goto retry;
+
+ if (exist) {
+ kfree(cs);
+ cs = exist;
+ }
+
+ return cs;
+}
+EXPORT_SYMBOL(ceph_find_or_create_string);
+
+static void ceph_free_string(struct rcu_head *head)
+{
+ struct ceph_string *cs = container_of(head, struct ceph_string, rcu);
+ kfree(cs);
+}
+
+void ceph_release_string(struct kref *ref)
+{
+ struct ceph_string *cs = container_of(ref, struct ceph_string, kref);
+
+ spin_lock(&string_tree_lock);
+ if (!RB_EMPTY_NODE(&cs->node)) {
+ rb_erase(&cs->node, &string_tree);
+ RB_CLEAR_NODE(&cs->node);
+ }
+ spin_unlock(&string_tree_lock);
+
+ call_rcu(&cs->rcu, ceph_free_string);
+}
+EXPORT_SYMBOL(ceph_release_string);
+
+bool ceph_strings_empty(void)
+{
+ return RB_EMPTY_ROOT(&string_tree);
+}
diff --git a/scripts/Kbuild.include b/scripts/Kbuild.include
index 0f82314621f2..15b196fc2f49 100644
--- a/scripts/Kbuild.include
+++ b/scripts/Kbuild.include
@@ -202,7 +202,7 @@ hdr-inst := -f $(srctree)/scripts/Makefile.headersinst obj
# Prefix -I with $(srctree) if it is not an absolute path.
# skip if -I has no parameter
addtree = $(if $(patsubst -I%,%,$(1)), \
-$(if $(filter-out -I/%,$(1)),$(patsubst -I%,-I$(srctree)/%,$(1))) $(1))
+$(if $(filter-out -I/% -I./% -I../%,$(1)),$(patsubst -I%,-I$(srctree)/%,$(1)),$(1)))
# Find all -I options and call addtree
flags = $(foreach o,$($(1)),$(if $(filter -I%,$(o)),$(call addtree,$(o)),$(o)))
diff --git a/scripts/Makefile b/scripts/Makefile
index 822ab4a6a4aa..1d80897a9644 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -47,4 +47,4 @@ subdir-$(CONFIG_DTC) += dtc
subdir-$(CONFIG_GDB_SCRIPTS) += gdb
# Let clean descend into subdirs
-subdir- += basic kconfig package
+subdir- += basic kconfig package gcc-plugins
diff --git a/scripts/Makefile.build b/scripts/Makefile.build
index 0d1ca5bf42fb..11602e5efb3b 100644
--- a/scripts/Makefile.build
+++ b/scripts/Makefile.build
@@ -60,7 +60,7 @@ endif
endif
# Do not include host rules unless needed
-ifneq ($(hostprogs-y)$(hostprogs-m),)
+ifneq ($(hostprogs-y)$(hostprogs-m)$(hostlibs-y)$(hostlibs-m)$(hostcxxlibs-y)$(hostcxxlibs-m),)
include scripts/Makefile.host
endif
diff --git a/scripts/Makefile.clean b/scripts/Makefile.clean
index 55c96cb8070f..50616ea25131 100644
--- a/scripts/Makefile.clean
+++ b/scripts/Makefile.clean
@@ -38,7 +38,9 @@ subdir-ymn := $(addprefix $(obj)/,$(subdir-ymn))
__clean-files := $(extra-y) $(extra-m) $(extra-) \
$(always) $(targets) $(clean-files) \
$(host-progs) \
- $(hostprogs-y) $(hostprogs-m) $(hostprogs-)
+ $(hostprogs-y) $(hostprogs-m) $(hostprogs-) \
+ $(hostlibs-y) $(hostlibs-m) $(hostlibs-) \
+ $(hostcxxlibs-y) $(hostcxxlibs-m)
__clean-files := $(filter-out $(no-clean-files), $(__clean-files))
diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins
new file mode 100644
index 000000000000..5e22b60589c1
--- /dev/null
+++ b/scripts/Makefile.gcc-plugins
@@ -0,0 +1,43 @@
+ifdef CONFIG_GCC_PLUGINS
+ __PLUGINCC := $(call cc-ifversion, -ge, 0408, $(HOSTCXX), $(HOSTCC))
+ PLUGINCC := $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-plugin.sh "$(__PLUGINCC)" "$(HOSTCXX)" "$(CC)")
+
+ SANCOV_PLUGIN := -fplugin=$(objtree)/scripts/gcc-plugins/sancov_plugin.so
+
+ gcc-plugin-$(CONFIG_GCC_PLUGIN_CYC_COMPLEXITY) += cyc_complexity_plugin.so
+
+ ifdef CONFIG_GCC_PLUGIN_SANCOV
+ ifeq ($(CFLAGS_KCOV),)
+ # It is needed because of the gcc-plugin.sh and gcc version checks.
+ gcc-plugin-$(CONFIG_GCC_PLUGIN_SANCOV) += sancov_plugin.so
+
+ ifneq ($(PLUGINCC),)
+ CFLAGS_KCOV := $(SANCOV_PLUGIN)
+ else
+ $(warning warning: cannot use CONFIG_KCOV: -fsanitize-coverage=trace-pc is not supported by compiler)
+ endif
+ endif
+ endif
+
+ GCC_PLUGINS_CFLAGS := $(addprefix -fplugin=$(objtree)/scripts/gcc-plugins/, $(gcc-plugin-y))
+
+ export PLUGINCC GCC_PLUGINS_CFLAGS GCC_PLUGIN SANCOV_PLUGIN
+
+ ifeq ($(PLUGINCC),)
+ ifneq ($(GCC_PLUGINS_CFLAGS),)
+ ifeq ($(call cc-ifversion, -ge, 0405, y), y)
+ PLUGINCC := $(shell $(CONFIG_SHELL) -x $(srctree)/scripts/gcc-plugin.sh "$(__PLUGINCC)" "$(HOSTCXX)" "$(CC)")
+ $(warning warning: your gcc installation does not support plugins, perhaps the necessary headers are missing?)
+ else
+ $(warning warning: your gcc version does not support plugins, you should upgrade it to gcc 4.5 at least)
+ endif
+ endif
+ else
+ # SANCOV_PLUGIN can be only in CFLAGS_KCOV because avoid duplication.
+ GCC_PLUGINS_CFLAGS := $(filter-out $(SANCOV_PLUGIN), $(GCC_PLUGINS_CFLAGS))
+ endif
+
+ KBUILD_CFLAGS += $(GCC_PLUGINS_CFLAGS)
+ GCC_PLUGIN := $(gcc-plugin-y)
+
+endif
diff --git a/scripts/Makefile.host b/scripts/Makefile.host
index 133edfae5b8a..45b5b1aaedbd 100644
--- a/scripts/Makefile.host
+++ b/scripts/Makefile.host
@@ -20,7 +20,15 @@
# Will compile qconf as a C++ program, and menu as a C program.
# They are linked as C++ code to the executable qconf
+# hostcc-option
+# Usage: cflags-y += $(call hostcc-option,-march=winchip-c6,-march=i586)
+
+hostcc-option = $(call try-run,\
+ $(HOSTCC) $(HOSTCFLAGS) $(HOST_EXTRACFLAGS) $(1) -c -x c /dev/null -o "$$TMP",$(1),$(2))
+
__hostprogs := $(sort $(hostprogs-y) $(hostprogs-m))
+host-cshlib := $(sort $(hostlibs-y) $(hostlibs-m))
+host-cxxshlib := $(sort $(hostcxxlibs-y) $(hostcxxlibs-m))
# C code
# Executables compiled from a single .c file
@@ -42,6 +50,10 @@ host-cxxmulti := $(foreach m,$(__hostprogs),$(if $($(m)-cxxobjs),$(m)))
# C++ Object (.o) files compiled from .cc files
host-cxxobjs := $(sort $(foreach m,$(host-cxxmulti),$($(m)-cxxobjs)))
+# Object (.o) files used by the shared libaries
+host-cshobjs := $(sort $(foreach m,$(host-cshlib),$($(m:.so=-objs))))
+host-cxxshobjs := $(sort $(foreach m,$(host-cxxshlib),$($(m:.so=-objs))))
+
# output directory for programs/.o files
# hostprogs-y := tools/build may have been specified.
# Retrieve also directory of .o files from prog-objs or prog-cxxobjs notation
@@ -56,6 +68,10 @@ host-cmulti := $(addprefix $(obj)/,$(host-cmulti))
host-cobjs := $(addprefix $(obj)/,$(host-cobjs))
host-cxxmulti := $(addprefix $(obj)/,$(host-cxxmulti))
host-cxxobjs := $(addprefix $(obj)/,$(host-cxxobjs))
+host-cshlib := $(addprefix $(obj)/,$(host-cshlib))
+host-cxxshlib := $(addprefix $(obj)/,$(host-cxxshlib))
+host-cshobjs := $(addprefix $(obj)/,$(host-cshobjs))
+host-cxxshobjs := $(addprefix $(obj)/,$(host-cxxshobjs))
host-objdirs := $(addprefix $(obj)/,$(host-objdirs))
obj-dirs += $(host-objdirs)
@@ -124,5 +140,42 @@ quiet_cmd_host-cxxobjs = HOSTCXX $@
$(host-cxxobjs): $(obj)/%.o: $(src)/%.cc FORCE
$(call if_changed_dep,host-cxxobjs)
+# Compile .c file, create position independent .o file
+# host-cshobjs -> .o
+quiet_cmd_host-cshobjs = HOSTCC -fPIC $@
+ cmd_host-cshobjs = $(HOSTCC) $(hostc_flags) -fPIC -c -o $@ $<
+$(host-cshobjs): $(obj)/%.o: $(src)/%.c FORCE
+ $(call if_changed_dep,host-cshobjs)
+
+# Compile .c file, create position independent .o file
+# Note that plugin capable gcc versions can be either C or C++ based
+# therefore plugin source files have to be compilable in both C and C++ mode.
+# This is why a C++ compiler is invoked on a .c file.
+# host-cxxshobjs -> .o
+quiet_cmd_host-cxxshobjs = HOSTCXX -fPIC $@
+ cmd_host-cxxshobjs = $(HOSTCXX) $(hostcxx_flags) -fPIC -c -o $@ $<
+$(host-cxxshobjs): $(obj)/%.o: $(src)/%.c FORCE
+ $(call if_changed_dep,host-cxxshobjs)
+
+# Link a shared library, based on position independent .o files
+# *.o -> .so shared library (host-cshlib)
+quiet_cmd_host-cshlib = HOSTLLD -shared $@
+ cmd_host-cshlib = $(HOSTCC) $(HOSTLDFLAGS) -shared -o $@ \
+ $(addprefix $(obj)/,$($(@F:.so=-objs))) \
+ $(HOST_LOADLIBES) $(HOSTLOADLIBES_$(@F))
+$(host-cshlib): FORCE
+ $(call if_changed,host-cshlib)
+$(call multi_depend, $(host-cshlib), .so, -objs)
+
+# Link a shared library, based on position independent .o files
+# *.o -> .so shared library (host-cxxshlib)
+quiet_cmd_host-cxxshlib = HOSTLLD -shared $@
+ cmd_host-cxxshlib = $(HOSTCXX) $(HOSTLDFLAGS) -shared -o $@ \
+ $(addprefix $(obj)/,$($(@F:.so=-objs))) \
+ $(HOST_LOADLIBES) $(HOSTLOADLIBES_$(@F))
+$(host-cxxshlib): FORCE
+ $(call if_changed,host-cxxshlib)
+$(call multi_depend, $(host-cxxshlib), .so, -objs)
+
targets += $(host-csingle) $(host-cmulti) $(host-cobjs)\
- $(host-cxxmulti) $(host-cxxobjs)
+ $(host-cxxmulti) $(host-cxxobjs) $(host-cshlib) $(host-cshobjs) $(host-cxxshlib) $(host-cxxshobjs)
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index e7df0f5db7ec..e89c214745eb 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -155,9 +155,10 @@ else
# $(call addtree,-I$(obj)) locates .h files in srctree, from generated .c files
# and locates generated .h files
# FIXME: Replace both with specific CFLAGS* statements in the makefiles
-__c_flags = $(call addtree,-I$(obj)) $(call flags,_c_flags)
-__a_flags = $(call flags,_a_flags)
-__cpp_flags = $(call flags,_cpp_flags)
+__c_flags = $(if $(obj),-I$(srctree)/$(src) -I$(obj)) \
+ $(call flags,_c_flags)
+__a_flags = $(call flags,_a_flags)
+__cpp_flags = $(call flags,_cpp_flags)
endif
c_flags = -Wp,-MD,$(depfile) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) \
diff --git a/scripts/basic/bin2c.c b/scripts/basic/bin2c.c
index af187e695345..c3d7eef3ad06 100644
--- a/scripts/basic/bin2c.c
+++ b/scripts/basic/bin2c.c
@@ -29,7 +29,8 @@ int main(int argc, char *argv[])
} while (ch != EOF);
if (argc > 1)
- printf("\t;\n\nconst int %s_size = %d;\n", argv[1], total);
+ printf("\t;\n\n#include <linux/types.h>\n\nconst size_t %s_size = %d;\n",
+ argv[1], total);
return 0;
}
diff --git a/scripts/coccicheck b/scripts/coccicheck
index dd85a455b2ba..c92c1528a54d 100755
--- a/scripts/coccicheck
+++ b/scripts/coccicheck
@@ -1,14 +1,24 @@
#!/bin/bash
-
+# Linux kernel coccicheck
+#
+# Read Documentation/coccinelle.txt
#
# This script requires at least spatch
# version 1.0.0-rc11.
-#
+DIR="$(dirname $(readlink -f $0))/.."
SPATCH="`which ${SPATCH:=spatch}`"
-trap kill_running SIGTERM SIGINT
-declare -a SPATCH_PID
+if [ ! -x "$SPATCH" ]; then
+ echo 'spatch is part of the Coccinelle project and is available at http://coccinelle.lip6.fr/'
+ exit 1
+fi
+
+SPATCH_VERSION=$($SPATCH --version | head -1 | awk '{print $3}')
+SPATCH_VERSION_NUM=$(echo $SPATCH_VERSION | ${DIR}/scripts/ld-version.sh)
+
+USE_JOBS="no"
+$SPATCH --help | grep "\-\-jobs" > /dev/null && USE_JOBS="yes"
# The verbosity may be set by the environmental parameter V=
# as for example with 'make V=1 coccicheck'
@@ -25,7 +35,28 @@ else
NPROC="$J"
fi
-FLAGS="$SPFLAGS --very-quiet"
+FLAGS="--very-quiet"
+
+# You can use SPFLAGS to append extra arguments to coccicheck or override any
+# heuristics done in this file as Coccinelle accepts the last options when
+# options conflict.
+#
+# A good example for use of SPFLAGS is if you want to debug your cocci script,
+# you can for instance use the following:
+#
+# $ export COCCI=scripts/coccinelle/misc/irqf_oneshot.cocci
+# $ make coccicheck MODE=report DEBUG_FILE="all.err" SPFLAGS="--profile --show-trying" M=./drivers/mfd/arizona-irq.c
+#
+# "--show-trying" should show you what rule is being processed as it goes to
+# stdout, you do not need a debug file for that. The profile output will be
+# be sent to stdout, if you provide a DEBUG_FILE the profiling data can be
+# inspected there.
+#
+# --profile will not output if --very-quiet is used, so avoid it.
+echo $SPFLAGS | egrep -e "--profile|--show-trying" 2>&1 > /dev/null
+if [ $? -eq 0 ]; then
+ FLAGS="--quiet"
+fi
# spatch only allows include directories with the syntax "-I include"
# while gcc also allows "-Iinclude" and "-include include"
@@ -51,9 +82,14 @@ if [ "$KBUILD_EXTMOD" != "" ] ; then
OPTIONS="--patch $srctree $OPTIONS"
fi
-if [ ! -x "$SPATCH" ]; then
- echo 'spatch is part of the Coccinelle project and is available at http://coccinelle.lip6.fr/'
- exit 1
+# You can override by using SPFLAGS
+if [ "$USE_JOBS" = "no" ]; then
+ trap kill_running SIGTERM SIGINT
+ declare -a SPATCH_PID
+elif [ "$NPROC" != "1" ]; then
+ # Using 0 should work as well, refer to _SC_NPROCESSORS_ONLN use on
+ # https://github.com/rdicosmo/parmap/blob/master/setcore_stubs.c
+ OPTIONS="$OPTIONS --jobs $NPROC --chunksize 1"
fi
if [ "$MODE" = "" ] ; then
@@ -72,7 +108,7 @@ if [ "$MODE" = "chain" ] ; then
echo 'All available modes will be tried (in that order): patch, report, context, org'
fi
elif [ "$MODE" = "report" -o "$MODE" = "org" ] ; then
- FLAGS="$FLAGS --no-show-diff"
+ FLAGS="--no-show-diff $FLAGS"
fi
if [ "$ONLINE" = "0" ] ; then
@@ -82,7 +118,26 @@ if [ "$ONLINE" = "0" ] ; then
echo ''
fi
-run_cmd() {
+run_cmd_parmap() {
+ if [ $VERBOSE -ne 0 ] ; then
+ echo "Running ($NPROC in parallel): $@"
+ fi
+ if [ "$DEBUG_FILE" != "/dev/null" -a "$DEBUG_FILE" != "" ]; then
+ if [ -f $DEBUG_FILE ]; then
+ echo "Debug file $DEBUG_FILE exists, bailing"
+ exit
+ fi
+ else
+ DEBUG_FILE="/dev/null"
+ fi
+ $@ 2>$DEBUG_FILE
+ if [[ $? -ne 0 ]]; then
+ echo "coccicheck failed"
+ exit $?
+ fi
+}
+
+run_cmd_old() {
local i
if [ $VERBOSE -ne 0 ] ; then
echo "Running ($NPROC in parallel): $@"
@@ -97,6 +152,14 @@ run_cmd() {
wait
}
+run_cmd() {
+ if [ "$USE_JOBS" = "yes" ]; then
+ run_cmd_parmap $@
+ else
+ run_cmd_old $@
+ fi
+}
+
kill_running() {
for i in $(seq 0 $(( NPROC - 1 )) ); do
if [ $VERBOSE -eq 2 ] ; then
@@ -106,10 +169,23 @@ kill_running() {
done
}
+# You can override heuristics with SPFLAGS, these must always go last
+OPTIONS="$OPTIONS $SPFLAGS"
+
coccinelle () {
COCCI="$1"
OPT=`grep "Option" $COCCI | cut -d':' -f2`
+ REQ=`grep "Requires" $COCCI | cut -d':' -f2 | sed "s| ||"`
+ REQ_NUM=$(echo $REQ | ${DIR}/scripts/ld-version.sh)
+ if [ "$REQ_NUM" != "0" ] ; then
+ if [ "$SPATCH_VERSION_NUM" -lt "$REQ_NUM" ] ; then
+ echo "Skipping coccinele SmPL patch: $COCCI"
+ echo "You have coccinelle: $SPATCH_VERSION"
+ echo "This SmPL patch requires: $REQ"
+ return
+ fi
+ fi
# The option '--parse-cocci' can be used to syntactically check the SmPL files.
#
diff --git a/scripts/coccinelle/free/devm_free.cocci b/scripts/coccinelle/free/devm_free.cocci
index 3d9349012bb3..c990d2c7ee16 100644
--- a/scripts/coccinelle/free/devm_free.cocci
+++ b/scripts/coccinelle/free/devm_free.cocci
@@ -29,8 +29,24 @@ expression x;
@@
(
+ x = devm_kmalloc(...)
+|
+ x = devm_kvasprintf(...)
+|
+ x = devm_kasprintf(...)
+|
x = devm_kzalloc(...)
|
+ x = devm_kmalloc_array(...)
+|
+ x = devm_kcalloc(...)
+|
+ x = devm_kstrdup(...)
+|
+ x = devm_kmemdup(...)
+|
+ x = devm_get_free_pages(...)
+|
x = devm_request_irq(...)
|
x = devm_ioremap(...)
@@ -48,6 +64,16 @@ position p;
(
* kfree@p(x)
|
+* kzfree@p(x)
+|
+* __krealloc@p(x, ...)
+|
+* krealloc@p(x, ...)
+|
+* free_pages@p(x, ...)
+|
+* free_page@p(x)
+|
* free_irq@p(x)
|
* iounmap@p(x)
diff --git a/scripts/coccinelle/free/ifnullfree.cocci b/scripts/coccinelle/free/ifnullfree.cocci
index 52bd235286fa..14a4cd98e83b 100644
--- a/scripts/coccinelle/free/ifnullfree.cocci
+++ b/scripts/coccinelle/free/ifnullfree.cocci
@@ -20,6 +20,8 @@ expression E;
(
kfree(E);
|
+ kzfree(E);
+|
debugfs_remove(E);
|
debugfs_remove_recursive(E);
@@ -39,7 +41,7 @@ position p;
@@
* if (E != NULL)
-* \(kfree@p\|debugfs_remove@p\|debugfs_remove_recursive@p\|
+* \(kfree@p\|kzfree@p\|debugfs_remove@p\|debugfs_remove_recursive@p\|
* usb_free_urb@p\|kmem_cache_destroy@p\|mempool_destroy@p\|
* dma_pool_destroy@p\)(E);
diff --git a/scripts/coccinelle/free/kfree.cocci b/scripts/coccinelle/free/kfree.cocci
index 577b78056990..ac438da4fd7b 100644
--- a/scripts/coccinelle/free/kfree.cocci
+++ b/scripts/coccinelle/free/kfree.cocci
@@ -20,7 +20,11 @@ expression E;
position p1;
@@
-kfree@p1(E)
+(
+* kfree@p1(E)
+|
+* kzfree@p1(E)
+)
@print expression@
constant char [] c;
@@ -60,7 +64,11 @@ position ok;
@@
while (1) { ...
- kfree@ok(E)
+(
+* kfree@ok(E)
+|
+* kzfree@ok(E)
+)
... when != break;
when != goto l;
when forall
@@ -74,7 +82,11 @@ statement S;
position free.p1!=loop.ok,p2!={print.p,sz.p};
@@
-kfree@p1(E,...)
+(
+* kfree@p1(E,...)
+|
+* kzfree@p1(E,...)
+)
...
(
iter(...,subE,...) S // no use
diff --git a/scripts/coccinelle/free/kfreeaddr.cocci b/scripts/coccinelle/free/kfreeaddr.cocci
index ce8aacc314cb..d46063b1db8b 100644
--- a/scripts/coccinelle/free/kfreeaddr.cocci
+++ b/scripts/coccinelle/free/kfreeaddr.cocci
@@ -16,7 +16,11 @@ identifier f;
position p;
@@
+(
* kfree@p(&e->f)
+|
+* kzfree@p(&e->f)
+)
@script:python depends on org@
p << r.p;
@@ -28,5 +32,5 @@ cocci.print_main("kfree",p)
p << r.p;
@@
-msg = "ERROR: kfree of structure field"
+msg = "ERROR: invalid free of structure field"
coccilib.report.print_report(p[0],msg)
diff --git a/scripts/coccinelle/iterators/device_node_continue.cocci b/scripts/coccinelle/iterators/device_node_continue.cocci
index 38ab744a4037..a36c16db171b 100644
--- a/scripts/coccinelle/iterators/device_node_continue.cocci
+++ b/scripts/coccinelle/iterators/device_node_continue.cocci
@@ -5,8 +5,11 @@
// Copyright: (C) 2015 Julia Lawall, Inria. GPLv2.
// URL: http://coccinelle.lip6.fr/
// Options: --no-includes --include-headers
+// Requires: 1.0.4
// Keywords: for_each_child_of_node, etc.
+// This uses a conjunction, which requires at least coccinelle >= 1.0.4
+
virtual patch
virtual context
virtual org
diff --git a/scripts/coccinelle/misc/noderef.cocci b/scripts/coccinelle/misc/noderef.cocci
index 80a831c91161..007f0de0c715 100644
--- a/scripts/coccinelle/misc/noderef.cocci
+++ b/scripts/coccinelle/misc/noderef.cocci
@@ -16,6 +16,7 @@ virtual patch
@depends on patch@
expression *x;
expression f;
+expression i;
type T;
@@
@@ -30,15 +31,26 @@ f(...,(T)(x),...,sizeof(
+ *x
),...)
|
-f(...,sizeof(x),...,(T)(
+f(...,sizeof(
+- x
++ *x
+ ),...,(T)(x),...)
+|
+f(...,(T)(x),...,i*sizeof(
- x
+ *x
),...)
+|
+f(...,i*sizeof(
+- x
++ *x
+ ),...,(T)(x),...)
)
@r depends on !patch@
expression *x;
expression f;
+expression i;
position p;
type T;
@@
@@ -49,6 +61,10 @@ type T;
*f(...,(T)(x),...,sizeof@p(x),...)
|
*f(...,sizeof@p(x),...,(T)(x),...)
+|
+*f(...,(T)(x),...,i*sizeof@p(x),...)
+|
+*f(...,i*sizeof@p(x),...,(T)(x),...)
)
@script:python depends on org@
diff --git a/scripts/gcc-plugin.sh b/scripts/gcc-plugin.sh
new file mode 100755
index 000000000000..fb9207565471
--- /dev/null
+++ b/scripts/gcc-plugin.sh
@@ -0,0 +1,51 @@
+#!/bin/sh
+srctree=$(dirname "$0")
+gccplugins_dir=$($3 -print-file-name=plugin)
+plugincc=$($1 -E -x c++ - -o /dev/null -I"${srctree}"/gcc-plugins -I"${gccplugins_dir}"/include 2>&1 <<EOF
+#include "gcc-common.h"
+#if BUILDING_GCC_VERSION >= 4008 || defined(ENABLE_BUILD_WITH_CXX)
+#warning $2 CXX
+#else
+#warning $1 CC
+#endif
+EOF
+)
+
+if [ $? -ne 0 ]
+then
+ exit 1
+fi
+
+case "$plugincc" in
+ *"$1 CC"*)
+ echo "$1"
+ exit 0
+ ;;
+
+ *"$2 CXX"*)
+ # the c++ compiler needs another test, see below
+ ;;
+
+ *)
+ exit 1
+ ;;
+esac
+
+# we need a c++ compiler that supports the designated initializer GNU extension
+plugincc=$($2 -c -x c++ -std=gnu++98 - -fsyntax-only -I"${srctree}"/gcc-plugins -I"${gccplugins_dir}"/include 2>&1 <<EOF
+#include "gcc-common.h"
+class test {
+public:
+ int test;
+} test = {
+ .test = 1
+};
+EOF
+)
+
+if [ $? -eq 0 ]
+then
+ echo "$2"
+ exit 0
+fi
+exit 1
diff --git a/scripts/gcc-plugins/Makefile b/scripts/gcc-plugins/Makefile
new file mode 100644
index 000000000000..88c8ec47232b
--- /dev/null
+++ b/scripts/gcc-plugins/Makefile
@@ -0,0 +1,27 @@
+GCC_PLUGINS_DIR := $(shell $(CC) -print-file-name=plugin)
+
+ifeq ($(PLUGINCC),$(HOSTCC))
+ HOSTLIBS := hostlibs
+ HOST_EXTRACFLAGS += -I$(GCC_PLUGINS_DIR)/include -I$(src) -std=gnu99 -ggdb
+ export HOST_EXTRACFLAGS
+else
+ HOSTLIBS := hostcxxlibs
+ HOST_EXTRACXXFLAGS += -I$(GCC_PLUGINS_DIR)/include -I$(src) -std=gnu++98 -fno-rtti
+ HOST_EXTRACXXFLAGS += -fno-exceptions -fasynchronous-unwind-tables -ggdb
+ HOST_EXTRACXXFLAGS += -Wno-narrowing -Wno-unused-variable
+ export HOST_EXTRACXXFLAGS
+endif
+
+export GCCPLUGINS_DIR HOSTLIBS
+
+ifneq ($(CFLAGS_KCOV), $(SANCOV_PLUGIN))
+ GCC_PLUGIN := $(filter-out $(SANCOV_PLUGIN), $(GCC_PLUGIN))
+endif
+
+$(HOSTLIBS)-y := $(GCC_PLUGIN)
+always := $($(HOSTLIBS)-y)
+
+cyc_complexity_plugin-objs := cyc_complexity_plugin.o
+sancov_plugin-objs := sancov_plugin.o
+
+clean-files += *.so
diff --git a/scripts/gcc-plugins/cyc_complexity_plugin.c b/scripts/gcc-plugins/cyc_complexity_plugin.c
new file mode 100644
index 000000000000..34df974c6ba3
--- /dev/null
+++ b/scripts/gcc-plugins/cyc_complexity_plugin.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2011-2016 by Emese Revfy <re.emese@gmail.com>
+ * Licensed under the GPL v2, or (at your option) v3
+ *
+ * Homepage:
+ * https://github.com/ephox-gcc-plugins/cyclomatic_complexity
+ *
+ * http://en.wikipedia.org/wiki/Cyclomatic_complexity
+ * The complexity M is then defined as:
+ * M = E - N + 2P
+ * where
+ *
+ * E = the number of edges of the graph
+ * N = the number of nodes of the graph
+ * P = the number of connected components (exit nodes).
+ *
+ * Usage (4.5 - 5):
+ * $ make clean; make run
+ */
+
+#include "gcc-common.h"
+
+int plugin_is_GPL_compatible;
+
+static struct plugin_info cyc_complexity_plugin_info = {
+ .version = "20160225",
+ .help = "Cyclomatic Complexity\n",
+};
+
+static unsigned int cyc_complexity_execute(void)
+{
+ int complexity;
+ expanded_location xloc;
+
+ /* M = E - N + 2P */
+ complexity = n_edges_for_fn(cfun) - n_basic_blocks_for_fn(cfun) + 2;
+
+ xloc = expand_location(DECL_SOURCE_LOCATION(current_function_decl));
+ fprintf(stderr, "Cyclomatic Complexity %d %s:%s\n", complexity,
+ xloc.file, DECL_NAME_POINTER(current_function_decl));
+
+ return 0;
+}
+
+#define PASS_NAME cyc_complexity
+
+#define NO_GATE
+#define TODO_FLAGS_FINISH TODO_dump_func
+
+#include "gcc-generate-gimple-pass.h"
+
+int plugin_init(struct plugin_name_args *plugin_info, struct plugin_gcc_version *version)
+{
+ const char * const plugin_name = plugin_info->base_name;
+ struct register_pass_info cyc_complexity_pass_info;
+
+ cyc_complexity_pass_info.pass = make_cyc_complexity_pass();
+ cyc_complexity_pass_info.reference_pass_name = "ssa";
+ cyc_complexity_pass_info.ref_pass_instance_number = 1;
+ cyc_complexity_pass_info.pos_op = PASS_POS_INSERT_AFTER;
+
+ if (!plugin_default_version_check(version, &gcc_version)) {
+ error(G_("incompatible gcc/plugin versions"));
+ return 1;
+ }
+
+ register_callback(plugin_name, PLUGIN_INFO, NULL,
+ &cyc_complexity_plugin_info);
+ register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL,
+ &cyc_complexity_pass_info);
+
+ return 0;
+}
diff --git a/scripts/gcc-plugins/gcc-common.h b/scripts/gcc-plugins/gcc-common.h
new file mode 100644
index 000000000000..172850bcd0d9
--- /dev/null
+++ b/scripts/gcc-plugins/gcc-common.h
@@ -0,0 +1,830 @@
+#ifndef GCC_COMMON_H_INCLUDED
+#define GCC_COMMON_H_INCLUDED
+
+#include "bversion.h"
+#if BUILDING_GCC_VERSION >= 6000
+#include "gcc-plugin.h"
+#else
+#include "plugin.h"
+#endif
+#include "plugin-version.h"
+#include "config.h"
+#include "system.h"
+#include "coretypes.h"
+#include "tm.h"
+#include "line-map.h"
+#include "input.h"
+#include "tree.h"
+
+#include "tree-inline.h"
+#include "version.h"
+#include "rtl.h"
+#include "tm_p.h"
+#include "flags.h"
+#include "hard-reg-set.h"
+#include "output.h"
+#include "except.h"
+#include "function.h"
+#include "toplev.h"
+#include "basic-block.h"
+#include "intl.h"
+#include "ggc.h"
+#include "timevar.h"
+
+#include "params.h"
+
+#if BUILDING_GCC_VERSION <= 4009
+#include "pointer-set.h"
+#else
+#include "hash-map.h"
+#endif
+
+#include "emit-rtl.h"
+#include "debug.h"
+#include "target.h"
+#include "langhooks.h"
+#include "cfgloop.h"
+#include "cgraph.h"
+#include "opts.h"
+
+#if BUILDING_GCC_VERSION == 4005
+#include <sys/mman.h>
+#endif
+
+#if BUILDING_GCC_VERSION >= 4007
+#include "tree-pretty-print.h"
+#include "gimple-pretty-print.h"
+#endif
+
+#if BUILDING_GCC_VERSION >= 4006
+#include "c-family/c-common.h"
+#else
+#include "c-common.h"
+#endif
+
+#if BUILDING_GCC_VERSION <= 4008
+#include "tree-flow.h"
+#else
+#include "tree-cfgcleanup.h"
+#include "tree-ssa-operands.h"
+#include "tree-into-ssa.h"
+#endif
+
+#if BUILDING_GCC_VERSION >= 4008
+#include "is-a.h"
+#endif
+
+#include "diagnostic.h"
+#include "tree-dump.h"
+#include "tree-pass.h"
+#include "predict.h"
+#include "ipa-utils.h"
+
+#if BUILDING_GCC_VERSION >= 4009
+#include "attribs.h"
+#include "varasm.h"
+#include "stor-layout.h"
+#include "internal-fn.h"
+#include "gimple-expr.h"
+#include "gimple-fold.h"
+#include "context.h"
+#include "tree-ssa-alias.h"
+#include "tree-ssa.h"
+#include "stringpool.h"
+#include "tree-ssanames.h"
+#include "print-tree.h"
+#include "tree-eh.h"
+#include "stmt.h"
+#include "gimplify.h"
+#endif
+
+#include "gimple.h"
+
+#if BUILDING_GCC_VERSION >= 4009
+#include "tree-ssa-operands.h"
+#include "tree-phinodes.h"
+#include "tree-cfg.h"
+#include "gimple-iterator.h"
+#include "gimple-ssa.h"
+#include "ssa-iterators.h"
+#endif
+
+#if BUILDING_GCC_VERSION >= 5000
+#include "builtins.h"
+#endif
+
+/* #include "expr.h" where are you... */
+extern rtx emit_move_insn(rtx x, rtx y);
+
+/* missing from basic_block.h... */
+extern void debug_dominance_info(enum cdi_direction dir);
+extern void debug_dominance_tree(enum cdi_direction dir, basic_block root);
+
+#if BUILDING_GCC_VERSION == 4006
+extern void debug_gimple_stmt(gimple);
+extern void debug_gimple_seq(gimple_seq);
+extern void print_gimple_seq(FILE *, gimple_seq, int, int);
+extern void print_gimple_stmt(FILE *, gimple, int, int);
+extern void print_gimple_expr(FILE *, gimple, int, int);
+extern void dump_gimple_stmt(pretty_printer *, gimple, int, int);
+#endif
+
+#define __unused __attribute__((__unused__))
+
+#define DECL_NAME_POINTER(node) IDENTIFIER_POINTER(DECL_NAME(node))
+#define DECL_NAME_LENGTH(node) IDENTIFIER_LENGTH(DECL_NAME(node))
+#define TYPE_NAME_POINTER(node) IDENTIFIER_POINTER(TYPE_NAME(node))
+#define TYPE_NAME_LENGTH(node) IDENTIFIER_LENGTH(TYPE_NAME(node))
+
+/* should come from c-tree.h if only it were installed for gcc 4.5... */
+#define C_TYPE_FIELDS_READONLY(TYPE) TREE_LANG_FLAG_1(TYPE)
+
+#if BUILDING_GCC_VERSION == 4005
+#define FOR_EACH_LOCAL_DECL(FUN, I, D) \
+ for (tree vars = (FUN)->local_decls, (I) = 0; \
+ vars && ((D) = TREE_VALUE(vars)); \
+ vars = TREE_CHAIN(vars), (I)++)
+#define DECL_CHAIN(NODE) (TREE_CHAIN(DECL_MINIMAL_CHECK(NODE)))
+#define FOR_EACH_VEC_ELT(T, V, I, P) \
+ for (I = 0; VEC_iterate(T, (V), (I), (P)); ++(I))
+#define TODO_rebuild_cgraph_edges 0
+#define SCOPE_FILE_SCOPE_P(EXP) (!(EXP))
+
+#ifndef O_BINARY
+#define O_BINARY 0
+#endif
+
+typedef struct varpool_node *varpool_node_ptr;
+
+static inline bool gimple_call_builtin_p(gimple stmt, enum built_in_function code)
+{
+ tree fndecl;
+
+ if (!is_gimple_call(stmt))
+ return false;
+ fndecl = gimple_call_fndecl(stmt);
+ if (!fndecl || DECL_BUILT_IN_CLASS(fndecl) != BUILT_IN_NORMAL)
+ return false;
+ return DECL_FUNCTION_CODE(fndecl) == code;
+}
+
+static inline bool is_simple_builtin(tree decl)
+{
+ if (decl && DECL_BUILT_IN_CLASS(decl) != BUILT_IN_NORMAL)
+ return false;
+
+ switch (DECL_FUNCTION_CODE(decl)) {
+ /* Builtins that expand to constants. */
+ case BUILT_IN_CONSTANT_P:
+ case BUILT_IN_EXPECT:
+ case BUILT_IN_OBJECT_SIZE:
+ case BUILT_IN_UNREACHABLE:
+ /* Simple register moves or loads from stack. */
+ case BUILT_IN_RETURN_ADDRESS:
+ case BUILT_IN_EXTRACT_RETURN_ADDR:
+ case BUILT_IN_FROB_RETURN_ADDR:
+ case BUILT_IN_RETURN:
+ case BUILT_IN_AGGREGATE_INCOMING_ADDRESS:
+ case BUILT_IN_FRAME_ADDRESS:
+ case BUILT_IN_VA_END:
+ case BUILT_IN_STACK_SAVE:
+ case BUILT_IN_STACK_RESTORE:
+ /* Exception state returns or moves registers around. */
+ case BUILT_IN_EH_FILTER:
+ case BUILT_IN_EH_POINTER:
+ case BUILT_IN_EH_COPY_VALUES:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
+static inline void add_local_decl(struct function *fun, tree d)
+{
+ gcc_assert(TREE_CODE(d) == VAR_DECL);
+ fun->local_decls = tree_cons(NULL_TREE, d, fun->local_decls);
+}
+#endif
+
+#if BUILDING_GCC_VERSION <= 4006
+#define ANY_RETURN_P(rtx) (GET_CODE(rtx) == RETURN)
+#define C_DECL_REGISTER(EXP) DECL_LANG_FLAG_4(EXP)
+#define EDGE_PRESERVE 0ULL
+#define HOST_WIDE_INT_PRINT_HEX_PURE "%" HOST_WIDE_INT_PRINT "x"
+#define flag_fat_lto_objects true
+
+#define get_random_seed(noinit) ({ \
+ unsigned HOST_WIDE_INT seed; \
+ sscanf(get_random_seed(noinit), "%" HOST_WIDE_INT_PRINT "x", &seed); \
+ seed * seed; })
+
+#define int_const_binop(code, arg1, arg2) \
+ int_const_binop((code), (arg1), (arg2), 0)
+
+static inline bool gimple_clobber_p(gimple s __unused)
+{
+ return false;
+}
+
+static inline bool gimple_asm_clobbers_memory_p(const_gimple stmt)
+{
+ unsigned i;
+
+ for (i = 0; i < gimple_asm_nclobbers(stmt); i++) {
+ tree op = gimple_asm_clobber_op(stmt, i);
+
+ if (!strcmp(TREE_STRING_POINTER(TREE_VALUE(op)), "memory"))
+ return true;
+ }
+
+ return false;
+}
+
+static inline tree builtin_decl_implicit(enum built_in_function fncode)
+{
+ return implicit_built_in_decls[fncode];
+}
+
+static inline int ipa_reverse_postorder(struct cgraph_node **order)
+{
+ return cgraph_postorder(order);
+}
+
+static inline struct cgraph_node *cgraph_create_node(tree decl)
+{
+ return cgraph_node(decl);
+}
+
+static inline struct cgraph_node *cgraph_get_create_node(tree decl)
+{
+ struct cgraph_node *node = cgraph_get_node(decl);
+
+ return node ? node : cgraph_node(decl);
+}
+
+static inline bool cgraph_function_with_gimple_body_p(struct cgraph_node *node)
+{
+ return node->analyzed && !node->thunk.thunk_p && !node->alias;
+}
+
+static inline struct cgraph_node *cgraph_first_function_with_gimple_body(void)
+{
+ struct cgraph_node *node;
+
+ for (node = cgraph_nodes; node; node = node->next)
+ if (cgraph_function_with_gimple_body_p(node))
+ return node;
+ return NULL;
+}
+
+static inline struct cgraph_node *cgraph_next_function_with_gimple_body(struct cgraph_node *node)
+{
+ for (node = node->next; node; node = node->next)
+ if (cgraph_function_with_gimple_body_p(node))
+ return node;
+ return NULL;
+}
+
+#define FOR_EACH_FUNCTION_WITH_GIMPLE_BODY(node) \
+ for ((node) = cgraph_first_function_with_gimple_body(); (node); \
+ (node) = cgraph_next_function_with_gimple_body(node))
+
+static inline void varpool_add_new_variable(tree decl)
+{
+ varpool_finalize_decl(decl);
+}
+#endif
+
+#if BUILDING_GCC_VERSION <= 4007
+#define FOR_EACH_FUNCTION(node) \
+ for (node = cgraph_nodes; node; node = node->next)
+#define FOR_EACH_VARIABLE(node) \
+ for (node = varpool_nodes; node; node = node->next)
+#define PROP_loops 0
+#define NODE_SYMBOL(node) (node)
+#define NODE_DECL(node) (node)->decl
+#define INSN_LOCATION(INSN) RTL_LOCATION(INSN)
+#define vNULL NULL
+
+static inline int bb_loop_depth(const_basic_block bb)
+{
+ return bb->loop_father ? loop_depth(bb->loop_father) : 0;
+}
+
+static inline bool gimple_store_p(gimple gs)
+{
+ tree lhs = gimple_get_lhs(gs);
+
+ return lhs && !is_gimple_reg(lhs);
+}
+
+static inline void gimple_init_singleton(gimple g __unused)
+{
+}
+#endif
+
+#if BUILDING_GCC_VERSION == 4007 || BUILDING_GCC_VERSION == 4008
+static inline struct cgraph_node *cgraph_alias_target(struct cgraph_node *n)
+{
+ return cgraph_alias_aliased_node(n);
+}
+#endif
+
+#if BUILDING_GCC_VERSION >= 4007 && BUILDING_GCC_VERSION <= 4009
+#define cgraph_create_edge(caller, callee, call_stmt, count, freq, nest) \
+ cgraph_create_edge((caller), (callee), (call_stmt), (count), (freq))
+#define cgraph_create_edge_including_clones(caller, callee, old_call_stmt, call_stmt, count, freq, nest, reason) \
+ cgraph_create_edge_including_clones((caller), (callee), (old_call_stmt), (call_stmt), (count), (freq), (reason))
+#endif
+
+#if BUILDING_GCC_VERSION <= 4008
+#define ENTRY_BLOCK_PTR_FOR_FN(FN) ENTRY_BLOCK_PTR_FOR_FUNCTION(FN)
+#define EXIT_BLOCK_PTR_FOR_FN(FN) EXIT_BLOCK_PTR_FOR_FUNCTION(FN)
+#define basic_block_info_for_fn(FN) ((FN)->cfg->x_basic_block_info)
+#define n_basic_blocks_for_fn(FN) ((FN)->cfg->x_n_basic_blocks)
+#define n_edges_for_fn(FN) ((FN)->cfg->x_n_edges)
+#define last_basic_block_for_fn(FN) ((FN)->cfg->x_last_basic_block)
+#define label_to_block_map_for_fn(FN) ((FN)->cfg->x_label_to_block_map)
+#define profile_status_for_fn(FN) ((FN)->cfg->x_profile_status)
+#define BASIC_BLOCK_FOR_FN(FN, N) BASIC_BLOCK_FOR_FUNCTION((FN), (N))
+#define NODE_IMPLICIT_ALIAS(node) (node)->same_body_alias
+#define VAR_P(NODE) (TREE_CODE(NODE) == VAR_DECL)
+
+static inline bool tree_fits_shwi_p(const_tree t)
+{
+ if (t == NULL_TREE || TREE_CODE(t) != INTEGER_CST)
+ return false;
+
+ if (TREE_INT_CST_HIGH(t) == 0 && (HOST_WIDE_INT)TREE_INT_CST_LOW(t) >= 0)
+ return true;
+
+ if (TREE_INT_CST_HIGH(t) == -1 && (HOST_WIDE_INT)TREE_INT_CST_LOW(t) < 0 && !TYPE_UNSIGNED(TREE_TYPE(t)))
+ return true;
+
+ return false;
+}
+
+static inline bool tree_fits_uhwi_p(const_tree t)
+{
+ if (t == NULL_TREE || TREE_CODE(t) != INTEGER_CST)
+ return false;
+
+ return TREE_INT_CST_HIGH(t) == 0;
+}
+
+static inline HOST_WIDE_INT tree_to_shwi(const_tree t)
+{
+ gcc_assert(tree_fits_shwi_p(t));
+ return TREE_INT_CST_LOW(t);
+}
+
+static inline unsigned HOST_WIDE_INT tree_to_uhwi(const_tree t)
+{
+ gcc_assert(tree_fits_uhwi_p(t));
+ return TREE_INT_CST_LOW(t);
+}
+
+static inline const char *get_tree_code_name(enum tree_code code)
+{
+ gcc_assert(code < MAX_TREE_CODES);
+ return tree_code_name[code];
+}
+
+#define ipa_remove_stmt_references(cnode, stmt)
+
+typedef union gimple_statement_d gasm;
+typedef union gimple_statement_d gassign;
+typedef union gimple_statement_d gcall;
+typedef union gimple_statement_d gcond;
+typedef union gimple_statement_d gdebug;
+typedef union gimple_statement_d gphi;
+typedef union gimple_statement_d greturn;
+
+static inline gasm *as_a_gasm(gimple stmt)
+{
+ return stmt;
+}
+
+static inline const gasm *as_a_const_gasm(const_gimple stmt)
+{
+ return stmt;
+}
+
+static inline gassign *as_a_gassign(gimple stmt)
+{
+ return stmt;
+}
+
+static inline const gassign *as_a_const_gassign(const_gimple stmt)
+{
+ return stmt;
+}
+
+static inline gcall *as_a_gcall(gimple stmt)
+{
+ return stmt;
+}
+
+static inline const gcall *as_a_const_gcall(const_gimple stmt)
+{
+ return stmt;
+}
+
+static inline gcond *as_a_gcond(gimple stmt)
+{
+ return stmt;
+}
+
+static inline const gcond *as_a_const_gcond(const_gimple stmt)
+{
+ return stmt;
+}
+
+static inline gdebug *as_a_gdebug(gimple stmt)
+{
+ return stmt;
+}
+
+static inline const gdebug *as_a_const_gdebug(const_gimple stmt)
+{
+ return stmt;
+}
+
+static inline gphi *as_a_gphi(gimple stmt)
+{
+ return stmt;
+}
+
+static inline const gphi *as_a_const_gphi(const_gimple stmt)
+{
+ return stmt;
+}
+
+static inline greturn *as_a_greturn(gimple stmt)
+{
+ return stmt;
+}
+
+static inline const greturn *as_a_const_greturn(const_gimple stmt)
+{
+ return stmt;
+}
+#endif
+
+#if BUILDING_GCC_VERSION == 4008
+#define NODE_SYMBOL(node) (&(node)->symbol)
+#define NODE_DECL(node) (node)->symbol.decl
+#endif
+
+#if BUILDING_GCC_VERSION >= 4008
+#define add_referenced_var(var)
+#define mark_sym_for_renaming(var)
+#define varpool_mark_needed_node(node)
+#define create_var_ann(var)
+#define TODO_dump_func 0
+#define TODO_dump_cgraph 0
+#endif
+
+#if BUILDING_GCC_VERSION <= 4009
+#define TODO_verify_il 0
+#define AVAIL_INTERPOSABLE AVAIL_OVERWRITABLE
+
+#define section_name_prefix LTO_SECTION_NAME_PREFIX
+#define fatal_error(loc, gmsgid, ...) fatal_error((gmsgid), __VA_ARGS__)
+
+typedef struct rtx_def rtx_insn;
+
+static inline void set_decl_section_name(tree node, const char *value)
+{
+ if (value)
+ DECL_SECTION_NAME(node) = build_string(strlen(value) + 1, value);
+ else
+ DECL_SECTION_NAME(node) = NULL;
+}
+#endif
+
+#if BUILDING_GCC_VERSION == 4009
+typedef struct gimple_statement_asm gasm;
+typedef struct gimple_statement_base gassign;
+typedef struct gimple_statement_call gcall;
+typedef struct gimple_statement_base gcond;
+typedef struct gimple_statement_base gdebug;
+typedef struct gimple_statement_phi gphi;
+typedef struct gimple_statement_base greturn;
+
+static inline gasm *as_a_gasm(gimple stmt)
+{
+ return as_a<gasm>(stmt);
+}
+
+static inline const gasm *as_a_const_gasm(const_gimple stmt)
+{
+ return as_a<const gasm>(stmt);
+}
+
+static inline gassign *as_a_gassign(gimple stmt)
+{
+ return stmt;
+}
+
+static inline const gassign *as_a_const_gassign(const_gimple stmt)
+{
+ return stmt;
+}
+
+static inline gcall *as_a_gcall(gimple stmt)
+{
+ return as_a<gcall>(stmt);
+}
+
+static inline const gcall *as_a_const_gcall(const_gimple stmt)
+{
+ return as_a<const gcall>(stmt);
+}
+
+static inline gcond *as_a_gcond(gimple stmt)
+{
+ return stmt;
+}
+
+static inline const gcond *as_a_const_gcond(const_gimple stmt)
+{
+ return stmt;
+}
+
+static inline gdebug *as_a_gdebug(gimple stmt)
+{
+ return stmt;
+}
+
+static inline const gdebug *as_a_const_gdebug(const_gimple stmt)
+{
+ return stmt;
+}
+
+static inline gphi *as_a_gphi(gimple stmt)
+{
+ return as_a<gphi>(stmt);
+}
+
+static inline const gphi *as_a_const_gphi(const_gimple stmt)
+{
+ return as_a<const gphi>(stmt);
+}
+
+static inline greturn *as_a_greturn(gimple stmt)
+{
+ return stmt;
+}
+
+static inline const greturn *as_a_const_greturn(const_gimple stmt)
+{
+ return stmt;
+}
+#endif
+
+#if BUILDING_GCC_VERSION >= 4009
+#define TODO_ggc_collect 0
+#define NODE_SYMBOL(node) (node)
+#define NODE_DECL(node) (node)->decl
+#define cgraph_node_name(node) (node)->name()
+#define NODE_IMPLICIT_ALIAS(node) (node)->cpp_implicit_alias
+#endif
+
+#if BUILDING_GCC_VERSION >= 5000 && BUILDING_GCC_VERSION < 6000
+/* gimple related */
+template <>
+template <>
+inline bool is_a_helper<const gassign *>::test(const_gimple gs)
+{
+ return gs->code == GIMPLE_ASSIGN;
+}
+#endif
+
+#if BUILDING_GCC_VERSION >= 5000
+#define TODO_verify_ssa TODO_verify_il
+#define TODO_verify_flow TODO_verify_il
+#define TODO_verify_stmts TODO_verify_il
+#define TODO_verify_rtl_sharing TODO_verify_il
+
+#define INSN_DELETED_P(insn) (insn)->deleted()
+
+/* symtab/cgraph related */
+#define debug_cgraph_node(node) (node)->debug()
+#define cgraph_get_node(decl) cgraph_node::get(decl)
+#define cgraph_get_create_node(decl) cgraph_node::get_create(decl)
+#define cgraph_create_node(decl) cgraph_node::create(decl)
+#define cgraph_n_nodes symtab->cgraph_count
+#define cgraph_max_uid symtab->cgraph_max_uid
+#define varpool_get_node(decl) varpool_node::get(decl)
+
+#define cgraph_create_edge(caller, callee, call_stmt, count, freq, nest) \
+ (caller)->create_edge((callee), (call_stmt), (count), (freq))
+#define cgraph_create_edge_including_clones(caller, callee, old_call_stmt, call_stmt, count, freq, nest, reason) \
+ (caller)->create_edge_including_clones((callee), (old_call_stmt), (call_stmt), (count), (freq), (reason))
+
+typedef struct cgraph_node *cgraph_node_ptr;
+typedef struct cgraph_edge *cgraph_edge_p;
+typedef struct varpool_node *varpool_node_ptr;
+
+static inline void change_decl_assembler_name(tree decl, tree name)
+{
+ symtab->change_decl_assembler_name(decl, name);
+}
+
+static inline void varpool_finalize_decl(tree decl)
+{
+ varpool_node::finalize_decl(decl);
+}
+
+static inline void varpool_add_new_variable(tree decl)
+{
+ varpool_node::add(decl);
+}
+
+static inline unsigned int rebuild_cgraph_edges(void)
+{
+ return cgraph_edge::rebuild_edges();
+}
+
+static inline cgraph_node_ptr cgraph_function_node(cgraph_node_ptr node, enum availability *availability)
+{
+ return node->function_symbol(availability);
+}
+
+static inline cgraph_node_ptr cgraph_function_or_thunk_node(cgraph_node_ptr node, enum availability *availability = NULL)
+{
+ return node->ultimate_alias_target(availability);
+}
+
+static inline bool cgraph_only_called_directly_p(cgraph_node_ptr node)
+{
+ return node->only_called_directly_p();
+}
+
+static inline enum availability cgraph_function_body_availability(cgraph_node_ptr node)
+{
+ return node->get_availability();
+}
+
+static inline cgraph_node_ptr cgraph_alias_target(cgraph_node_ptr node)
+{
+ return node->get_alias_target();
+}
+
+static inline struct cgraph_node_hook_list *cgraph_add_function_insertion_hook(cgraph_node_hook hook, void *data)
+{
+ return symtab->add_cgraph_insertion_hook(hook, data);
+}
+
+static inline void cgraph_remove_function_insertion_hook(struct cgraph_node_hook_list *entry)
+{
+ symtab->remove_cgraph_insertion_hook(entry);
+}
+
+static inline struct cgraph_node_hook_list *cgraph_add_node_removal_hook(cgraph_node_hook hook, void *data)
+{
+ return symtab->add_cgraph_removal_hook(hook, data);
+}
+
+static inline void cgraph_remove_node_removal_hook(struct cgraph_node_hook_list *entry)
+{
+ symtab->remove_cgraph_removal_hook(entry);
+}
+
+static inline struct cgraph_2node_hook_list *cgraph_add_node_duplication_hook(cgraph_2node_hook hook, void *data)
+{
+ return symtab->add_cgraph_duplication_hook(hook, data);
+}
+
+static inline void cgraph_remove_node_duplication_hook(struct cgraph_2node_hook_list *entry)
+{
+ symtab->remove_cgraph_duplication_hook(entry);
+}
+
+static inline void cgraph_call_node_duplication_hooks(cgraph_node_ptr node, cgraph_node_ptr node2)
+{
+ symtab->call_cgraph_duplication_hooks(node, node2);
+}
+
+static inline void cgraph_call_edge_duplication_hooks(cgraph_edge *cs1, cgraph_edge *cs2)
+{
+ symtab->call_edge_duplication_hooks(cs1, cs2);
+}
+
+#if BUILDING_GCC_VERSION >= 6000
+typedef gimple *gimple_ptr;
+typedef const gimple *const_gimple_ptr;
+#define gimple gimple_ptr
+#define const_gimple const_gimple_ptr
+#undef CONST_CAST_GIMPLE
+#define CONST_CAST_GIMPLE(X) CONST_CAST(gimple, (X))
+#endif
+
+/* gimple related */
+static inline gimple gimple_build_assign_with_ops(enum tree_code subcode, tree lhs, tree op1, tree op2 MEM_STAT_DECL)
+{
+ return gimple_build_assign(lhs, subcode, op1, op2 PASS_MEM_STAT);
+}
+
+template <>
+template <>
+inline bool is_a_helper<const greturn *>::test(const_gimple gs)
+{
+ return gs->code == GIMPLE_RETURN;
+}
+
+static inline gasm *as_a_gasm(gimple stmt)
+{
+ return as_a<gasm *>(stmt);
+}
+
+static inline const gasm *as_a_const_gasm(const_gimple stmt)
+{
+ return as_a<const gasm *>(stmt);
+}
+
+static inline gassign *as_a_gassign(gimple stmt)
+{
+ return as_a<gassign *>(stmt);
+}
+
+static inline const gassign *as_a_const_gassign(const_gimple stmt)
+{
+ return as_a<const gassign *>(stmt);
+}
+
+static inline gcall *as_a_gcall(gimple stmt)
+{
+ return as_a<gcall *>(stmt);
+}
+
+static inline const gcall *as_a_const_gcall(const_gimple stmt)
+{
+ return as_a<const gcall *>(stmt);
+}
+
+static inline gphi *as_a_gphi(gimple stmt)
+{
+ return as_a<gphi *>(stmt);
+}
+
+static inline const gphi *as_a_const_gphi(const_gimple stmt)
+{
+ return as_a<const gphi *>(stmt);
+}
+
+static inline greturn *as_a_greturn(gimple stmt)
+{
+ return as_a<greturn *>(stmt);
+}
+
+static inline const greturn *as_a_const_greturn(const_gimple stmt)
+{
+ return as_a<const greturn *>(stmt);
+}
+
+/* IPA/LTO related */
+#define ipa_ref_list_referring_iterate(L, I, P) \
+ (L)->referring.iterate((I), &(P))
+#define ipa_ref_list_reference_iterate(L, I, P) \
+ (L)->reference.iterate((I), &(P))
+
+static inline cgraph_node_ptr ipa_ref_referring_node(struct ipa_ref *ref)
+{
+ return dyn_cast<cgraph_node_ptr>(ref->referring);
+}
+
+static inline void ipa_remove_stmt_references(symtab_node *referring_node, gimple stmt)
+{
+ referring_node->remove_stmt_references(stmt);
+}
+#endif
+
+#if BUILDING_GCC_VERSION < 6000
+#define get_inner_reference(exp, pbitsize, pbitpos, poffset, pmode, punsignedp, preversep, pvolatilep, keep_aligning) \
+ get_inner_reference(exp, pbitsize, pbitpos, poffset, pmode, punsignedp, pvolatilep, keep_aligning)
+#define gen_rtx_set(ARG0, ARG1) gen_rtx_SET(VOIDmode, (ARG0), (ARG1))
+#endif
+
+#if BUILDING_GCC_VERSION >= 6000
+#define gen_rtx_set(ARG0, ARG1) gen_rtx_SET((ARG0), (ARG1))
+#endif
+
+#ifdef __cplusplus
+static inline void debug_tree(const_tree t)
+{
+ debug_tree(CONST_CAST_TREE(t));
+}
+
+static inline void debug_gimple_stmt(const_gimple s)
+{
+ debug_gimple_stmt(CONST_CAST_GIMPLE(s));
+}
+#else
+#define debug_tree(t) debug_tree(CONST_CAST_TREE(t))
+#define debug_gimple_stmt(s) debug_gimple_stmt(CONST_CAST_GIMPLE(s))
+#endif
+
+#endif
diff --git a/scripts/gcc-plugins/gcc-generate-gimple-pass.h b/scripts/gcc-plugins/gcc-generate-gimple-pass.h
new file mode 100644
index 000000000000..526c3c79b68e
--- /dev/null
+++ b/scripts/gcc-plugins/gcc-generate-gimple-pass.h
@@ -0,0 +1,175 @@
+/*
+ * Generator for GIMPLE pass related boilerplate code/data
+ *
+ * Supports gcc 4.5-6
+ *
+ * Usage:
+ *
+ * 1. before inclusion define PASS_NAME
+ * 2. before inclusion define NO_* for unimplemented callbacks
+ * NO_GATE
+ * NO_EXECUTE
+ * 3. before inclusion define PROPERTIES_* and TODO_FLAGS_* to override
+ * the default 0 values
+ * 4. for convenience, all the above will be undefined after inclusion!
+ * 5. the only exported name is make_PASS_NAME_pass() to register with gcc
+ */
+
+#ifndef PASS_NAME
+#error at least PASS_NAME must be defined
+#else
+#define __GCC_PLUGIN_STRINGIFY(n) #n
+#define _GCC_PLUGIN_STRINGIFY(n) __GCC_PLUGIN_STRINGIFY(n)
+#define _GCC_PLUGIN_CONCAT2(x, y) x ## y
+#define _GCC_PLUGIN_CONCAT3(x, y, z) x ## y ## z
+
+#define __PASS_NAME_PASS_DATA(n) _GCC_PLUGIN_CONCAT2(n, _pass_data)
+#define _PASS_NAME_PASS_DATA __PASS_NAME_PASS_DATA(PASS_NAME)
+
+#define __PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT2(n, _pass)
+#define _PASS_NAME_PASS __PASS_NAME_PASS(PASS_NAME)
+
+#define _PASS_NAME_NAME _GCC_PLUGIN_STRINGIFY(PASS_NAME)
+
+#define __MAKE_PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT3(make_, n, _pass)
+#define _MAKE_PASS_NAME_PASS __MAKE_PASS_NAME_PASS(PASS_NAME)
+
+#ifdef NO_GATE
+#define _GATE NULL
+#define _HAS_GATE false
+#else
+#define __GATE(n) _GCC_PLUGIN_CONCAT2(n, _gate)
+#define _GATE __GATE(PASS_NAME)
+#define _HAS_GATE true
+#endif
+
+#ifdef NO_EXECUTE
+#define _EXECUTE NULL
+#define _HAS_EXECUTE false
+#else
+#define __EXECUTE(n) _GCC_PLUGIN_CONCAT2(n, _execute)
+#define _EXECUTE __EXECUTE(PASS_NAME)
+#define _HAS_EXECUTE true
+#endif
+
+#ifndef PROPERTIES_REQUIRED
+#define PROPERTIES_REQUIRED 0
+#endif
+
+#ifndef PROPERTIES_PROVIDED
+#define PROPERTIES_PROVIDED 0
+#endif
+
+#ifndef PROPERTIES_DESTROYED
+#define PROPERTIES_DESTROYED 0
+#endif
+
+#ifndef TODO_FLAGS_START
+#define TODO_FLAGS_START 0
+#endif
+
+#ifndef TODO_FLAGS_FINISH
+#define TODO_FLAGS_FINISH 0
+#endif
+
+#if BUILDING_GCC_VERSION >= 4009
+namespace {
+static const pass_data _PASS_NAME_PASS_DATA = {
+#else
+static struct gimple_opt_pass _PASS_NAME_PASS = {
+ .pass = {
+#endif
+ .type = GIMPLE_PASS,
+ .name = _PASS_NAME_NAME,
+#if BUILDING_GCC_VERSION >= 4008
+ .optinfo_flags = OPTGROUP_NONE,
+#endif
+#if BUILDING_GCC_VERSION >= 5000
+#elif BUILDING_GCC_VERSION == 4009
+ .has_gate = _HAS_GATE,
+ .has_execute = _HAS_EXECUTE,
+#else
+ .gate = _GATE,
+ .execute = _EXECUTE,
+ .sub = NULL,
+ .next = NULL,
+ .static_pass_number = 0,
+#endif
+ .tv_id = TV_NONE,
+ .properties_required = PROPERTIES_REQUIRED,
+ .properties_provided = PROPERTIES_PROVIDED,
+ .properties_destroyed = PROPERTIES_DESTROYED,
+ .todo_flags_start = TODO_FLAGS_START,
+ .todo_flags_finish = TODO_FLAGS_FINISH,
+#if BUILDING_GCC_VERSION < 4009
+ }
+#endif
+};
+
+#if BUILDING_GCC_VERSION >= 4009
+class _PASS_NAME_PASS : public gimple_opt_pass {
+public:
+ _PASS_NAME_PASS() : gimple_opt_pass(_PASS_NAME_PASS_DATA, g) {}
+
+#ifndef NO_GATE
+#if BUILDING_GCC_VERSION >= 5000
+ virtual bool gate(function *) { return _GATE(); }
+#else
+ virtual bool gate(void) { return _GATE(); }
+#endif
+#endif
+
+ virtual opt_pass * clone () { return new _PASS_NAME_PASS(); }
+
+#ifndef NO_EXECUTE
+#if BUILDING_GCC_VERSION >= 5000
+ virtual unsigned int execute(function *) { return _EXECUTE(); }
+#else
+ virtual unsigned int execute(void) { return _EXECUTE(); }
+#endif
+#endif
+};
+}
+
+opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+ return new _PASS_NAME_PASS();
+}
+#else
+struct opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+ return &_PASS_NAME_PASS.pass;
+}
+#endif
+
+/* clean up user provided defines */
+#undef PASS_NAME
+#undef NO_GATE
+#undef NO_EXECUTE
+
+#undef PROPERTIES_DESTROYED
+#undef PROPERTIES_PROVIDED
+#undef PROPERTIES_REQUIRED
+#undef TODO_FLAGS_FINISH
+#undef TODO_FLAGS_START
+
+/* clean up generated defines */
+#undef _EXECUTE
+#undef __EXECUTE
+#undef _GATE
+#undef __GATE
+#undef _GCC_PLUGIN_CONCAT2
+#undef _GCC_PLUGIN_CONCAT3
+#undef _GCC_PLUGIN_STRINGIFY
+#undef __GCC_PLUGIN_STRINGIFY
+#undef _HAS_EXECUTE
+#undef _HAS_GATE
+#undef _MAKE_PASS_NAME_PASS
+#undef __MAKE_PASS_NAME_PASS
+#undef _PASS_NAME_NAME
+#undef _PASS_NAME_PASS
+#undef __PASS_NAME_PASS
+#undef _PASS_NAME_PASS_DATA
+#undef __PASS_NAME_PASS_DATA
+
+#endif /* PASS_NAME */
diff --git a/scripts/gcc-plugins/gcc-generate-ipa-pass.h b/scripts/gcc-plugins/gcc-generate-ipa-pass.h
new file mode 100644
index 000000000000..9bd926e072f0
--- /dev/null
+++ b/scripts/gcc-plugins/gcc-generate-ipa-pass.h
@@ -0,0 +1,289 @@
+/*
+ * Generator for IPA pass related boilerplate code/data
+ *
+ * Supports gcc 4.5-6
+ *
+ * Usage:
+ *
+ * 1. before inclusion define PASS_NAME
+ * 2. before inclusion define NO_* for unimplemented callbacks
+ * NO_GENERATE_SUMMARY
+ * NO_READ_SUMMARY
+ * NO_WRITE_SUMMARY
+ * NO_READ_OPTIMIZATION_SUMMARY
+ * NO_WRITE_OPTIMIZATION_SUMMARY
+ * NO_STMT_FIXUP
+ * NO_FUNCTION_TRANSFORM
+ * NO_VARIABLE_TRANSFORM
+ * NO_GATE
+ * NO_EXECUTE
+ * 3. before inclusion define PROPERTIES_* and *TODO_FLAGS_* to override
+ * the default 0 values
+ * 4. for convenience, all the above will be undefined after inclusion!
+ * 5. the only exported name is make_PASS_NAME_pass() to register with gcc
+ */
+
+#ifndef PASS_NAME
+#error at least PASS_NAME must be defined
+#else
+#define __GCC_PLUGIN_STRINGIFY(n) #n
+#define _GCC_PLUGIN_STRINGIFY(n) __GCC_PLUGIN_STRINGIFY(n)
+#define _GCC_PLUGIN_CONCAT2(x, y) x ## y
+#define _GCC_PLUGIN_CONCAT3(x, y, z) x ## y ## z
+
+#define __PASS_NAME_PASS_DATA(n) _GCC_PLUGIN_CONCAT2(n, _pass_data)
+#define _PASS_NAME_PASS_DATA __PASS_NAME_PASS_DATA(PASS_NAME)
+
+#define __PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT2(n, _pass)
+#define _PASS_NAME_PASS __PASS_NAME_PASS(PASS_NAME)
+
+#define _PASS_NAME_NAME _GCC_PLUGIN_STRINGIFY(PASS_NAME)
+
+#define __MAKE_PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT3(make_, n, _pass)
+#define _MAKE_PASS_NAME_PASS __MAKE_PASS_NAME_PASS(PASS_NAME)
+
+#ifdef NO_GENERATE_SUMMARY
+#define _GENERATE_SUMMARY NULL
+#else
+#define __GENERATE_SUMMARY(n) _GCC_PLUGIN_CONCAT2(n, _generate_summary)
+#define _GENERATE_SUMMARY __GENERATE_SUMMARY(PASS_NAME)
+#endif
+
+#ifdef NO_READ_SUMMARY
+#define _READ_SUMMARY NULL
+#else
+#define __READ_SUMMARY(n) _GCC_PLUGIN_CONCAT2(n, _read_summary)
+#define _READ_SUMMARY __READ_SUMMARY(PASS_NAME)
+#endif
+
+#ifdef NO_WRITE_SUMMARY
+#define _WRITE_SUMMARY NULL
+#else
+#define __WRITE_SUMMARY(n) _GCC_PLUGIN_CONCAT2(n, _write_summary)
+#define _WRITE_SUMMARY __WRITE_SUMMARY(PASS_NAME)
+#endif
+
+#ifdef NO_READ_OPTIMIZATION_SUMMARY
+#define _READ_OPTIMIZATION_SUMMARY NULL
+#else
+#define __READ_OPTIMIZATION_SUMMARY(n) _GCC_PLUGIN_CONCAT2(n, _read_optimization_summary)
+#define _READ_OPTIMIZATION_SUMMARY __READ_OPTIMIZATION_SUMMARY(PASS_NAME)
+#endif
+
+#ifdef NO_WRITE_OPTIMIZATION_SUMMARY
+#define _WRITE_OPTIMIZATION_SUMMARY NULL
+#else
+#define __WRITE_OPTIMIZATION_SUMMARY(n) _GCC_PLUGIN_CONCAT2(n, _write_optimization_summary)
+#define _WRITE_OPTIMIZATION_SUMMARY __WRITE_OPTIMIZATION_SUMMARY(PASS_NAME)
+#endif
+
+#ifdef NO_STMT_FIXUP
+#define _STMT_FIXUP NULL
+#else
+#define __STMT_FIXUP(n) _GCC_PLUGIN_CONCAT2(n, _stmt_fixup)
+#define _STMT_FIXUP __STMT_FIXUP(PASS_NAME)
+#endif
+
+#ifdef NO_FUNCTION_TRANSFORM
+#define _FUNCTION_TRANSFORM NULL
+#else
+#define __FUNCTION_TRANSFORM(n) _GCC_PLUGIN_CONCAT2(n, _function_transform)
+#define _FUNCTION_TRANSFORM __FUNCTION_TRANSFORM(PASS_NAME)
+#endif
+
+#ifdef NO_VARIABLE_TRANSFORM
+#define _VARIABLE_TRANSFORM NULL
+#else
+#define __VARIABLE_TRANSFORM(n) _GCC_PLUGIN_CONCAT2(n, _variable_transform)
+#define _VARIABLE_TRANSFORM __VARIABLE_TRANSFORM(PASS_NAME)
+#endif
+
+#ifdef NO_GATE
+#define _GATE NULL
+#define _HAS_GATE false
+#else
+#define __GATE(n) _GCC_PLUGIN_CONCAT2(n, _gate)
+#define _GATE __GATE(PASS_NAME)
+#define _HAS_GATE true
+#endif
+
+#ifdef NO_EXECUTE
+#define _EXECUTE NULL
+#define _HAS_EXECUTE false
+#else
+#define __EXECUTE(n) _GCC_PLUGIN_CONCAT2(n, _execute)
+#define _EXECUTE __EXECUTE(PASS_NAME)
+#define _HAS_EXECUTE true
+#endif
+
+#ifndef PROPERTIES_REQUIRED
+#define PROPERTIES_REQUIRED 0
+#endif
+
+#ifndef PROPERTIES_PROVIDED
+#define PROPERTIES_PROVIDED 0
+#endif
+
+#ifndef PROPERTIES_DESTROYED
+#define PROPERTIES_DESTROYED 0
+#endif
+
+#ifndef TODO_FLAGS_START
+#define TODO_FLAGS_START 0
+#endif
+
+#ifndef TODO_FLAGS_FINISH
+#define TODO_FLAGS_FINISH 0
+#endif
+
+#ifndef FUNCTION_TRANSFORM_TODO_FLAGS_START
+#define FUNCTION_TRANSFORM_TODO_FLAGS_START 0
+#endif
+
+#if BUILDING_GCC_VERSION >= 4009
+namespace {
+static const pass_data _PASS_NAME_PASS_DATA = {
+#else
+static struct ipa_opt_pass_d _PASS_NAME_PASS = {
+ .pass = {
+#endif
+ .type = IPA_PASS,
+ .name = _PASS_NAME_NAME,
+#if BUILDING_GCC_VERSION >= 4008
+ .optinfo_flags = OPTGROUP_NONE,
+#endif
+#if BUILDING_GCC_VERSION >= 5000
+#elif BUILDING_GCC_VERSION == 4009
+ .has_gate = _HAS_GATE,
+ .has_execute = _HAS_EXECUTE,
+#else
+ .gate = _GATE,
+ .execute = _EXECUTE,
+ .sub = NULL,
+ .next = NULL,
+ .static_pass_number = 0,
+#endif
+ .tv_id = TV_NONE,
+ .properties_required = PROPERTIES_REQUIRED,
+ .properties_provided = PROPERTIES_PROVIDED,
+ .properties_destroyed = PROPERTIES_DESTROYED,
+ .todo_flags_start = TODO_FLAGS_START,
+ .todo_flags_finish = TODO_FLAGS_FINISH,
+#if BUILDING_GCC_VERSION < 4009
+ },
+ .generate_summary = _GENERATE_SUMMARY,
+ .write_summary = _WRITE_SUMMARY,
+ .read_summary = _READ_SUMMARY,
+#if BUILDING_GCC_VERSION >= 4006
+ .write_optimization_summary = _WRITE_OPTIMIZATION_SUMMARY,
+ .read_optimization_summary = _READ_OPTIMIZATION_SUMMARY,
+#endif
+ .stmt_fixup = _STMT_FIXUP,
+ .function_transform_todo_flags_start = FUNCTION_TRANSFORM_TODO_FLAGS_START,
+ .function_transform = _FUNCTION_TRANSFORM,
+ .variable_transform = _VARIABLE_TRANSFORM,
+#endif
+};
+
+#if BUILDING_GCC_VERSION >= 4009
+class _PASS_NAME_PASS : public ipa_opt_pass_d {
+public:
+ _PASS_NAME_PASS() : ipa_opt_pass_d(_PASS_NAME_PASS_DATA,
+ g,
+ _GENERATE_SUMMARY,
+ _WRITE_SUMMARY,
+ _READ_SUMMARY,
+ _WRITE_OPTIMIZATION_SUMMARY,
+ _READ_OPTIMIZATION_SUMMARY,
+ _STMT_FIXUP,
+ FUNCTION_TRANSFORM_TODO_FLAGS_START,
+ _FUNCTION_TRANSFORM,
+ _VARIABLE_TRANSFORM) {}
+
+#ifndef NO_GATE
+#if BUILDING_GCC_VERSION >= 5000
+ virtual bool gate(function *) { return _GATE(); }
+#else
+ virtual bool gate(void) { return _GATE(); }
+#endif
+#endif
+
+ virtual opt_pass *clone() { return new _PASS_NAME_PASS(); }
+
+#ifndef NO_EXECUTE
+#if BUILDING_GCC_VERSION >= 5000
+ virtual unsigned int execute(function *) { return _EXECUTE(); }
+#else
+ virtual unsigned int execute(void) { return _EXECUTE(); }
+#endif
+#endif
+};
+}
+
+opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+ return new _PASS_NAME_PASS();
+}
+#else
+struct opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+ return &_PASS_NAME_PASS.pass;
+}
+#endif
+
+/* clean up user provided defines */
+#undef PASS_NAME
+#undef NO_GENERATE_SUMMARY
+#undef NO_WRITE_SUMMARY
+#undef NO_READ_SUMMARY
+#undef NO_WRITE_OPTIMIZATION_SUMMARY
+#undef NO_READ_OPTIMIZATION_SUMMARY
+#undef NO_STMT_FIXUP
+#undef NO_FUNCTION_TRANSFORM
+#undef NO_VARIABLE_TRANSFORM
+#undef NO_GATE
+#undef NO_EXECUTE
+
+#undef FUNCTION_TRANSFORM_TODO_FLAGS_START
+#undef PROPERTIES_DESTROYED
+#undef PROPERTIES_PROVIDED
+#undef PROPERTIES_REQUIRED
+#undef TODO_FLAGS_FINISH
+#undef TODO_FLAGS_START
+
+/* clean up generated defines */
+#undef _EXECUTE
+#undef __EXECUTE
+#undef _FUNCTION_TRANSFORM
+#undef __FUNCTION_TRANSFORM
+#undef _GATE
+#undef __GATE
+#undef _GCC_PLUGIN_CONCAT2
+#undef _GCC_PLUGIN_CONCAT3
+#undef _GCC_PLUGIN_STRINGIFY
+#undef __GCC_PLUGIN_STRINGIFY
+#undef _GENERATE_SUMMARY
+#undef __GENERATE_SUMMARY
+#undef _HAS_EXECUTE
+#undef _HAS_GATE
+#undef _MAKE_PASS_NAME_PASS
+#undef __MAKE_PASS_NAME_PASS
+#undef _PASS_NAME_NAME
+#undef _PASS_NAME_PASS
+#undef __PASS_NAME_PASS
+#undef _PASS_NAME_PASS_DATA
+#undef __PASS_NAME_PASS_DATA
+#undef _READ_OPTIMIZATION_SUMMARY
+#undef __READ_OPTIMIZATION_SUMMARY
+#undef _READ_SUMMARY
+#undef __READ_SUMMARY
+#undef _STMT_FIXUP
+#undef __STMT_FIXUP
+#undef _VARIABLE_TRANSFORM
+#undef __VARIABLE_TRANSFORM
+#undef _WRITE_OPTIMIZATION_SUMMARY
+#undef __WRITE_OPTIMIZATION_SUMMARY
+#undef _WRITE_SUMMARY
+#undef __WRITE_SUMMARY
+
+#endif /* PASS_NAME */
diff --git a/scripts/gcc-plugins/gcc-generate-rtl-pass.h b/scripts/gcc-plugins/gcc-generate-rtl-pass.h
new file mode 100644
index 000000000000..1dc67a5aeadf
--- /dev/null
+++ b/scripts/gcc-plugins/gcc-generate-rtl-pass.h
@@ -0,0 +1,175 @@
+/*
+ * Generator for RTL pass related boilerplate code/data
+ *
+ * Supports gcc 4.5-6
+ *
+ * Usage:
+ *
+ * 1. before inclusion define PASS_NAME
+ * 2. before inclusion define NO_* for unimplemented callbacks
+ * NO_GATE
+ * NO_EXECUTE
+ * 3. before inclusion define PROPERTIES_* and TODO_FLAGS_* to override
+ * the default 0 values
+ * 4. for convenience, all the above will be undefined after inclusion!
+ * 5. the only exported name is make_PASS_NAME_pass() to register with gcc
+ */
+
+#ifndef PASS_NAME
+#error at least PASS_NAME must be defined
+#else
+#define __GCC_PLUGIN_STRINGIFY(n) #n
+#define _GCC_PLUGIN_STRINGIFY(n) __GCC_PLUGIN_STRINGIFY(n)
+#define _GCC_PLUGIN_CONCAT2(x, y) x ## y
+#define _GCC_PLUGIN_CONCAT3(x, y, z) x ## y ## z
+
+#define __PASS_NAME_PASS_DATA(n) _GCC_PLUGIN_CONCAT2(n, _pass_data)
+#define _PASS_NAME_PASS_DATA __PASS_NAME_PASS_DATA(PASS_NAME)
+
+#define __PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT2(n, _pass)
+#define _PASS_NAME_PASS __PASS_NAME_PASS(PASS_NAME)
+
+#define _PASS_NAME_NAME _GCC_PLUGIN_STRINGIFY(PASS_NAME)
+
+#define __MAKE_PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT3(make_, n, _pass)
+#define _MAKE_PASS_NAME_PASS __MAKE_PASS_NAME_PASS(PASS_NAME)
+
+#ifdef NO_GATE
+#define _GATE NULL
+#define _HAS_GATE false
+#else
+#define __GATE(n) _GCC_PLUGIN_CONCAT2(n, _gate)
+#define _GATE __GATE(PASS_NAME)
+#define _HAS_GATE true
+#endif
+
+#ifdef NO_EXECUTE
+#define _EXECUTE NULL
+#define _HAS_EXECUTE false
+#else
+#define __EXECUTE(n) _GCC_PLUGIN_CONCAT2(n, _execute)
+#define _EXECUTE __EXECUTE(PASS_NAME)
+#define _HAS_EXECUTE true
+#endif
+
+#ifndef PROPERTIES_REQUIRED
+#define PROPERTIES_REQUIRED 0
+#endif
+
+#ifndef PROPERTIES_PROVIDED
+#define PROPERTIES_PROVIDED 0
+#endif
+
+#ifndef PROPERTIES_DESTROYED
+#define PROPERTIES_DESTROYED 0
+#endif
+
+#ifndef TODO_FLAGS_START
+#define TODO_FLAGS_START 0
+#endif
+
+#ifndef TODO_FLAGS_FINISH
+#define TODO_FLAGS_FINISH 0
+#endif
+
+#if BUILDING_GCC_VERSION >= 4009
+namespace {
+static const pass_data _PASS_NAME_PASS_DATA = {
+#else
+static struct rtl_opt_pass _PASS_NAME_PASS = {
+ .pass = {
+#endif
+ .type = RTL_PASS,
+ .name = _PASS_NAME_NAME,
+#if BUILDING_GCC_VERSION >= 4008
+ .optinfo_flags = OPTGROUP_NONE,
+#endif
+#if BUILDING_GCC_VERSION >= 5000
+#elif BUILDING_GCC_VERSION == 4009
+ .has_gate = _HAS_GATE,
+ .has_execute = _HAS_EXECUTE,
+#else
+ .gate = _GATE,
+ .execute = _EXECUTE,
+ .sub = NULL,
+ .next = NULL,
+ .static_pass_number = 0,
+#endif
+ .tv_id = TV_NONE,
+ .properties_required = PROPERTIES_REQUIRED,
+ .properties_provided = PROPERTIES_PROVIDED,
+ .properties_destroyed = PROPERTIES_DESTROYED,
+ .todo_flags_start = TODO_FLAGS_START,
+ .todo_flags_finish = TODO_FLAGS_FINISH,
+#if BUILDING_GCC_VERSION < 4009
+ }
+#endif
+};
+
+#if BUILDING_GCC_VERSION >= 4009
+class _PASS_NAME_PASS : public rtl_opt_pass {
+public:
+ _PASS_NAME_PASS() : rtl_opt_pass(_PASS_NAME_PASS_DATA, g) {}
+
+#ifndef NO_GATE
+#if BUILDING_GCC_VERSION >= 5000
+ virtual bool gate(function *) { return _GATE(); }
+#else
+ virtual bool gate(void) { return _GATE(); }
+#endif
+#endif
+
+ virtual opt_pass *clone() { return new _PASS_NAME_PASS(); }
+
+#ifndef NO_EXECUTE
+#if BUILDING_GCC_VERSION >= 5000
+ virtual unsigned int execute(function *) { return _EXECUTE(); }
+#else
+ virtual unsigned int execute(void) { return _EXECUTE(); }
+#endif
+#endif
+};
+}
+
+opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+ return new _PASS_NAME_PASS();
+}
+#else
+struct opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+ return &_PASS_NAME_PASS.pass;
+}
+#endif
+
+/* clean up user provided defines */
+#undef PASS_NAME
+#undef NO_GATE
+#undef NO_EXECUTE
+
+#undef PROPERTIES_DESTROYED
+#undef PROPERTIES_PROVIDED
+#undef PROPERTIES_REQUIRED
+#undef TODO_FLAGS_FINISH
+#undef TODO_FLAGS_START
+
+/* clean up generated defines */
+#undef _EXECUTE
+#undef __EXECUTE
+#undef _GATE
+#undef __GATE
+#undef _GCC_PLUGIN_CONCAT2
+#undef _GCC_PLUGIN_CONCAT3
+#undef _GCC_PLUGIN_STRINGIFY
+#undef __GCC_PLUGIN_STRINGIFY
+#undef _HAS_EXECUTE
+#undef _HAS_GATE
+#undef _MAKE_PASS_NAME_PASS
+#undef __MAKE_PASS_NAME_PASS
+#undef _PASS_NAME_NAME
+#undef _PASS_NAME_PASS
+#undef __PASS_NAME_PASS
+#undef _PASS_NAME_PASS_DATA
+#undef __PASS_NAME_PASS_DATA
+
+#endif /* PASS_NAME */
diff --git a/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h b/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h
new file mode 100644
index 000000000000..a27e2b36afaa
--- /dev/null
+++ b/scripts/gcc-plugins/gcc-generate-simple_ipa-pass.h
@@ -0,0 +1,175 @@
+/*
+ * Generator for SIMPLE_IPA pass related boilerplate code/data
+ *
+ * Supports gcc 4.5-6
+ *
+ * Usage:
+ *
+ * 1. before inclusion define PASS_NAME
+ * 2. before inclusion define NO_* for unimplemented callbacks
+ * NO_GATE
+ * NO_EXECUTE
+ * 3. before inclusion define PROPERTIES_* and TODO_FLAGS_* to override
+ * the default 0 values
+ * 4. for convenience, all the above will be undefined after inclusion!
+ * 5. the only exported name is make_PASS_NAME_pass() to register with gcc
+ */
+
+#ifndef PASS_NAME
+#error at least PASS_NAME must be defined
+#else
+#define __GCC_PLUGIN_STRINGIFY(n) #n
+#define _GCC_PLUGIN_STRINGIFY(n) __GCC_PLUGIN_STRINGIFY(n)
+#define _GCC_PLUGIN_CONCAT2(x, y) x ## y
+#define _GCC_PLUGIN_CONCAT3(x, y, z) x ## y ## z
+
+#define __PASS_NAME_PASS_DATA(n) _GCC_PLUGIN_CONCAT2(n, _pass_data)
+#define _PASS_NAME_PASS_DATA __PASS_NAME_PASS_DATA(PASS_NAME)
+
+#define __PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT2(n, _pass)
+#define _PASS_NAME_PASS __PASS_NAME_PASS(PASS_NAME)
+
+#define _PASS_NAME_NAME _GCC_PLUGIN_STRINGIFY(PASS_NAME)
+
+#define __MAKE_PASS_NAME_PASS(n) _GCC_PLUGIN_CONCAT3(make_, n, _pass)
+#define _MAKE_PASS_NAME_PASS __MAKE_PASS_NAME_PASS(PASS_NAME)
+
+#ifdef NO_GATE
+#define _GATE NULL
+#define _HAS_GATE false
+#else
+#define __GATE(n) _GCC_PLUGIN_CONCAT2(n, _gate)
+#define _GATE __GATE(PASS_NAME)
+#define _HAS_GATE true
+#endif
+
+#ifdef NO_EXECUTE
+#define _EXECUTE NULL
+#define _HAS_EXECUTE false
+#else
+#define __EXECUTE(n) _GCC_PLUGIN_CONCAT2(n, _execute)
+#define _EXECUTE __EXECUTE(PASS_NAME)
+#define _HAS_EXECUTE true
+#endif
+
+#ifndef PROPERTIES_REQUIRED
+#define PROPERTIES_REQUIRED 0
+#endif
+
+#ifndef PROPERTIES_PROVIDED
+#define PROPERTIES_PROVIDED 0
+#endif
+
+#ifndef PROPERTIES_DESTROYED
+#define PROPERTIES_DESTROYED 0
+#endif
+
+#ifndef TODO_FLAGS_START
+#define TODO_FLAGS_START 0
+#endif
+
+#ifndef TODO_FLAGS_FINISH
+#define TODO_FLAGS_FINISH 0
+#endif
+
+#if BUILDING_GCC_VERSION >= 4009
+namespace {
+static const pass_data _PASS_NAME_PASS_DATA = {
+#else
+static struct simple_ipa_opt_pass _PASS_NAME_PASS = {
+ .pass = {
+#endif
+ .type = SIMPLE_IPA_PASS,
+ .name = _PASS_NAME_NAME,
+#if BUILDING_GCC_VERSION >= 4008
+ .optinfo_flags = OPTGROUP_NONE,
+#endif
+#if BUILDING_GCC_VERSION >= 5000
+#elif BUILDING_GCC_VERSION == 4009
+ .has_gate = _HAS_GATE,
+ .has_execute = _HAS_EXECUTE,
+#else
+ .gate = _GATE,
+ .execute = _EXECUTE,
+ .sub = NULL,
+ .next = NULL,
+ .static_pass_number = 0,
+#endif
+ .tv_id = TV_NONE,
+ .properties_required = PROPERTIES_REQUIRED,
+ .properties_provided = PROPERTIES_PROVIDED,
+ .properties_destroyed = PROPERTIES_DESTROYED,
+ .todo_flags_start = TODO_FLAGS_START,
+ .todo_flags_finish = TODO_FLAGS_FINISH,
+#if BUILDING_GCC_VERSION < 4009
+ }
+#endif
+};
+
+#if BUILDING_GCC_VERSION >= 4009
+class _PASS_NAME_PASS : public simple_ipa_opt_pass {
+public:
+ _PASS_NAME_PASS() : simple_ipa_opt_pass(_PASS_NAME_PASS_DATA, g) {}
+
+#ifndef NO_GATE
+#if BUILDING_GCC_VERSION >= 5000
+ virtual bool gate(function *) { return _GATE(); }
+#else
+ virtual bool gate(void) { return _GATE(); }
+#endif
+#endif
+
+ virtual opt_pass *clone() { return new _PASS_NAME_PASS(); }
+
+#ifndef NO_EXECUTE
+#if BUILDING_GCC_VERSION >= 5000
+ virtual unsigned int execute(function *) { return _EXECUTE(); }
+#else
+ virtual unsigned int execute(void) { return _EXECUTE(); }
+#endif
+#endif
+};
+}
+
+opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+ return new _PASS_NAME_PASS();
+}
+#else
+struct opt_pass *_MAKE_PASS_NAME_PASS(void)
+{
+ return &_PASS_NAME_PASS.pass;
+}
+#endif
+
+/* clean up user provided defines */
+#undef PASS_NAME
+#undef NO_GATE
+#undef NO_EXECUTE
+
+#undef PROPERTIES_DESTROYED
+#undef PROPERTIES_PROVIDED
+#undef PROPERTIES_REQUIRED
+#undef TODO_FLAGS_FINISH
+#undef TODO_FLAGS_START
+
+/* clean up generated defines */
+#undef _EXECUTE
+#undef __EXECUTE
+#undef _GATE
+#undef __GATE
+#undef _GCC_PLUGIN_CONCAT2
+#undef _GCC_PLUGIN_CONCAT3
+#undef _GCC_PLUGIN_STRINGIFY
+#undef __GCC_PLUGIN_STRINGIFY
+#undef _HAS_EXECUTE
+#undef _HAS_GATE
+#undef _MAKE_PASS_NAME_PASS
+#undef __MAKE_PASS_NAME_PASS
+#undef _PASS_NAME_NAME
+#undef _PASS_NAME_PASS
+#undef __PASS_NAME_PASS
+#undef _PASS_NAME_PASS_DATA
+#undef __PASS_NAME_PASS_DATA
+
+#endif /* PASS_NAME */
diff --git a/scripts/gcc-plugins/sancov_plugin.c b/scripts/gcc-plugins/sancov_plugin.c
new file mode 100644
index 000000000000..aedd6113cb73
--- /dev/null
+++ b/scripts/gcc-plugins/sancov_plugin.c
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2011-2016 by Emese Revfy <re.emese@gmail.com>
+ * Licensed under the GPL v2, or (at your option) v3
+ *
+ * Homepage:
+ * https://github.com/ephox-gcc-plugins/sancov
+ *
+ * This plugin inserts a __sanitizer_cov_trace_pc() call at the start of basic blocks.
+ * It supports all gcc versions with plugin support (from gcc-4.5 on).
+ * It is based on the commit "Add fuzzing coverage support" by Dmitry Vyukov <dvyukov@google.com>.
+ *
+ * You can read about it more here:
+ * https://gcc.gnu.org/viewcvs/gcc?limit_changes=0&view=revision&revision=231296
+ * http://lwn.net/Articles/674854/
+ * https://github.com/google/syzkaller
+ * https://lwn.net/Articles/677764/
+ *
+ * Usage:
+ * make run
+ */
+
+#include "gcc-common.h"
+
+int plugin_is_GPL_compatible;
+
+tree sancov_fndecl;
+
+static struct plugin_info sancov_plugin_info = {
+ .version = "20160402",
+ .help = "sancov plugin\n",
+};
+
+static unsigned int sancov_execute(void)
+{
+ basic_block bb;
+
+ /* Remove this line when this plugin and kcov will be in the kernel.
+ if (!strcmp(DECL_NAME_POINTER(current_function_decl), DECL_NAME_POINTER(sancov_fndecl)))
+ return 0;
+ */
+
+ FOR_EACH_BB_FN(bb, cfun) {
+ const_gimple stmt;
+ gcall *gcall;
+ gimple_stmt_iterator gsi = gsi_after_labels(bb);
+
+ if (gsi_end_p(gsi))
+ continue;
+
+ stmt = gsi_stmt(gsi);
+ gcall = as_a_gcall(gimple_build_call(sancov_fndecl, 0));
+ gimple_set_location(gcall, gimple_location(stmt));
+ gsi_insert_before(&gsi, gcall, GSI_SAME_STMT);
+ }
+ return 0;
+}
+
+#define PASS_NAME sancov
+
+#define NO_GATE
+#define TODO_FLAGS_FINISH TODO_dump_func | TODO_verify_stmts | TODO_update_ssa_no_phi | TODO_verify_flow
+
+#include "gcc-generate-gimple-pass.h"
+
+static void sancov_start_unit(void __unused *gcc_data, void __unused *user_data)
+{
+ tree leaf_attr, nothrow_attr;
+ tree BT_FN_VOID = build_function_type_list(void_type_node, NULL_TREE);
+
+ sancov_fndecl = build_fn_decl("__sanitizer_cov_trace_pc", BT_FN_VOID);
+
+ DECL_ASSEMBLER_NAME(sancov_fndecl);
+ TREE_PUBLIC(sancov_fndecl) = 1;
+ DECL_EXTERNAL(sancov_fndecl) = 1;
+ DECL_ARTIFICIAL(sancov_fndecl) = 1;
+ DECL_PRESERVE_P(sancov_fndecl) = 1;
+ DECL_UNINLINABLE(sancov_fndecl) = 1;
+ TREE_USED(sancov_fndecl) = 1;
+
+ nothrow_attr = tree_cons(get_identifier("nothrow"), NULL, NULL);
+ decl_attributes(&sancov_fndecl, nothrow_attr, 0);
+ gcc_assert(TREE_NOTHROW(sancov_fndecl));
+#if BUILDING_GCC_VERSION > 4005
+ leaf_attr = tree_cons(get_identifier("leaf"), NULL, NULL);
+ decl_attributes(&sancov_fndecl, leaf_attr, 0);
+#endif
+}
+
+int plugin_init(struct plugin_name_args *plugin_info, struct plugin_gcc_version *version)
+{
+ int i;
+ struct register_pass_info sancov_plugin_pass_info;
+ const char * const plugin_name = plugin_info->base_name;
+ const int argc = plugin_info->argc;
+ const struct plugin_argument * const argv = plugin_info->argv;
+ bool enable = true;
+
+ static const struct ggc_root_tab gt_ggc_r_gt_sancov[] = {
+ {
+ .base = &sancov_fndecl,
+ .nelt = 1,
+ .stride = sizeof(sancov_fndecl),
+ .cb = &gt_ggc_mx_tree_node,
+ .pchw = &gt_pch_nx_tree_node
+ },
+ LAST_GGC_ROOT_TAB
+ };
+
+ /* BBs can be split afterwards?? */
+ sancov_plugin_pass_info.pass = make_sancov_pass();
+#if BUILDING_GCC_VERSION >= 4009
+ sancov_plugin_pass_info.reference_pass_name = "asan";
+#else
+ sancov_plugin_pass_info.reference_pass_name = "nrv";
+#endif
+ sancov_plugin_pass_info.ref_pass_instance_number = 0;
+ sancov_plugin_pass_info.pos_op = PASS_POS_INSERT_BEFORE;
+
+ if (!plugin_default_version_check(version, &gcc_version)) {
+ error(G_("incompatible gcc/plugin versions"));
+ return 1;
+ }
+
+ for (i = 0; i < argc; ++i) {
+ if (!strcmp(argv[i].key, "no-sancov")) {
+ enable = false;
+ continue;
+ }
+ error(G_("unkown option '-fplugin-arg-%s-%s'"), plugin_name, argv[i].key);
+ }
+
+ register_callback(plugin_name, PLUGIN_INFO, NULL, &sancov_plugin_info);
+
+ if (!enable)
+ return 0;
+
+#if BUILDING_GCC_VERSION < 6000
+ register_callback(plugin_name, PLUGIN_START_UNIT, &sancov_start_unit, NULL);
+ register_callback(plugin_name, PLUGIN_REGISTER_GGC_ROOTS, NULL, (void *)&gt_ggc_r_gt_sancov);
+ register_callback(plugin_name, PLUGIN_PASS_MANAGER_SETUP, NULL, &sancov_plugin_pass_info);
+#endif
+
+ return 0;
+}
diff --git a/scripts/link-vmlinux.sh b/scripts/link-vmlinux.sh
index f0f6d9d75435..4f727eb5ec43 100755
--- a/scripts/link-vmlinux.sh
+++ b/scripts/link-vmlinux.sh
@@ -180,7 +180,7 @@ else
fi;
# final build of init/
-${MAKE} -f "${srctree}/scripts/Makefile.build" obj=init
+${MAKE} -f "${srctree}/scripts/Makefile.build" obj=init GCC_PLUGINS_CFLAGS="${GCC_PLUGINS_CFLAGS}"
kallsymso=""
kallsyms_vmlinux=""
diff --git a/scripts/package/builddeb b/scripts/package/builddeb
index 86e56fef7473..e1c09e2f9be7 100755
--- a/scripts/package/builddeb
+++ b/scripts/package/builddeb
@@ -26,6 +26,8 @@ create_package() {
# Fix ownership and permissions
chown -R root:root "$pdir"
chmod -R go-w "$pdir"
+ # in case we are in a restrictive umask environment like 0077
+ chmod -R a+rX "$pdir"
# Create the package
dpkg-gencontrol $forcearch -Vkernel:debarch="${debarch}" -p$pname -P"$pdir"
@@ -238,7 +240,8 @@ maintainer="$name <$email>"
# Try to determine distribution
if [ -n "$KDEB_CHANGELOG_DIST" ]; then
distribution=$KDEB_CHANGELOG_DIST
-elif distribution=$(lsb_release -cs 2>/dev/null) && [ -n "$distribution" ]; then
+# In some cases lsb_release returns the codename as n/a, which breaks dpkg-parsechangelog
+elif distribution=$(lsb_release -cs 2>/dev/null) && [ -n "$distribution" ] && [ "$distribution" != "n/a" ]; then
: # nothing to do in this case
else
distribution="unstable"
@@ -322,13 +325,14 @@ fi
# Build kernel header package
(cd $srctree; find . -name Makefile\* -o -name Kconfig\* -o -name \*.pl) > "$objtree/debian/hdrsrcfiles"
-if grep -q '^CONFIG_STACK_VALIDATION=y' $KCONFIG_CONFIG ; then
- (cd $srctree; find tools/objtool -type f -executable) >> "$objtree/debian/hdrsrcfiles"
-fi
(cd $srctree; find arch/*/include include scripts -type f) >> "$objtree/debian/hdrsrcfiles"
(cd $srctree; find arch/$SRCARCH -name module.lds -o -name Kbuild.platforms -o -name Platform) >> "$objtree/debian/hdrsrcfiles"
(cd $srctree; find $(find arch/$SRCARCH -name include -o -name scripts -type d) -type f) >> "$objtree/debian/hdrsrcfiles"
+if grep -q '^CONFIG_STACK_VALIDATION=y' $KCONFIG_CONFIG ; then
+ (cd $objtree; find tools/objtool -type f -executable) >> "$objtree/debian/hdrobjfiles"
+fi
(cd $objtree; find arch/$SRCARCH/include Module.symvers include scripts -type f) >> "$objtree/debian/hdrobjfiles"
+(cd $objtree; find scripts/gcc-plugins -name \*.so -o -name gcc-common.h) >> "$objtree/debian/hdrobjfiles"
destdir=$kernel_headers_dir/usr/src/linux-headers-$version
mkdir -p "$destdir"
(cd $srctree; tar -c -f - -T -) < "$objtree/debian/hdrsrcfiles" | (cd $destdir; tar -xf -)
diff --git a/scripts/setlocalversion b/scripts/setlocalversion
index 63d91e22ed7c..966dd3924ea9 100755
--- a/scripts/setlocalversion
+++ b/scripts/setlocalversion
@@ -143,7 +143,7 @@ fi
if test -e include/config/auto.conf; then
. include/config/auto.conf
else
- echo "Error: kernelrelease not valid - run 'make prepare' to update it"
+ echo "Error: kernelrelease not valid - run 'make prepare' to update it" >&2
exit 1
fi
diff --git a/virt/kvm/Kconfig b/virt/kvm/Kconfig
index e5d6108f5e85..b0cc1a34db27 100644
--- a/virt/kvm/Kconfig
+++ b/virt/kvm/Kconfig
@@ -16,9 +16,6 @@ config HAVE_KVM_EVENTFD
bool
select EVENTFD
-config KVM_APIC_ARCHITECTURE
- bool
-
config KVM_MMIO
bool
diff --git a/virt/kvm/arm/hyp/vgic-v2-sr.c b/virt/kvm/arm/hyp/vgic-v2-sr.c
index 3a3a699b7489..7cffd9338c49 100644
--- a/virt/kvm/arm/hyp/vgic-v2-sr.c
+++ b/virt/kvm/arm/hyp/vgic-v2-sr.c
@@ -21,18 +21,11 @@
#include <asm/kvm_hyp.h>
-#ifdef CONFIG_KVM_NEW_VGIC
-extern struct vgic_global kvm_vgic_global_state;
-#define vgic_v2_params kvm_vgic_global_state
-#else
-extern struct vgic_params vgic_v2_params;
-#endif
-
static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
void __iomem *base)
{
struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+ int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
u32 eisr0, eisr1;
int i;
bool expect_mi;
@@ -74,7 +67,7 @@ static void __hyp_text save_maint_int_state(struct kvm_vcpu *vcpu,
static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
{
struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+ int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
u32 elrsr0, elrsr1;
elrsr0 = readl_relaxed(base + GICH_ELRSR0);
@@ -93,7 +86,7 @@ static void __hyp_text save_elrsr(struct kvm_vcpu *vcpu, void __iomem *base)
static void __hyp_text save_lrs(struct kvm_vcpu *vcpu, void __iomem *base)
{
struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
- int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+ int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
int i;
for (i = 0; i < nr_lr; i++) {
@@ -147,7 +140,7 @@ void __hyp_text __vgic_v2_restore_state(struct kvm_vcpu *vcpu)
struct vgic_v2_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v2;
struct vgic_dist *vgic = &kvm->arch.vgic;
void __iomem *base = kern_hyp_va(vgic->vctrl_base);
- int nr_lr = (kern_hyp_va(&vgic_v2_params))->nr_lr;
+ int nr_lr = (kern_hyp_va(&kvm_vgic_global_state))->nr_lr;
int i;
u64 live_lrs = 0;
diff --git a/virt/kvm/arm/vgic-v2-emul.c b/virt/kvm/arm/vgic-v2-emul.c
deleted file mode 100644
index 1b0bee095427..000000000000
--- a/virt/kvm/arm/vgic-v2-emul.c
+++ /dev/null
@@ -1,856 +0,0 @@
-/*
- * Contains GICv2 specific emulation code, was in vgic.c before.
- *
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/uaccess.h>
-
-#include <linux/irqchip/arm-gic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-
-#define GICC_ARCH_VERSION_V2 0x2
-
-static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg);
-static u8 *vgic_get_sgi_sources(struct vgic_dist *dist, int vcpu_id, int sgi)
-{
- return dist->irq_sgi_sources + vcpu_id * VGIC_NR_SGIS + sgi;
-}
-
-static bool handle_mmio_misc(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- u32 reg;
- u32 word_offset = offset & 3;
-
- switch (offset & ~3) {
- case 0: /* GICD_CTLR */
- reg = vcpu->kvm->arch.vgic.enabled;
- vgic_reg_access(mmio, &reg, word_offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
- if (mmio->is_write) {
- vcpu->kvm->arch.vgic.enabled = reg & 1;
- vgic_update_state(vcpu->kvm);
- return true;
- }
- break;
-
- case 4: /* GICD_TYPER */
- reg = (atomic_read(&vcpu->kvm->online_vcpus) - 1) << 5;
- reg |= (vcpu->kvm->arch.vgic.nr_irqs >> 5) - 1;
- vgic_reg_access(mmio, &reg, word_offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
- break;
-
- case 8: /* GICD_IIDR */
- reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
- vgic_reg_access(mmio, &reg, word_offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
- break;
- }
-
- return false;
-}
-
-static bool handle_mmio_set_enable_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id, ACCESS_WRITE_SETBIT);
-}
-
-static bool handle_mmio_clear_enable_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id, ACCESS_WRITE_CLEARBIT);
-}
-
-static bool handle_mmio_set_pending_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_pending_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-}
-
-static bool handle_mmio_set_active_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_active_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-}
-
-static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 *reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
- vcpu->vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
- return false;
-}
-
-#define GICD_ITARGETSR_SIZE 32
-#define GICD_CPUTARGETS_BITS 8
-#define GICD_IRQS_PER_ITARGETSR (GICD_ITARGETSR_SIZE / GICD_CPUTARGETS_BITS)
-static u32 vgic_get_target_reg(struct kvm *kvm, int irq)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- int i;
- u32 val = 0;
-
- irq -= VGIC_NR_PRIVATE_IRQS;
-
- for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++)
- val |= 1 << (dist->irq_spi_cpu[irq + i] + i * 8);
-
- return val;
-}
-
-static void vgic_set_target_reg(struct kvm *kvm, u32 val, int irq)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct kvm_vcpu *vcpu;
- int i, c;
- unsigned long *bmap;
- u32 target;
-
- irq -= VGIC_NR_PRIVATE_IRQS;
-
- /*
- * Pick the LSB in each byte. This ensures we target exactly
- * one vcpu per IRQ. If the byte is null, assume we target
- * CPU0.
- */
- for (i = 0; i < GICD_IRQS_PER_ITARGETSR; i++) {
- int shift = i * GICD_CPUTARGETS_BITS;
-
- target = ffs((val >> shift) & 0xffU);
- target = target ? (target - 1) : 0;
- dist->irq_spi_cpu[irq + i] = target;
- kvm_for_each_vcpu(c, vcpu, kvm) {
- bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[c]);
- if (c == target)
- set_bit(irq + i, bmap);
- else
- clear_bit(irq + i, bmap);
- }
- }
-}
-
-static bool handle_mmio_target_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 reg;
-
- /* We treat the banked interrupts targets as read-only */
- if (offset < 32) {
- u32 roreg;
-
- roreg = 1 << vcpu->vcpu_id;
- roreg |= roreg << 8;
- roreg |= roreg << 16;
-
- vgic_reg_access(mmio, &roreg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
- return false;
- }
-
- reg = vgic_get_target_reg(vcpu->kvm, offset & ~3U);
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
- if (mmio->is_write) {
- vgic_set_target_reg(vcpu->kvm, reg, offset & ~3U);
- vgic_update_state(vcpu->kvm);
- return true;
- }
-
- return false;
-}
-
-static bool handle_mmio_cfg_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- u32 *reg;
-
- reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
- vcpu->vcpu_id, offset >> 1);
-
- return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-static bool handle_mmio_sgi_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- u32 reg;
-
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_VALUE);
- if (mmio->is_write) {
- vgic_dispatch_sgi(vcpu, reg);
- vgic_update_state(vcpu->kvm);
- return true;
- }
-
- return false;
-}
-
-/* Handle reads of GICD_CPENDSGIRn and GICD_SPENDSGIRn */
-static bool read_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- int sgi;
- int min_sgi = (offset & ~0x3);
- int max_sgi = min_sgi + 3;
- int vcpu_id = vcpu->vcpu_id;
- u32 reg = 0;
-
- /* Copy source SGIs from distributor side */
- for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
- u8 sources = *vgic_get_sgi_sources(dist, vcpu_id, sgi);
-
- reg |= ((u32)sources) << (8 * (sgi - min_sgi));
- }
-
- mmio_data_write(mmio, ~0, reg);
- return false;
-}
-
-static bool write_set_clear_sgi_pend_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset, bool set)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- int sgi;
- int min_sgi = (offset & ~0x3);
- int max_sgi = min_sgi + 3;
- int vcpu_id = vcpu->vcpu_id;
- u32 reg;
- bool updated = false;
-
- reg = mmio_data_read(mmio, ~0);
-
- /* Clear pending SGIs on the distributor */
- for (sgi = min_sgi; sgi <= max_sgi; sgi++) {
- u8 mask = reg >> (8 * (sgi - min_sgi));
- u8 *src = vgic_get_sgi_sources(dist, vcpu_id, sgi);
-
- if (set) {
- if ((*src & mask) != mask)
- updated = true;
- *src |= mask;
- } else {
- if (*src & mask)
- updated = true;
- *src &= ~mask;
- }
- }
-
- if (updated)
- vgic_update_state(vcpu->kvm);
-
- return updated;
-}
-
-static bool handle_mmio_sgi_set(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (!mmio->is_write)
- return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
- else
- return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, true);
-}
-
-static bool handle_mmio_sgi_clear(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (!mmio->is_write)
- return read_set_clear_sgi_pend_reg(vcpu, mmio, offset);
- else
- return write_set_clear_sgi_pend_reg(vcpu, mmio, offset, false);
-}
-
-static const struct vgic_io_range vgic_dist_ranges[] = {
- {
- .base = GIC_DIST_SOFTINT,
- .len = 4,
- .handle_mmio = handle_mmio_sgi_reg,
- },
- {
- .base = GIC_DIST_CTRL,
- .len = 12,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_misc,
- },
- {
- .base = GIC_DIST_IGROUP,
- .len = VGIC_MAX_IRQS / 8,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- .base = GIC_DIST_ENABLE_SET,
- .len = VGIC_MAX_IRQS / 8,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_enable_reg,
- },
- {
- .base = GIC_DIST_ENABLE_CLEAR,
- .len = VGIC_MAX_IRQS / 8,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_enable_reg,
- },
- {
- .base = GIC_DIST_PENDING_SET,
- .len = VGIC_MAX_IRQS / 8,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_pending_reg,
- },
- {
- .base = GIC_DIST_PENDING_CLEAR,
- .len = VGIC_MAX_IRQS / 8,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_pending_reg,
- },
- {
- .base = GIC_DIST_ACTIVE_SET,
- .len = VGIC_MAX_IRQS / 8,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_active_reg,
- },
- {
- .base = GIC_DIST_ACTIVE_CLEAR,
- .len = VGIC_MAX_IRQS / 8,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_active_reg,
- },
- {
- .base = GIC_DIST_PRI,
- .len = VGIC_MAX_IRQS,
- .bits_per_irq = 8,
- .handle_mmio = handle_mmio_priority_reg,
- },
- {
- .base = GIC_DIST_TARGET,
- .len = VGIC_MAX_IRQS,
- .bits_per_irq = 8,
- .handle_mmio = handle_mmio_target_reg,
- },
- {
- .base = GIC_DIST_CONFIG,
- .len = VGIC_MAX_IRQS / 4,
- .bits_per_irq = 2,
- .handle_mmio = handle_mmio_cfg_reg,
- },
- {
- .base = GIC_DIST_SGI_PENDING_CLEAR,
- .len = VGIC_NR_SGIS,
- .handle_mmio = handle_mmio_sgi_clear,
- },
- {
- .base = GIC_DIST_SGI_PENDING_SET,
- .len = VGIC_NR_SGIS,
- .handle_mmio = handle_mmio_sgi_set,
- },
- {}
-};
-
-static void vgic_dispatch_sgi(struct kvm_vcpu *vcpu, u32 reg)
-{
- struct kvm *kvm = vcpu->kvm;
- struct vgic_dist *dist = &kvm->arch.vgic;
- int nrcpus = atomic_read(&kvm->online_vcpus);
- u8 target_cpus;
- int sgi, mode, c, vcpu_id;
-
- vcpu_id = vcpu->vcpu_id;
-
- sgi = reg & 0xf;
- target_cpus = (reg >> 16) & 0xff;
- mode = (reg >> 24) & 3;
-
- switch (mode) {
- case 0:
- if (!target_cpus)
- return;
- break;
-
- case 1:
- target_cpus = ((1 << nrcpus) - 1) & ~(1 << vcpu_id) & 0xff;
- break;
-
- case 2:
- target_cpus = 1 << vcpu_id;
- break;
- }
-
- kvm_for_each_vcpu(c, vcpu, kvm) {
- if (target_cpus & 1) {
- /* Flag the SGI as pending */
- vgic_dist_irq_set_pending(vcpu, sgi);
- *vgic_get_sgi_sources(dist, c, sgi) |= 1 << vcpu_id;
- kvm_debug("SGI%d from CPU%d to CPU%d\n",
- sgi, vcpu_id, c);
- }
-
- target_cpus >>= 1;
- }
-}
-
-static bool vgic_v2_queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- unsigned long sources;
- int vcpu_id = vcpu->vcpu_id;
- int c;
-
- sources = *vgic_get_sgi_sources(dist, vcpu_id, irq);
-
- for_each_set_bit(c, &sources, dist->nr_cpus) {
- if (vgic_queue_irq(vcpu, c, irq))
- clear_bit(c, &sources);
- }
-
- *vgic_get_sgi_sources(dist, vcpu_id, irq) = sources;
-
- /*
- * If the sources bitmap has been cleared it means that we
- * could queue all the SGIs onto link registers (see the
- * clear_bit above), and therefore we are done with them in
- * our emulated gic and can get rid of them.
- */
- if (!sources) {
- vgic_dist_irq_clear_pending(vcpu, irq);
- vgic_cpu_irq_clear(vcpu, irq);
- return true;
- }
-
- return false;
-}
-
-/**
- * kvm_vgic_map_resources - Configure global VGIC state before running any VCPUs
- * @kvm: pointer to the kvm struct
- *
- * Map the virtual CPU interface into the VM before running any VCPUs. We
- * can't do this at creation time, because user space must first set the
- * virtual CPU interface address in the guest physical address space.
- */
-static int vgic_v2_map_resources(struct kvm *kvm,
- const struct vgic_params *params)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- int ret = 0;
-
- if (!irqchip_in_kernel(kvm))
- return 0;
-
- mutex_lock(&kvm->lock);
-
- if (vgic_ready(kvm))
- goto out;
-
- if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
- IS_VGIC_ADDR_UNDEF(dist->vgic_cpu_base)) {
- kvm_err("Need to set vgic cpu and dist addresses first\n");
- ret = -ENXIO;
- goto out;
- }
-
- vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base,
- KVM_VGIC_V2_DIST_SIZE,
- vgic_dist_ranges, -1, &dist->dist_iodev);
-
- /*
- * Initialize the vgic if this hasn't already been done on demand by
- * accessing the vgic state from userspace.
- */
- ret = vgic_init(kvm);
- if (ret) {
- kvm_err("Unable to allocate maps\n");
- goto out_unregister;
- }
-
- ret = kvm_phys_addr_ioremap(kvm, dist->vgic_cpu_base,
- params->vcpu_base, KVM_VGIC_V2_CPU_SIZE,
- true);
- if (ret) {
- kvm_err("Unable to remap VGIC CPU to VCPU\n");
- goto out_unregister;
- }
-
- dist->ready = true;
- goto out;
-
-out_unregister:
- kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev);
-
-out:
- if (ret)
- kvm_vgic_destroy(kvm);
- mutex_unlock(&kvm->lock);
- return ret;
-}
-
-static void vgic_v2_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- *vgic_get_sgi_sources(dist, vcpu->vcpu_id, irq) |= 1 << source;
-}
-
-static int vgic_v2_init_model(struct kvm *kvm)
-{
- int i;
-
- for (i = VGIC_NR_PRIVATE_IRQS; i < kvm->arch.vgic.nr_irqs; i += 4)
- vgic_set_target_reg(kvm, 0, i);
-
- return 0;
-}
-
-void vgic_v2_init_emulation(struct kvm *kvm)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
-
- dist->vm_ops.queue_sgi = vgic_v2_queue_sgi;
- dist->vm_ops.add_sgi_source = vgic_v2_add_sgi_source;
- dist->vm_ops.init_model = vgic_v2_init_model;
- dist->vm_ops.map_resources = vgic_v2_map_resources;
-
- kvm->arch.max_vcpus = VGIC_V2_MAX_CPUS;
-}
-
-static bool handle_cpu_mmio_misc(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- bool updated = false;
- struct vgic_vmcr vmcr;
- u32 *vmcr_field;
- u32 reg;
-
- vgic_get_vmcr(vcpu, &vmcr);
-
- switch (offset & ~0x3) {
- case GIC_CPU_CTRL:
- vmcr_field = &vmcr.ctlr;
- break;
- case GIC_CPU_PRIMASK:
- vmcr_field = &vmcr.pmr;
- break;
- case GIC_CPU_BINPOINT:
- vmcr_field = &vmcr.bpr;
- break;
- case GIC_CPU_ALIAS_BINPOINT:
- vmcr_field = &vmcr.abpr;
- break;
- default:
- BUG();
- }
-
- if (!mmio->is_write) {
- reg = *vmcr_field;
- mmio_data_write(mmio, ~0, reg);
- } else {
- reg = mmio_data_read(mmio, ~0);
- if (reg != *vmcr_field) {
- *vmcr_field = reg;
- vgic_set_vmcr(vcpu, &vmcr);
- updated = true;
- }
- }
- return updated;
-}
-
-static bool handle_mmio_abpr(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- return handle_cpu_mmio_misc(vcpu, mmio, GIC_CPU_ALIAS_BINPOINT);
-}
-
-static bool handle_cpu_mmio_ident(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 reg;
-
- if (mmio->is_write)
- return false;
-
- /* GICC_IIDR */
- reg = (PRODUCT_ID_KVM << 20) |
- (GICC_ARCH_VERSION_V2 << 16) |
- (IMPLEMENTER_ARM << 0);
- mmio_data_write(mmio, ~0, reg);
- return false;
-}
-
-/*
- * CPU Interface Register accesses - these are not accessed by the VM, but by
- * user space for saving and restoring VGIC state.
- */
-static const struct vgic_io_range vgic_cpu_ranges[] = {
- {
- .base = GIC_CPU_CTRL,
- .len = 12,
- .handle_mmio = handle_cpu_mmio_misc,
- },
- {
- .base = GIC_CPU_ALIAS_BINPOINT,
- .len = 4,
- .handle_mmio = handle_mmio_abpr,
- },
- {
- .base = GIC_CPU_ACTIVEPRIO,
- .len = 16,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- .base = GIC_CPU_IDENT,
- .len = 4,
- .handle_mmio = handle_cpu_mmio_ident,
- },
-};
-
-static int vgic_attr_regs_access(struct kvm_device *dev,
- struct kvm_device_attr *attr,
- u32 *reg, bool is_write)
-{
- const struct vgic_io_range *r = NULL, *ranges;
- phys_addr_t offset;
- int ret, cpuid, c;
- struct kvm_vcpu *vcpu, *tmp_vcpu;
- struct vgic_dist *vgic;
- struct kvm_exit_mmio mmio;
- u32 data;
-
- offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
- cpuid = (attr->attr & KVM_DEV_ARM_VGIC_CPUID_MASK) >>
- KVM_DEV_ARM_VGIC_CPUID_SHIFT;
-
- mutex_lock(&dev->kvm->lock);
-
- ret = vgic_init(dev->kvm);
- if (ret)
- goto out;
-
- if (cpuid >= atomic_read(&dev->kvm->online_vcpus)) {
- ret = -EINVAL;
- goto out;
- }
-
- vcpu = kvm_get_vcpu(dev->kvm, cpuid);
- vgic = &dev->kvm->arch.vgic;
-
- mmio.len = 4;
- mmio.is_write = is_write;
- mmio.data = &data;
- if (is_write)
- mmio_data_write(&mmio, ~0, *reg);
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
- mmio.phys_addr = vgic->vgic_dist_base + offset;
- ranges = vgic_dist_ranges;
- break;
- case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
- mmio.phys_addr = vgic->vgic_cpu_base + offset;
- ranges = vgic_cpu_ranges;
- break;
- default:
- BUG();
- }
- r = vgic_find_range(ranges, 4, offset);
-
- if (unlikely(!r || !r->handle_mmio)) {
- ret = -ENXIO;
- goto out;
- }
-
-
- spin_lock(&vgic->lock);
-
- /*
- * Ensure that no other VCPU is running by checking the vcpu->cpu
- * field. If no other VPCUs are running we can safely access the VGIC
- * state, because even if another VPU is run after this point, that
- * VCPU will not touch the vgic state, because it will block on
- * getting the vgic->lock in kvm_vgic_sync_hwstate().
- */
- kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm) {
- if (unlikely(tmp_vcpu->cpu != -1)) {
- ret = -EBUSY;
- goto out_vgic_unlock;
- }
- }
-
- /*
- * Move all pending IRQs from the LRs on all VCPUs so the pending
- * state can be properly represented in the register state accessible
- * through this API.
- */
- kvm_for_each_vcpu(c, tmp_vcpu, dev->kvm)
- vgic_unqueue_irqs(tmp_vcpu);
-
- offset -= r->base;
- r->handle_mmio(vcpu, &mmio, offset);
-
- if (!is_write)
- *reg = mmio_data_read(&mmio, ~0);
-
- ret = 0;
-out_vgic_unlock:
- spin_unlock(&vgic->lock);
-out:
- mutex_unlock(&dev->kvm->lock);
- return ret;
-}
-
-static int vgic_v2_create(struct kvm_device *dev, u32 type)
-{
- return kvm_vgic_create(dev->kvm, type);
-}
-
-static void vgic_v2_destroy(struct kvm_device *dev)
-{
- kfree(dev);
-}
-
-static int vgic_v2_set_attr(struct kvm_device *dev,
- struct kvm_device_attr *attr)
-{
- int ret;
-
- ret = vgic_set_common_attr(dev, attr);
- if (ret != -ENXIO)
- return ret;
-
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
- case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
- u32 __user *uaddr = (u32 __user *)(long)attr->addr;
- u32 reg;
-
- if (get_user(reg, uaddr))
- return -EFAULT;
-
- return vgic_attr_regs_access(dev, attr, &reg, true);
- }
-
- }
-
- return -ENXIO;
-}
-
-static int vgic_v2_get_attr(struct kvm_device *dev,
- struct kvm_device_attr *attr)
-{
- int ret;
-
- ret = vgic_get_common_attr(dev, attr);
- if (ret != -ENXIO)
- return ret;
-
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
- case KVM_DEV_ARM_VGIC_GRP_CPU_REGS: {
- u32 __user *uaddr = (u32 __user *)(long)attr->addr;
- u32 reg = 0;
-
- ret = vgic_attr_regs_access(dev, attr, &reg, false);
- if (ret)
- return ret;
- return put_user(reg, uaddr);
- }
-
- }
-
- return -ENXIO;
-}
-
-static int vgic_v2_has_attr(struct kvm_device *dev,
- struct kvm_device_attr *attr)
-{
- phys_addr_t offset;
-
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_ADDR:
- switch (attr->attr) {
- case KVM_VGIC_V2_ADDR_TYPE_DIST:
- case KVM_VGIC_V2_ADDR_TYPE_CPU:
- return 0;
- }
- break;
- case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
- offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
- return vgic_has_attr_regs(vgic_dist_ranges, offset);
- case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
- offset = attr->attr & KVM_DEV_ARM_VGIC_OFFSET_MASK;
- return vgic_has_attr_regs(vgic_cpu_ranges, offset);
- case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
- return 0;
- case KVM_DEV_ARM_VGIC_GRP_CTRL:
- switch (attr->attr) {
- case KVM_DEV_ARM_VGIC_CTRL_INIT:
- return 0;
- }
- }
- return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v2_ops = {
- .name = "kvm-arm-vgic-v2",
- .create = vgic_v2_create,
- .destroy = vgic_v2_destroy,
- .set_attr = vgic_v2_set_attr,
- .get_attr = vgic_v2_get_attr,
- .has_attr = vgic_v2_has_attr,
-};
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
deleted file mode 100644
index 334cd7a89106..000000000000
--- a/virt/kvm/arm/vgic-v2.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * Copyright (C) 2012,2013 ARM Limited, All Rights Reserved.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-
-#include <linux/irqchip/arm-gic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-static struct vgic_lr vgic_v2_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
- struct vgic_lr lr_desc;
- u32 val = vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr];
-
- lr_desc.irq = val & GICH_LR_VIRTUALID;
- if (lr_desc.irq <= 15)
- lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7;
- else
- lr_desc.source = 0;
- lr_desc.state = 0;
-
- if (val & GICH_LR_PENDING_BIT)
- lr_desc.state |= LR_STATE_PENDING;
- if (val & GICH_LR_ACTIVE_BIT)
- lr_desc.state |= LR_STATE_ACTIVE;
- if (val & GICH_LR_EOI)
- lr_desc.state |= LR_EOI_INT;
- if (val & GICH_LR_HW) {
- lr_desc.state |= LR_HW;
- lr_desc.hwirq = (val & GICH_LR_PHYSID_CPUID) >> GICH_LR_PHYSID_CPUID_SHIFT;
- }
-
- return lr_desc;
-}
-
-static void vgic_v2_set_lr(struct kvm_vcpu *vcpu, int lr,
- struct vgic_lr lr_desc)
-{
- u32 lr_val;
-
- lr_val = lr_desc.irq;
-
- if (lr_desc.state & LR_STATE_PENDING)
- lr_val |= GICH_LR_PENDING_BIT;
- if (lr_desc.state & LR_STATE_ACTIVE)
- lr_val |= GICH_LR_ACTIVE_BIT;
- if (lr_desc.state & LR_EOI_INT)
- lr_val |= GICH_LR_EOI;
-
- if (lr_desc.state & LR_HW) {
- lr_val |= GICH_LR_HW;
- lr_val |= (u32)lr_desc.hwirq << GICH_LR_PHYSID_CPUID_SHIFT;
- }
-
- if (lr_desc.irq < VGIC_NR_SGIS)
- lr_val |= (lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT);
-
- vcpu->arch.vgic_cpu.vgic_v2.vgic_lr[lr] = lr_val;
-
- if (!(lr_desc.state & LR_STATE_MASK))
- vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr);
- else
- vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr &= ~(1ULL << lr);
-}
-
-static u64 vgic_v2_get_elrsr(const struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr;
-}
-
-static u64 vgic_v2_get_eisr(const struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr;
-}
-
-static void vgic_v2_clear_eisr(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr = 0;
-}
-
-static u32 vgic_v2_get_interrupt_status(const struct kvm_vcpu *vcpu)
-{
- u32 misr = vcpu->arch.vgic_cpu.vgic_v2.vgic_misr;
- u32 ret = 0;
-
- if (misr & GICH_MISR_EOI)
- ret |= INT_STATUS_EOI;
- if (misr & GICH_MISR_U)
- ret |= INT_STATUS_UNDERFLOW;
-
- return ret;
-}
-
-static void vgic_v2_enable_underflow(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr |= GICH_HCR_UIE;
-}
-
-static void vgic_v2_disable_underflow(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr &= ~GICH_HCR_UIE;
-}
-
-static void vgic_v2_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
- u32 vmcr = vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr;
-
- vmcrp->ctlr = (vmcr & GICH_VMCR_CTRL_MASK) >> GICH_VMCR_CTRL_SHIFT;
- vmcrp->abpr = (vmcr & GICH_VMCR_ALIAS_BINPOINT_MASK) >> GICH_VMCR_ALIAS_BINPOINT_SHIFT;
- vmcrp->bpr = (vmcr & GICH_VMCR_BINPOINT_MASK) >> GICH_VMCR_BINPOINT_SHIFT;
- vmcrp->pmr = (vmcr & GICH_VMCR_PRIMASK_MASK) >> GICH_VMCR_PRIMASK_SHIFT;
-}
-
-static void vgic_v2_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
- u32 vmcr;
-
- vmcr = (vmcrp->ctlr << GICH_VMCR_CTRL_SHIFT) & GICH_VMCR_CTRL_MASK;
- vmcr |= (vmcrp->abpr << GICH_VMCR_ALIAS_BINPOINT_SHIFT) & GICH_VMCR_ALIAS_BINPOINT_MASK;
- vmcr |= (vmcrp->bpr << GICH_VMCR_BINPOINT_SHIFT) & GICH_VMCR_BINPOINT_MASK;
- vmcr |= (vmcrp->pmr << GICH_VMCR_PRIMASK_SHIFT) & GICH_VMCR_PRIMASK_MASK;
-
- vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = vmcr;
-}
-
-static void vgic_v2_enable(struct kvm_vcpu *vcpu)
-{
- /*
- * By forcing VMCR to zero, the GIC will restore the binary
- * points to their reset values. Anything else resets to zero
- * anyway.
- */
- vcpu->arch.vgic_cpu.vgic_v2.vgic_vmcr = 0;
- vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr = ~0;
-
- /* Get the show on the road... */
- vcpu->arch.vgic_cpu.vgic_v2.vgic_hcr = GICH_HCR_EN;
-}
-
-static const struct vgic_ops vgic_v2_ops = {
- .get_lr = vgic_v2_get_lr,
- .set_lr = vgic_v2_set_lr,
- .get_elrsr = vgic_v2_get_elrsr,
- .get_eisr = vgic_v2_get_eisr,
- .clear_eisr = vgic_v2_clear_eisr,
- .get_interrupt_status = vgic_v2_get_interrupt_status,
- .enable_underflow = vgic_v2_enable_underflow,
- .disable_underflow = vgic_v2_disable_underflow,
- .get_vmcr = vgic_v2_get_vmcr,
- .set_vmcr = vgic_v2_set_vmcr,
- .enable = vgic_v2_enable,
-};
-
-struct vgic_params __section(.hyp.text) vgic_v2_params;
-
-static void vgic_cpu_init_lrs(void *params)
-{
- struct vgic_params *vgic = params;
- int i;
-
- for (i = 0; i < vgic->nr_lr; i++)
- writel_relaxed(0, vgic->vctrl_base + GICH_LR0 + (i * 4));
-}
-
-/**
- * vgic_v2_probe - probe for a GICv2 compatible interrupt controller
- * @gic_kvm_info: pointer to the GIC description
- * @ops: address of a pointer to the GICv2 operations
- * @params: address of a pointer to HW-specific parameters
- *
- * Returns 0 if a GICv2 has been found, with the low level operations
- * in *ops and the HW parameters in *params. Returns an error code
- * otherwise.
- */
-int vgic_v2_probe(const struct gic_kvm_info *gic_kvm_info,
- const struct vgic_ops **ops,
- const struct vgic_params **params)
-{
- int ret;
- struct vgic_params *vgic = &vgic_v2_params;
- const struct resource *vctrl_res = &gic_kvm_info->vctrl;
- const struct resource *vcpu_res = &gic_kvm_info->vcpu;
-
- memset(vgic, 0, sizeof(*vgic));
-
- if (!gic_kvm_info->maint_irq) {
- kvm_err("error getting vgic maintenance irq\n");
- ret = -ENXIO;
- goto out;
- }
- vgic->maint_irq = gic_kvm_info->maint_irq;
-
- if (!gic_kvm_info->vctrl.start) {
- kvm_err("GICH not present in the firmware table\n");
- ret = -ENXIO;
- goto out;
- }
-
- vgic->vctrl_base = ioremap(gic_kvm_info->vctrl.start,
- resource_size(&gic_kvm_info->vctrl));
- if (!vgic->vctrl_base) {
- kvm_err("Cannot ioremap GICH\n");
- ret = -ENOMEM;
- goto out;
- }
-
- vgic->nr_lr = readl_relaxed(vgic->vctrl_base + GICH_VTR);
- vgic->nr_lr = (vgic->nr_lr & 0x3f) + 1;
-
- ret = create_hyp_io_mappings(vgic->vctrl_base,
- vgic->vctrl_base + resource_size(vctrl_res),
- vctrl_res->start);
- if (ret) {
- kvm_err("Cannot map VCTRL into hyp\n");
- goto out_unmap;
- }
-
- if (!PAGE_ALIGNED(vcpu_res->start)) {
- kvm_err("GICV physical address 0x%llx not page aligned\n",
- (unsigned long long)vcpu_res->start);
- ret = -ENXIO;
- goto out_unmap;
- }
-
- if (!PAGE_ALIGNED(resource_size(vcpu_res))) {
- kvm_err("GICV size 0x%llx not a multiple of page size 0x%lx\n",
- (unsigned long long)resource_size(vcpu_res),
- PAGE_SIZE);
- ret = -ENXIO;
- goto out_unmap;
- }
-
- vgic->can_emulate_gicv2 = true;
- kvm_register_device_ops(&kvm_arm_vgic_v2_ops, KVM_DEV_TYPE_ARM_VGIC_V2);
-
- vgic->vcpu_base = vcpu_res->start;
-
- kvm_info("GICH base=0x%llx, GICV base=0x%llx, IRQ=%d\n",
- gic_kvm_info->vctrl.start, vgic->vcpu_base, vgic->maint_irq);
-
- vgic->type = VGIC_V2;
- vgic->max_gic_vcpus = VGIC_V2_MAX_CPUS;
-
- on_each_cpu(vgic_cpu_init_lrs, vgic, 1);
-
- *ops = &vgic_v2_ops;
- *params = vgic;
- goto out;
-
-out_unmap:
- iounmap(vgic->vctrl_base);
-out:
- return ret;
-}
diff --git a/virt/kvm/arm/vgic-v3-emul.c b/virt/kvm/arm/vgic-v3-emul.c
deleted file mode 100644
index e661e7fb9d91..000000000000
--- a/virt/kvm/arm/vgic-v3-emul.c
+++ /dev/null
@@ -1,1074 +0,0 @@
-/*
- * GICv3 distributor and redistributor emulation
- *
- * GICv3 emulation is currently only supported on a GICv3 host (because
- * we rely on the hardware's CPU interface virtualization support), but
- * supports both hardware with or without the optional GICv2 backwards
- * compatibility features.
- *
- * Limitations of the emulation:
- * (RAZ/WI: read as zero, write ignore, RAO/WI: read as one, write ignore)
- * - We do not support LPIs (yet). TYPER.LPIS is reported as 0 and is RAZ/WI.
- * - We do not support the message based interrupts (MBIs) triggered by
- * writes to the GICD_{SET,CLR}SPI_* registers. TYPER.MBIS is reported as 0.
- * - We do not support the (optional) backwards compatibility feature.
- * GICD_CTLR.ARE resets to 1 and is RAO/WI. If the _host_ GIC supports
- * the compatiblity feature, you can use a GICv2 in the guest, though.
- * - We only support a single security state. GICD_CTLR.DS is 1 and is RAO/WI.
- * - Priorities are not emulated (same as the GICv2 emulation). Linux
- * as a guest is fine with this, because it does not use priorities.
- * - We only support Group1 interrupts. Again Linux uses only those.
- *
- * Copyright (C) 2014 ARM Ltd.
- * Author: Andre Przywara <andre.przywara@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-
-#include <linux/irqchip/arm-gic-v3.h>
-#include <kvm/arm_vgic.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-
-#include "vgic.h"
-
-static bool handle_mmio_rao_wi(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- u32 reg = 0xffffffff;
-
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
- return false;
-}
-
-static bool handle_mmio_ctlr(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- u32 reg = 0;
-
- /*
- * Force ARE and DS to 1, the guest cannot change this.
- * For the time being we only support Group1 interrupts.
- */
- if (vcpu->kvm->arch.vgic.enabled)
- reg = GICD_CTLR_ENABLE_SS_G1;
- reg |= GICD_CTLR_ARE_NS | GICD_CTLR_DS;
-
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
- if (mmio->is_write) {
- vcpu->kvm->arch.vgic.enabled = !!(reg & GICD_CTLR_ENABLE_SS_G1);
- vgic_update_state(vcpu->kvm);
- return true;
- }
- return false;
-}
-
-/*
- * As this implementation does not provide compatibility
- * with GICv2 (ARE==1), we report zero CPUs in bits [5..7].
- * Also LPIs and MBIs are not supported, so we set the respective bits to 0.
- * Also we report at most 2**10=1024 interrupt IDs (to match 1024 SPIs).
- */
-#define INTERRUPT_ID_BITS 10
-static bool handle_mmio_typer(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- u32 reg;
-
- reg = (min(vcpu->kvm->arch.vgic.nr_irqs, 1024) >> 5) - 1;
-
- reg |= (INTERRUPT_ID_BITS - 1) << 19;
-
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
- return false;
-}
-
-static bool handle_mmio_iidr(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio, phys_addr_t offset)
-{
- u32 reg;
-
- reg = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
- return false;
-}
-
-static bool handle_mmio_set_enable_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
- return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id,
- ACCESS_WRITE_SETBIT);
-
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_clear_enable_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
- return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id,
- ACCESS_WRITE_CLEARBIT);
-
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_set_pending_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
- return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_clear_pending_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
- return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_set_active_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
- return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_clear_active_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- if (likely(offset >= VGIC_NR_PRIVATE_IRQS / 8))
- return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
- vcpu->vcpu_id);
-
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_priority_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 *reg;
-
- if (unlikely(offset < VGIC_NR_PRIVATE_IRQS)) {
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
- }
-
- reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
- vcpu->vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
- return false;
-}
-
-static bool handle_mmio_cfg_reg_dist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 *reg;
-
- if (unlikely(offset < VGIC_NR_PRIVATE_IRQS / 4)) {
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
- }
-
- reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
- vcpu->vcpu_id, offset >> 1);
-
- return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-/*
- * We use a compressed version of the MPIDR (all 32 bits in one 32-bit word)
- * when we store the target MPIDR written by the guest.
- */
-static u32 compress_mpidr(unsigned long mpidr)
-{
- u32 ret;
-
- ret = MPIDR_AFFINITY_LEVEL(mpidr, 0);
- ret |= MPIDR_AFFINITY_LEVEL(mpidr, 1) << 8;
- ret |= MPIDR_AFFINITY_LEVEL(mpidr, 2) << 16;
- ret |= MPIDR_AFFINITY_LEVEL(mpidr, 3) << 24;
-
- return ret;
-}
-
-static unsigned long uncompress_mpidr(u32 value)
-{
- unsigned long mpidr;
-
- mpidr = ((value >> 0) & 0xFF) << MPIDR_LEVEL_SHIFT(0);
- mpidr |= ((value >> 8) & 0xFF) << MPIDR_LEVEL_SHIFT(1);
- mpidr |= ((value >> 16) & 0xFF) << MPIDR_LEVEL_SHIFT(2);
- mpidr |= (u64)((value >> 24) & 0xFF) << MPIDR_LEVEL_SHIFT(3);
-
- return mpidr;
-}
-
-/*
- * Lookup the given MPIDR value to get the vcpu_id (if there is one)
- * and store that in the irq_spi_cpu[] array.
- * This limits the number of VCPUs to 255 for now, extending the data
- * type (or storing kvm_vcpu pointers) should lift the limit.
- * Store the original MPIDR value in an extra array to support read-as-written.
- * Unallocated MPIDRs are translated to a special value and caught
- * before any array accesses.
- */
-static bool handle_mmio_route_reg(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm *kvm = vcpu->kvm;
- struct vgic_dist *dist = &kvm->arch.vgic;
- int spi;
- u32 reg;
- int vcpu_id;
- unsigned long *bmap, mpidr;
-
- /*
- * The upper 32 bits of each 64 bit register are zero,
- * as we don't support Aff3.
- */
- if ((offset & 4)) {
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
- }
-
- /* This region only covers SPIs, so no handling of private IRQs here. */
- spi = offset / 8;
-
- /* get the stored MPIDR for this IRQ */
- mpidr = uncompress_mpidr(dist->irq_spi_mpidr[spi]);
- reg = mpidr;
-
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
-
- if (!mmio->is_write)
- return false;
-
- /*
- * Now clear the currently assigned vCPU from the map, making room
- * for the new one to be written below
- */
- vcpu = kvm_mpidr_to_vcpu(kvm, mpidr);
- if (likely(vcpu)) {
- vcpu_id = vcpu->vcpu_id;
- bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
- __clear_bit(spi, bmap);
- }
-
- dist->irq_spi_mpidr[spi] = compress_mpidr(reg);
- vcpu = kvm_mpidr_to_vcpu(kvm, reg & MPIDR_HWID_BITMASK);
-
- /*
- * The spec says that non-existent MPIDR values should not be
- * forwarded to any existent (v)CPU, but should be able to become
- * pending anyway. We simply keep the irq_spi_target[] array empty, so
- * the interrupt will never be injected.
- * irq_spi_cpu[irq] gets a magic value in this case.
- */
- if (likely(vcpu)) {
- vcpu_id = vcpu->vcpu_id;
- dist->irq_spi_cpu[spi] = vcpu_id;
- bmap = vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]);
- __set_bit(spi, bmap);
- } else {
- dist->irq_spi_cpu[spi] = VCPU_NOT_ALLOCATED;
- }
-
- vgic_update_state(kvm);
-
- return true;
-}
-
-/*
- * We should be careful about promising too much when a guest reads
- * this register. Don't claim to be like any hardware implementation,
- * but just report the GIC as version 3 - which is what a Linux guest
- * would check.
- */
-static bool handle_mmio_idregs(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 reg = 0;
-
- switch (offset + GICD_IDREGS) {
- case GICD_PIDR2:
- reg = 0x3b;
- break;
- }
-
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
-
- return false;
-}
-
-static const struct vgic_io_range vgic_v3_dist_ranges[] = {
- {
- .base = GICD_CTLR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_ctlr,
- },
- {
- .base = GICD_TYPER,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_typer,
- },
- {
- .base = GICD_IIDR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_iidr,
- },
- {
- /* this register is optional, it is RAZ/WI if not implemented */
- .base = GICD_STATUSR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this write only register is WI when TYPER.MBIS=0 */
- .base = GICD_SETSPI_NSR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this write only register is WI when TYPER.MBIS=0 */
- .base = GICD_CLRSPI_NSR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this is RAZ/WI when DS=1 */
- .base = GICD_SETSPI_SR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this is RAZ/WI when DS=1 */
- .base = GICD_CLRSPI_SR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- .base = GICD_IGROUPR,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_rao_wi,
- },
- {
- .base = GICD_ISENABLER,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_enable_reg_dist,
- },
- {
- .base = GICD_ICENABLER,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_enable_reg_dist,
- },
- {
- .base = GICD_ISPENDR,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_pending_reg_dist,
- },
- {
- .base = GICD_ICPENDR,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_pending_reg_dist,
- },
- {
- .base = GICD_ISACTIVER,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_active_reg_dist,
- },
- {
- .base = GICD_ICACTIVER,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_active_reg_dist,
- },
- {
- .base = GICD_IPRIORITYR,
- .len = 0x400,
- .bits_per_irq = 8,
- .handle_mmio = handle_mmio_priority_reg_dist,
- },
- {
- /* TARGETSRn is RES0 when ARE=1 */
- .base = GICD_ITARGETSR,
- .len = 0x400,
- .bits_per_irq = 8,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- .base = GICD_ICFGR,
- .len = 0x100,
- .bits_per_irq = 2,
- .handle_mmio = handle_mmio_cfg_reg_dist,
- },
- {
- /* this is RAZ/WI when DS=1 */
- .base = GICD_IGRPMODR,
- .len = 0x80,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this is RAZ/WI when DS=1 */
- .base = GICD_NSACR,
- .len = 0x100,
- .bits_per_irq = 2,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this is RAZ/WI when ARE=1 */
- .base = GICD_SGIR,
- .len = 0x04,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this is RAZ/WI when ARE=1 */
- .base = GICD_CPENDSGIR,
- .len = 0x10,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- /* this is RAZ/WI when ARE=1 */
- .base = GICD_SPENDSGIR,
- .len = 0x10,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- .base = GICD_IROUTER + 0x100,
- .len = 0x1ee0,
- .bits_per_irq = 64,
- .handle_mmio = handle_mmio_route_reg,
- },
- {
- .base = GICD_IDREGS,
- .len = 0x30,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_idregs,
- },
- {},
-};
-
-static bool handle_mmio_ctlr_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- /* since we don't support LPIs, this register is zero for now */
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_typer_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 reg;
- u64 mpidr;
- struct kvm_vcpu *redist_vcpu = mmio->private;
- int target_vcpu_id = redist_vcpu->vcpu_id;
-
- /* the upper 32 bits contain the affinity value */
- if ((offset & ~3) == 4) {
- mpidr = kvm_vcpu_get_mpidr_aff(redist_vcpu);
- reg = compress_mpidr(mpidr);
-
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
- return false;
- }
-
- reg = redist_vcpu->vcpu_id << 8;
- if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
- reg |= GICR_TYPER_LAST;
- vgic_reg_access(mmio, &reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-static bool handle_mmio_set_enable_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
-
- return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
- redist_vcpu->vcpu_id,
- ACCESS_WRITE_SETBIT);
-}
-
-static bool handle_mmio_clear_enable_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
-
- return vgic_handle_enable_reg(vcpu->kvm, mmio, offset,
- redist_vcpu->vcpu_id,
- ACCESS_WRITE_CLEARBIT);
-}
-
-static bool handle_mmio_set_active_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
-
- return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
- redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_active_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
-
- return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
- redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_set_pending_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
-
- return vgic_handle_set_pending_reg(vcpu->kvm, mmio, offset,
- redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_clear_pending_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
-
- return vgic_handle_clear_pending_reg(vcpu->kvm, mmio, offset,
- redist_vcpu->vcpu_id);
-}
-
-static bool handle_mmio_priority_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
- u32 *reg;
-
- reg = vgic_bytemap_get_reg(&vcpu->kvm->arch.vgic.irq_priority,
- redist_vcpu->vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
- return false;
-}
-
-static bool handle_mmio_cfg_reg_redist(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- struct kvm_vcpu *redist_vcpu = mmio->private;
-
- u32 *reg = vgic_bitmap_get_reg(&vcpu->kvm->arch.vgic.irq_cfg,
- redist_vcpu->vcpu_id, offset >> 1);
-
- return vgic_handle_cfg_reg(reg, mmio, offset);
-}
-
-#define SGI_base(x) ((x) + SZ_64K)
-
-static const struct vgic_io_range vgic_redist_ranges[] = {
- {
- .base = GICR_CTLR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_ctlr_redist,
- },
- {
- .base = GICR_TYPER,
- .len = 0x08,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_typer_redist,
- },
- {
- .base = GICR_IIDR,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_iidr,
- },
- {
- .base = GICR_WAKER,
- .len = 0x04,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- .base = GICR_IDREGS,
- .len = 0x30,
- .bits_per_irq = 0,
- .handle_mmio = handle_mmio_idregs,
- },
- {
- .base = SGI_base(GICR_IGROUPR0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_rao_wi,
- },
- {
- .base = SGI_base(GICR_ISENABLER0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_enable_reg_redist,
- },
- {
- .base = SGI_base(GICR_ICENABLER0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_enable_reg_redist,
- },
- {
- .base = SGI_base(GICR_ISPENDR0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_pending_reg_redist,
- },
- {
- .base = SGI_base(GICR_ICPENDR0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_pending_reg_redist,
- },
- {
- .base = SGI_base(GICR_ISACTIVER0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_set_active_reg_redist,
- },
- {
- .base = SGI_base(GICR_ICACTIVER0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_clear_active_reg_redist,
- },
- {
- .base = SGI_base(GICR_IPRIORITYR0),
- .len = 0x20,
- .bits_per_irq = 8,
- .handle_mmio = handle_mmio_priority_reg_redist,
- },
- {
- .base = SGI_base(GICR_ICFGR0),
- .len = 0x08,
- .bits_per_irq = 2,
- .handle_mmio = handle_mmio_cfg_reg_redist,
- },
- {
- .base = SGI_base(GICR_IGRPMODR0),
- .len = 0x04,
- .bits_per_irq = 1,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {
- .base = SGI_base(GICR_NSACR),
- .len = 0x04,
- .handle_mmio = handle_mmio_raz_wi,
- },
- {},
-};
-
-static bool vgic_v3_queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
- if (vgic_queue_irq(vcpu, 0, irq)) {
- vgic_dist_irq_clear_pending(vcpu, irq);
- vgic_cpu_irq_clear(vcpu, irq);
- return true;
- }
-
- return false;
-}
-
-static int vgic_v3_map_resources(struct kvm *kvm,
- const struct vgic_params *params)
-{
- int ret = 0;
- struct vgic_dist *dist = &kvm->arch.vgic;
- gpa_t rdbase = dist->vgic_redist_base;
- struct vgic_io_device *iodevs = NULL;
- int i;
-
- if (!irqchip_in_kernel(kvm))
- return 0;
-
- mutex_lock(&kvm->lock);
-
- if (vgic_ready(kvm))
- goto out;
-
- if (IS_VGIC_ADDR_UNDEF(dist->vgic_dist_base) ||
- IS_VGIC_ADDR_UNDEF(dist->vgic_redist_base)) {
- kvm_err("Need to set vgic distributor addresses first\n");
- ret = -ENXIO;
- goto out;
- }
-
- /*
- * For a VGICv3 we require the userland to explicitly initialize
- * the VGIC before we need to use it.
- */
- if (!vgic_initialized(kvm)) {
- ret = -EBUSY;
- goto out;
- }
-
- ret = vgic_register_kvm_io_dev(kvm, dist->vgic_dist_base,
- GIC_V3_DIST_SIZE, vgic_v3_dist_ranges,
- -1, &dist->dist_iodev);
- if (ret)
- goto out;
-
- iodevs = kcalloc(dist->nr_cpus, sizeof(iodevs[0]), GFP_KERNEL);
- if (!iodevs) {
- ret = -ENOMEM;
- goto out_unregister;
- }
-
- for (i = 0; i < dist->nr_cpus; i++) {
- ret = vgic_register_kvm_io_dev(kvm, rdbase,
- SZ_128K, vgic_redist_ranges,
- i, &iodevs[i]);
- if (ret)
- goto out_unregister;
- rdbase += GIC_V3_REDIST_SIZE;
- }
-
- dist->redist_iodevs = iodevs;
- dist->ready = true;
- goto out;
-
-out_unregister:
- kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS, &dist->dist_iodev.dev);
- if (iodevs) {
- for (i = 0; i < dist->nr_cpus; i++) {
- if (iodevs[i].dev.ops)
- kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
- &iodevs[i].dev);
- }
- }
-
-out:
- if (ret)
- kvm_vgic_destroy(kvm);
- mutex_unlock(&kvm->lock);
- return ret;
-}
-
-static int vgic_v3_init_model(struct kvm *kvm)
-{
- int i;
- u32 mpidr;
- struct vgic_dist *dist = &kvm->arch.vgic;
- int nr_spis = dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
-
- dist->irq_spi_mpidr = kcalloc(nr_spis, sizeof(dist->irq_spi_mpidr[0]),
- GFP_KERNEL);
-
- if (!dist->irq_spi_mpidr)
- return -ENOMEM;
-
- /* Initialize the target VCPUs for each IRQ to VCPU 0 */
- mpidr = compress_mpidr(kvm_vcpu_get_mpidr_aff(kvm_get_vcpu(kvm, 0)));
- for (i = VGIC_NR_PRIVATE_IRQS; i < dist->nr_irqs; i++) {
- dist->irq_spi_cpu[i - VGIC_NR_PRIVATE_IRQS] = 0;
- dist->irq_spi_mpidr[i - VGIC_NR_PRIVATE_IRQS] = mpidr;
- vgic_bitmap_set_irq_val(dist->irq_spi_target, 0, i, 1);
- }
-
- return 0;
-}
-
-/* GICv3 does not keep track of SGI sources anymore. */
-static void vgic_v3_add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
-}
-
-void vgic_v3_init_emulation(struct kvm *kvm)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
-
- dist->vm_ops.queue_sgi = vgic_v3_queue_sgi;
- dist->vm_ops.add_sgi_source = vgic_v3_add_sgi_source;
- dist->vm_ops.init_model = vgic_v3_init_model;
- dist->vm_ops.map_resources = vgic_v3_map_resources;
-
- kvm->arch.max_vcpus = KVM_MAX_VCPUS;
-}
-
-/*
- * Compare a given affinity (level 1-3 and a level 0 mask, from the SGI
- * generation register ICC_SGI1R_EL1) with a given VCPU.
- * If the VCPU's MPIDR matches, return the level0 affinity, otherwise
- * return -1.
- */
-static int match_mpidr(u64 sgi_aff, u16 sgi_cpu_mask, struct kvm_vcpu *vcpu)
-{
- unsigned long affinity;
- int level0;
-
- /*
- * Split the current VCPU's MPIDR into affinity level 0 and the
- * rest as this is what we have to compare against.
- */
- affinity = kvm_vcpu_get_mpidr_aff(vcpu);
- level0 = MPIDR_AFFINITY_LEVEL(affinity, 0);
- affinity &= ~MPIDR_LEVEL_MASK;
-
- /* bail out if the upper three levels don't match */
- if (sgi_aff != affinity)
- return -1;
-
- /* Is this VCPU's bit set in the mask ? */
- if (!(sgi_cpu_mask & BIT(level0)))
- return -1;
-
- return level0;
-}
-
-#define SGI_AFFINITY_LEVEL(reg, level) \
- ((((reg) & ICC_SGI1R_AFFINITY_## level ##_MASK) \
- >> ICC_SGI1R_AFFINITY_## level ##_SHIFT) << MPIDR_LEVEL_SHIFT(level))
-
-/**
- * vgic_v3_dispatch_sgi - handle SGI requests from VCPUs
- * @vcpu: The VCPU requesting a SGI
- * @reg: The value written into the ICC_SGI1R_EL1 register by that VCPU
- *
- * With GICv3 (and ARE=1) CPUs trigger SGIs by writing to a system register.
- * This will trap in sys_regs.c and call this function.
- * This ICC_SGI1R_EL1 register contains the upper three affinity levels of the
- * target processors as well as a bitmask of 16 Aff0 CPUs.
- * If the interrupt routing mode bit is not set, we iterate over all VCPUs to
- * check for matching ones. If this bit is set, we signal all, but not the
- * calling VCPU.
- */
-void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
-{
- struct kvm *kvm = vcpu->kvm;
- struct kvm_vcpu *c_vcpu;
- struct vgic_dist *dist = &kvm->arch.vgic;
- u16 target_cpus;
- u64 mpidr;
- int sgi, c;
- int vcpu_id = vcpu->vcpu_id;
- bool broadcast;
- int updated = 0;
-
- sgi = (reg & ICC_SGI1R_SGI_ID_MASK) >> ICC_SGI1R_SGI_ID_SHIFT;
- broadcast = reg & BIT(ICC_SGI1R_IRQ_ROUTING_MODE_BIT);
- target_cpus = (reg & ICC_SGI1R_TARGET_LIST_MASK) >> ICC_SGI1R_TARGET_LIST_SHIFT;
- mpidr = SGI_AFFINITY_LEVEL(reg, 3);
- mpidr |= SGI_AFFINITY_LEVEL(reg, 2);
- mpidr |= SGI_AFFINITY_LEVEL(reg, 1);
-
- /*
- * We take the dist lock here, because we come from the sysregs
- * code path and not from the MMIO one (which already takes the lock).
- */
- spin_lock(&dist->lock);
-
- /*
- * We iterate over all VCPUs to find the MPIDRs matching the request.
- * If we have handled one CPU, we clear it's bit to detect early
- * if we are already finished. This avoids iterating through all
- * VCPUs when most of the times we just signal a single VCPU.
- */
- kvm_for_each_vcpu(c, c_vcpu, kvm) {
-
- /* Exit early if we have dealt with all requested CPUs */
- if (!broadcast && target_cpus == 0)
- break;
-
- /* Don't signal the calling VCPU */
- if (broadcast && c == vcpu_id)
- continue;
-
- if (!broadcast) {
- int level0;
-
- level0 = match_mpidr(mpidr, target_cpus, c_vcpu);
- if (level0 == -1)
- continue;
-
- /* remove this matching VCPU from the mask */
- target_cpus &= ~BIT(level0);
- }
-
- /* Flag the SGI as pending */
- vgic_dist_irq_set_pending(c_vcpu, sgi);
- updated = 1;
- kvm_debug("SGI%d from CPU%d to CPU%d\n", sgi, vcpu_id, c);
- }
- if (updated)
- vgic_update_state(vcpu->kvm);
- spin_unlock(&dist->lock);
- if (updated)
- vgic_kick_vcpus(vcpu->kvm);
-}
-
-static int vgic_v3_create(struct kvm_device *dev, u32 type)
-{
- return kvm_vgic_create(dev->kvm, type);
-}
-
-static void vgic_v3_destroy(struct kvm_device *dev)
-{
- kfree(dev);
-}
-
-static int vgic_v3_set_attr(struct kvm_device *dev,
- struct kvm_device_attr *attr)
-{
- int ret;
-
- ret = vgic_set_common_attr(dev, attr);
- if (ret != -ENXIO)
- return ret;
-
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
- case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
- return -ENXIO;
- }
-
- return -ENXIO;
-}
-
-static int vgic_v3_get_attr(struct kvm_device *dev,
- struct kvm_device_attr *attr)
-{
- int ret;
-
- ret = vgic_get_common_attr(dev, attr);
- if (ret != -ENXIO)
- return ret;
-
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
- case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
- return -ENXIO;
- }
-
- return -ENXIO;
-}
-
-static int vgic_v3_has_attr(struct kvm_device *dev,
- struct kvm_device_attr *attr)
-{
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_ADDR:
- switch (attr->attr) {
- case KVM_VGIC_V2_ADDR_TYPE_DIST:
- case KVM_VGIC_V2_ADDR_TYPE_CPU:
- return -ENXIO;
- case KVM_VGIC_V3_ADDR_TYPE_DIST:
- case KVM_VGIC_V3_ADDR_TYPE_REDIST:
- return 0;
- }
- break;
- case KVM_DEV_ARM_VGIC_GRP_DIST_REGS:
- case KVM_DEV_ARM_VGIC_GRP_CPU_REGS:
- return -ENXIO;
- case KVM_DEV_ARM_VGIC_GRP_NR_IRQS:
- return 0;
- case KVM_DEV_ARM_VGIC_GRP_CTRL:
- switch (attr->attr) {
- case KVM_DEV_ARM_VGIC_CTRL_INIT:
- return 0;
- }
- }
- return -ENXIO;
-}
-
-struct kvm_device_ops kvm_arm_vgic_v3_ops = {
- .name = "kvm-arm-vgic-v3",
- .create = vgic_v3_create,
- .destroy = vgic_v3_destroy,
- .set_attr = vgic_v3_set_attr,
- .get_attr = vgic_v3_get_attr,
- .has_attr = vgic_v3_has_attr,
-};
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
deleted file mode 100644
index 75b02fa86436..000000000000
--- a/virt/kvm/arm/vgic-v3.c
+++ /dev/null
@@ -1,279 +0,0 @@
-/*
- * Copyright (C) 2013 ARM Limited, All Rights Reserved.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-
-#include <linux/irqchip/arm-gic-v3.h>
-#include <linux/irqchip/arm-gic-common.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_asm.h>
-#include <asm/kvm_mmu.h>
-
-static u32 ich_vtr_el2;
-
-static struct vgic_lr vgic_v3_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
- struct vgic_lr lr_desc;
- u64 val = vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr];
-
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
- lr_desc.irq = val & ICH_LR_VIRTUAL_ID_MASK;
- else
- lr_desc.irq = val & GICH_LR_VIRTUALID;
-
- lr_desc.source = 0;
- if (lr_desc.irq <= 15 &&
- vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
- lr_desc.source = (val >> GICH_LR_PHYSID_CPUID_SHIFT) & 0x7;
-
- lr_desc.state = 0;
-
- if (val & ICH_LR_PENDING_BIT)
- lr_desc.state |= LR_STATE_PENDING;
- if (val & ICH_LR_ACTIVE_BIT)
- lr_desc.state |= LR_STATE_ACTIVE;
- if (val & ICH_LR_EOI)
- lr_desc.state |= LR_EOI_INT;
- if (val & ICH_LR_HW) {
- lr_desc.state |= LR_HW;
- lr_desc.hwirq = (val >> ICH_LR_PHYS_ID_SHIFT) & GENMASK(9, 0);
- }
-
- return lr_desc;
-}
-
-static void vgic_v3_set_lr(struct kvm_vcpu *vcpu, int lr,
- struct vgic_lr lr_desc)
-{
- u64 lr_val;
-
- lr_val = lr_desc.irq;
-
- /*
- * Currently all guest IRQs are Group1, as Group0 would result
- * in a FIQ in the guest, which it wouldn't expect.
- * Eventually we want to make this configurable, so we may revisit
- * this in the future.
- */
- switch (vcpu->kvm->arch.vgic.vgic_model) {
- case KVM_DEV_TYPE_ARM_VGIC_V3:
- lr_val |= ICH_LR_GROUP;
- break;
- case KVM_DEV_TYPE_ARM_VGIC_V2:
- if (lr_desc.irq < VGIC_NR_SGIS)
- lr_val |= (u32)lr_desc.source << GICH_LR_PHYSID_CPUID_SHIFT;
- break;
- default:
- BUG();
- }
-
- if (lr_desc.state & LR_STATE_PENDING)
- lr_val |= ICH_LR_PENDING_BIT;
- if (lr_desc.state & LR_STATE_ACTIVE)
- lr_val |= ICH_LR_ACTIVE_BIT;
- if (lr_desc.state & LR_EOI_INT)
- lr_val |= ICH_LR_EOI;
- if (lr_desc.state & LR_HW) {
- lr_val |= ICH_LR_HW;
- lr_val |= ((u64)lr_desc.hwirq) << ICH_LR_PHYS_ID_SHIFT;
- }
-
- vcpu->arch.vgic_cpu.vgic_v3.vgic_lr[lr] = lr_val;
-
- if (!(lr_desc.state & LR_STATE_MASK))
- vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
- else
- vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr &= ~(1U << lr);
-}
-
-static u64 vgic_v3_get_elrsr(const struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr;
-}
-
-static u64 vgic_v3_get_eisr(const struct kvm_vcpu *vcpu)
-{
- return vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr;
-}
-
-static void vgic_v3_clear_eisr(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr = 0;
-}
-
-static u32 vgic_v3_get_interrupt_status(const struct kvm_vcpu *vcpu)
-{
- u32 misr = vcpu->arch.vgic_cpu.vgic_v3.vgic_misr;
- u32 ret = 0;
-
- if (misr & ICH_MISR_EOI)
- ret |= INT_STATUS_EOI;
- if (misr & ICH_MISR_U)
- ret |= INT_STATUS_UNDERFLOW;
-
- return ret;
-}
-
-static void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
- u32 vmcr = vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr;
-
- vmcrp->ctlr = (vmcr & ICH_VMCR_CTLR_MASK) >> ICH_VMCR_CTLR_SHIFT;
- vmcrp->abpr = (vmcr & ICH_VMCR_BPR1_MASK) >> ICH_VMCR_BPR1_SHIFT;
- vmcrp->bpr = (vmcr & ICH_VMCR_BPR0_MASK) >> ICH_VMCR_BPR0_SHIFT;
- vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
-}
-
-static void vgic_v3_enable_underflow(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr |= ICH_HCR_UIE;
-}
-
-static void vgic_v3_disable_underflow(struct kvm_vcpu *vcpu)
-{
- vcpu->arch.vgic_cpu.vgic_v3.vgic_hcr &= ~ICH_HCR_UIE;
-}
-
-static void vgic_v3_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
-{
- u32 vmcr;
-
- vmcr = (vmcrp->ctlr << ICH_VMCR_CTLR_SHIFT) & ICH_VMCR_CTLR_MASK;
- vmcr |= (vmcrp->abpr << ICH_VMCR_BPR1_SHIFT) & ICH_VMCR_BPR1_MASK;
- vmcr |= (vmcrp->bpr << ICH_VMCR_BPR0_SHIFT) & ICH_VMCR_BPR0_MASK;
- vmcr |= (vmcrp->pmr << ICH_VMCR_PMR_SHIFT) & ICH_VMCR_PMR_MASK;
-
- vcpu->arch.vgic_cpu.vgic_v3.vgic_vmcr = vmcr;
-}
-
-static void vgic_v3_enable(struct kvm_vcpu *vcpu)
-{
- struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
-
- /*
- * By forcing VMCR to zero, the GIC will restore the binary
- * points to their reset values. Anything else resets to zero
- * anyway.
- */
- vgic_v3->vgic_vmcr = 0;
- vgic_v3->vgic_elrsr = ~0;
-
- /*
- * If we are emulating a GICv3, we do it in an non-GICv2-compatible
- * way, so we force SRE to 1 to demonstrate this to the guest.
- * This goes with the spec allowing the value to be RAO/WI.
- */
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
- vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
- else
- vgic_v3->vgic_sre = 0;
-
- /* Get the show on the road... */
- vgic_v3->vgic_hcr = ICH_HCR_EN;
-}
-
-static const struct vgic_ops vgic_v3_ops = {
- .get_lr = vgic_v3_get_lr,
- .set_lr = vgic_v3_set_lr,
- .get_elrsr = vgic_v3_get_elrsr,
- .get_eisr = vgic_v3_get_eisr,
- .clear_eisr = vgic_v3_clear_eisr,
- .get_interrupt_status = vgic_v3_get_interrupt_status,
- .enable_underflow = vgic_v3_enable_underflow,
- .disable_underflow = vgic_v3_disable_underflow,
- .get_vmcr = vgic_v3_get_vmcr,
- .set_vmcr = vgic_v3_set_vmcr,
- .enable = vgic_v3_enable,
-};
-
-static struct vgic_params vgic_v3_params;
-
-static void vgic_cpu_init_lrs(void *params)
-{
- kvm_call_hyp(__vgic_v3_init_lrs);
-}
-
-/**
- * vgic_v3_probe - probe for a GICv3 compatible interrupt controller
- * @gic_kvm_info: pointer to the GIC description
- * @ops: address of a pointer to the GICv3 operations
- * @params: address of a pointer to HW-specific parameters
- *
- * Returns 0 if a GICv3 has been found, with the low level operations
- * in *ops and the HW parameters in *params. Returns an error code
- * otherwise.
- */
-int vgic_v3_probe(const struct gic_kvm_info *gic_kvm_info,
- const struct vgic_ops **ops,
- const struct vgic_params **params)
-{
- int ret = 0;
- struct vgic_params *vgic = &vgic_v3_params;
- const struct resource *vcpu_res = &gic_kvm_info->vcpu;
-
- vgic->maint_irq = gic_kvm_info->maint_irq;
-
- ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
-
- /*
- * The ListRegs field is 5 bits, but there is a architectural
- * maximum of 16 list registers. Just ignore bit 4...
- */
- vgic->nr_lr = (ich_vtr_el2 & 0xf) + 1;
- vgic->can_emulate_gicv2 = false;
-
- if (!vcpu_res->start) {
- kvm_info("GICv3: no GICV resource entry\n");
- vgic->vcpu_base = 0;
- } else if (!PAGE_ALIGNED(vcpu_res->start)) {
- pr_warn("GICV physical address 0x%llx not page aligned\n",
- (unsigned long long)vcpu_res->start);
- vgic->vcpu_base = 0;
- } else if (!PAGE_ALIGNED(resource_size(vcpu_res))) {
- pr_warn("GICV size 0x%llx not a multiple of page size 0x%lx\n",
- (unsigned long long)resource_size(vcpu_res),
- PAGE_SIZE);
- } else {
- vgic->vcpu_base = vcpu_res->start;
- vgic->can_emulate_gicv2 = true;
- kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
- KVM_DEV_TYPE_ARM_VGIC_V2);
- }
- if (vgic->vcpu_base == 0)
- kvm_info("disabling GICv2 emulation\n");
- kvm_register_device_ops(&kvm_arm_vgic_v3_ops, KVM_DEV_TYPE_ARM_VGIC_V3);
-
- vgic->vctrl_base = NULL;
- vgic->type = VGIC_V3;
- vgic->max_gic_vcpus = VGIC_V3_MAX_CPUS;
-
- kvm_info("GICV base=0x%llx, IRQ=%d\n",
- vgic->vcpu_base, vgic->maint_irq);
-
- on_each_cpu(vgic_cpu_init_lrs, vgic, 1);
-
- *ops = &vgic_v3_ops;
- *params = vgic;
-
- return ret;
-}
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
deleted file mode 100644
index 67cb5e948be2..000000000000
--- a/virt/kvm/arm/vgic.c
+++ /dev/null
@@ -1,2417 +0,0 @@
-/*
- * Copyright (C) 2012 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-
-#include <linux/cpu.h>
-#include <linux/kvm.h>
-#include <linux/kvm_host.h>
-#include <linux/interrupt.h>
-#include <linux/io.h>
-#include <linux/irq.h>
-#include <linux/rculist.h>
-#include <linux/uaccess.h>
-
-#include <asm/kvm_emulate.h>
-#include <asm/kvm_arm.h>
-#include <asm/kvm_mmu.h>
-#include <trace/events/kvm.h>
-#include <asm/kvm.h>
-#include <kvm/iodev.h>
-#include <linux/irqchip/arm-gic-common.h>
-
-#define CREATE_TRACE_POINTS
-#include "trace.h"
-
-/*
- * How the whole thing works (courtesy of Christoffer Dall):
- *
- * - At any time, the dist->irq_pending_on_cpu is the oracle that knows if
- * something is pending on the CPU interface.
- * - Interrupts that are pending on the distributor are stored on the
- * vgic.irq_pending vgic bitmap (this bitmap is updated by both user land
- * ioctls and guest mmio ops, and other in-kernel peripherals such as the
- * arch. timers).
- * - Every time the bitmap changes, the irq_pending_on_cpu oracle is
- * recalculated
- * - To calculate the oracle, we need info for each cpu from
- * compute_pending_for_cpu, which considers:
- * - PPI: dist->irq_pending & dist->irq_enable
- * - SPI: dist->irq_pending & dist->irq_enable & dist->irq_spi_target
- * - irq_spi_target is a 'formatted' version of the GICD_ITARGETSRn
- * registers, stored on each vcpu. We only keep one bit of
- * information per interrupt, making sure that only one vcpu can
- * accept the interrupt.
- * - If any of the above state changes, we must recalculate the oracle.
- * - The same is true when injecting an interrupt, except that we only
- * consider a single interrupt at a time. The irq_spi_cpu array
- * contains the target CPU for each SPI.
- *
- * The handling of level interrupts adds some extra complexity. We
- * need to track when the interrupt has been EOIed, so we can sample
- * the 'line' again. This is achieved as such:
- *
- * - When a level interrupt is moved onto a vcpu, the corresponding
- * bit in irq_queued is set. As long as this bit is set, the line
- * will be ignored for further interrupts. The interrupt is injected
- * into the vcpu with the GICH_LR_EOI bit set (generate a
- * maintenance interrupt on EOI).
- * - When the interrupt is EOIed, the maintenance interrupt fires,
- * and clears the corresponding bit in irq_queued. This allows the
- * interrupt line to be sampled again.
- * - Note that level-triggered interrupts can also be set to pending from
- * writes to GICD_ISPENDRn and lowering the external input line does not
- * cause the interrupt to become inactive in such a situation.
- * Conversely, writes to GICD_ICPENDRn do not cause the interrupt to become
- * inactive as long as the external input line is held high.
- *
- *
- * Initialization rules: there are multiple stages to the vgic
- * initialization, both for the distributor and the CPU interfaces.
- *
- * Distributor:
- *
- * - kvm_vgic_early_init(): initialization of static data that doesn't
- * depend on any sizing information or emulation type. No allocation
- * is allowed there.
- *
- * - vgic_init(): allocation and initialization of the generic data
- * structures that depend on sizing information (number of CPUs,
- * number of interrupts). Also initializes the vcpu specific data
- * structures. Can be executed lazily for GICv2.
- * [to be renamed to kvm_vgic_init??]
- *
- * CPU Interface:
- *
- * - kvm_vgic_cpu_early_init(): initialization of static data that
- * doesn't depend on any sizing information or emulation type. No
- * allocation is allowed there.
- */
-
-#include "vgic.h"
-
-static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu);
-static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu);
-static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr);
-static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr, struct vgic_lr lr_desc);
-static u64 vgic_get_elrsr(struct kvm_vcpu *vcpu);
-static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
- int virt_irq);
-static int compute_pending_for_cpu(struct kvm_vcpu *vcpu);
-
-static const struct vgic_ops *vgic_ops;
-static const struct vgic_params *vgic;
-
-static void add_sgi_source(struct kvm_vcpu *vcpu, int irq, int source)
-{
- vcpu->kvm->arch.vgic.vm_ops.add_sgi_source(vcpu, irq, source);
-}
-
-static bool queue_sgi(struct kvm_vcpu *vcpu, int irq)
-{
- return vcpu->kvm->arch.vgic.vm_ops.queue_sgi(vcpu, irq);
-}
-
-int kvm_vgic_map_resources(struct kvm *kvm)
-{
- return kvm->arch.vgic.vm_ops.map_resources(kvm, vgic);
-}
-
-/*
- * struct vgic_bitmap contains a bitmap made of unsigned longs, but
- * extracts u32s out of them.
- *
- * This does not work on 64-bit BE systems, because the bitmap access
- * will store two consecutive 32-bit words with the higher-addressed
- * register's bits at the lower index and the lower-addressed register's
- * bits at the higher index.
- *
- * Therefore, swizzle the register index when accessing the 32-bit word
- * registers to access the right register's value.
- */
-#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 64
-#define REG_OFFSET_SWIZZLE 1
-#else
-#define REG_OFFSET_SWIZZLE 0
-#endif
-
-static int vgic_init_bitmap(struct vgic_bitmap *b, int nr_cpus, int nr_irqs)
-{
- int nr_longs;
-
- nr_longs = nr_cpus + BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS);
-
- b->private = kzalloc(sizeof(unsigned long) * nr_longs, GFP_KERNEL);
- if (!b->private)
- return -ENOMEM;
-
- b->shared = b->private + nr_cpus;
-
- return 0;
-}
-
-static void vgic_free_bitmap(struct vgic_bitmap *b)
-{
- kfree(b->private);
- b->private = NULL;
- b->shared = NULL;
-}
-
-/*
- * Call this function to convert a u64 value to an unsigned long * bitmask
- * in a way that works on both 32-bit and 64-bit LE and BE platforms.
- *
- * Warning: Calling this function may modify *val.
- */
-static unsigned long *u64_to_bitmask(u64 *val)
-{
-#if defined(CONFIG_CPU_BIG_ENDIAN) && BITS_PER_LONG == 32
- *val = (*val >> 32) | (*val << 32);
-#endif
- return (unsigned long *)val;
-}
-
-u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset)
-{
- offset >>= 2;
- if (!offset)
- return (u32 *)(x->private + cpuid) + REG_OFFSET_SWIZZLE;
- else
- return (u32 *)(x->shared) + ((offset - 1) ^ REG_OFFSET_SWIZZLE);
-}
-
-static int vgic_bitmap_get_irq_val(struct vgic_bitmap *x,
- int cpuid, int irq)
-{
- if (irq < VGIC_NR_PRIVATE_IRQS)
- return test_bit(irq, x->private + cpuid);
-
- return test_bit(irq - VGIC_NR_PRIVATE_IRQS, x->shared);
-}
-
-void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
- int irq, int val)
-{
- unsigned long *reg;
-
- if (irq < VGIC_NR_PRIVATE_IRQS) {
- reg = x->private + cpuid;
- } else {
- reg = x->shared;
- irq -= VGIC_NR_PRIVATE_IRQS;
- }
-
- if (val)
- set_bit(irq, reg);
- else
- clear_bit(irq, reg);
-}
-
-static unsigned long *vgic_bitmap_get_cpu_map(struct vgic_bitmap *x, int cpuid)
-{
- return x->private + cpuid;
-}
-
-unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x)
-{
- return x->shared;
-}
-
-static int vgic_init_bytemap(struct vgic_bytemap *x, int nr_cpus, int nr_irqs)
-{
- int size;
-
- size = nr_cpus * VGIC_NR_PRIVATE_IRQS;
- size += nr_irqs - VGIC_NR_PRIVATE_IRQS;
-
- x->private = kzalloc(size, GFP_KERNEL);
- if (!x->private)
- return -ENOMEM;
-
- x->shared = x->private + nr_cpus * VGIC_NR_PRIVATE_IRQS / sizeof(u32);
- return 0;
-}
-
-static void vgic_free_bytemap(struct vgic_bytemap *b)
-{
- kfree(b->private);
- b->private = NULL;
- b->shared = NULL;
-}
-
-u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset)
-{
- u32 *reg;
-
- if (offset < VGIC_NR_PRIVATE_IRQS) {
- reg = x->private;
- offset += cpuid * VGIC_NR_PRIVATE_IRQS;
- } else {
- reg = x->shared;
- offset -= VGIC_NR_PRIVATE_IRQS;
- }
-
- return reg + (offset / sizeof(u32));
-}
-
-#define VGIC_CFG_LEVEL 0
-#define VGIC_CFG_EDGE 1
-
-static bool vgic_irq_is_edge(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- int irq_val;
-
- irq_val = vgic_bitmap_get_irq_val(&dist->irq_cfg, vcpu->vcpu_id, irq);
- return irq_val == VGIC_CFG_EDGE;
-}
-
-static int vgic_irq_is_enabled(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- return vgic_bitmap_get_irq_val(&dist->irq_enabled, vcpu->vcpu_id, irq);
-}
-
-static int vgic_irq_is_queued(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- return vgic_bitmap_get_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq);
-}
-
-static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq);
-}
-
-static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 0);
-}
-
-static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0);
-}
-
-static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- return vgic_bitmap_get_irq_val(&dist->irq_level, vcpu->vcpu_id, irq);
-}
-
-static void vgic_dist_irq_set_level(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 1);
-}
-
-static void vgic_dist_irq_clear_level(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_level, vcpu->vcpu_id, irq, 0);
-}
-
-static int vgic_dist_irq_soft_pend(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- return vgic_bitmap_get_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq);
-}
-
-static void vgic_dist_irq_clear_soft_pend(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_soft_pend, vcpu->vcpu_id, irq, 0);
- if (!vgic_dist_irq_get_level(vcpu, irq)) {
- vgic_dist_irq_clear_pending(vcpu, irq);
- if (!compute_pending_for_cpu(vcpu))
- clear_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
- }
-}
-
-static int vgic_dist_irq_is_pending(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- return vgic_bitmap_get_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq);
-}
-
-void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 1);
-}
-
-void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- vgic_bitmap_set_irq_val(&dist->irq_pending, vcpu->vcpu_id, irq, 0);
-}
-
-static void vgic_cpu_irq_set(struct kvm_vcpu *vcpu, int irq)
-{
- if (irq < VGIC_NR_PRIVATE_IRQS)
- set_bit(irq, vcpu->arch.vgic_cpu.pending_percpu);
- else
- set_bit(irq - VGIC_NR_PRIVATE_IRQS,
- vcpu->arch.vgic_cpu.pending_shared);
-}
-
-void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq)
-{
- if (irq < VGIC_NR_PRIVATE_IRQS)
- clear_bit(irq, vcpu->arch.vgic_cpu.pending_percpu);
- else
- clear_bit(irq - VGIC_NR_PRIVATE_IRQS,
- vcpu->arch.vgic_cpu.pending_shared);
-}
-
-static bool vgic_can_sample_irq(struct kvm_vcpu *vcpu, int irq)
-{
- return !vgic_irq_is_queued(vcpu, irq);
-}
-
-/**
- * vgic_reg_access - access vgic register
- * @mmio: pointer to the data describing the mmio access
- * @reg: pointer to the virtual backing of vgic distributor data
- * @offset: least significant 2 bits used for word offset
- * @mode: ACCESS_ mode (see defines above)
- *
- * Helper to make vgic register access easier using one of the access
- * modes defined for vgic register access
- * (read,raz,write-ignored,setbit,clearbit,write)
- */
-void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
- phys_addr_t offset, int mode)
-{
- int word_offset = (offset & 3) * 8;
- u32 mask = (1UL << (mmio->len * 8)) - 1;
- u32 regval;
-
- /*
- * Any alignment fault should have been delivered to the guest
- * directly (ARM ARM B3.12.7 "Prioritization of aborts").
- */
-
- if (reg) {
- regval = *reg;
- } else {
- BUG_ON(mode != (ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED));
- regval = 0;
- }
-
- if (mmio->is_write) {
- u32 data = mmio_data_read(mmio, mask) << word_offset;
- switch (ACCESS_WRITE_MASK(mode)) {
- case ACCESS_WRITE_IGNORED:
- return;
-
- case ACCESS_WRITE_SETBIT:
- regval |= data;
- break;
-
- case ACCESS_WRITE_CLEARBIT:
- regval &= ~data;
- break;
-
- case ACCESS_WRITE_VALUE:
- regval = (regval & ~(mask << word_offset)) | data;
- break;
- }
- *reg = regval;
- } else {
- switch (ACCESS_READ_MASK(mode)) {
- case ACCESS_READ_RAZ:
- regval = 0;
- /* fall through */
-
- case ACCESS_READ_VALUE:
- mmio_data_write(mmio, mask, regval >> word_offset);
- }
- }
-}
-
-bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- vgic_reg_access(mmio, NULL, offset,
- ACCESS_READ_RAZ | ACCESS_WRITE_IGNORED);
- return false;
-}
-
-bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id, int access)
-{
- u32 *reg;
- int mode = ACCESS_READ_VALUE | access;
- struct kvm_vcpu *target_vcpu = kvm_get_vcpu(kvm, vcpu_id);
-
- reg = vgic_bitmap_get_reg(&kvm->arch.vgic.irq_enabled, vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset, mode);
- if (mmio->is_write) {
- if (access & ACCESS_WRITE_CLEARBIT) {
- if (offset < 4) /* Force SGI enabled */
- *reg |= 0xffff;
- vgic_retire_disabled_irqs(target_vcpu);
- }
- vgic_update_state(kvm);
- return true;
- }
-
- return false;
-}
-
-bool vgic_handle_set_pending_reg(struct kvm *kvm,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id)
-{
- u32 *reg, orig;
- u32 level_mask;
- int mode = ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT;
- struct vgic_dist *dist = &kvm->arch.vgic;
-
- reg = vgic_bitmap_get_reg(&dist->irq_cfg, vcpu_id, offset);
- level_mask = (~(*reg));
-
- /* Mark both level and edge triggered irqs as pending */
- reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
- orig = *reg;
- vgic_reg_access(mmio, reg, offset, mode);
-
- if (mmio->is_write) {
- /* Set the soft-pending flag only for level-triggered irqs */
- reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
- vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset, mode);
- *reg &= level_mask;
-
- /* Ignore writes to SGIs */
- if (offset < 2) {
- *reg &= ~0xffff;
- *reg |= orig & 0xffff;
- }
-
- vgic_update_state(kvm);
- return true;
- }
-
- return false;
-}
-
-bool vgic_handle_clear_pending_reg(struct kvm *kvm,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id)
-{
- u32 *level_active;
- u32 *reg, orig;
- int mode = ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT;
- struct vgic_dist *dist = &kvm->arch.vgic;
-
- reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
- orig = *reg;
- vgic_reg_access(mmio, reg, offset, mode);
- if (mmio->is_write) {
- /* Re-set level triggered level-active interrupts */
- level_active = vgic_bitmap_get_reg(&dist->irq_level,
- vcpu_id, offset);
- reg = vgic_bitmap_get_reg(&dist->irq_pending, vcpu_id, offset);
- *reg |= *level_active;
-
- /* Ignore writes to SGIs */
- if (offset < 2) {
- *reg &= ~0xffff;
- *reg |= orig & 0xffff;
- }
-
- /* Clear soft-pending flags */
- reg = vgic_bitmap_get_reg(&dist->irq_soft_pend,
- vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset, mode);
-
- vgic_update_state(kvm);
- return true;
- }
- return false;
-}
-
-bool vgic_handle_set_active_reg(struct kvm *kvm,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id)
-{
- u32 *reg;
- struct vgic_dist *dist = &kvm->arch.vgic;
-
- reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_SETBIT);
-
- if (mmio->is_write) {
- vgic_update_state(kvm);
- return true;
- }
-
- return false;
-}
-
-bool vgic_handle_clear_active_reg(struct kvm *kvm,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id)
-{
- u32 *reg;
- struct vgic_dist *dist = &kvm->arch.vgic;
-
- reg = vgic_bitmap_get_reg(&dist->irq_active, vcpu_id, offset);
- vgic_reg_access(mmio, reg, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_CLEARBIT);
-
- if (mmio->is_write) {
- vgic_update_state(kvm);
- return true;
- }
-
- return false;
-}
-
-static u32 vgic_cfg_expand(u16 val)
-{
- u32 res = 0;
- int i;
-
- /*
- * Turn a 16bit value like abcd...mnop into a 32bit word
- * a0b0c0d0...m0n0o0p0, which is what the HW cfg register is.
- */
- for (i = 0; i < 16; i++)
- res |= ((val >> i) & VGIC_CFG_EDGE) << (2 * i + 1);
-
- return res;
-}
-
-static u16 vgic_cfg_compress(u32 val)
-{
- u16 res = 0;
- int i;
-
- /*
- * Turn a 32bit word a0b0c0d0...m0n0o0p0 into 16bit value like
- * abcd...mnop which is what we really care about.
- */
- for (i = 0; i < 16; i++)
- res |= ((val >> (i * 2 + 1)) & VGIC_CFG_EDGE) << i;
-
- return res;
-}
-
-/*
- * The distributor uses 2 bits per IRQ for the CFG register, but the
- * LSB is always 0. As such, we only keep the upper bit, and use the
- * two above functions to compress/expand the bits
- */
-bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
- phys_addr_t offset)
-{
- u32 val;
-
- if (offset & 4)
- val = *reg >> 16;
- else
- val = *reg & 0xffff;
-
- val = vgic_cfg_expand(val);
- vgic_reg_access(mmio, &val, offset,
- ACCESS_READ_VALUE | ACCESS_WRITE_VALUE);
- if (mmio->is_write) {
- /* Ignore writes to read-only SGI and PPI bits */
- if (offset < 8)
- return false;
-
- val = vgic_cfg_compress(val);
- if (offset & 4) {
- *reg &= 0xffff;
- *reg |= val << 16;
- } else {
- *reg &= 0xffff << 16;
- *reg |= val;
- }
- }
-
- return false;
-}
-
-/**
- * vgic_unqueue_irqs - move pending/active IRQs from LRs to the distributor
- * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs
- *
- * Move any IRQs that have already been assigned to LRs back to the
- * emulated distributor state so that the complete emulated state can be read
- * from the main emulation structures without investigating the LRs.
- */
-void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
-{
- u64 elrsr = vgic_get_elrsr(vcpu);
- unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
- int i;
-
- for_each_clear_bit(i, elrsr_ptr, vgic->nr_lr) {
- struct vgic_lr lr = vgic_get_lr(vcpu, i);
-
- /*
- * There are three options for the state bits:
- *
- * 01: pending
- * 10: active
- * 11: pending and active
- */
- BUG_ON(!(lr.state & LR_STATE_MASK));
-
- /* Reestablish SGI source for pending and active IRQs */
- if (lr.irq < VGIC_NR_SGIS)
- add_sgi_source(vcpu, lr.irq, lr.source);
-
- /*
- * If the LR holds an active (10) or a pending and active (11)
- * interrupt then move the active state to the
- * distributor tracking bit.
- */
- if (lr.state & LR_STATE_ACTIVE)
- vgic_irq_set_active(vcpu, lr.irq);
-
- /*
- * Reestablish the pending state on the distributor and the
- * CPU interface and mark the LR as free for other use.
- */
- vgic_retire_lr(i, vcpu);
-
- /* Finally update the VGIC state. */
- vgic_update_state(vcpu->kvm);
- }
-}
-
-const
-struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges,
- int len, gpa_t offset)
-{
- while (ranges->len) {
- if (offset >= ranges->base &&
- (offset + len) <= (ranges->base + ranges->len))
- return ranges;
- ranges++;
- }
-
- return NULL;
-}
-
-static bool vgic_validate_access(const struct vgic_dist *dist,
- const struct vgic_io_range *range,
- unsigned long offset)
-{
- int irq;
-
- if (!range->bits_per_irq)
- return true; /* Not an irq-based access */
-
- irq = offset * 8 / range->bits_per_irq;
- if (irq >= dist->nr_irqs)
- return false;
-
- return true;
-}
-
-/*
- * Call the respective handler function for the given range.
- * We split up any 64 bit accesses into two consecutive 32 bit
- * handler calls and merge the result afterwards.
- * We do this in a little endian fashion regardless of the host's
- * or guest's endianness, because the GIC is always LE and the rest of
- * the code (vgic_reg_access) also puts it in a LE fashion already.
- * At this point we have already identified the handle function, so
- * range points to that one entry and offset is relative to this.
- */
-static bool call_range_handler(struct kvm_vcpu *vcpu,
- struct kvm_exit_mmio *mmio,
- unsigned long offset,
- const struct vgic_io_range *range)
-{
- struct kvm_exit_mmio mmio32;
- bool ret;
-
- if (likely(mmio->len <= 4))
- return range->handle_mmio(vcpu, mmio, offset);
-
- /*
- * Any access bigger than 4 bytes (that we currently handle in KVM)
- * is actually 8 bytes long, caused by a 64-bit access
- */
-
- mmio32.len = 4;
- mmio32.is_write = mmio->is_write;
- mmio32.private = mmio->private;
-
- mmio32.phys_addr = mmio->phys_addr + 4;
- mmio32.data = &((u32 *)mmio->data)[1];
- ret = range->handle_mmio(vcpu, &mmio32, offset + 4);
-
- mmio32.phys_addr = mmio->phys_addr;
- mmio32.data = &((u32 *)mmio->data)[0];
- ret |= range->handle_mmio(vcpu, &mmio32, offset);
-
- return ret;
-}
-
-/**
- * vgic_handle_mmio_access - handle an in-kernel MMIO access
- * This is called by the read/write KVM IO device wrappers below.
- * @vcpu: pointer to the vcpu performing the access
- * @this: pointer to the KVM IO device in charge
- * @addr: guest physical address of the access
- * @len: size of the access
- * @val: pointer to the data region
- * @is_write: read or write access
- *
- * returns true if the MMIO access could be performed
- */
-static int vgic_handle_mmio_access(struct kvm_vcpu *vcpu,
- struct kvm_io_device *this, gpa_t addr,
- int len, void *val, bool is_write)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- struct vgic_io_device *iodev = container_of(this,
- struct vgic_io_device, dev);
- const struct vgic_io_range *range;
- struct kvm_exit_mmio mmio;
- bool updated_state;
- gpa_t offset;
-
- offset = addr - iodev->addr;
- range = vgic_find_range(iodev->reg_ranges, len, offset);
- if (unlikely(!range || !range->handle_mmio)) {
- pr_warn("Unhandled access %d %08llx %d\n", is_write, addr, len);
- return -ENXIO;
- }
-
- mmio.phys_addr = addr;
- mmio.len = len;
- mmio.is_write = is_write;
- mmio.data = val;
- mmio.private = iodev->redist_vcpu;
-
- spin_lock(&dist->lock);
- offset -= range->base;
- if (vgic_validate_access(dist, range, offset)) {
- updated_state = call_range_handler(vcpu, &mmio, offset, range);
- } else {
- if (!is_write)
- memset(val, 0, len);
- updated_state = false;
- }
- spin_unlock(&dist->lock);
-
- if (updated_state)
- vgic_kick_vcpus(vcpu->kvm);
-
- return 0;
-}
-
-static int vgic_handle_mmio_read(struct kvm_vcpu *vcpu,
- struct kvm_io_device *this,
- gpa_t addr, int len, void *val)
-{
- return vgic_handle_mmio_access(vcpu, this, addr, len, val, false);
-}
-
-static int vgic_handle_mmio_write(struct kvm_vcpu *vcpu,
- struct kvm_io_device *this,
- gpa_t addr, int len, const void *val)
-{
- return vgic_handle_mmio_access(vcpu, this, addr, len, (void *)val,
- true);
-}
-
-static struct kvm_io_device_ops vgic_io_ops = {
- .read = vgic_handle_mmio_read,
- .write = vgic_handle_mmio_write,
-};
-
-/**
- * vgic_register_kvm_io_dev - register VGIC register frame on the KVM I/O bus
- * @kvm: The VM structure pointer
- * @base: The (guest) base address for the register frame
- * @len: Length of the register frame window
- * @ranges: Describing the handler functions for each register
- * @redist_vcpu_id: The VCPU ID to pass on to the handlers on call
- * @iodev: Points to memory to be passed on to the handler
- *
- * @iodev stores the parameters of this function to be usable by the handler
- * respectively the dispatcher function (since the KVM I/O bus framework lacks
- * an opaque parameter). Initialization is done in this function, but the
- * reference should be valid and unique for the whole VGIC lifetime.
- * If the register frame is not mapped for a specific VCPU, pass -1 to
- * @redist_vcpu_id.
- */
-int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len,
- const struct vgic_io_range *ranges,
- int redist_vcpu_id,
- struct vgic_io_device *iodev)
-{
- struct kvm_vcpu *vcpu = NULL;
- int ret;
-
- if (redist_vcpu_id >= 0)
- vcpu = kvm_get_vcpu(kvm, redist_vcpu_id);
-
- iodev->addr = base;
- iodev->len = len;
- iodev->reg_ranges = ranges;
- iodev->redist_vcpu = vcpu;
-
- kvm_iodevice_init(&iodev->dev, &vgic_io_ops);
-
- mutex_lock(&kvm->slots_lock);
-
- ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, base, len,
- &iodev->dev);
- mutex_unlock(&kvm->slots_lock);
-
- /* Mark the iodev as invalid if registration fails. */
- if (ret)
- iodev->dev.ops = NULL;
-
- return ret;
-}
-
-static int vgic_nr_shared_irqs(struct vgic_dist *dist)
-{
- return dist->nr_irqs - VGIC_NR_PRIVATE_IRQS;
-}
-
-static int compute_active_for_cpu(struct kvm_vcpu *vcpu)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- unsigned long *active, *enabled, *act_percpu, *act_shared;
- unsigned long active_private, active_shared;
- int nr_shared = vgic_nr_shared_irqs(dist);
- int vcpu_id;
-
- vcpu_id = vcpu->vcpu_id;
- act_percpu = vcpu->arch.vgic_cpu.active_percpu;
- act_shared = vcpu->arch.vgic_cpu.active_shared;
-
- active = vgic_bitmap_get_cpu_map(&dist->irq_active, vcpu_id);
- enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
- bitmap_and(act_percpu, active, enabled, VGIC_NR_PRIVATE_IRQS);
-
- active = vgic_bitmap_get_shared_map(&dist->irq_active);
- enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
- bitmap_and(act_shared, active, enabled, nr_shared);
- bitmap_and(act_shared, act_shared,
- vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
- nr_shared);
-
- active_private = find_first_bit(act_percpu, VGIC_NR_PRIVATE_IRQS);
- active_shared = find_first_bit(act_shared, nr_shared);
-
- return (active_private < VGIC_NR_PRIVATE_IRQS ||
- active_shared < nr_shared);
-}
-
-static int compute_pending_for_cpu(struct kvm_vcpu *vcpu)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- unsigned long *pending, *enabled, *pend_percpu, *pend_shared;
- unsigned long pending_private, pending_shared;
- int nr_shared = vgic_nr_shared_irqs(dist);
- int vcpu_id;
-
- vcpu_id = vcpu->vcpu_id;
- pend_percpu = vcpu->arch.vgic_cpu.pending_percpu;
- pend_shared = vcpu->arch.vgic_cpu.pending_shared;
-
- if (!dist->enabled) {
- bitmap_zero(pend_percpu, VGIC_NR_PRIVATE_IRQS);
- bitmap_zero(pend_shared, nr_shared);
- return 0;
- }
-
- pending = vgic_bitmap_get_cpu_map(&dist->irq_pending, vcpu_id);
- enabled = vgic_bitmap_get_cpu_map(&dist->irq_enabled, vcpu_id);
- bitmap_and(pend_percpu, pending, enabled, VGIC_NR_PRIVATE_IRQS);
-
- pending = vgic_bitmap_get_shared_map(&dist->irq_pending);
- enabled = vgic_bitmap_get_shared_map(&dist->irq_enabled);
- bitmap_and(pend_shared, pending, enabled, nr_shared);
- bitmap_and(pend_shared, pend_shared,
- vgic_bitmap_get_shared_map(&dist->irq_spi_target[vcpu_id]),
- nr_shared);
-
- pending_private = find_first_bit(pend_percpu, VGIC_NR_PRIVATE_IRQS);
- pending_shared = find_first_bit(pend_shared, nr_shared);
- return (pending_private < VGIC_NR_PRIVATE_IRQS ||
- pending_shared < vgic_nr_shared_irqs(dist));
-}
-
-/*
- * Update the interrupt state and determine which CPUs have pending
- * or active interrupts. Must be called with distributor lock held.
- */
-void vgic_update_state(struct kvm *kvm)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct kvm_vcpu *vcpu;
- int c;
-
- kvm_for_each_vcpu(c, vcpu, kvm) {
- if (compute_pending_for_cpu(vcpu))
- set_bit(c, dist->irq_pending_on_cpu);
-
- if (compute_active_for_cpu(vcpu))
- set_bit(c, dist->irq_active_on_cpu);
- else
- clear_bit(c, dist->irq_active_on_cpu);
- }
-}
-
-static struct vgic_lr vgic_get_lr(const struct kvm_vcpu *vcpu, int lr)
-{
- return vgic_ops->get_lr(vcpu, lr);
-}
-
-static void vgic_set_lr(struct kvm_vcpu *vcpu, int lr,
- struct vgic_lr vlr)
-{
- vgic_ops->set_lr(vcpu, lr, vlr);
-}
-
-static inline u64 vgic_get_elrsr(struct kvm_vcpu *vcpu)
-{
- return vgic_ops->get_elrsr(vcpu);
-}
-
-static inline u64 vgic_get_eisr(struct kvm_vcpu *vcpu)
-{
- return vgic_ops->get_eisr(vcpu);
-}
-
-static inline void vgic_clear_eisr(struct kvm_vcpu *vcpu)
-{
- vgic_ops->clear_eisr(vcpu);
-}
-
-static inline u32 vgic_get_interrupt_status(struct kvm_vcpu *vcpu)
-{
- return vgic_ops->get_interrupt_status(vcpu);
-}
-
-static inline void vgic_enable_underflow(struct kvm_vcpu *vcpu)
-{
- vgic_ops->enable_underflow(vcpu);
-}
-
-static inline void vgic_disable_underflow(struct kvm_vcpu *vcpu)
-{
- vgic_ops->disable_underflow(vcpu);
-}
-
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
- vgic_ops->get_vmcr(vcpu, vmcr);
-}
-
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr)
-{
- vgic_ops->set_vmcr(vcpu, vmcr);
-}
-
-static inline void vgic_enable(struct kvm_vcpu *vcpu)
-{
- vgic_ops->enable(vcpu);
-}
-
-static void vgic_retire_lr(int lr_nr, struct kvm_vcpu *vcpu)
-{
- struct vgic_lr vlr = vgic_get_lr(vcpu, lr_nr);
-
- vgic_irq_clear_queued(vcpu, vlr.irq);
-
- /*
- * We must transfer the pending state back to the distributor before
- * retiring the LR, otherwise we may loose edge-triggered interrupts.
- */
- if (vlr.state & LR_STATE_PENDING) {
- vgic_dist_irq_set_pending(vcpu, vlr.irq);
- vlr.hwirq = 0;
- }
-
- vlr.state = 0;
- vgic_set_lr(vcpu, lr_nr, vlr);
-}
-
-static bool dist_active_irq(struct kvm_vcpu *vcpu)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- return test_bit(vcpu->vcpu_id, dist->irq_active_on_cpu);
-}
-
-bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
-{
- int i;
-
- for (i = 0; i < vgic->nr_lr; i++) {
- struct vgic_lr vlr = vgic_get_lr(vcpu, i);
-
- if (vlr.irq == virt_irq && vlr.state & LR_STATE_ACTIVE)
- return true;
- }
-
- return vgic_irq_is_active(vcpu, virt_irq);
-}
-
-/*
- * An interrupt may have been disabled after being made pending on the
- * CPU interface (the classic case is a timer running while we're
- * rebooting the guest - the interrupt would kick as soon as the CPU
- * interface gets enabled, with deadly consequences).
- *
- * The solution is to examine already active LRs, and check the
- * interrupt is still enabled. If not, just retire it.
- */
-static void vgic_retire_disabled_irqs(struct kvm_vcpu *vcpu)
-{
- u64 elrsr = vgic_get_elrsr(vcpu);
- unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
- int lr;
-
- for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
- struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
- if (!vgic_irq_is_enabled(vcpu, vlr.irq))
- vgic_retire_lr(lr, vcpu);
- }
-}
-
-static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
- int lr_nr, struct vgic_lr vlr)
-{
- if (vgic_irq_is_active(vcpu, irq)) {
- vlr.state |= LR_STATE_ACTIVE;
- kvm_debug("Set active, clear distributor: 0x%x\n", vlr.state);
- vgic_irq_clear_active(vcpu, irq);
- vgic_update_state(vcpu->kvm);
- } else {
- WARN_ON(!vgic_dist_irq_is_pending(vcpu, irq));
- vlr.state |= LR_STATE_PENDING;
- kvm_debug("Set pending: 0x%x\n", vlr.state);
- }
-
- if (!vgic_irq_is_edge(vcpu, irq))
- vlr.state |= LR_EOI_INT;
-
- if (vlr.irq >= VGIC_NR_SGIS) {
- struct irq_phys_map *map;
- map = vgic_irq_map_search(vcpu, irq);
-
- if (map) {
- vlr.hwirq = map->phys_irq;
- vlr.state |= LR_HW;
- vlr.state &= ~LR_EOI_INT;
-
- /*
- * Make sure we're not going to sample this
- * again, as a HW-backed interrupt cannot be
- * in the PENDING_ACTIVE stage.
- */
- vgic_irq_set_queued(vcpu, irq);
- }
- }
-
- vgic_set_lr(vcpu, lr_nr, vlr);
-}
-
-/*
- * Queue an interrupt to a CPU virtual interface. Return true on success,
- * or false if it wasn't possible to queue it.
- * sgi_source must be zero for any non-SGI interrupts.
- */
-bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- u64 elrsr = vgic_get_elrsr(vcpu);
- unsigned long *elrsr_ptr = u64_to_bitmask(&elrsr);
- struct vgic_lr vlr;
- int lr;
-
- /* Sanitize the input... */
- BUG_ON(sgi_source_id & ~7);
- BUG_ON(sgi_source_id && irq >= VGIC_NR_SGIS);
- BUG_ON(irq >= dist->nr_irqs);
-
- kvm_debug("Queue IRQ%d\n", irq);
-
- /* Do we have an active interrupt for the same CPUID? */
- for_each_clear_bit(lr, elrsr_ptr, vgic->nr_lr) {
- vlr = vgic_get_lr(vcpu, lr);
- if (vlr.irq == irq && vlr.source == sgi_source_id) {
- kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq);
- vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
- return true;
- }
- }
-
- /* Try to use another LR for this interrupt */
- lr = find_first_bit(elrsr_ptr, vgic->nr_lr);
- if (lr >= vgic->nr_lr)
- return false;
-
- kvm_debug("LR%d allocated for IRQ%d %x\n", lr, irq, sgi_source_id);
-
- vlr.irq = irq;
- vlr.source = sgi_source_id;
- vlr.state = 0;
- vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
-
- return true;
-}
-
-static bool vgic_queue_hwirq(struct kvm_vcpu *vcpu, int irq)
-{
- if (!vgic_can_sample_irq(vcpu, irq))
- return true; /* level interrupt, already queued */
-
- if (vgic_queue_irq(vcpu, 0, irq)) {
- if (vgic_irq_is_edge(vcpu, irq)) {
- vgic_dist_irq_clear_pending(vcpu, irq);
- vgic_cpu_irq_clear(vcpu, irq);
- } else {
- vgic_irq_set_queued(vcpu, irq);
- }
-
- return true;
- }
-
- return false;
-}
-
-/*
- * Fill the list registers with pending interrupts before running the
- * guest.
- */
-static void __kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
-{
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- unsigned long *pa_percpu, *pa_shared;
- int i, vcpu_id;
- int overflow = 0;
- int nr_shared = vgic_nr_shared_irqs(dist);
-
- vcpu_id = vcpu->vcpu_id;
-
- pa_percpu = vcpu->arch.vgic_cpu.pend_act_percpu;
- pa_shared = vcpu->arch.vgic_cpu.pend_act_shared;
-
- bitmap_or(pa_percpu, vgic_cpu->pending_percpu, vgic_cpu->active_percpu,
- VGIC_NR_PRIVATE_IRQS);
- bitmap_or(pa_shared, vgic_cpu->pending_shared, vgic_cpu->active_shared,
- nr_shared);
- /*
- * We may not have any pending interrupt, or the interrupts
- * may have been serviced from another vcpu. In all cases,
- * move along.
- */
- if (!kvm_vgic_vcpu_pending_irq(vcpu) && !dist_active_irq(vcpu))
- goto epilog;
-
- /* SGIs */
- for_each_set_bit(i, pa_percpu, VGIC_NR_SGIS) {
- if (!queue_sgi(vcpu, i))
- overflow = 1;
- }
-
- /* PPIs */
- for_each_set_bit_from(i, pa_percpu, VGIC_NR_PRIVATE_IRQS) {
- if (!vgic_queue_hwirq(vcpu, i))
- overflow = 1;
- }
-
- /* SPIs */
- for_each_set_bit(i, pa_shared, nr_shared) {
- if (!vgic_queue_hwirq(vcpu, i + VGIC_NR_PRIVATE_IRQS))
- overflow = 1;
- }
-
-
-
-
-epilog:
- if (overflow) {
- vgic_enable_underflow(vcpu);
- } else {
- vgic_disable_underflow(vcpu);
- /*
- * We're about to run this VCPU, and we've consumed
- * everything the distributor had in store for
- * us. Claim we don't have anything pending. We'll
- * adjust that if needed while exiting.
- */
- clear_bit(vcpu_id, dist->irq_pending_on_cpu);
- }
-}
-
-static int process_queued_irq(struct kvm_vcpu *vcpu,
- int lr, struct vgic_lr vlr)
-{
- int pending = 0;
-
- /*
- * If the IRQ was EOIed (called from vgic_process_maintenance) or it
- * went from active to non-active (called from vgic_sync_hwirq) it was
- * also ACKed and we we therefore assume we can clear the soft pending
- * state (should it had been set) for this interrupt.
- *
- * Note: if the IRQ soft pending state was set after the IRQ was
- * acked, it actually shouldn't be cleared, but we have no way of
- * knowing that unless we start trapping ACKs when the soft-pending
- * state is set.
- */
- vgic_dist_irq_clear_soft_pend(vcpu, vlr.irq);
-
- /*
- * Tell the gic to start sampling this interrupt again.
- */
- vgic_irq_clear_queued(vcpu, vlr.irq);
-
- /* Any additional pending interrupt? */
- if (vgic_irq_is_edge(vcpu, vlr.irq)) {
- BUG_ON(!(vlr.state & LR_HW));
- pending = vgic_dist_irq_is_pending(vcpu, vlr.irq);
- } else {
- if (vgic_dist_irq_get_level(vcpu, vlr.irq)) {
- vgic_cpu_irq_set(vcpu, vlr.irq);
- pending = 1;
- } else {
- vgic_dist_irq_clear_pending(vcpu, vlr.irq);
- vgic_cpu_irq_clear(vcpu, vlr.irq);
- }
- }
-
- /*
- * Despite being EOIed, the LR may not have
- * been marked as empty.
- */
- vlr.state = 0;
- vlr.hwirq = 0;
- vgic_set_lr(vcpu, lr, vlr);
-
- return pending;
-}
-
-static bool vgic_process_maintenance(struct kvm_vcpu *vcpu)
-{
- u32 status = vgic_get_interrupt_status(vcpu);
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- struct kvm *kvm = vcpu->kvm;
- int level_pending = 0;
-
- kvm_debug("STATUS = %08x\n", status);
-
- if (status & INT_STATUS_EOI) {
- /*
- * Some level interrupts have been EOIed. Clear their
- * active bit.
- */
- u64 eisr = vgic_get_eisr(vcpu);
- unsigned long *eisr_ptr = u64_to_bitmask(&eisr);
- int lr;
-
- for_each_set_bit(lr, eisr_ptr, vgic->nr_lr) {
- struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
- WARN_ON(vgic_irq_is_edge(vcpu, vlr.irq));
- WARN_ON(vlr.state & LR_STATE_MASK);
-
-
- /*
- * kvm_notify_acked_irq calls kvm_set_irq()
- * to reset the IRQ level, which grabs the dist->lock
- * so we call this before taking the dist->lock.
- */
- kvm_notify_acked_irq(kvm, 0,
- vlr.irq - VGIC_NR_PRIVATE_IRQS);
-
- spin_lock(&dist->lock);
- level_pending |= process_queued_irq(vcpu, lr, vlr);
- spin_unlock(&dist->lock);
- }
- }
-
- if (status & INT_STATUS_UNDERFLOW)
- vgic_disable_underflow(vcpu);
-
- /*
- * In the next iterations of the vcpu loop, if we sync the vgic state
- * after flushing it, but before entering the guest (this happens for
- * pending signals and vmid rollovers), then make sure we don't pick
- * up any old maintenance interrupts here.
- */
- vgic_clear_eisr(vcpu);
-
- return level_pending;
-}
-
-/*
- * Save the physical active state, and reset it to inactive.
- *
- * Return true if there's a pending forwarded interrupt to queue.
- */
-static bool vgic_sync_hwirq(struct kvm_vcpu *vcpu, int lr, struct vgic_lr vlr)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- bool level_pending;
-
- if (!(vlr.state & LR_HW))
- return false;
-
- if (vlr.state & LR_STATE_ACTIVE)
- return false;
-
- spin_lock(&dist->lock);
- level_pending = process_queued_irq(vcpu, lr, vlr);
- spin_unlock(&dist->lock);
- return level_pending;
-}
-
-/* Sync back the VGIC state after a guest run */
-static void __kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- u64 elrsr;
- unsigned long *elrsr_ptr;
- int lr, pending;
- bool level_pending;
-
- level_pending = vgic_process_maintenance(vcpu);
-
- /* Deal with HW interrupts, and clear mappings for empty LRs */
- for (lr = 0; lr < vgic->nr_lr; lr++) {
- struct vgic_lr vlr = vgic_get_lr(vcpu, lr);
-
- level_pending |= vgic_sync_hwirq(vcpu, lr, vlr);
- BUG_ON(vlr.irq >= dist->nr_irqs);
- }
-
- /* Check if we still have something up our sleeve... */
- elrsr = vgic_get_elrsr(vcpu);
- elrsr_ptr = u64_to_bitmask(&elrsr);
- pending = find_first_zero_bit(elrsr_ptr, vgic->nr_lr);
- if (level_pending || pending < vgic->nr_lr)
- set_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
-}
-
-void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- if (!irqchip_in_kernel(vcpu->kvm))
- return;
-
- spin_lock(&dist->lock);
- __kvm_vgic_flush_hwstate(vcpu);
- spin_unlock(&dist->lock);
-}
-
-void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
-{
- if (!irqchip_in_kernel(vcpu->kvm))
- return;
-
- __kvm_vgic_sync_hwstate(vcpu);
-}
-
-int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
-
- if (!irqchip_in_kernel(vcpu->kvm))
- return 0;
-
- return test_bit(vcpu->vcpu_id, dist->irq_pending_on_cpu);
-}
-
-void vgic_kick_vcpus(struct kvm *kvm)
-{
- struct kvm_vcpu *vcpu;
- int c;
-
- /*
- * We've injected an interrupt, time to find out who deserves
- * a good kick...
- */
- kvm_for_each_vcpu(c, vcpu, kvm) {
- if (kvm_vgic_vcpu_pending_irq(vcpu))
- kvm_vcpu_kick(vcpu);
- }
-}
-
-static int vgic_validate_injection(struct kvm_vcpu *vcpu, int irq, int level)
-{
- int edge_triggered = vgic_irq_is_edge(vcpu, irq);
-
- /*
- * Only inject an interrupt if:
- * - edge triggered and we have a rising edge
- * - level triggered and we change level
- */
- if (edge_triggered) {
- int state = vgic_dist_irq_is_pending(vcpu, irq);
- return level > state;
- } else {
- int state = vgic_dist_irq_get_level(vcpu, irq);
- return level != state;
- }
-}
-
-static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
- unsigned int irq_num, bool level)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct kvm_vcpu *vcpu;
- int edge_triggered, level_triggered;
- int enabled;
- bool ret = true, can_inject = true;
-
- trace_vgic_update_irq_pending(cpuid, irq_num, level);
-
- if (irq_num >= min(kvm->arch.vgic.nr_irqs, 1020))
- return -EINVAL;
-
- spin_lock(&dist->lock);
-
- vcpu = kvm_get_vcpu(kvm, cpuid);
- edge_triggered = vgic_irq_is_edge(vcpu, irq_num);
- level_triggered = !edge_triggered;
-
- if (!vgic_validate_injection(vcpu, irq_num, level)) {
- ret = false;
- goto out;
- }
-
- if (irq_num >= VGIC_NR_PRIVATE_IRQS) {
- cpuid = dist->irq_spi_cpu[irq_num - VGIC_NR_PRIVATE_IRQS];
- if (cpuid == VCPU_NOT_ALLOCATED) {
- /* Pretend we use CPU0, and prevent injection */
- cpuid = 0;
- can_inject = false;
- }
- vcpu = kvm_get_vcpu(kvm, cpuid);
- }
-
- kvm_debug("Inject IRQ%d level %d CPU%d\n", irq_num, level, cpuid);
-
- if (level) {
- if (level_triggered)
- vgic_dist_irq_set_level(vcpu, irq_num);
- vgic_dist_irq_set_pending(vcpu, irq_num);
- } else {
- if (level_triggered) {
- vgic_dist_irq_clear_level(vcpu, irq_num);
- if (!vgic_dist_irq_soft_pend(vcpu, irq_num)) {
- vgic_dist_irq_clear_pending(vcpu, irq_num);
- vgic_cpu_irq_clear(vcpu, irq_num);
- if (!compute_pending_for_cpu(vcpu))
- clear_bit(cpuid, dist->irq_pending_on_cpu);
- }
- }
-
- ret = false;
- goto out;
- }
-
- enabled = vgic_irq_is_enabled(vcpu, irq_num);
-
- if (!enabled || !can_inject) {
- ret = false;
- goto out;
- }
-
- if (!vgic_can_sample_irq(vcpu, irq_num)) {
- /*
- * Level interrupt in progress, will be picked up
- * when EOId.
- */
- ret = false;
- goto out;
- }
-
- if (level) {
- vgic_cpu_irq_set(vcpu, irq_num);
- set_bit(cpuid, dist->irq_pending_on_cpu);
- }
-
-out:
- spin_unlock(&dist->lock);
-
- if (ret) {
- /* kick the specified vcpu */
- kvm_vcpu_kick(kvm_get_vcpu(kvm, cpuid));
- }
-
- return 0;
-}
-
-static int vgic_lazy_init(struct kvm *kvm)
-{
- int ret = 0;
-
- if (unlikely(!vgic_initialized(kvm))) {
- /*
- * We only provide the automatic initialization of the VGIC
- * for the legacy case of a GICv2. Any other type must
- * be explicitly initialized once setup with the respective
- * KVM device call.
- */
- if (kvm->arch.vgic.vgic_model != KVM_DEV_TYPE_ARM_VGIC_V2)
- return -EBUSY;
-
- mutex_lock(&kvm->lock);
- ret = vgic_init(kvm);
- mutex_unlock(&kvm->lock);
- }
-
- return ret;
-}
-
-/**
- * kvm_vgic_inject_irq - Inject an IRQ from a device to the vgic
- * @kvm: The VM structure pointer
- * @cpuid: The CPU for PPIs
- * @irq_num: The IRQ number that is assigned to the device. This IRQ
- * must not be mapped to a HW interrupt.
- * @level: Edge-triggered: true: to trigger the interrupt
- * false: to ignore the call
- * Level-sensitive true: raise the input signal
- * false: lower the input signal
- *
- * The GIC is not concerned with devices being active-LOW or active-HIGH for
- * level-sensitive interrupts. You can think of the level parameter as 1
- * being HIGH and 0 being LOW and all devices being active-HIGH.
- */
-int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, unsigned int irq_num,
- bool level)
-{
- struct irq_phys_map *map;
- int ret;
-
- ret = vgic_lazy_init(kvm);
- if (ret)
- return ret;
-
- map = vgic_irq_map_search(kvm_get_vcpu(kvm, cpuid), irq_num);
- if (map)
- return -EINVAL;
-
- return vgic_update_irq_pending(kvm, cpuid, irq_num, level);
-}
-
-/**
- * kvm_vgic_inject_mapped_irq - Inject a physically mapped IRQ to the vgic
- * @kvm: The VM structure pointer
- * @cpuid: The CPU for PPIs
- * @virt_irq: The virtual IRQ to be injected
- * @level: Edge-triggered: true: to trigger the interrupt
- * false: to ignore the call
- * Level-sensitive true: raise the input signal
- * false: lower the input signal
- *
- * The GIC is not concerned with devices being active-LOW or active-HIGH for
- * level-sensitive interrupts. You can think of the level parameter as 1
- * being HIGH and 0 being LOW and all devices being active-HIGH.
- */
-int kvm_vgic_inject_mapped_irq(struct kvm *kvm, int cpuid,
- unsigned int virt_irq, bool level)
-{
- int ret;
-
- ret = vgic_lazy_init(kvm);
- if (ret)
- return ret;
-
- return vgic_update_irq_pending(kvm, cpuid, virt_irq, level);
-}
-
-static irqreturn_t vgic_maintenance_handler(int irq, void *data)
-{
- /*
- * We cannot rely on the vgic maintenance interrupt to be
- * delivered synchronously. This means we can only use it to
- * exit the VM, and we perform the handling of EOIed
- * interrupts on the exit path (see vgic_process_maintenance).
- */
- return IRQ_HANDLED;
-}
-
-static struct list_head *vgic_get_irq_phys_map_list(struct kvm_vcpu *vcpu,
- int virt_irq)
-{
- if (virt_irq < VGIC_NR_PRIVATE_IRQS)
- return &vcpu->arch.vgic_cpu.irq_phys_map_list;
- else
- return &vcpu->kvm->arch.vgic.irq_phys_map_list;
-}
-
-/**
- * kvm_vgic_map_phys_irq - map a virtual IRQ to a physical IRQ
- * @vcpu: The VCPU pointer
- * @virt_irq: The virtual IRQ number for the guest
- * @phys_irq: The hardware IRQ number of the host
- *
- * Establish a mapping between a guest visible irq (@virt_irq) and a
- * hardware irq (@phys_irq). On injection, @virt_irq will be associated with
- * the physical interrupt represented by @phys_irq. This mapping can be
- * established multiple times as long as the parameters are the same.
- *
- * Returns 0 on success or an error value otherwise.
- */
-int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, int virt_irq, int phys_irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
- struct irq_phys_map *map;
- struct irq_phys_map_entry *entry;
- int ret = 0;
-
- /* Create a new mapping */
- entry = kzalloc(sizeof(*entry), GFP_KERNEL);
- if (!entry)
- return -ENOMEM;
-
- spin_lock(&dist->irq_phys_map_lock);
-
- /* Try to match an existing mapping */
- map = vgic_irq_map_search(vcpu, virt_irq);
- if (map) {
- /* Make sure this mapping matches */
- if (map->phys_irq != phys_irq)
- ret = -EINVAL;
-
- /* Found an existing, valid mapping */
- goto out;
- }
-
- map = &entry->map;
- map->virt_irq = virt_irq;
- map->phys_irq = phys_irq;
-
- list_add_tail_rcu(&entry->entry, root);
-
-out:
- spin_unlock(&dist->irq_phys_map_lock);
- /* If we've found a hit in the existing list, free the useless
- * entry */
- if (ret || map != &entry->map)
- kfree(entry);
- return ret;
-}
-
-static struct irq_phys_map *vgic_irq_map_search(struct kvm_vcpu *vcpu,
- int virt_irq)
-{
- struct list_head *root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
- struct irq_phys_map_entry *entry;
- struct irq_phys_map *map;
-
- rcu_read_lock();
-
- list_for_each_entry_rcu(entry, root, entry) {
- map = &entry->map;
- if (map->virt_irq == virt_irq) {
- rcu_read_unlock();
- return map;
- }
- }
-
- rcu_read_unlock();
-
- return NULL;
-}
-
-static void vgic_free_phys_irq_map_rcu(struct rcu_head *rcu)
-{
- struct irq_phys_map_entry *entry;
-
- entry = container_of(rcu, struct irq_phys_map_entry, rcu);
- kfree(entry);
-}
-
-/**
- * kvm_vgic_unmap_phys_irq - Remove a virtual to physical IRQ mapping
- * @vcpu: The VCPU pointer
- * @virt_irq: The virtual IRQ number to be unmapped
- *
- * Remove an existing mapping between virtual and physical interrupts.
- */
-int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
-{
- struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
- struct irq_phys_map_entry *entry;
- struct list_head *root;
-
- root = vgic_get_irq_phys_map_list(vcpu, virt_irq);
-
- spin_lock(&dist->irq_phys_map_lock);
-
- list_for_each_entry(entry, root, entry) {
- if (entry->map.virt_irq == virt_irq) {
- list_del_rcu(&entry->entry);
- call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
- break;
- }
- }
-
- spin_unlock(&dist->irq_phys_map_lock);
-
- return 0;
-}
-
-static void vgic_destroy_irq_phys_map(struct kvm *kvm, struct list_head *root)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct irq_phys_map_entry *entry;
-
- spin_lock(&dist->irq_phys_map_lock);
-
- list_for_each_entry(entry, root, entry) {
- list_del_rcu(&entry->entry);
- call_rcu(&entry->rcu, vgic_free_phys_irq_map_rcu);
- }
-
- spin_unlock(&dist->irq_phys_map_lock);
-}
-
-void kvm_vgic_vcpu_destroy(struct kvm_vcpu *vcpu)
-{
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
-
- kfree(vgic_cpu->pending_shared);
- kfree(vgic_cpu->active_shared);
- kfree(vgic_cpu->pend_act_shared);
- vgic_destroy_irq_phys_map(vcpu->kvm, &vgic_cpu->irq_phys_map_list);
- vgic_cpu->pending_shared = NULL;
- vgic_cpu->active_shared = NULL;
- vgic_cpu->pend_act_shared = NULL;
-}
-
-static int vgic_vcpu_init_maps(struct kvm_vcpu *vcpu, int nr_irqs)
-{
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- int nr_longs = BITS_TO_LONGS(nr_irqs - VGIC_NR_PRIVATE_IRQS);
- int sz = nr_longs * sizeof(unsigned long);
- vgic_cpu->pending_shared = kzalloc(sz, GFP_KERNEL);
- vgic_cpu->active_shared = kzalloc(sz, GFP_KERNEL);
- vgic_cpu->pend_act_shared = kzalloc(sz, GFP_KERNEL);
-
- if (!vgic_cpu->pending_shared
- || !vgic_cpu->active_shared
- || !vgic_cpu->pend_act_shared) {
- kvm_vgic_vcpu_destroy(vcpu);
- return -ENOMEM;
- }
-
- return 0;
-}
-
-/**
- * kvm_vgic_vcpu_early_init - Earliest possible per-vcpu vgic init stage
- *
- * No memory allocation should be performed here, only static init.
- */
-void kvm_vgic_vcpu_early_init(struct kvm_vcpu *vcpu)
-{
- struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
- INIT_LIST_HEAD(&vgic_cpu->irq_phys_map_list);
-}
-
-/**
- * kvm_vgic_get_max_vcpus - Get the maximum number of VCPUs allowed by HW
- *
- * The host's GIC naturally limits the maximum amount of VCPUs a guest
- * can use.
- */
-int kvm_vgic_get_max_vcpus(void)
-{
- return vgic->max_gic_vcpus;
-}
-
-void kvm_vgic_destroy(struct kvm *kvm)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct kvm_vcpu *vcpu;
- int i;
-
- kvm_for_each_vcpu(i, vcpu, kvm)
- kvm_vgic_vcpu_destroy(vcpu);
-
- vgic_free_bitmap(&dist->irq_enabled);
- vgic_free_bitmap(&dist->irq_level);
- vgic_free_bitmap(&dist->irq_pending);
- vgic_free_bitmap(&dist->irq_soft_pend);
- vgic_free_bitmap(&dist->irq_queued);
- vgic_free_bitmap(&dist->irq_cfg);
- vgic_free_bytemap(&dist->irq_priority);
- if (dist->irq_spi_target) {
- for (i = 0; i < dist->nr_cpus; i++)
- vgic_free_bitmap(&dist->irq_spi_target[i]);
- }
- kfree(dist->irq_sgi_sources);
- kfree(dist->irq_spi_cpu);
- kfree(dist->irq_spi_mpidr);
- kfree(dist->irq_spi_target);
- kfree(dist->irq_pending_on_cpu);
- kfree(dist->irq_active_on_cpu);
- vgic_destroy_irq_phys_map(kvm, &dist->irq_phys_map_list);
- dist->irq_sgi_sources = NULL;
- dist->irq_spi_cpu = NULL;
- dist->irq_spi_target = NULL;
- dist->irq_pending_on_cpu = NULL;
- dist->irq_active_on_cpu = NULL;
- dist->nr_cpus = 0;
-}
-
-/*
- * Allocate and initialize the various data structures. Must be called
- * with kvm->lock held!
- */
-int vgic_init(struct kvm *kvm)
-{
- struct vgic_dist *dist = &kvm->arch.vgic;
- struct kvm_vcpu *vcpu;
- int nr_cpus, nr_irqs;
- int ret, i, vcpu_id;
-
- if (vgic_initialized(kvm))
- return 0;
-
- nr_cpus = dist->nr_cpus = atomic_read(&kvm->online_vcpus);
- if (!nr_cpus) /* No vcpus? Can't be good... */
- return -ENODEV;
-
- /*
- * If nobody configured the number of interrupts, use the
- * legacy one.
- */
- if (!dist->nr_irqs)
- dist->nr_irqs = VGIC_NR_IRQS_LEGACY;
-
- nr_irqs = dist->nr_irqs;
-
- ret = vgic_init_bitmap(&dist->irq_enabled, nr_cpus, nr_irqs);
- ret |= vgic_init_bitmap(&dist->irq_level, nr_cpus, nr_irqs);
- ret |= vgic_init_bitmap(&dist->irq_pending, nr_cpus, nr_irqs);
- ret |= vgic_init_bitmap(&dist->irq_soft_pend, nr_cpus, nr_irqs);
- ret |= vgic_init_bitmap(&dist->irq_queued, nr_cpus, nr_irqs);
- ret |= vgic_init_bitmap(&dist->irq_active, nr_cpus, nr_irqs);
- ret |= vgic_init_bitmap(&dist->irq_cfg, nr_cpus, nr_irqs);
- ret |= vgic_init_bytemap(&dist->irq_priority, nr_cpus, nr_irqs);
-
- if (ret)
- goto out;
-
- dist->irq_sgi_sources = kzalloc(nr_cpus * VGIC_NR_SGIS, GFP_KERNEL);
- dist->irq_spi_cpu = kzalloc(nr_irqs - VGIC_NR_PRIVATE_IRQS, GFP_KERNEL);
- dist->irq_spi_target = kzalloc(sizeof(*dist->irq_spi_target) * nr_cpus,
- GFP_KERNEL);
- dist->irq_pending_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
- GFP_KERNEL);
- dist->irq_active_on_cpu = kzalloc(BITS_TO_LONGS(nr_cpus) * sizeof(long),
- GFP_KERNEL);
- if (!dist->irq_sgi_sources ||
- !dist->irq_spi_cpu ||
- !dist->irq_spi_target ||
- !dist->irq_pending_on_cpu ||
- !dist->irq_active_on_cpu) {
- ret = -ENOMEM;
- goto out;
- }
-
- for (i = 0; i < nr_cpus; i++)
- ret |= vgic_init_bitmap(&dist->irq_spi_target[i],
- nr_cpus, nr_irqs);
-
- if (ret)
- goto out;
-
- ret = kvm->arch.vgic.vm_ops.init_model(kvm);
- if (ret)
- goto out;
-
- kvm_for_each_vcpu(vcpu_id, vcpu, kvm) {
- ret = vgic_vcpu_init_maps(vcpu, nr_irqs);
- if (ret) {
- kvm_err("VGIC: Failed to allocate vcpu memory\n");
- break;
- }
-
- /*
- * Enable and configure all SGIs to be edge-triggere and
- * configure all PPIs as level-triggered.
- */
- for (i = 0; i < VGIC_NR_PRIVATE_IRQS; i++) {
- if (i < VGIC_NR_SGIS) {
- /* SGIs */
- vgic_bitmap_set_irq_val(&dist->irq_enabled,
- vcpu->vcpu_id, i, 1);
- vgic_bitmap_set_irq_val(&dist->irq_cfg,
- vcpu->vcpu_id, i,
- VGIC_CFG_EDGE);
- } else if (i < VGIC_NR_PRIVATE_IRQS) {
- /* PPIs */
- vgic_bitmap_set_irq_val(&dist->irq_cfg,
- vcpu->vcpu_id, i,
- VGIC_CFG_LEVEL);
- }
- }
-
- vgic_enable(vcpu);
- }
-
-out:
- if (ret)
- kvm_vgic_destroy(kvm);
-
- return ret;
-}
-
-static int init_vgic_model(struct kvm *kvm, int type)
-{
- switch (type) {
- case KVM_DEV_TYPE_ARM_VGIC_V2:
- vgic_v2_init_emulation(kvm);
- break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
- case KVM_DEV_TYPE_ARM_VGIC_V3:
- vgic_v3_init_emulation(kvm);
- break;
-#endif
- default:
- return -ENODEV;
- }
-
- if (atomic_read(&kvm->online_vcpus) > kvm->arch.max_vcpus)
- return -E2BIG;
-
- return 0;
-}
-
-/**
- * kvm_vgic_early_init - Earliest possible vgic initialization stage
- *
- * No memory allocation should be performed here, only static init.
- */
-void kvm_vgic_early_init(struct kvm *kvm)
-{
- spin_lock_init(&kvm->arch.vgic.lock);
- spin_lock_init(&kvm->arch.vgic.irq_phys_map_lock);
- INIT_LIST_HEAD(&kvm->arch.vgic.irq_phys_map_list);
-}
-
-int kvm_vgic_create(struct kvm *kvm, u32 type)
-{
- int i, vcpu_lock_idx = -1, ret;
- struct kvm_vcpu *vcpu;
-
- mutex_lock(&kvm->lock);
-
- if (irqchip_in_kernel(kvm)) {
- ret = -EEXIST;
- goto out;
- }
-
- /*
- * This function is also called by the KVM_CREATE_IRQCHIP handler,
- * which had no chance yet to check the availability of the GICv2
- * emulation. So check this here again. KVM_CREATE_DEVICE does
- * the proper checks already.
- */
- if (type == KVM_DEV_TYPE_ARM_VGIC_V2 && !vgic->can_emulate_gicv2) {
- ret = -ENODEV;
- goto out;
- }
-
- /*
- * Any time a vcpu is run, vcpu_load is called which tries to grab the
- * vcpu->mutex. By grabbing the vcpu->mutex of all VCPUs we ensure
- * that no other VCPUs are run while we create the vgic.
- */
- ret = -EBUSY;
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (!mutex_trylock(&vcpu->mutex))
- goto out_unlock;
- vcpu_lock_idx = i;
- }
-
- kvm_for_each_vcpu(i, vcpu, kvm) {
- if (vcpu->arch.has_run_once)
- goto out_unlock;
- }
- ret = 0;
-
- ret = init_vgic_model(kvm, type);
- if (ret)
- goto out_unlock;
-
- kvm->arch.vgic.in_kernel = true;
- kvm->arch.vgic.vgic_model = type;
- kvm->arch.vgic.vctrl_base = vgic->vctrl_base;
- kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
- kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
- kvm->arch.vgic.vgic_redist_base = VGIC_ADDR_UNDEF;
-
-out_unlock:
- for (; vcpu_lock_idx >= 0; vcpu_lock_idx--) {
- vcpu = kvm_get_vcpu(kvm, vcpu_lock_idx);
- mutex_unlock(&vcpu->mutex);
- }
-
-out:
- mutex_unlock(&kvm->lock);
- return ret;
-}
-
-static int vgic_ioaddr_overlap(struct kvm *kvm)
-{
- phys_addr_t dist = kvm->arch.vgic.vgic_dist_base;
- phys_addr_t cpu = kvm->arch.vgic.vgic_cpu_base;
-
- if (IS_VGIC_ADDR_UNDEF(dist) || IS_VGIC_ADDR_UNDEF(cpu))
- return 0;
- if ((dist <= cpu && dist + KVM_VGIC_V2_DIST_SIZE > cpu) ||
- (cpu <= dist && cpu + KVM_VGIC_V2_CPU_SIZE > dist))
- return -EBUSY;
- return 0;
-}
-
-static int vgic_ioaddr_assign(struct kvm *kvm, phys_addr_t *ioaddr,
- phys_addr_t addr, phys_addr_t size)
-{
- int ret;
-
- if (addr & ~KVM_PHYS_MASK)
- return -E2BIG;
-
- if (addr & (SZ_4K - 1))
- return -EINVAL;
-
- if (!IS_VGIC_ADDR_UNDEF(*ioaddr))
- return -EEXIST;
- if (addr + size < addr)
- return -EINVAL;
-
- *ioaddr = addr;
- ret = vgic_ioaddr_overlap(kvm);
- if (ret)
- *ioaddr = VGIC_ADDR_UNDEF;
-
- return ret;
-}
-
-/**
- * kvm_vgic_addr - set or get vgic VM base addresses
- * @kvm: pointer to the vm struct
- * @type: the VGIC addr type, one of KVM_VGIC_V[23]_ADDR_TYPE_XXX
- * @addr: pointer to address value
- * @write: if true set the address in the VM address space, if false read the
- * address
- *
- * Set or get the vgic base addresses for the distributor and the virtual CPU
- * interface in the VM physical address space. These addresses are properties
- * of the emulated core/SoC and therefore user space initially knows this
- * information.
- */
-int kvm_vgic_addr(struct kvm *kvm, unsigned long type, u64 *addr, bool write)
-{
- int r = 0;
- struct vgic_dist *vgic = &kvm->arch.vgic;
- int type_needed;
- phys_addr_t *addr_ptr, block_size;
- phys_addr_t alignment;
-
- mutex_lock(&kvm->lock);
- switch (type) {
- case KVM_VGIC_V2_ADDR_TYPE_DIST:
- type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
- addr_ptr = &vgic->vgic_dist_base;
- block_size = KVM_VGIC_V2_DIST_SIZE;
- alignment = SZ_4K;
- break;
- case KVM_VGIC_V2_ADDR_TYPE_CPU:
- type_needed = KVM_DEV_TYPE_ARM_VGIC_V2;
- addr_ptr = &vgic->vgic_cpu_base;
- block_size = KVM_VGIC_V2_CPU_SIZE;
- alignment = SZ_4K;
- break;
-#ifdef CONFIG_KVM_ARM_VGIC_V3
- case KVM_VGIC_V3_ADDR_TYPE_DIST:
- type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
- addr_ptr = &vgic->vgic_dist_base;
- block_size = KVM_VGIC_V3_DIST_SIZE;
- alignment = SZ_64K;
- break;
- case KVM_VGIC_V3_ADDR_TYPE_REDIST:
- type_needed = KVM_DEV_TYPE_ARM_VGIC_V3;
- addr_ptr = &vgic->vgic_redist_base;
- block_size = KVM_VGIC_V3_REDIST_SIZE;
- alignment = SZ_64K;
- break;
-#endif
- default:
- r = -ENODEV;
- goto out;
- }
-
- if (vgic->vgic_model != type_needed) {
- r = -ENODEV;
- goto out;
- }
-
- if (write) {
- if (!IS_ALIGNED(*addr, alignment))
- r = -EINVAL;
- else
- r = vgic_ioaddr_assign(kvm, addr_ptr, *addr,
- block_size);
- } else {
- *addr = *addr_ptr;
- }
-
-out:
- mutex_unlock(&kvm->lock);
- return r;
-}
-
-int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
- int r;
-
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_ADDR: {
- u64 __user *uaddr = (u64 __user *)(long)attr->addr;
- u64 addr;
- unsigned long type = (unsigned long)attr->attr;
-
- if (copy_from_user(&addr, uaddr, sizeof(addr)))
- return -EFAULT;
-
- r = kvm_vgic_addr(dev->kvm, type, &addr, true);
- return (r == -ENODEV) ? -ENXIO : r;
- }
- case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
- u32 __user *uaddr = (u32 __user *)(long)attr->addr;
- u32 val;
- int ret = 0;
-
- if (get_user(val, uaddr))
- return -EFAULT;
-
- /*
- * We require:
- * - at least 32 SPIs on top of the 16 SGIs and 16 PPIs
- * - at most 1024 interrupts
- * - a multiple of 32 interrupts
- */
- if (val < (VGIC_NR_PRIVATE_IRQS + 32) ||
- val > VGIC_MAX_IRQS ||
- (val & 31))
- return -EINVAL;
-
- mutex_lock(&dev->kvm->lock);
-
- if (vgic_ready(dev->kvm) || dev->kvm->arch.vgic.nr_irqs)
- ret = -EBUSY;
- else
- dev->kvm->arch.vgic.nr_irqs = val;
-
- mutex_unlock(&dev->kvm->lock);
-
- return ret;
- }
- case KVM_DEV_ARM_VGIC_GRP_CTRL: {
- switch (attr->attr) {
- case KVM_DEV_ARM_VGIC_CTRL_INIT:
- r = vgic_init(dev->kvm);
- return r;
- }
- break;
- }
- }
-
- return -ENXIO;
-}
-
-int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr)
-{
- int r = -ENXIO;
-
- switch (attr->group) {
- case KVM_DEV_ARM_VGIC_GRP_ADDR: {
- u64 __user *uaddr = (u64 __user *)(long)attr->addr;
- u64 addr;
- unsigned long type = (unsigned long)attr->attr;
-
- r = kvm_vgic_addr(dev->kvm, type, &addr, false);
- if (r)
- return (r == -ENODEV) ? -ENXIO : r;
-
- if (copy_to_user(uaddr, &addr, sizeof(addr)))
- return -EFAULT;
- break;
- }
- case KVM_DEV_ARM_VGIC_GRP_NR_IRQS: {
- u32 __user *uaddr = (u32 __user *)(long)attr->addr;
-
- r = put_user(dev->kvm->arch.vgic.nr_irqs, uaddr);
- break;
- }
-
- }
-
- return r;
-}
-
-int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset)
-{
- if (vgic_find_range(ranges, 4, offset))
- return 0;
- else
- return -ENXIO;
-}
-
-static int vgic_starting_cpu(unsigned int cpu)
-{
- enable_percpu_irq(vgic->maint_irq, 0);
- return 0;
-}
-
-static int vgic_dying_cpu(unsigned int cpu)
-{
- disable_percpu_irq(vgic->maint_irq);
- return 0;
-}
-
-static int kvm_vgic_probe(void)
-{
- const struct gic_kvm_info *gic_kvm_info;
- int ret;
-
- gic_kvm_info = gic_get_kvm_info();
- if (!gic_kvm_info)
- return -ENODEV;
-
- switch (gic_kvm_info->type) {
- case GIC_V2:
- ret = vgic_v2_probe(gic_kvm_info, &vgic_ops, &vgic);
- break;
- case GIC_V3:
- ret = vgic_v3_probe(gic_kvm_info, &vgic_ops, &vgic);
- break;
- default:
- ret = -ENODEV;
- }
-
- return ret;
-}
-
-int kvm_vgic_hyp_init(void)
-{
- int ret;
-
- ret = kvm_vgic_probe();
- if (ret) {
- kvm_err("error: KVM vGIC probing failed\n");
- return ret;
- }
-
- ret = request_percpu_irq(vgic->maint_irq, vgic_maintenance_handler,
- "vgic", kvm_get_running_vcpus());
- if (ret) {
- kvm_err("Cannot register interrupt %d\n", vgic->maint_irq);
- return ret;
- }
-
- cpuhp_setup_state(CPUHP_AP_KVM_ARM_VGIC_STARTING,
- "AP_KVM_ARM_VGIC_STARTING", vgic_starting_cpu,
- vgic_dying_cpu);
- return 0;
-}
-
-int kvm_irq_map_gsi(struct kvm *kvm,
- struct kvm_kernel_irq_routing_entry *entries,
- int gsi)
-{
- return 0;
-}
-
-int kvm_irq_map_chip_pin(struct kvm *kvm, unsigned irqchip, unsigned pin)
-{
- return pin;
-}
-
-int kvm_set_irq(struct kvm *kvm, int irq_source_id,
- u32 irq, int level, bool line_status)
-{
- unsigned int spi = irq + VGIC_NR_PRIVATE_IRQS;
-
- trace_kvm_set_irq(irq, level, irq_source_id);
-
- BUG_ON(!vgic_initialized(kvm));
-
- return kvm_vgic_inject_irq(kvm, 0, spi, level);
-}
-
-/* MSI not implemented yet */
-int kvm_set_msi(struct kvm_kernel_irq_routing_entry *e,
- struct kvm *kvm, int irq_source_id,
- int level, bool line_status)
-{
- return 0;
-}
diff --git a/virt/kvm/arm/vgic.h b/virt/kvm/arm/vgic.h
deleted file mode 100644
index 0df74cbb6200..000000000000
--- a/virt/kvm/arm/vgic.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Copyright (C) 2012-2014 ARM Ltd.
- * Author: Marc Zyngier <marc.zyngier@arm.com>
- *
- * Derived from virt/kvm/arm/vgic.c
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
- */
-
-#ifndef __KVM_VGIC_H__
-#define __KVM_VGIC_H__
-
-#include <kvm/iodev.h>
-
-#define VGIC_ADDR_UNDEF (-1)
-#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF)
-
-#define PRODUCT_ID_KVM 0x4b /* ASCII code K */
-#define IMPLEMENTER_ARM 0x43b
-
-#define ACCESS_READ_VALUE (1 << 0)
-#define ACCESS_READ_RAZ (0 << 0)
-#define ACCESS_READ_MASK(x) ((x) & (1 << 0))
-#define ACCESS_WRITE_IGNORED (0 << 1)
-#define ACCESS_WRITE_SETBIT (1 << 1)
-#define ACCESS_WRITE_CLEARBIT (2 << 1)
-#define ACCESS_WRITE_VALUE (3 << 1)
-#define ACCESS_WRITE_MASK(x) ((x) & (3 << 1))
-
-#define VCPU_NOT_ALLOCATED ((u8)-1)
-
-unsigned long *vgic_bitmap_get_shared_map(struct vgic_bitmap *x);
-
-void vgic_update_state(struct kvm *kvm);
-int vgic_init_common_maps(struct kvm *kvm);
-
-u32 *vgic_bitmap_get_reg(struct vgic_bitmap *x, int cpuid, u32 offset);
-u32 *vgic_bytemap_get_reg(struct vgic_bytemap *x, int cpuid, u32 offset);
-
-void vgic_dist_irq_set_pending(struct kvm_vcpu *vcpu, int irq);
-void vgic_dist_irq_clear_pending(struct kvm_vcpu *vcpu, int irq);
-void vgic_cpu_irq_clear(struct kvm_vcpu *vcpu, int irq);
-void vgic_bitmap_set_irq_val(struct vgic_bitmap *x, int cpuid,
- int irq, int val);
-
-void vgic_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-void vgic_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
-
-bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 sgi_source_id, int irq);
-void vgic_unqueue_irqs(struct kvm_vcpu *vcpu);
-
-struct kvm_exit_mmio {
- phys_addr_t phys_addr;
- void *data;
- u32 len;
- bool is_write;
- void *private;
-};
-
-void vgic_reg_access(struct kvm_exit_mmio *mmio, u32 *reg,
- phys_addr_t offset, int mode);
-bool handle_mmio_raz_wi(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
- phys_addr_t offset);
-
-static inline
-u32 mmio_data_read(struct kvm_exit_mmio *mmio, u32 mask)
-{
- return le32_to_cpu(*((u32 *)mmio->data)) & mask;
-}
-
-static inline
-void mmio_data_write(struct kvm_exit_mmio *mmio, u32 mask, u32 value)
-{
- *((u32 *)mmio->data) = cpu_to_le32(value) & mask;
-}
-
-struct vgic_io_range {
- phys_addr_t base;
- unsigned long len;
- int bits_per_irq;
- bool (*handle_mmio)(struct kvm_vcpu *vcpu, struct kvm_exit_mmio *mmio,
- phys_addr_t offset);
-};
-
-int vgic_register_kvm_io_dev(struct kvm *kvm, gpa_t base, int len,
- const struct vgic_io_range *ranges,
- int redist_id,
- struct vgic_io_device *iodev);
-
-static inline bool is_in_range(phys_addr_t addr, unsigned long len,
- phys_addr_t baseaddr, unsigned long size)
-{
- return (addr >= baseaddr) && (addr + len <= baseaddr + size);
-}
-
-const
-struct vgic_io_range *vgic_find_range(const struct vgic_io_range *ranges,
- int len, gpa_t offset);
-
-bool vgic_handle_enable_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id, int access);
-
-bool vgic_handle_set_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_clear_pending_reg(struct kvm *kvm, struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_set_active_reg(struct kvm *kvm,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_clear_active_reg(struct kvm *kvm,
- struct kvm_exit_mmio *mmio,
- phys_addr_t offset, int vcpu_id);
-
-bool vgic_handle_cfg_reg(u32 *reg, struct kvm_exit_mmio *mmio,
- phys_addr_t offset);
-
-void vgic_kick_vcpus(struct kvm *kvm);
-
-int vgic_has_attr_regs(const struct vgic_io_range *ranges, phys_addr_t offset);
-int vgic_set_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
-int vgic_get_common_attr(struct kvm_device *dev, struct kvm_device_attr *attr);
-
-int vgic_init(struct kvm *kvm);
-void vgic_v2_init_emulation(struct kvm *kvm);
-void vgic_v3_init_emulation(struct kvm *kvm);
-
-#endif
diff --git a/virt/kvm/arm/vgic/vgic-init.c b/virt/kvm/arm/vgic/vgic-init.c
index 2c7f0d5a62ea..1e30ce08700d 100644
--- a/virt/kvm/arm/vgic/vgic-init.c
+++ b/virt/kvm/arm/vgic/vgic-init.c
@@ -157,6 +157,9 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
int i;
+ INIT_LIST_HEAD(&dist->lpi_list_head);
+ spin_lock_init(&dist->lpi_list_lock);
+
dist->spis = kcalloc(nr_spis, sizeof(struct vgic_irq), GFP_KERNEL);
if (!dist->spis)
return -ENOMEM;
@@ -177,6 +180,7 @@ static int kvm_vgic_dist_init(struct kvm *kvm, unsigned int nr_spis)
spin_lock_init(&irq->irq_lock);
irq->vcpu = NULL;
irq->target_vcpu = vcpu0;
+ kref_init(&irq->refcount);
if (dist->vgic_model == KVM_DEV_TYPE_ARM_VGIC_V2)
irq->targets = 0;
else
@@ -211,6 +215,7 @@ static void kvm_vgic_vcpu_init(struct kvm_vcpu *vcpu)
irq->vcpu = NULL;
irq->target_vcpu = vcpu;
irq->targets = 1U << vcpu->vcpu_id;
+ kref_init(&irq->refcount);
if (vgic_irq_is_sgi(i)) {
/* SGIs */
irq->enabled = 1;
@@ -253,6 +258,9 @@ int vgic_init(struct kvm *kvm)
if (ret)
goto out;
+ if (vgic_has_its(kvm))
+ dist->msis_require_devid = true;
+
kvm_for_each_vcpu(i, vcpu, kvm)
kvm_vgic_vcpu_init(vcpu);
@@ -271,7 +279,6 @@ static void kvm_vgic_dist_destroy(struct kvm *kvm)
dist->initialized = false;
kfree(dist->spis);
- kfree(dist->redist_iodevs);
dist->nr_spis = 0;
mutex_unlock(&kvm->lock);
diff --git a/virt/kvm/arm/vgic/vgic-its.c b/virt/kvm/arm/vgic/vgic-its.c
new file mode 100644
index 000000000000..07411cf967b9
--- /dev/null
+++ b/virt/kvm/arm/vgic/vgic-its.c
@@ -0,0 +1,1500 @@
+/*
+ * GICv3 ITS emulation
+ *
+ * Copyright (C) 2015,2016 ARM Ltd.
+ * Author: Andre Przywara <andre.przywara@arm.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <linux/cpu.h>
+#include <linux/kvm.h>
+#include <linux/kvm_host.h>
+#include <linux/interrupt.h>
+#include <linux/list.h>
+#include <linux/uaccess.h>
+
+#include <linux/irqchip/arm-gic-v3.h>
+
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_arm.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+#include "vgic-mmio.h"
+
+/*
+ * Creates a new (reference to a) struct vgic_irq for a given LPI.
+ * If this LPI is already mapped on another ITS, we increase its refcount
+ * and return a pointer to the existing structure.
+ * If this is a "new" LPI, we allocate and initialize a new struct vgic_irq.
+ * This function returns a pointer to the _unlocked_ structure.
+ */
+static struct vgic_irq *vgic_add_lpi(struct kvm *kvm, u32 intid)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq = vgic_get_irq(kvm, NULL, intid), *oldirq;
+
+ /* In this case there is no put, since we keep the reference. */
+ if (irq)
+ return irq;
+
+ irq = kzalloc(sizeof(struct vgic_irq), GFP_KERNEL);
+ if (!irq)
+ return NULL;
+
+ INIT_LIST_HEAD(&irq->lpi_list);
+ INIT_LIST_HEAD(&irq->ap_list);
+ spin_lock_init(&irq->irq_lock);
+
+ irq->config = VGIC_CONFIG_EDGE;
+ kref_init(&irq->refcount);
+ irq->intid = intid;
+
+ spin_lock(&dist->lpi_list_lock);
+
+ /*
+ * There could be a race with another vgic_add_lpi(), so we need to
+ * check that we don't add a second list entry with the same LPI.
+ */
+ list_for_each_entry(oldirq, &dist->lpi_list_head, lpi_list) {
+ if (oldirq->intid != intid)
+ continue;
+
+ /* Someone was faster with adding this LPI, lets use that. */
+ kfree(irq);
+ irq = oldirq;
+
+ /*
+ * This increases the refcount, the caller is expected to
+ * call vgic_put_irq() on the returned pointer once it's
+ * finished with the IRQ.
+ */
+ vgic_get_irq_kref(irq);
+
+ goto out_unlock;
+ }
+
+ list_add_tail(&irq->lpi_list, &dist->lpi_list_head);
+ dist->lpi_list_count++;
+
+out_unlock:
+ spin_unlock(&dist->lpi_list_lock);
+
+ return irq;
+}
+
+struct its_device {
+ struct list_head dev_list;
+
+ /* the head for the list of ITTEs */
+ struct list_head itt_head;
+ u32 device_id;
+};
+
+#define COLLECTION_NOT_MAPPED ((u32)~0)
+
+struct its_collection {
+ struct list_head coll_list;
+
+ u32 collection_id;
+ u32 target_addr;
+};
+
+#define its_is_collection_mapped(coll) ((coll) && \
+ ((coll)->target_addr != COLLECTION_NOT_MAPPED))
+
+struct its_itte {
+ struct list_head itte_list;
+
+ struct vgic_irq *irq;
+ struct its_collection *collection;
+ u32 lpi;
+ u32 event_id;
+};
+
+/*
+ * Find and returns a device in the device table for an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_device *find_its_device(struct vgic_its *its, u32 device_id)
+{
+ struct its_device *device;
+
+ list_for_each_entry(device, &its->device_list, dev_list)
+ if (device_id == device->device_id)
+ return device;
+
+ return NULL;
+}
+
+/*
+ * Find and returns an interrupt translation table entry (ITTE) for a given
+ * Device ID/Event ID pair on an ITS.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_itte *find_itte(struct vgic_its *its, u32 device_id,
+ u32 event_id)
+{
+ struct its_device *device;
+ struct its_itte *itte;
+
+ device = find_its_device(its, device_id);
+ if (device == NULL)
+ return NULL;
+
+ list_for_each_entry(itte, &device->itt_head, itte_list)
+ if (itte->event_id == event_id)
+ return itte;
+
+ return NULL;
+}
+
+/* To be used as an iterator this macro misses the enclosing parentheses */
+#define for_each_lpi_its(dev, itte, its) \
+ list_for_each_entry(dev, &(its)->device_list, dev_list) \
+ list_for_each_entry(itte, &(dev)->itt_head, itte_list)
+
+/*
+ * We only implement 48 bits of PA at the moment, although the ITS
+ * supports more. Let's be restrictive here.
+ */
+#define BASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16))
+#define CBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12))
+#define PENDBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 16))
+#define PROPBASER_ADDRESS(x) ((x) & GENMASK_ULL(47, 12))
+
+#define GIC_LPI_OFFSET 8192
+
+/*
+ * Finds and returns a collection in the ITS collection table.
+ * Must be called with the its_lock mutex held.
+ */
+static struct its_collection *find_collection(struct vgic_its *its, int coll_id)
+{
+ struct its_collection *collection;
+
+ list_for_each_entry(collection, &its->collection_list, coll_list) {
+ if (coll_id == collection->collection_id)
+ return collection;
+ }
+
+ return NULL;
+}
+
+#define LPI_PROP_ENABLE_BIT(p) ((p) & LPI_PROP_ENABLED)
+#define LPI_PROP_PRIORITY(p) ((p) & 0xfc)
+
+/*
+ * Reads the configuration data for a given LPI from guest memory and
+ * updates the fields in struct vgic_irq.
+ * If filter_vcpu is not NULL, applies only if the IRQ is targeting this
+ * VCPU. Unconditionally applies if filter_vcpu is NULL.
+ */
+static int update_lpi_config(struct kvm *kvm, struct vgic_irq *irq,
+ struct kvm_vcpu *filter_vcpu)
+{
+ u64 propbase = PROPBASER_ADDRESS(kvm->arch.vgic.propbaser);
+ u8 prop;
+ int ret;
+
+ ret = kvm_read_guest(kvm, propbase + irq->intid - GIC_LPI_OFFSET,
+ &prop, 1);
+
+ if (ret)
+ return ret;
+
+ spin_lock(&irq->irq_lock);
+
+ if (!filter_vcpu || filter_vcpu == irq->target_vcpu) {
+ irq->priority = LPI_PROP_PRIORITY(prop);
+ irq->enabled = LPI_PROP_ENABLE_BIT(prop);
+
+ vgic_queue_irq_unlock(kvm, irq);
+ } else {
+ spin_unlock(&irq->irq_lock);
+ }
+
+ return 0;
+}
+
+/*
+ * Create a snapshot of the current LPI list, so that we can enumerate all
+ * LPIs without holding any lock.
+ * Returns the array length and puts the kmalloc'ed array into intid_ptr.
+ */
+static int vgic_copy_lpi_list(struct kvm *kvm, u32 **intid_ptr)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq;
+ u32 *intids;
+ int irq_count = dist->lpi_list_count, i = 0;
+
+ /*
+ * We use the current value of the list length, which may change
+ * after the kmalloc. We don't care, because the guest shouldn't
+ * change anything while the command handling is still running,
+ * and in the worst case we would miss a new IRQ, which one wouldn't
+ * expect to be covered by this command anyway.
+ */
+ intids = kmalloc_array(irq_count, sizeof(intids[0]), GFP_KERNEL);
+ if (!intids)
+ return -ENOMEM;
+
+ spin_lock(&dist->lpi_list_lock);
+ list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+ /* We don't need to "get" the IRQ, as we hold the list lock. */
+ intids[i] = irq->intid;
+ if (++i == irq_count)
+ break;
+ }
+ spin_unlock(&dist->lpi_list_lock);
+
+ *intid_ptr = intids;
+ return irq_count;
+}
+
+/*
+ * Promotes the ITS view of affinity of an ITTE (which redistributor this LPI
+ * is targeting) to the VGIC's view, which deals with target VCPUs.
+ * Needs to be called whenever either the collection for a LPIs has
+ * changed or the collection itself got retargeted.
+ */
+static void update_affinity_itte(struct kvm *kvm, struct its_itte *itte)
+{
+ struct kvm_vcpu *vcpu;
+
+ if (!its_is_collection_mapped(itte->collection))
+ return;
+
+ vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr);
+
+ spin_lock(&itte->irq->irq_lock);
+ itte->irq->target_vcpu = vcpu;
+ spin_unlock(&itte->irq->irq_lock);
+}
+
+/*
+ * Updates the target VCPU for every LPI targeting this collection.
+ * Must be called with the its_lock mutex held.
+ */
+static void update_affinity_collection(struct kvm *kvm, struct vgic_its *its,
+ struct its_collection *coll)
+{
+ struct its_device *device;
+ struct its_itte *itte;
+
+ for_each_lpi_its(device, itte, its) {
+ if (!itte->collection || coll != itte->collection)
+ continue;
+
+ update_affinity_itte(kvm, itte);
+ }
+}
+
+static u32 max_lpis_propbaser(u64 propbaser)
+{
+ int nr_idbits = (propbaser & 0x1f) + 1;
+
+ return 1U << min(nr_idbits, INTERRUPT_ID_BITS_ITS);
+}
+
+/*
+ * Scan the whole LPI pending table and sync the pending bit in there
+ * with our own data structures. This relies on the LPI being
+ * mapped before.
+ */
+static int its_sync_lpi_pending_table(struct kvm_vcpu *vcpu)
+{
+ gpa_t pendbase = PENDBASER_ADDRESS(vcpu->arch.vgic_cpu.pendbaser);
+ struct vgic_irq *irq;
+ int last_byte_offset = -1;
+ int ret = 0;
+ u32 *intids;
+ int nr_irqs, i;
+
+ nr_irqs = vgic_copy_lpi_list(vcpu->kvm, &intids);
+ if (nr_irqs < 0)
+ return nr_irqs;
+
+ for (i = 0; i < nr_irqs; i++) {
+ int byte_offset, bit_nr;
+ u8 pendmask;
+
+ byte_offset = intids[i] / BITS_PER_BYTE;
+ bit_nr = intids[i] % BITS_PER_BYTE;
+
+ /*
+ * For contiguously allocated LPIs chances are we just read
+ * this very same byte in the last iteration. Reuse that.
+ */
+ if (byte_offset != last_byte_offset) {
+ ret = kvm_read_guest(vcpu->kvm, pendbase + byte_offset,
+ &pendmask, 1);
+ if (ret) {
+ kfree(intids);
+ return ret;
+ }
+ last_byte_offset = byte_offset;
+ }
+
+ irq = vgic_get_irq(vcpu->kvm, NULL, intids[i]);
+ spin_lock(&irq->irq_lock);
+ irq->pending = pendmask & (1U << bit_nr);
+ vgic_queue_irq_unlock(vcpu->kvm, irq);
+ vgic_put_irq(vcpu->kvm, irq);
+ }
+
+ kfree(intids);
+
+ return ret;
+}
+
+static unsigned long vgic_mmio_read_its_ctlr(struct kvm *vcpu,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ u32 reg = 0;
+
+ mutex_lock(&its->cmd_lock);
+ if (its->creadr == its->cwriter)
+ reg |= GITS_CTLR_QUIESCENT;
+ if (its->enabled)
+ reg |= GITS_CTLR_ENABLE;
+ mutex_unlock(&its->cmd_lock);
+
+ return reg;
+}
+
+static void vgic_mmio_write_its_ctlr(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ its->enabled = !!(val & GITS_CTLR_ENABLE);
+}
+
+static unsigned long vgic_mmio_read_its_typer(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ u64 reg = GITS_TYPER_PLPIS;
+
+ /*
+ * We use linear CPU numbers for redistributor addressing,
+ * so GITS_TYPER.PTA is 0.
+ * Also we force all PROPBASER registers to be the same, so
+ * CommonLPIAff is 0 as well.
+ * To avoid memory waste in the guest, we keep the number of IDBits and
+ * DevBits low - as least for the time being.
+ */
+ reg |= 0x0f << GITS_TYPER_DEVBITS_SHIFT;
+ reg |= 0x0f << GITS_TYPER_IDBITS_SHIFT;
+
+ return extract_bytes(reg, addr & 7, len);
+}
+
+static unsigned long vgic_mmio_read_its_iidr(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ return (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
+}
+
+static unsigned long vgic_mmio_read_its_idregs(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ switch (addr & 0xffff) {
+ case GITS_PIDR0:
+ return 0x92; /* part number, bits[7:0] */
+ case GITS_PIDR1:
+ return 0xb4; /* part number, bits[11:8] */
+ case GITS_PIDR2:
+ return GIC_PIDR2_ARCH_GICv3 | 0x0b;
+ case GITS_PIDR4:
+ return 0x40; /* This is a 64K software visible page */
+ /* The following are the ID registers for (any) GIC. */
+ case GITS_CIDR0:
+ return 0x0d;
+ case GITS_CIDR1:
+ return 0xf0;
+ case GITS_CIDR2:
+ return 0x05;
+ case GITS_CIDR3:
+ return 0xb1;
+ }
+
+ return 0;
+}
+
+/*
+ * Find the target VCPU and the LPI number for a given devid/eventid pair
+ * and make this IRQ pending, possibly injecting it.
+ * Must be called with the its_lock mutex held.
+ */
+static void vgic_its_trigger_msi(struct kvm *kvm, struct vgic_its *its,
+ u32 devid, u32 eventid)
+{
+ struct its_itte *itte;
+
+ if (!its->enabled)
+ return;
+
+ itte = find_itte(its, devid, eventid);
+ /* Triggering an unmapped IRQ gets silently dropped. */
+ if (itte && its_is_collection_mapped(itte->collection)) {
+ struct kvm_vcpu *vcpu;
+
+ vcpu = kvm_get_vcpu(kvm, itte->collection->target_addr);
+ if (vcpu && vcpu->arch.vgic_cpu.lpis_enabled) {
+ spin_lock(&itte->irq->irq_lock);
+ itte->irq->pending = true;
+ vgic_queue_irq_unlock(kvm, itte->irq);
+ }
+ }
+}
+
+/*
+ * Queries the KVM IO bus framework to get the ITS pointer from the given
+ * doorbell address.
+ * We then call vgic_its_trigger_msi() with the decoded data.
+ */
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+ u64 address;
+ struct kvm_io_device *kvm_io_dev;
+ struct vgic_io_device *iodev;
+
+ if (!vgic_has_its(kvm))
+ return -ENODEV;
+
+ if (!(msi->flags & KVM_MSI_VALID_DEVID))
+ return -EINVAL;
+
+ address = (u64)msi->address_hi << 32 | msi->address_lo;
+
+ kvm_io_dev = kvm_io_bus_get_dev(kvm, KVM_MMIO_BUS, address);
+ if (!kvm_io_dev)
+ return -ENODEV;
+
+ iodev = container_of(kvm_io_dev, struct vgic_io_device, dev);
+
+ mutex_lock(&iodev->its->its_lock);
+ vgic_its_trigger_msi(kvm, iodev->its, msi->devid, msi->data);
+ mutex_unlock(&iodev->its->its_lock);
+
+ return 0;
+}
+
+/* Requires the its_lock to be held. */
+static void its_free_itte(struct kvm *kvm, struct its_itte *itte)
+{
+ list_del(&itte->itte_list);
+
+ /* This put matches the get in vgic_add_lpi. */
+ vgic_put_irq(kvm, itte->irq);
+
+ kfree(itte);
+}
+
+static u64 its_cmd_mask_field(u64 *its_cmd, int word, int shift, int size)
+{
+ return (le64_to_cpu(its_cmd[word]) >> shift) & (BIT_ULL(size) - 1);
+}
+
+#define its_cmd_get_command(cmd) its_cmd_mask_field(cmd, 0, 0, 8)
+#define its_cmd_get_deviceid(cmd) its_cmd_mask_field(cmd, 0, 32, 32)
+#define its_cmd_get_id(cmd) its_cmd_mask_field(cmd, 1, 0, 32)
+#define its_cmd_get_physical_id(cmd) its_cmd_mask_field(cmd, 1, 32, 32)
+#define its_cmd_get_collection(cmd) its_cmd_mask_field(cmd, 2, 0, 16)
+#define its_cmd_get_target_addr(cmd) its_cmd_mask_field(cmd, 2, 16, 32)
+#define its_cmd_get_validbit(cmd) its_cmd_mask_field(cmd, 2, 63, 1)
+
+/*
+ * The DISCARD command frees an Interrupt Translation Table Entry (ITTE).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ struct its_itte *itte;
+
+
+ itte = find_itte(its, device_id, event_id);
+ if (itte && itte->collection) {
+ /*
+ * Though the spec talks about removing the pending state, we
+ * don't bother here since we clear the ITTE anyway and the
+ * pending state is a property of the ITTE struct.
+ */
+ its_free_itte(kvm, itte);
+ return 0;
+ }
+
+ return E_ITS_DISCARD_UNMAPPED_INTERRUPT;
+}
+
+/*
+ * The MOVI command moves an ITTE to a different collection.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_movi(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ u32 coll_id = its_cmd_get_collection(its_cmd);
+ struct kvm_vcpu *vcpu;
+ struct its_itte *itte;
+ struct its_collection *collection;
+
+ itte = find_itte(its, device_id, event_id);
+ if (!itte)
+ return E_ITS_MOVI_UNMAPPED_INTERRUPT;
+
+ if (!its_is_collection_mapped(itte->collection))
+ return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+ collection = find_collection(its, coll_id);
+ if (!its_is_collection_mapped(collection))
+ return E_ITS_MOVI_UNMAPPED_COLLECTION;
+
+ itte->collection = collection;
+ vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+ spin_lock(&itte->irq->irq_lock);
+ itte->irq->target_vcpu = vcpu;
+ spin_unlock(&itte->irq->irq_lock);
+
+ return 0;
+}
+
+/*
+ * Check whether an ID can be stored into the corresponding guest table.
+ * For a direct table this is pretty easy, but gets a bit nasty for
+ * indirect tables. We check whether the resulting guest physical address
+ * is actually valid (covered by a memslot and guest accessbible).
+ * For this we have to read the respective first level entry.
+ */
+static bool vgic_its_check_id(struct vgic_its *its, u64 baser, int id)
+{
+ int l1_tbl_size = GITS_BASER_NR_PAGES(baser) * SZ_64K;
+ int index;
+ u64 indirect_ptr;
+ gfn_t gfn;
+
+ if (!(baser & GITS_BASER_INDIRECT)) {
+ phys_addr_t addr;
+
+ if (id >= (l1_tbl_size / GITS_BASER_ENTRY_SIZE(baser)))
+ return false;
+
+ addr = BASER_ADDRESS(baser) + id * GITS_BASER_ENTRY_SIZE(baser);
+ gfn = addr >> PAGE_SHIFT;
+
+ return kvm_is_visible_gfn(its->dev->kvm, gfn);
+ }
+
+ /* calculate and check the index into the 1st level */
+ index = id / (SZ_64K / GITS_BASER_ENTRY_SIZE(baser));
+ if (index >= (l1_tbl_size / sizeof(u64)))
+ return false;
+
+ /* Each 1st level entry is represented by a 64-bit value. */
+ if (kvm_read_guest(its->dev->kvm,
+ BASER_ADDRESS(baser) + index * sizeof(indirect_ptr),
+ &indirect_ptr, sizeof(indirect_ptr)))
+ return false;
+
+ indirect_ptr = le64_to_cpu(indirect_ptr);
+
+ /* check the valid bit of the first level entry */
+ if (!(indirect_ptr & BIT_ULL(63)))
+ return false;
+
+ /*
+ * Mask the guest physical address and calculate the frame number.
+ * Any address beyond our supported 48 bits of PA will be caught
+ * by the actual check in the final step.
+ */
+ indirect_ptr &= GENMASK_ULL(51, 16);
+
+ /* Find the address of the actual entry */
+ index = id % (SZ_64K / GITS_BASER_ENTRY_SIZE(baser));
+ indirect_ptr += index * GITS_BASER_ENTRY_SIZE(baser);
+ gfn = indirect_ptr >> PAGE_SHIFT;
+
+ return kvm_is_visible_gfn(its->dev->kvm, gfn);
+}
+
+static int vgic_its_alloc_collection(struct vgic_its *its,
+ struct its_collection **colp,
+ u32 coll_id)
+{
+ struct its_collection *collection;
+
+ if (!vgic_its_check_id(its, its->baser_coll_table, coll_id))
+ return E_ITS_MAPC_COLLECTION_OOR;
+
+ collection = kzalloc(sizeof(*collection), GFP_KERNEL);
+
+ collection->collection_id = coll_id;
+ collection->target_addr = COLLECTION_NOT_MAPPED;
+
+ list_add_tail(&collection->coll_list, &its->collection_list);
+ *colp = collection;
+
+ return 0;
+}
+
+static void vgic_its_free_collection(struct vgic_its *its, u32 coll_id)
+{
+ struct its_collection *collection;
+ struct its_device *device;
+ struct its_itte *itte;
+
+ /*
+ * Clearing the mapping for that collection ID removes the
+ * entry from the list. If there wasn't any before, we can
+ * go home early.
+ */
+ collection = find_collection(its, coll_id);
+ if (!collection)
+ return;
+
+ for_each_lpi_its(device, itte, its)
+ if (itte->collection &&
+ itte->collection->collection_id == coll_id)
+ itte->collection = NULL;
+
+ list_del(&collection->coll_list);
+ kfree(collection);
+}
+
+/*
+ * The MAPTI and MAPI commands map LPIs to ITTEs.
+ * Must be called with its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapi(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ u32 coll_id = its_cmd_get_collection(its_cmd);
+ struct its_itte *itte;
+ struct its_device *device;
+ struct its_collection *collection, *new_coll = NULL;
+ int lpi_nr;
+
+ device = find_its_device(its, device_id);
+ if (!device)
+ return E_ITS_MAPTI_UNMAPPED_DEVICE;
+
+ if (its_cmd_get_command(its_cmd) == GITS_CMD_MAPTI)
+ lpi_nr = its_cmd_get_physical_id(its_cmd);
+ else
+ lpi_nr = event_id;
+ if (lpi_nr < GIC_LPI_OFFSET ||
+ lpi_nr >= max_lpis_propbaser(kvm->arch.vgic.propbaser))
+ return E_ITS_MAPTI_PHYSICALID_OOR;
+
+ collection = find_collection(its, coll_id);
+ if (!collection) {
+ int ret = vgic_its_alloc_collection(its, &collection, coll_id);
+ if (ret)
+ return ret;
+ new_coll = collection;
+ }
+
+ itte = find_itte(its, device_id, event_id);
+ if (!itte) {
+ itte = kzalloc(sizeof(struct its_itte), GFP_KERNEL);
+ if (!itte) {
+ if (new_coll)
+ vgic_its_free_collection(its, coll_id);
+ return -ENOMEM;
+ }
+
+ itte->event_id = event_id;
+ list_add_tail(&itte->itte_list, &device->itt_head);
+ }
+
+ itte->collection = collection;
+ itte->lpi = lpi_nr;
+ itte->irq = vgic_add_lpi(kvm, lpi_nr);
+ update_affinity_itte(kvm, itte);
+
+ /*
+ * We "cache" the configuration table entries in out struct vgic_irq's.
+ * However we only have those structs for mapped IRQs, so we read in
+ * the respective config data from memory here upon mapping the LPI.
+ */
+ update_lpi_config(kvm, itte->irq, NULL);
+
+ return 0;
+}
+
+/* Requires the its_lock to be held. */
+static void vgic_its_unmap_device(struct kvm *kvm, struct its_device *device)
+{
+ struct its_itte *itte, *temp;
+
+ /*
+ * The spec says that unmapping a device with still valid
+ * ITTEs associated is UNPREDICTABLE. We remove all ITTEs,
+ * since we cannot leave the memory unreferenced.
+ */
+ list_for_each_entry_safe(itte, temp, &device->itt_head, itte_list)
+ its_free_itte(kvm, itte);
+
+ list_del(&device->dev_list);
+ kfree(device);
+}
+
+/*
+ * MAPD maps or unmaps a device ID to Interrupt Translation Tables (ITTs).
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ bool valid = its_cmd_get_validbit(its_cmd);
+ struct its_device *device;
+
+ if (!vgic_its_check_id(its, its->baser_device_table, device_id))
+ return E_ITS_MAPD_DEVICE_OOR;
+
+ device = find_its_device(its, device_id);
+
+ /*
+ * The spec says that calling MAPD on an already mapped device
+ * invalidates all cached data for this device. We implement this
+ * by removing the mapping and re-establishing it.
+ */
+ if (device)
+ vgic_its_unmap_device(kvm, device);
+
+ /*
+ * The spec does not say whether unmapping a not-mapped device
+ * is an error, so we are done in any case.
+ */
+ if (!valid)
+ return 0;
+
+ device = kzalloc(sizeof(struct its_device), GFP_KERNEL);
+ if (!device)
+ return -ENOMEM;
+
+ device->device_id = device_id;
+ INIT_LIST_HEAD(&device->itt_head);
+
+ list_add_tail(&device->dev_list, &its->device_list);
+
+ return 0;
+}
+
+/*
+ * The MAPC command maps collection IDs to redistributors.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_mapc(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u16 coll_id;
+ u32 target_addr;
+ struct its_collection *collection;
+ bool valid;
+
+ valid = its_cmd_get_validbit(its_cmd);
+ coll_id = its_cmd_get_collection(its_cmd);
+ target_addr = its_cmd_get_target_addr(its_cmd);
+
+ if (target_addr >= atomic_read(&kvm->online_vcpus))
+ return E_ITS_MAPC_PROCNUM_OOR;
+
+ if (!valid) {
+ vgic_its_free_collection(its, coll_id);
+ } else {
+ collection = find_collection(its, coll_id);
+
+ if (!collection) {
+ int ret;
+
+ ret = vgic_its_alloc_collection(its, &collection,
+ coll_id);
+ if (ret)
+ return ret;
+ collection->target_addr = target_addr;
+ } else {
+ collection->target_addr = target_addr;
+ update_affinity_collection(kvm, its, collection);
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * The CLEAR command removes the pending state for a particular LPI.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_clear(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ struct its_itte *itte;
+
+
+ itte = find_itte(its, device_id, event_id);
+ if (!itte)
+ return E_ITS_CLEAR_UNMAPPED_INTERRUPT;
+
+ itte->irq->pending = false;
+
+ return 0;
+}
+
+/*
+ * The INV command syncs the configuration bits from the memory table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_inv(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 device_id = its_cmd_get_deviceid(its_cmd);
+ u32 event_id = its_cmd_get_id(its_cmd);
+ struct its_itte *itte;
+
+
+ itte = find_itte(its, device_id, event_id);
+ if (!itte)
+ return E_ITS_INV_UNMAPPED_INTERRUPT;
+
+ return update_lpi_config(kvm, itte->irq, NULL);
+}
+
+/*
+ * The INVALL command requests flushing of all IRQ data in this collection.
+ * Find the VCPU mapped to that collection, then iterate over the VM's list
+ * of mapped LPIs and update the configuration for each IRQ which targets
+ * the specified vcpu. The configuration will be read from the in-memory
+ * configuration table.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_invall(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 coll_id = its_cmd_get_collection(its_cmd);
+ struct its_collection *collection;
+ struct kvm_vcpu *vcpu;
+ struct vgic_irq *irq;
+ u32 *intids;
+ int irq_count, i;
+
+ collection = find_collection(its, coll_id);
+ if (!its_is_collection_mapped(collection))
+ return E_ITS_INVALL_UNMAPPED_COLLECTION;
+
+ vcpu = kvm_get_vcpu(kvm, collection->target_addr);
+
+ irq_count = vgic_copy_lpi_list(kvm, &intids);
+ if (irq_count < 0)
+ return irq_count;
+
+ for (i = 0; i < irq_count; i++) {
+ irq = vgic_get_irq(kvm, NULL, intids[i]);
+ if (!irq)
+ continue;
+ update_lpi_config(kvm, irq, vcpu);
+ vgic_put_irq(kvm, irq);
+ }
+
+ kfree(intids);
+
+ return 0;
+}
+
+/*
+ * The MOVALL command moves the pending state of all IRQs targeting one
+ * redistributor to another. We don't hold the pending state in the VCPUs,
+ * but in the IRQs instead, so there is really not much to do for us here.
+ * However the spec says that no IRQ must target the old redistributor
+ * afterwards, so we make sure that no LPI is using the associated target_vcpu.
+ * This command affects all LPIs in the system that target that redistributor.
+ */
+static int vgic_its_cmd_handle_movall(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ u32 target1_addr = its_cmd_get_target_addr(its_cmd);
+ u32 target2_addr = its_cmd_mask_field(its_cmd, 3, 16, 32);
+ struct kvm_vcpu *vcpu1, *vcpu2;
+ struct vgic_irq *irq;
+
+ if (target1_addr >= atomic_read(&kvm->online_vcpus) ||
+ target2_addr >= atomic_read(&kvm->online_vcpus))
+ return E_ITS_MOVALL_PROCNUM_OOR;
+
+ if (target1_addr == target2_addr)
+ return 0;
+
+ vcpu1 = kvm_get_vcpu(kvm, target1_addr);
+ vcpu2 = kvm_get_vcpu(kvm, target2_addr);
+
+ spin_lock(&dist->lpi_list_lock);
+
+ list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+ spin_lock(&irq->irq_lock);
+
+ if (irq->target_vcpu == vcpu1)
+ irq->target_vcpu = vcpu2;
+
+ spin_unlock(&irq->irq_lock);
+ }
+
+ spin_unlock(&dist->lpi_list_lock);
+
+ return 0;
+}
+
+/*
+ * The INT command injects the LPI associated with that DevID/EvID pair.
+ * Must be called with the its_lock mutex held.
+ */
+static int vgic_its_cmd_handle_int(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ u32 msi_data = its_cmd_get_id(its_cmd);
+ u64 msi_devid = its_cmd_get_deviceid(its_cmd);
+
+ vgic_its_trigger_msi(kvm, its, msi_devid, msi_data);
+
+ return 0;
+}
+
+/*
+ * This function is called with the its_cmd lock held, but the ITS data
+ * structure lock dropped.
+ */
+static int vgic_its_handle_command(struct kvm *kvm, struct vgic_its *its,
+ u64 *its_cmd)
+{
+ int ret = -ENODEV;
+
+ mutex_lock(&its->its_lock);
+ switch (its_cmd_get_command(its_cmd)) {
+ case GITS_CMD_MAPD:
+ ret = vgic_its_cmd_handle_mapd(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MAPC:
+ ret = vgic_its_cmd_handle_mapc(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MAPI:
+ ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MAPTI:
+ ret = vgic_its_cmd_handle_mapi(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MOVI:
+ ret = vgic_its_cmd_handle_movi(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_DISCARD:
+ ret = vgic_its_cmd_handle_discard(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_CLEAR:
+ ret = vgic_its_cmd_handle_clear(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_MOVALL:
+ ret = vgic_its_cmd_handle_movall(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_INT:
+ ret = vgic_its_cmd_handle_int(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_INV:
+ ret = vgic_its_cmd_handle_inv(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_INVALL:
+ ret = vgic_its_cmd_handle_invall(kvm, its, its_cmd);
+ break;
+ case GITS_CMD_SYNC:
+ /* we ignore this command: we are in sync all of the time */
+ ret = 0;
+ break;
+ }
+ mutex_unlock(&its->its_lock);
+
+ return ret;
+}
+
+static u64 vgic_sanitise_its_baser(u64 reg)
+{
+ reg = vgic_sanitise_field(reg, GITS_BASER_SHAREABILITY_MASK,
+ GITS_BASER_SHAREABILITY_SHIFT,
+ vgic_sanitise_shareability);
+ reg = vgic_sanitise_field(reg, GITS_BASER_INNER_CACHEABILITY_MASK,
+ GITS_BASER_INNER_CACHEABILITY_SHIFT,
+ vgic_sanitise_inner_cacheability);
+ reg = vgic_sanitise_field(reg, GITS_BASER_OUTER_CACHEABILITY_MASK,
+ GITS_BASER_OUTER_CACHEABILITY_SHIFT,
+ vgic_sanitise_outer_cacheability);
+
+ /* Bits 15:12 contain bits 51:48 of the PA, which we don't support. */
+ reg &= ~GENMASK_ULL(15, 12);
+
+ /* We support only one (ITS) page size: 64K */
+ reg = (reg & ~GITS_BASER_PAGE_SIZE_MASK) | GITS_BASER_PAGE_SIZE_64K;
+
+ return reg;
+}
+
+static u64 vgic_sanitise_its_cbaser(u64 reg)
+{
+ reg = vgic_sanitise_field(reg, GITS_CBASER_SHAREABILITY_MASK,
+ GITS_CBASER_SHAREABILITY_SHIFT,
+ vgic_sanitise_shareability);
+ reg = vgic_sanitise_field(reg, GITS_CBASER_INNER_CACHEABILITY_MASK,
+ GITS_CBASER_INNER_CACHEABILITY_SHIFT,
+ vgic_sanitise_inner_cacheability);
+ reg = vgic_sanitise_field(reg, GITS_CBASER_OUTER_CACHEABILITY_MASK,
+ GITS_CBASER_OUTER_CACHEABILITY_SHIFT,
+ vgic_sanitise_outer_cacheability);
+
+ /*
+ * Sanitise the physical address to be 64k aligned.
+ * Also limit the physical addresses to 48 bits.
+ */
+ reg &= ~(GENMASK_ULL(51, 48) | GENMASK_ULL(15, 12));
+
+ return reg;
+}
+
+static unsigned long vgic_mmio_read_its_cbaser(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ return extract_bytes(its->cbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_its_cbaser(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ /* When GITS_CTLR.Enable is 1, this register is RO. */
+ if (its->enabled)
+ return;
+
+ mutex_lock(&its->cmd_lock);
+ its->cbaser = update_64bit_reg(its->cbaser, addr & 7, len, val);
+ its->cbaser = vgic_sanitise_its_cbaser(its->cbaser);
+ its->creadr = 0;
+ /*
+ * CWRITER is architecturally UNKNOWN on reset, but we need to reset
+ * it to CREADR to make sure we start with an empty command buffer.
+ */
+ its->cwriter = its->creadr;
+ mutex_unlock(&its->cmd_lock);
+}
+
+#define ITS_CMD_BUFFER_SIZE(baser) ((((baser) & 0xff) + 1) << 12)
+#define ITS_CMD_SIZE 32
+#define ITS_CMD_OFFSET(reg) ((reg) & GENMASK(19, 5))
+
+/*
+ * By writing to CWRITER the guest announces new commands to be processed.
+ * To avoid any races in the first place, we take the its_cmd lock, which
+ * protects our ring buffer variables, so that there is only one user
+ * per ITS handling commands at a given time.
+ */
+static void vgic_mmio_write_its_cwriter(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ gpa_t cbaser;
+ u64 cmd_buf[4];
+ u32 reg;
+
+ if (!its)
+ return;
+
+ mutex_lock(&its->cmd_lock);
+
+ reg = update_64bit_reg(its->cwriter, addr & 7, len, val);
+ reg = ITS_CMD_OFFSET(reg);
+ if (reg >= ITS_CMD_BUFFER_SIZE(its->cbaser)) {
+ mutex_unlock(&its->cmd_lock);
+ return;
+ }
+
+ its->cwriter = reg;
+ cbaser = CBASER_ADDRESS(its->cbaser);
+
+ while (its->cwriter != its->creadr) {
+ int ret = kvm_read_guest(kvm, cbaser + its->creadr,
+ cmd_buf, ITS_CMD_SIZE);
+ /*
+ * If kvm_read_guest() fails, this could be due to the guest
+ * programming a bogus value in CBASER or something else going
+ * wrong from which we cannot easily recover.
+ * According to section 6.3.2 in the GICv3 spec we can just
+ * ignore that command then.
+ */
+ if (!ret)
+ vgic_its_handle_command(kvm, its, cmd_buf);
+
+ its->creadr += ITS_CMD_SIZE;
+ if (its->creadr == ITS_CMD_BUFFER_SIZE(its->cbaser))
+ its->creadr = 0;
+ }
+
+ mutex_unlock(&its->cmd_lock);
+}
+
+static unsigned long vgic_mmio_read_its_cwriter(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ return extract_bytes(its->cwriter, addr & 0x7, len);
+}
+
+static unsigned long vgic_mmio_read_its_creadr(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ return extract_bytes(its->creadr, addr & 0x7, len);
+}
+
+#define BASER_INDEX(addr) (((addr) / sizeof(u64)) & 0x7)
+static unsigned long vgic_mmio_read_its_baser(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len)
+{
+ u64 reg;
+
+ switch (BASER_INDEX(addr)) {
+ case 0:
+ reg = its->baser_device_table;
+ break;
+ case 1:
+ reg = its->baser_coll_table;
+ break;
+ default:
+ reg = 0;
+ break;
+ }
+
+ return extract_bytes(reg, addr & 7, len);
+}
+
+#define GITS_BASER_RO_MASK (GENMASK_ULL(52, 48) | GENMASK_ULL(58, 56))
+static void vgic_mmio_write_its_baser(struct kvm *kvm,
+ struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ u64 entry_size, device_type;
+ u64 reg, *regptr, clearbits = 0;
+
+ /* When GITS_CTLR.Enable is 1, we ignore write accesses. */
+ if (its->enabled)
+ return;
+
+ switch (BASER_INDEX(addr)) {
+ case 0:
+ regptr = &its->baser_device_table;
+ entry_size = 8;
+ device_type = GITS_BASER_TYPE_DEVICE;
+ break;
+ case 1:
+ regptr = &its->baser_coll_table;
+ entry_size = 8;
+ device_type = GITS_BASER_TYPE_COLLECTION;
+ clearbits = GITS_BASER_INDIRECT;
+ break;
+ default:
+ return;
+ }
+
+ reg = update_64bit_reg(*regptr, addr & 7, len, val);
+ reg &= ~GITS_BASER_RO_MASK;
+ reg &= ~clearbits;
+
+ reg |= (entry_size - 1) << GITS_BASER_ENTRY_SIZE_SHIFT;
+ reg |= device_type << GITS_BASER_TYPE_SHIFT;
+ reg = vgic_sanitise_its_baser(reg);
+
+ *regptr = reg;
+}
+
+#define REGISTER_ITS_DESC(off, rd, wr, length, acc) \
+{ \
+ .reg_offset = off, \
+ .len = length, \
+ .access_flags = acc, \
+ .its_read = rd, \
+ .its_write = wr, \
+}
+
+static void its_mmio_write_wi(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len, unsigned long val)
+{
+ /* Ignore */
+}
+
+static struct vgic_register_region its_registers[] = {
+ REGISTER_ITS_DESC(GITS_CTLR,
+ vgic_mmio_read_its_ctlr, vgic_mmio_write_its_ctlr, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_IIDR,
+ vgic_mmio_read_its_iidr, its_mmio_write_wi, 4,
+ VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_TYPER,
+ vgic_mmio_read_its_typer, its_mmio_write_wi, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_CBASER,
+ vgic_mmio_read_its_cbaser, vgic_mmio_write_its_cbaser, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_CWRITER,
+ vgic_mmio_read_its_cwriter, vgic_mmio_write_its_cwriter, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_CREADR,
+ vgic_mmio_read_its_creadr, its_mmio_write_wi, 8,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_BASER,
+ vgic_mmio_read_its_baser, vgic_mmio_write_its_baser, 0x40,
+ VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
+ REGISTER_ITS_DESC(GITS_IDREGS_BASE,
+ vgic_mmio_read_its_idregs, its_mmio_write_wi, 0x30,
+ VGIC_ACCESS_32bit),
+};
+
+/* This is called on setting the LPI enable bit in the redistributor. */
+void vgic_enable_lpis(struct kvm_vcpu *vcpu)
+{
+ if (!(vcpu->arch.vgic_cpu.pendbaser & GICR_PENDBASER_PTZ))
+ its_sync_lpi_pending_table(vcpu);
+}
+
+static int vgic_its_init_its(struct kvm *kvm, struct vgic_its *its)
+{
+ struct vgic_io_device *iodev = &its->iodev;
+ int ret;
+
+ if (its->initialized)
+ return 0;
+
+ if (IS_VGIC_ADDR_UNDEF(its->vgic_its_base))
+ return -ENXIO;
+
+ iodev->regions = its_registers;
+ iodev->nr_regions = ARRAY_SIZE(its_registers);
+ kvm_iodevice_init(&iodev->dev, &kvm_io_gic_ops);
+
+ iodev->base_addr = its->vgic_its_base;
+ iodev->iodev_type = IODEV_ITS;
+ iodev->its = its;
+ mutex_lock(&kvm->slots_lock);
+ ret = kvm_io_bus_register_dev(kvm, KVM_MMIO_BUS, iodev->base_addr,
+ KVM_VGIC_V3_ITS_SIZE, &iodev->dev);
+ mutex_unlock(&kvm->slots_lock);
+
+ if (!ret)
+ its->initialized = true;
+
+ return ret;
+}
+
+#define INITIAL_BASER_VALUE \
+ (GIC_BASER_CACHEABILITY(GITS_BASER, INNER, RaWb) | \
+ GIC_BASER_CACHEABILITY(GITS_BASER, OUTER, SameAsInner) | \
+ GIC_BASER_SHAREABILITY(GITS_BASER, InnerShareable) | \
+ ((8ULL - 1) << GITS_BASER_ENTRY_SIZE_SHIFT) | \
+ GITS_BASER_PAGE_SIZE_64K)
+
+#define INITIAL_PROPBASER_VALUE \
+ (GIC_BASER_CACHEABILITY(GICR_PROPBASER, INNER, RaWb) | \
+ GIC_BASER_CACHEABILITY(GICR_PROPBASER, OUTER, SameAsInner) | \
+ GIC_BASER_SHAREABILITY(GICR_PROPBASER, InnerShareable))
+
+static int vgic_its_create(struct kvm_device *dev, u32 type)
+{
+ struct vgic_its *its;
+
+ if (type != KVM_DEV_TYPE_ARM_VGIC_ITS)
+ return -ENODEV;
+
+ its = kzalloc(sizeof(struct vgic_its), GFP_KERNEL);
+ if (!its)
+ return -ENOMEM;
+
+ mutex_init(&its->its_lock);
+ mutex_init(&its->cmd_lock);
+
+ its->vgic_its_base = VGIC_ADDR_UNDEF;
+
+ INIT_LIST_HEAD(&its->device_list);
+ INIT_LIST_HEAD(&its->collection_list);
+
+ dev->kvm->arch.vgic.has_its = true;
+ its->initialized = false;
+ its->enabled = false;
+ its->dev = dev;
+
+ its->baser_device_table = INITIAL_BASER_VALUE |
+ ((u64)GITS_BASER_TYPE_DEVICE << GITS_BASER_TYPE_SHIFT);
+ its->baser_coll_table = INITIAL_BASER_VALUE |
+ ((u64)GITS_BASER_TYPE_COLLECTION << GITS_BASER_TYPE_SHIFT);
+ dev->kvm->arch.vgic.propbaser = INITIAL_PROPBASER_VALUE;
+
+ dev->private = its;
+
+ return 0;
+}
+
+static void vgic_its_destroy(struct kvm_device *kvm_dev)
+{
+ struct kvm *kvm = kvm_dev->kvm;
+ struct vgic_its *its = kvm_dev->private;
+ struct its_device *dev;
+ struct its_itte *itte;
+ struct list_head *dev_cur, *dev_temp;
+ struct list_head *cur, *temp;
+
+ /*
+ * We may end up here without the lists ever having been initialized.
+ * Check this and bail out early to avoid dereferencing a NULL pointer.
+ */
+ if (!its->device_list.next)
+ return;
+
+ mutex_lock(&its->its_lock);
+ list_for_each_safe(dev_cur, dev_temp, &its->device_list) {
+ dev = container_of(dev_cur, struct its_device, dev_list);
+ list_for_each_safe(cur, temp, &dev->itt_head) {
+ itte = (container_of(cur, struct its_itte, itte_list));
+ its_free_itte(kvm, itte);
+ }
+ list_del(dev_cur);
+ kfree(dev);
+ }
+
+ list_for_each_safe(cur, temp, &its->collection_list) {
+ list_del(cur);
+ kfree(container_of(cur, struct its_collection, coll_list));
+ }
+ mutex_unlock(&its->its_lock);
+
+ kfree(its);
+}
+
+static int vgic_its_has_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR:
+ switch (attr->attr) {
+ case KVM_VGIC_ITS_ADDR_TYPE:
+ return 0;
+ }
+ break;
+ case KVM_DEV_ARM_VGIC_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_ARM_VGIC_CTRL_INIT:
+ return 0;
+ }
+ break;
+ }
+ return -ENXIO;
+}
+
+static int vgic_its_set_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ struct vgic_its *its = dev->private;
+ int ret;
+
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ unsigned long type = (unsigned long)attr->attr;
+ u64 addr;
+
+ if (type != KVM_VGIC_ITS_ADDR_TYPE)
+ return -ENODEV;
+
+ if (its->initialized)
+ return -EBUSY;
+
+ if (copy_from_user(&addr, uaddr, sizeof(addr)))
+ return -EFAULT;
+
+ ret = vgic_check_ioaddr(dev->kvm, &its->vgic_its_base,
+ addr, SZ_64K);
+ if (ret)
+ return ret;
+
+ its->vgic_its_base = addr;
+
+ return 0;
+ }
+ case KVM_DEV_ARM_VGIC_GRP_CTRL:
+ switch (attr->attr) {
+ case KVM_DEV_ARM_VGIC_CTRL_INIT:
+ return vgic_its_init_its(dev->kvm, its);
+ }
+ break;
+ }
+ return -ENXIO;
+}
+
+static int vgic_its_get_attr(struct kvm_device *dev,
+ struct kvm_device_attr *attr)
+{
+ switch (attr->group) {
+ case KVM_DEV_ARM_VGIC_GRP_ADDR: {
+ struct vgic_its *its = dev->private;
+ u64 addr = its->vgic_its_base;
+ u64 __user *uaddr = (u64 __user *)(long)attr->addr;
+ unsigned long type = (unsigned long)attr->attr;
+
+ if (type != KVM_VGIC_ITS_ADDR_TYPE)
+ return -ENODEV;
+
+ if (copy_to_user(uaddr, &addr, sizeof(addr)))
+ return -EFAULT;
+ break;
+ default:
+ return -ENXIO;
+ }
+ }
+
+ return 0;
+}
+
+static struct kvm_device_ops kvm_arm_vgic_its_ops = {
+ .name = "kvm-arm-vgic-its",
+ .create = vgic_its_create,
+ .destroy = vgic_its_destroy,
+ .set_attr = vgic_its_set_attr,
+ .get_attr = vgic_its_get_attr,
+ .has_attr = vgic_its_has_attr,
+};
+
+int kvm_vgic_register_its_device(void)
+{
+ return kvm_register_device_ops(&kvm_arm_vgic_its_ops,
+ KVM_DEV_TYPE_ARM_VGIC_ITS);
+}
diff --git a/virt/kvm/arm/vgic/vgic-kvm-device.c b/virt/kvm/arm/vgic/vgic-kvm-device.c
index 0130c4b147b7..1813f93b5cde 100644
--- a/virt/kvm/arm/vgic/vgic-kvm-device.c
+++ b/virt/kvm/arm/vgic/vgic-kvm-device.c
@@ -21,8 +21,8 @@
/* common helpers */
-static int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
- phys_addr_t addr, phys_addr_t alignment)
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+ phys_addr_t addr, phys_addr_t alignment)
{
if (addr & ~KVM_PHYS_MASK)
return -E2BIG;
@@ -210,20 +210,27 @@ static void vgic_destroy(struct kvm_device *dev)
kfree(dev);
}
-void kvm_register_vgic_device(unsigned long type)
+int kvm_register_vgic_device(unsigned long type)
{
+ int ret = -ENODEV;
+
switch (type) {
case KVM_DEV_TYPE_ARM_VGIC_V2:
- kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
- KVM_DEV_TYPE_ARM_VGIC_V2);
+ ret = kvm_register_device_ops(&kvm_arm_vgic_v2_ops,
+ KVM_DEV_TYPE_ARM_VGIC_V2);
break;
#ifdef CONFIG_KVM_ARM_VGIC_V3
case KVM_DEV_TYPE_ARM_VGIC_V3:
- kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
- KVM_DEV_TYPE_ARM_VGIC_V3);
+ ret = kvm_register_device_ops(&kvm_arm_vgic_v3_ops,
+ KVM_DEV_TYPE_ARM_VGIC_V3);
+ if (ret)
+ break;
+ ret = kvm_vgic_register_its_device();
break;
#endif
}
+
+ return ret;
}
/** vgic_attr_regs_access: allows user space to read/write VGIC registers
@@ -428,4 +435,3 @@ struct kvm_device_ops kvm_arm_vgic_v3_ops = {
};
#endif /* CONFIG_KVM_ARM_VGIC_V3 */
-
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v2.c b/virt/kvm/arm/vgic/vgic-mmio-v2.c
index a21393637e4b..b44b359cbbad 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v2.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v2.c
@@ -102,6 +102,7 @@ static void vgic_mmio_write_sgir(struct kvm_vcpu *source_vcpu,
irq->source |= 1U << source_vcpu->vcpu_id;
vgic_queue_irq_unlock(source_vcpu->kvm, irq);
+ vgic_put_irq(source_vcpu->kvm, irq);
}
}
@@ -116,6 +117,8 @@ static unsigned long vgic_mmio_read_target(struct kvm_vcpu *vcpu,
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
val |= (u64)irq->targets << (i * 8);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
return val;
@@ -143,6 +146,7 @@ static void vgic_mmio_write_target(struct kvm_vcpu *vcpu,
irq->target_vcpu = kvm_get_vcpu(vcpu->kvm, target);
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -157,6 +161,8 @@ static unsigned long vgic_mmio_read_sgipend(struct kvm_vcpu *vcpu,
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
val |= (u64)irq->source << (i * 8);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
return val;
}
@@ -178,6 +184,7 @@ static void vgic_mmio_write_sgipendc(struct kvm_vcpu *vcpu,
irq->pending = false;
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -201,6 +208,7 @@ static void vgic_mmio_write_sgipends(struct kvm_vcpu *vcpu,
} else {
spin_unlock(&irq->irq_lock);
}
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -429,6 +437,7 @@ int vgic_v2_cpuif_uaccess(struct kvm_vcpu *vcpu, bool is_write,
struct vgic_io_device dev = {
.regions = vgic_v2_cpu_registers,
.nr_regions = ARRAY_SIZE(vgic_v2_cpu_registers),
+ .iodev_type = IODEV_CPUIF,
};
return vgic_uaccess(vcpu, &dev, is_write, offset, val);
@@ -440,6 +449,7 @@ int vgic_v2_dist_uaccess(struct kvm_vcpu *vcpu, bool is_write,
struct vgic_io_device dev = {
.regions = vgic_v2_dist_registers,
.nr_regions = ARRAY_SIZE(vgic_v2_dist_registers),
+ .iodev_type = IODEV_DIST,
};
return vgic_uaccess(vcpu, &dev, is_write, offset, val);
diff --git a/virt/kvm/arm/vgic/vgic-mmio-v3.c b/virt/kvm/arm/vgic/vgic-mmio-v3.c
index a0c515a412a7..ff668e0dd586 100644
--- a/virt/kvm/arm/vgic/vgic-mmio-v3.c
+++ b/virt/kvm/arm/vgic/vgic-mmio-v3.c
@@ -23,12 +23,35 @@
#include "vgic-mmio.h"
/* extract @num bytes at @offset bytes offset in data */
-static unsigned long extract_bytes(unsigned long data, unsigned int offset,
- unsigned int num)
+unsigned long extract_bytes(unsigned long data, unsigned int offset,
+ unsigned int num)
{
return (data >> (offset * 8)) & GENMASK_ULL(num * 8 - 1, 0);
}
+/* allows updates of any half of a 64-bit register (or the whole thing) */
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+ unsigned long val)
+{
+ int lower = (offset & 4) * 8;
+ int upper = lower + 8 * len - 1;
+
+ reg &= ~GENMASK_ULL(upper, lower);
+ val &= GENMASK_ULL(len * 8 - 1, 0);
+
+ return reg | ((u64)val << lower);
+}
+
+bool vgic_has_its(struct kvm *kvm)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+
+ if (dist->vgic_model != KVM_DEV_TYPE_ARM_VGIC_V3)
+ return false;
+
+ return dist->has_its;
+}
+
static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len)
{
@@ -43,7 +66,12 @@ static unsigned long vgic_mmio_read_v3_misc(struct kvm_vcpu *vcpu,
case GICD_TYPER:
value = vcpu->kvm->arch.vgic.nr_spis + VGIC_NR_PRIVATE_IRQS;
value = (value >> 5) - 1;
- value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+ if (vgic_has_its(vcpu->kvm)) {
+ value |= (INTERRUPT_ID_BITS_ITS - 1) << 19;
+ value |= GICD_TYPER_LPIS;
+ } else {
+ value |= (INTERRUPT_ID_BITS_SPIS - 1) << 19;
+ }
break;
case GICD_IIDR:
value = (PRODUCT_ID_KVM << 24) | (IMPLEMENTER_ARM << 0);
@@ -80,15 +108,17 @@ static unsigned long vgic_mmio_read_irouter(struct kvm_vcpu *vcpu,
{
int intid = VGIC_ADDR_TO_INTID(addr, 64);
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+ unsigned long ret = 0;
if (!irq)
return 0;
/* The upper word is RAZ for us. */
- if (addr & 4)
- return 0;
+ if (!(addr & 4))
+ ret = extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
- return extract_bytes(READ_ONCE(irq->mpidr), addr & 7, len);
+ vgic_put_irq(vcpu->kvm, irq);
+ return ret;
}
static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
@@ -96,15 +126,17 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
unsigned long val)
{
int intid = VGIC_ADDR_TO_INTID(addr, 64);
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, NULL, intid);
-
- if (!irq)
- return;
+ struct vgic_irq *irq;
/* The upper word is WI for us since we don't implement Aff3. */
if (addr & 4)
return;
+ irq = vgic_get_irq(vcpu->kvm, NULL, intid);
+
+ if (!irq)
+ return;
+
spin_lock(&irq->irq_lock);
/* We only care about and preserve Aff0, Aff1 and Aff2. */
@@ -112,6 +144,32 @@ static void vgic_mmio_write_irouter(struct kvm_vcpu *vcpu,
irq->target_vcpu = kvm_mpidr_to_vcpu(vcpu->kvm, irq->mpidr);
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
+}
+
+static unsigned long vgic_mmio_read_v3r_ctlr(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+ return vgic_cpu->lpis_enabled ? GICR_CTLR_ENABLE_LPIS : 0;
+}
+
+
+static void vgic_mmio_write_v3r_ctlr(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ bool was_enabled = vgic_cpu->lpis_enabled;
+
+ if (!vgic_has_its(vcpu->kvm))
+ return;
+
+ vgic_cpu->lpis_enabled = val & GICR_CTLR_ENABLE_LPIS;
+
+ if (!was_enabled && vgic_cpu->lpis_enabled)
+ vgic_enable_lpis(vcpu);
}
static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
@@ -125,6 +183,8 @@ static unsigned long vgic_mmio_read_v3r_typer(struct kvm_vcpu *vcpu,
value |= ((target_vcpu_id & 0xffff) << 8);
if (target_vcpu_id == atomic_read(&vcpu->kvm->online_vcpus) - 1)
value |= GICR_TYPER_LAST;
+ if (vgic_has_its(vcpu->kvm))
+ value |= GICR_TYPER_PLPIS;
return extract_bytes(value, addr & 7, len);
}
@@ -147,6 +207,142 @@ static unsigned long vgic_mmio_read_v3_idregs(struct kvm_vcpu *vcpu,
return 0;
}
+/* We want to avoid outer shareable. */
+u64 vgic_sanitise_shareability(u64 field)
+{
+ switch (field) {
+ case GIC_BASER_OuterShareable:
+ return GIC_BASER_InnerShareable;
+ default:
+ return field;
+ }
+}
+
+/* Avoid any inner non-cacheable mapping. */
+u64 vgic_sanitise_inner_cacheability(u64 field)
+{
+ switch (field) {
+ case GIC_BASER_CACHE_nCnB:
+ case GIC_BASER_CACHE_nC:
+ return GIC_BASER_CACHE_RaWb;
+ default:
+ return field;
+ }
+}
+
+/* Non-cacheable or same-as-inner are OK. */
+u64 vgic_sanitise_outer_cacheability(u64 field)
+{
+ switch (field) {
+ case GIC_BASER_CACHE_SameAsInner:
+ case GIC_BASER_CACHE_nC:
+ return field;
+ default:
+ return GIC_BASER_CACHE_nC;
+ }
+}
+
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+ u64 (*sanitise_fn)(u64))
+{
+ u64 field = (reg & field_mask) >> field_shift;
+
+ field = sanitise_fn(field) << field_shift;
+ return (reg & ~field_mask) | field;
+}
+
+#define PROPBASER_RES0_MASK \
+ (GENMASK_ULL(63, 59) | GENMASK_ULL(55, 52) | GENMASK_ULL(6, 5))
+#define PENDBASER_RES0_MASK \
+ (BIT_ULL(63) | GENMASK_ULL(61, 59) | GENMASK_ULL(55, 52) | \
+ GENMASK_ULL(15, 12) | GENMASK_ULL(6, 0))
+
+static u64 vgic_sanitise_pendbaser(u64 reg)
+{
+ reg = vgic_sanitise_field(reg, GICR_PENDBASER_SHAREABILITY_MASK,
+ GICR_PENDBASER_SHAREABILITY_SHIFT,
+ vgic_sanitise_shareability);
+ reg = vgic_sanitise_field(reg, GICR_PENDBASER_INNER_CACHEABILITY_MASK,
+ GICR_PENDBASER_INNER_CACHEABILITY_SHIFT,
+ vgic_sanitise_inner_cacheability);
+ reg = vgic_sanitise_field(reg, GICR_PENDBASER_OUTER_CACHEABILITY_MASK,
+ GICR_PENDBASER_OUTER_CACHEABILITY_SHIFT,
+ vgic_sanitise_outer_cacheability);
+
+ reg &= ~PENDBASER_RES0_MASK;
+ reg &= ~GENMASK_ULL(51, 48);
+
+ return reg;
+}
+
+static u64 vgic_sanitise_propbaser(u64 reg)
+{
+ reg = vgic_sanitise_field(reg, GICR_PROPBASER_SHAREABILITY_MASK,
+ GICR_PROPBASER_SHAREABILITY_SHIFT,
+ vgic_sanitise_shareability);
+ reg = vgic_sanitise_field(reg, GICR_PROPBASER_INNER_CACHEABILITY_MASK,
+ GICR_PROPBASER_INNER_CACHEABILITY_SHIFT,
+ vgic_sanitise_inner_cacheability);
+ reg = vgic_sanitise_field(reg, GICR_PROPBASER_OUTER_CACHEABILITY_MASK,
+ GICR_PROPBASER_OUTER_CACHEABILITY_SHIFT,
+ vgic_sanitise_outer_cacheability);
+
+ reg &= ~PROPBASER_RES0_MASK;
+ reg &= ~GENMASK_ULL(51, 48);
+ return reg;
+}
+
+static unsigned long vgic_mmio_read_propbase(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+ return extract_bytes(dist->propbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_propbase(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ u64 propbaser = dist->propbaser;
+
+ /* Storing a value with LPIs already enabled is undefined */
+ if (vgic_cpu->lpis_enabled)
+ return;
+
+ propbaser = update_64bit_reg(propbaser, addr & 4, len, val);
+ propbaser = vgic_sanitise_propbaser(propbaser);
+
+ dist->propbaser = propbaser;
+}
+
+static unsigned long vgic_mmio_read_pendbase(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+
+ return extract_bytes(vgic_cpu->pendbaser, addr & 7, len);
+}
+
+static void vgic_mmio_write_pendbase(struct kvm_vcpu *vcpu,
+ gpa_t addr, unsigned int len,
+ unsigned long val)
+{
+ struct vgic_cpu *vgic_cpu = &vcpu->arch.vgic_cpu;
+ u64 pendbaser = vgic_cpu->pendbaser;
+
+ /* Storing a value with LPIs already enabled is undefined */
+ if (vgic_cpu->lpis_enabled)
+ return;
+
+ pendbaser = update_64bit_reg(pendbaser, addr & 4, len, val);
+ pendbaser = vgic_sanitise_pendbaser(pendbaser);
+
+ vgic_cpu->pendbaser = pendbaser;
+}
+
/*
* The GICv3 per-IRQ registers are split to control PPIs and SGIs in the
* redistributors, while SPIs are covered by registers in the distributor
@@ -218,7 +414,7 @@ static const struct vgic_register_region vgic_v3_dist_registers[] = {
static const struct vgic_register_region vgic_v3_rdbase_registers[] = {
REGISTER_DESC_WITH_LENGTH(GICR_CTLR,
- vgic_mmio_read_raz, vgic_mmio_write_wi, 4,
+ vgic_mmio_read_v3r_ctlr, vgic_mmio_write_v3r_ctlr, 4,
VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH(GICR_IIDR,
vgic_mmio_read_v3r_iidr, vgic_mmio_write_wi, 4,
@@ -227,10 +423,10 @@ static const struct vgic_register_region vgic_v3_rdbase_registers[] = {
vgic_mmio_read_v3r_typer, vgic_mmio_write_wi, 8,
VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH(GICR_PROPBASER,
- vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+ vgic_mmio_read_propbase, vgic_mmio_write_propbase, 8,
VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH(GICR_PENDBASER,
- vgic_mmio_read_raz, vgic_mmio_write_wi, 8,
+ vgic_mmio_read_pendbase, vgic_mmio_write_pendbase, 8,
VGIC_ACCESS_64bit | VGIC_ACCESS_32bit),
REGISTER_DESC_WITH_LENGTH(GICR_IDREGS,
vgic_mmio_read_v3_idregs, vgic_mmio_write_wi, 48,
@@ -285,24 +481,18 @@ unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev)
int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
{
- int nr_vcpus = atomic_read(&kvm->online_vcpus);
struct kvm_vcpu *vcpu;
- struct vgic_io_device *devices;
int c, ret = 0;
- devices = kmalloc(sizeof(struct vgic_io_device) * nr_vcpus * 2,
- GFP_KERNEL);
- if (!devices)
- return -ENOMEM;
-
kvm_for_each_vcpu(c, vcpu, kvm) {
gpa_t rd_base = redist_base_address + c * SZ_64K * 2;
gpa_t sgi_base = rd_base + SZ_64K;
- struct vgic_io_device *rd_dev = &devices[c * 2];
- struct vgic_io_device *sgi_dev = &devices[c * 2 + 1];
+ struct vgic_io_device *rd_dev = &vcpu->arch.vgic_cpu.rd_iodev;
+ struct vgic_io_device *sgi_dev = &vcpu->arch.vgic_cpu.sgi_iodev;
kvm_iodevice_init(&rd_dev->dev, &kvm_io_gic_ops);
rd_dev->base_addr = rd_base;
+ rd_dev->iodev_type = IODEV_REDIST;
rd_dev->regions = vgic_v3_rdbase_registers;
rd_dev->nr_regions = ARRAY_SIZE(vgic_v3_rdbase_registers);
rd_dev->redist_vcpu = vcpu;
@@ -317,6 +507,7 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
kvm_iodevice_init(&sgi_dev->dev, &kvm_io_gic_ops);
sgi_dev->base_addr = sgi_base;
+ sgi_dev->iodev_type = IODEV_REDIST;
sgi_dev->regions = vgic_v3_sgibase_registers;
sgi_dev->nr_regions = ARRAY_SIZE(vgic_v3_sgibase_registers);
sgi_dev->redist_vcpu = vcpu;
@@ -335,14 +526,15 @@ int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t redist_base_address)
if (ret) {
/* The current c failed, so we start with the previous one. */
for (c--; c >= 0; c--) {
+ struct vgic_cpu *vgic_cpu;
+
+ vcpu = kvm_get_vcpu(kvm, c);
+ vgic_cpu = &vcpu->arch.vgic_cpu;
kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
- &devices[c * 2].dev);
+ &vgic_cpu->rd_iodev.dev);
kvm_io_bus_unregister_dev(kvm, KVM_MMIO_BUS,
- &devices[c * 2 + 1].dev);
+ &vgic_cpu->sgi_iodev.dev);
}
- kfree(devices);
- } else {
- kvm->arch.vgic.redist_iodevs = devices;
}
return ret;
@@ -451,5 +643,6 @@ void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg)
irq->pending = true;
vgic_queue_irq_unlock(vcpu->kvm, irq);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
diff --git a/virt/kvm/arm/vgic/vgic-mmio.c b/virt/kvm/arm/vgic/vgic-mmio.c
index 9f6fab74dce7..3bad3c5ed431 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.c
+++ b/virt/kvm/arm/vgic/vgic-mmio.c
@@ -56,6 +56,8 @@ unsigned long vgic_mmio_read_enable(struct kvm_vcpu *vcpu,
if (irq->enabled)
value |= (1U << i);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
return value;
@@ -74,6 +76,8 @@ void vgic_mmio_write_senable(struct kvm_vcpu *vcpu,
spin_lock(&irq->irq_lock);
irq->enabled = true;
vgic_queue_irq_unlock(vcpu->kvm, irq);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -92,6 +96,7 @@ void vgic_mmio_write_cenable(struct kvm_vcpu *vcpu,
irq->enabled = false;
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -108,6 +113,8 @@ unsigned long vgic_mmio_read_pending(struct kvm_vcpu *vcpu,
if (irq->pending)
value |= (1U << i);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
return value;
@@ -129,6 +136,7 @@ void vgic_mmio_write_spending(struct kvm_vcpu *vcpu,
irq->soft_pending = true;
vgic_queue_irq_unlock(vcpu->kvm, irq);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -152,6 +160,7 @@ void vgic_mmio_write_cpending(struct kvm_vcpu *vcpu,
}
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -168,6 +177,8 @@ unsigned long vgic_mmio_read_active(struct kvm_vcpu *vcpu,
if (irq->active)
value |= (1U << i);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
return value;
@@ -242,6 +253,7 @@ void vgic_mmio_write_cactive(struct kvm_vcpu *vcpu,
for_each_set_bit(i, &val, len * 8) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
vgic_mmio_change_active(vcpu, irq, false);
+ vgic_put_irq(vcpu->kvm, irq);
}
vgic_change_active_finish(vcpu, intid);
}
@@ -257,6 +269,7 @@ void vgic_mmio_write_sactive(struct kvm_vcpu *vcpu,
for_each_set_bit(i, &val, len * 8) {
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
vgic_mmio_change_active(vcpu, irq, true);
+ vgic_put_irq(vcpu->kvm, irq);
}
vgic_change_active_finish(vcpu, intid);
}
@@ -272,6 +285,8 @@ unsigned long vgic_mmio_read_priority(struct kvm_vcpu *vcpu,
struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
val |= (u64)irq->priority << (i * 8);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
return val;
@@ -298,6 +313,8 @@ void vgic_mmio_write_priority(struct kvm_vcpu *vcpu,
/* Narrow the priority range to what we actually support */
irq->priority = (val >> (i * 8)) & GENMASK(7, 8 - VGIC_PRI_BITS);
spin_unlock(&irq->irq_lock);
+
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -313,6 +330,8 @@ unsigned long vgic_mmio_read_config(struct kvm_vcpu *vcpu,
if (irq->config == VGIC_CONFIG_EDGE)
value |= (2U << (i * 2));
+
+ vgic_put_irq(vcpu->kvm, irq);
}
return value;
@@ -326,7 +345,7 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
int i;
for (i = 0; i < len * 4; i++) {
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
+ struct vgic_irq *irq;
/*
* The configuration cannot be changed for SGIs in general,
@@ -337,14 +356,18 @@ void vgic_mmio_write_config(struct kvm_vcpu *vcpu,
if (intid + i < VGIC_NR_PRIVATE_IRQS)
continue;
+ irq = vgic_get_irq(vcpu->kvm, vcpu, intid + i);
spin_lock(&irq->irq_lock);
+
if (test_bit(i * 2 + 1, &val)) {
irq->config = VGIC_CONFIG_EDGE;
} else {
irq->config = VGIC_CONFIG_LEVEL;
irq->pending = irq->line_level | irq->soft_pending;
}
+
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -450,8 +473,7 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
{
struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
const struct vgic_register_region *region;
- struct kvm_vcpu *r_vcpu;
- unsigned long data;
+ unsigned long data = 0;
region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
addr - iodev->base_addr);
@@ -460,8 +482,21 @@ static int dispatch_mmio_read(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
return 0;
}
- r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
- data = region->read(r_vcpu, addr, len);
+ switch (iodev->iodev_type) {
+ case IODEV_CPUIF:
+ data = region->read(vcpu, addr, len);
+ break;
+ case IODEV_DIST:
+ data = region->read(vcpu, addr, len);
+ break;
+ case IODEV_REDIST:
+ data = region->read(iodev->redist_vcpu, addr, len);
+ break;
+ case IODEV_ITS:
+ data = region->its_read(vcpu->kvm, iodev->its, addr, len);
+ break;
+ }
+
vgic_data_host_to_mmio_bus(val, len, data);
return 0;
}
@@ -471,7 +506,6 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
{
struct vgic_io_device *iodev = kvm_to_vgic_iodev(dev);
const struct vgic_register_region *region;
- struct kvm_vcpu *r_vcpu;
unsigned long data = vgic_data_mmio_bus_to_host(val, len);
region = vgic_find_mmio_region(iodev->regions, iodev->nr_regions,
@@ -482,8 +516,21 @@ static int dispatch_mmio_write(struct kvm_vcpu *vcpu, struct kvm_io_device *dev,
if (!check_region(region, addr, len))
return 0;
- r_vcpu = iodev->redist_vcpu ? iodev->redist_vcpu : vcpu;
- region->write(r_vcpu, addr, len, data);
+ switch (iodev->iodev_type) {
+ case IODEV_CPUIF:
+ region->write(vcpu, addr, len, data);
+ break;
+ case IODEV_DIST:
+ region->write(vcpu, addr, len, data);
+ break;
+ case IODEV_REDIST:
+ region->write(iodev->redist_vcpu, addr, len, data);
+ break;
+ case IODEV_ITS:
+ region->its_write(vcpu->kvm, iodev->its, addr, len, data);
+ break;
+ }
+
return 0;
}
@@ -513,6 +560,7 @@ int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
}
io_device->base_addr = dist_base_address;
+ io_device->iodev_type = IODEV_DIST;
io_device->redist_vcpu = NULL;
mutex_lock(&kvm->slots_lock);
diff --git a/virt/kvm/arm/vgic/vgic-mmio.h b/virt/kvm/arm/vgic/vgic-mmio.h
index 850901482aec..0b3ecf9d100e 100644
--- a/virt/kvm/arm/vgic/vgic-mmio.h
+++ b/virt/kvm/arm/vgic/vgic-mmio.h
@@ -21,10 +21,19 @@ struct vgic_register_region {
unsigned int len;
unsigned int bits_per_irq;
unsigned int access_flags;
- unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
- unsigned int len);
- void (*write)(struct kvm_vcpu *vcpu, gpa_t addr, unsigned int len,
- unsigned long val);
+ union {
+ unsigned long (*read)(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len);
+ unsigned long (*its_read)(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len);
+ };
+ union {
+ void (*write)(struct kvm_vcpu *vcpu, gpa_t addr,
+ unsigned int len, unsigned long val);
+ void (*its_write)(struct kvm *kvm, struct vgic_its *its,
+ gpa_t addr, unsigned int len,
+ unsigned long val);
+ };
};
extern struct kvm_io_device_ops kvm_io_gic_ops;
@@ -87,6 +96,12 @@ unsigned long vgic_data_mmio_bus_to_host(const void *val, unsigned int len);
void vgic_data_host_to_mmio_bus(void *buf, unsigned int len,
unsigned long data);
+unsigned long extract_bytes(unsigned long data, unsigned int offset,
+ unsigned int num);
+
+u64 update_64bit_reg(u64 reg, unsigned int offset, unsigned int len,
+ unsigned long val);
+
unsigned long vgic_mmio_read_raz(struct kvm_vcpu *vcpu,
gpa_t addr, unsigned int len);
@@ -147,4 +162,12 @@ unsigned int vgic_v2_init_dist_iodev(struct vgic_io_device *dev);
unsigned int vgic_v3_init_dist_iodev(struct vgic_io_device *dev);
+#ifdef CONFIG_KVM_ARM_VGIC_V3
+u64 vgic_sanitise_outer_cacheability(u64 reg);
+u64 vgic_sanitise_inner_cacheability(u64 reg);
+u64 vgic_sanitise_shareability(u64 reg);
+u64 vgic_sanitise_field(u64 reg, u64 field_mask, int field_shift,
+ u64 (*sanitise_fn)(u64));
+#endif
+
#endif
diff --git a/virt/kvm/arm/vgic/vgic-v2.c b/virt/kvm/arm/vgic/vgic-v2.c
index e31405ee5515..0bf6709d1006 100644
--- a/virt/kvm/arm/vgic/vgic-v2.c
+++ b/virt/kvm/arm/vgic/vgic-v2.c
@@ -124,6 +124,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu)
}
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -332,20 +333,25 @@ int vgic_v2_probe(const struct gic_kvm_info *info)
vtr = readl_relaxed(kvm_vgic_global_state.vctrl_base + GICH_VTR);
kvm_vgic_global_state.nr_lr = (vtr & 0x3f) + 1;
+ ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+ if (ret) {
+ kvm_err("Cannot register GICv2 KVM device\n");
+ iounmap(kvm_vgic_global_state.vctrl_base);
+ return ret;
+ }
+
ret = create_hyp_io_mappings(kvm_vgic_global_state.vctrl_base,
kvm_vgic_global_state.vctrl_base +
resource_size(&info->vctrl),
info->vctrl.start);
-
if (ret) {
kvm_err("Cannot map VCTRL into hyp\n");
+ kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
iounmap(kvm_vgic_global_state.vctrl_base);
return ret;
}
kvm_vgic_global_state.can_emulate_gicv2 = true;
- kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
-
kvm_vgic_global_state.vcpu_base = info->vcpu.start;
kvm_vgic_global_state.type = VGIC_V2;
kvm_vgic_global_state.max_gic_vcpus = VGIC_V2_MAX_CPUS;
diff --git a/virt/kvm/arm/vgic/vgic-v3.c b/virt/kvm/arm/vgic/vgic-v3.c
index 346b4ad12b49..0506543df38a 100644
--- a/virt/kvm/arm/vgic/vgic-v3.c
+++ b/virt/kvm/arm/vgic/vgic-v3.c
@@ -81,6 +81,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
else
intid = val & GICH_LR_VIRTUALID;
irq = vgic_get_irq(vcpu->kvm, vcpu, intid);
+ if (!irq) /* An LPI could have been unmapped. */
+ continue;
spin_lock(&irq->irq_lock);
@@ -113,6 +115,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu)
}
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
}
}
@@ -190,6 +193,11 @@ void vgic_v3_get_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcrp)
vmcrp->pmr = (vmcr & ICH_VMCR_PMR_MASK) >> ICH_VMCR_PMR_SHIFT;
}
+#define INITIAL_PENDBASER_VALUE \
+ (GIC_BASER_CACHEABILITY(GICR_PENDBASER, INNER, RaWb) | \
+ GIC_BASER_CACHEABILITY(GICR_PENDBASER, OUTER, SameAsInner) | \
+ GIC_BASER_SHAREABILITY(GICR_PENDBASER, InnerShareable))
+
void vgic_v3_enable(struct kvm_vcpu *vcpu)
{
struct vgic_v3_cpu_if *vgic_v3 = &vcpu->arch.vgic_cpu.vgic_v3;
@@ -207,10 +215,12 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu)
* way, so we force SRE to 1 to demonstrate this to the guest.
* This goes with the spec allowing the value to be RAO/WI.
*/
- if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3)
+ if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V3) {
vgic_v3->vgic_sre = ICC_SRE_EL1_SRE;
- else
+ vcpu->arch.vgic_cpu.pendbaser = INITIAL_PENDBASER_VALUE;
+ } else {
vgic_v3->vgic_sre = 0;
+ }
/* Get the show on the road... */
vgic_v3->vgic_hcr = ICH_HCR_EN;
@@ -296,6 +306,7 @@ out:
int vgic_v3_probe(const struct gic_kvm_info *info)
{
u32 ich_vtr_el2 = kvm_call_hyp(__vgic_v3_get_ich_vtr_el2);
+ int ret;
/*
* The ListRegs field is 5 bits, but there is a architectural
@@ -319,12 +330,22 @@ int vgic_v3_probe(const struct gic_kvm_info *info)
} else {
kvm_vgic_global_state.vcpu_base = info->vcpu.start;
kvm_vgic_global_state.can_emulate_gicv2 = true;
- kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+ ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V2);
+ if (ret) {
+ kvm_err("Cannot register GICv2 KVM device.\n");
+ return ret;
+ }
kvm_info("vgic-v2@%llx\n", info->vcpu.start);
}
+ ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
+ if (ret) {
+ kvm_err("Cannot register GICv3 KVM device.\n");
+ kvm_unregister_device_ops(KVM_DEV_TYPE_ARM_VGIC_V2);
+ return ret;
+ }
+
if (kvm_vgic_global_state.vcpu_base == 0)
kvm_info("disabling GICv2 emulation\n");
- kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
kvm_vgic_global_state.vctrl_base = NULL;
kvm_vgic_global_state.type = VGIC_V3;
diff --git a/virt/kvm/arm/vgic/vgic.c b/virt/kvm/arm/vgic/vgic.c
index 69b61abefa19..39f3358c6d91 100644
--- a/virt/kvm/arm/vgic/vgic.c
+++ b/virt/kvm/arm/vgic/vgic.c
@@ -33,10 +33,17 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
/*
* Locking order is always:
- * vgic_cpu->ap_list_lock
- * vgic_irq->irq_lock
+ * its->cmd_lock (mutex)
+ * its->its_lock (mutex)
+ * vgic_cpu->ap_list_lock
+ * kvm->lpi_list_lock
+ * vgic_irq->irq_lock
*
- * (that is, always take the ap_list_lock before the struct vgic_irq lock).
+ * If you need to take multiple locks, always take the upper lock first,
+ * then the lower ones, e.g. first take the its_lock, then the irq_lock.
+ * If you are already holding a lock and need to take a higher one, you
+ * have to drop the lower ranking lock first and re-aquire it after having
+ * taken the upper one.
*
* When taking more than one ap_list_lock at the same time, always take the
* lowest numbered VCPU's ap_list_lock first, so:
@@ -45,6 +52,41 @@ struct vgic_global __section(.hyp.text) kvm_vgic_global_state;
* spin_lock(vcpuY->arch.vgic_cpu.ap_list_lock);
*/
+/*
+ * Iterate over the VM's list of mapped LPIs to find the one with a
+ * matching interrupt ID and return a reference to the IRQ structure.
+ */
+static struct vgic_irq *vgic_get_lpi(struct kvm *kvm, u32 intid)
+{
+ struct vgic_dist *dist = &kvm->arch.vgic;
+ struct vgic_irq *irq = NULL;
+
+ spin_lock(&dist->lpi_list_lock);
+
+ list_for_each_entry(irq, &dist->lpi_list_head, lpi_list) {
+ if (irq->intid != intid)
+ continue;
+
+ /*
+ * This increases the refcount, the caller is expected to
+ * call vgic_put_irq() later once it's finished with the IRQ.
+ */
+ vgic_get_irq_kref(irq);
+ goto out_unlock;
+ }
+ irq = NULL;
+
+out_unlock:
+ spin_unlock(&dist->lpi_list_lock);
+
+ return irq;
+}
+
+/*
+ * This looks up the virtual interrupt ID to get the corresponding
+ * struct vgic_irq. It also increases the refcount, so any caller is expected
+ * to call vgic_put_irq() once it's finished with this IRQ.
+ */
struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
u32 intid)
{
@@ -56,14 +98,43 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
if (intid <= VGIC_MAX_SPI)
return &kvm->arch.vgic.spis[intid - VGIC_NR_PRIVATE_IRQS];
- /* LPIs are not yet covered */
+ /* LPIs */
if (intid >= VGIC_MIN_LPI)
- return NULL;
+ return vgic_get_lpi(kvm, intid);
WARN(1, "Looking up struct vgic_irq for reserved INTID");
return NULL;
}
+/*
+ * We can't do anything in here, because we lack the kvm pointer to
+ * lock and remove the item from the lpi_list. So we keep this function
+ * empty and use the return value of kref_put() to trigger the freeing.
+ */
+static void vgic_irq_release(struct kref *ref)
+{
+}
+
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq)
+{
+ struct vgic_dist *dist;
+
+ if (irq->intid < VGIC_MIN_LPI)
+ return;
+
+ if (!kref_put(&irq->refcount, vgic_irq_release))
+ return;
+
+ dist = &kvm->arch.vgic;
+
+ spin_lock(&dist->lpi_list_lock);
+ list_del(&irq->lpi_list);
+ dist->lpi_list_count--;
+ spin_unlock(&dist->lpi_list_lock);
+
+ kfree(irq);
+}
+
/**
* kvm_vgic_target_oracle - compute the target vcpu for an irq
*
@@ -236,6 +307,11 @@ retry:
goto retry;
}
+ /*
+ * Grab a reference to the irq to reflect the fact that it is
+ * now in the ap_list.
+ */
+ vgic_get_irq_kref(irq);
list_add_tail(&irq->ap_list, &vcpu->arch.vgic_cpu.ap_list_head);
irq->vcpu = vcpu;
@@ -269,14 +345,17 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
if (!irq)
return -EINVAL;
- if (irq->hw != mapped_irq)
+ if (irq->hw != mapped_irq) {
+ vgic_put_irq(kvm, irq);
return -EINVAL;
+ }
spin_lock(&irq->irq_lock);
if (!vgic_validate_injection(irq, level)) {
/* Nothing to see here, move along... */
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(kvm, irq);
return 0;
}
@@ -288,6 +367,7 @@ static int vgic_update_irq_pending(struct kvm *kvm, int cpuid,
}
vgic_queue_irq_unlock(kvm, irq);
+ vgic_put_irq(kvm, irq);
return 0;
}
@@ -330,25 +410,28 @@ int kvm_vgic_map_phys_irq(struct kvm_vcpu *vcpu, u32 virt_irq, u32 phys_irq)
irq->hwintid = phys_irq;
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
return 0;
}
int kvm_vgic_unmap_phys_irq(struct kvm_vcpu *vcpu, unsigned int virt_irq)
{
- struct vgic_irq *irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
-
- BUG_ON(!irq);
+ struct vgic_irq *irq;
if (!vgic_initialized(vcpu->kvm))
return -EAGAIN;
+ irq = vgic_get_irq(vcpu->kvm, vcpu, virt_irq);
+ BUG_ON(!irq);
+
spin_lock(&irq->irq_lock);
irq->hw = false;
irq->hwintid = 0;
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
return 0;
}
@@ -386,6 +469,15 @@ retry:
list_del(&irq->ap_list);
irq->vcpu = NULL;
spin_unlock(&irq->irq_lock);
+
+ /*
+ * This vgic_put_irq call matches the
+ * vgic_get_irq_kref in vgic_queue_irq_unlock,
+ * where we added the LPI to the ap_list. As
+ * we remove the irq from the list, we drop
+ * also drop the refcount.
+ */
+ vgic_put_irq(vcpu->kvm, irq);
continue;
}
@@ -614,6 +706,15 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int virt_irq)
spin_lock(&irq->irq_lock);
map_is_active = irq->hw && irq->active;
spin_unlock(&irq->irq_lock);
+ vgic_put_irq(vcpu->kvm, irq);
return map_is_active;
}
+
+int kvm_send_userspace_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+ if (vgic_has_its(kvm))
+ return vgic_its_inject_msi(kvm, msi);
+ else
+ return -ENODEV;
+}
diff --git a/virt/kvm/arm/vgic/vgic.h b/virt/kvm/arm/vgic/vgic.h
index 7b300ca370b7..1d8e21d5c13f 100644
--- a/virt/kvm/arm/vgic/vgic.h
+++ b/virt/kvm/arm/vgic/vgic.h
@@ -25,6 +25,7 @@
#define IS_VGIC_ADDR_UNDEF(_x) ((_x) == VGIC_ADDR_UNDEF)
#define INTERRUPT_ID_BITS_SPIS 10
+#define INTERRUPT_ID_BITS_ITS 16
#define VGIC_PRI_BITS 5
#define vgic_irq_is_sgi(intid) ((intid) < VGIC_NR_SGIS)
@@ -38,9 +39,13 @@ struct vgic_vmcr {
struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu,
u32 intid);
+void vgic_put_irq(struct kvm *kvm, struct vgic_irq *irq);
bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq);
void vgic_kick_vcpus(struct kvm *kvm);
+int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr,
+ phys_addr_t addr, phys_addr_t alignment);
+
void vgic_v2_process_maintenance(struct kvm_vcpu *vcpu);
void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu);
void vgic_v2_populate_lr(struct kvm_vcpu *vcpu, struct vgic_irq *irq, int lr);
@@ -59,6 +64,14 @@ int vgic_v2_map_resources(struct kvm *kvm);
int vgic_register_dist_iodev(struct kvm *kvm, gpa_t dist_base_address,
enum vgic_type);
+static inline void vgic_get_irq_kref(struct vgic_irq *irq)
+{
+ if (irq->intid < VGIC_MIN_LPI)
+ return;
+
+ kref_get(&irq->refcount);
+}
+
#ifdef CONFIG_KVM_ARM_VGIC_V3
void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu);
void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu);
@@ -71,6 +84,10 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu);
int vgic_v3_probe(const struct gic_kvm_info *info);
int vgic_v3_map_resources(struct kvm *kvm);
int vgic_register_redist_iodevs(struct kvm *kvm, gpa_t dist_base_address);
+bool vgic_has_its(struct kvm *kvm);
+int kvm_vgic_register_its_device(void);
+void vgic_enable_lpis(struct kvm_vcpu *vcpu);
+int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi);
#else
static inline void vgic_v3_process_maintenance(struct kvm_vcpu *vcpu)
{
@@ -122,9 +139,28 @@ static inline int vgic_register_redist_iodevs(struct kvm *kvm,
{
return -ENODEV;
}
+
+static inline bool vgic_has_its(struct kvm *kvm)
+{
+ return false;
+}
+
+static inline int kvm_vgic_register_its_device(void)
+{
+ return -ENODEV;
+}
+
+static inline void vgic_enable_lpis(struct kvm_vcpu *vcpu)
+{
+}
+
+static inline int vgic_its_inject_msi(struct kvm *kvm, struct kvm_msi *msi)
+{
+ return -ENODEV;
+}
#endif
-void kvm_register_vgic_device(unsigned long type);
+int kvm_register_vgic_device(unsigned long type);
int vgic_lazy_init(struct kvm *kvm);
int vgic_init(struct kvm *kvm);
diff --git a/virt/kvm/irqchip.c b/virt/kvm/irqchip.c
index 8db197bb6c7a..df99e9c3b64d 100644
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -135,7 +135,8 @@ void kvm_free_irq_routing(struct kvm *kvm)
free_irq_routing_table(rt);
}
-static int setup_routing_entry(struct kvm_irq_routing_table *rt,
+static int setup_routing_entry(struct kvm *kvm,
+ struct kvm_irq_routing_table *rt,
struct kvm_kernel_irq_routing_entry *e,
const struct kvm_irq_routing_entry *ue)
{
@@ -154,7 +155,7 @@ static int setup_routing_entry(struct kvm_irq_routing_table *rt,
e->gsi = ue->gsi;
e->type = ue->type;
- r = kvm_set_routing_entry(e, ue);
+ r = kvm_set_routing_entry(kvm, e, ue);
if (r)
goto out;
if (e->type == KVM_IRQ_ROUTING_IRQCHIP)
@@ -211,7 +212,7 @@ int kvm_set_irq_routing(struct kvm *kvm,
kfree(e);
goto out;
}
- r = setup_routing_entry(new, e, ue);
+ r = setup_routing_entry(kvm, new, e, ue);
if (r) {
kfree(e);
goto out;
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2e791367c576..cc081ccfcaa3 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -1444,6 +1444,52 @@ static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
return true;
}
+static int hva_to_pfn_remapped(struct vm_area_struct *vma,
+ unsigned long addr, bool *async,
+ bool write_fault, kvm_pfn_t *p_pfn)
+{
+ unsigned long pfn;
+ int r;
+
+ r = follow_pfn(vma, addr, &pfn);
+ if (r) {
+ /*
+ * get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
+ * not call the fault handler, so do it here.
+ */
+ bool unlocked = false;
+ r = fixup_user_fault(current, current->mm, addr,
+ (write_fault ? FAULT_FLAG_WRITE : 0),
+ &unlocked);
+ if (unlocked)
+ return -EAGAIN;
+ if (r)
+ return r;
+
+ r = follow_pfn(vma, addr, &pfn);
+ if (r)
+ return r;
+
+ }
+
+
+ /*
+ * Get a reference here because callers of *hva_to_pfn* and
+ * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the
+ * returned pfn. This is only needed if the VMA has VM_MIXEDMAP
+ * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will
+ * simply do nothing for reserved pfns.
+ *
+ * Whoever called remap_pfn_range is also going to call e.g.
+ * unmap_mapping_range before the underlying pages are freed,
+ * causing a call to our MMU notifier.
+ */
+ kvm_get_pfn(pfn);
+
+ *p_pfn = pfn;
+ return 0;
+}
+
/*
* Pin guest page in memory and return its pfn.
* @addr: host virtual address which maps memory to the guest
@@ -1463,7 +1509,7 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
{
struct vm_area_struct *vma;
kvm_pfn_t pfn = 0;
- int npages;
+ int npages, r;
/* we can do it either atomically or asynchronously, not both */
BUG_ON(atomic && async);
@@ -1485,14 +1531,17 @@ static kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool *async,
goto exit;
}
+retry:
vma = find_vma_intersection(current->mm, addr, addr + 1);
if (vma == NULL)
pfn = KVM_PFN_ERR_FAULT;
- else if ((vma->vm_flags & VM_PFNMAP)) {
- pfn = ((addr - vma->vm_start) >> PAGE_SHIFT) +
- vma->vm_pgoff;
- BUG_ON(!kvm_is_reserved_pfn(pfn));
+ else if (vma->vm_flags & (VM_IO | VM_PFNMAP)) {
+ r = hva_to_pfn_remapped(vma, addr, async, write_fault, &pfn);
+ if (r == -EAGAIN)
+ goto retry;
+ if (r < 0)
+ pfn = KVM_PFN_ERR_FAULT;
} else {
if (async && vma_is_valid(vma, write_fault))
*async = true;
@@ -2348,9 +2397,20 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
if (id >= KVM_MAX_VCPU_ID)
return -EINVAL;
+ mutex_lock(&kvm->lock);
+ if (kvm->created_vcpus == KVM_MAX_VCPUS) {
+ mutex_unlock(&kvm->lock);
+ return -EINVAL;
+ }
+
+ kvm->created_vcpus++;
+ mutex_unlock(&kvm->lock);
+
vcpu = kvm_arch_vcpu_create(kvm, id);
- if (IS_ERR(vcpu))
- return PTR_ERR(vcpu);
+ if (IS_ERR(vcpu)) {
+ r = PTR_ERR(vcpu);
+ goto vcpu_decrement;
+ }
preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops);
@@ -2359,14 +2419,6 @@ static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
goto vcpu_destroy;
mutex_lock(&kvm->lock);
- if (!kvm_vcpu_compatible(vcpu)) {
- r = -EINVAL;
- goto unlock_vcpu_destroy;
- }
- if (atomic_read(&kvm->online_vcpus) == KVM_MAX_VCPUS) {
- r = -EINVAL;
- goto unlock_vcpu_destroy;
- }
if (kvm_get_vcpu_by_id(kvm, id)) {
r = -EEXIST;
goto unlock_vcpu_destroy;
@@ -2399,6 +2451,10 @@ unlock_vcpu_destroy:
mutex_unlock(&kvm->lock);
vcpu_destroy:
kvm_arch_vcpu_destroy(vcpu);
+vcpu_decrement:
+ mutex_lock(&kvm->lock);
+ kvm->created_vcpus--;
+ mutex_unlock(&kvm->lock);
return r;
}
@@ -3487,6 +3543,30 @@ int kvm_io_bus_unregister_dev(struct kvm *kvm, enum kvm_bus bus_idx,
return r;
}
+struct kvm_io_device *kvm_io_bus_get_dev(struct kvm *kvm, enum kvm_bus bus_idx,
+ gpa_t addr)
+{
+ struct kvm_io_bus *bus;
+ int dev_idx, srcu_idx;
+ struct kvm_io_device *iodev = NULL;
+
+ srcu_idx = srcu_read_lock(&kvm->srcu);
+
+ bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
+
+ dev_idx = kvm_io_bus_get_first_dev(bus, addr, 1);
+ if (dev_idx < 0)
+ goto out_unlock;
+
+ iodev = bus->range[dev_idx].dev;
+
+out_unlock:
+ srcu_read_unlock(&kvm->srcu, srcu_idx);
+
+ return iodev;
+}
+EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
+
static int kvm_debugfs_open(struct inode *inode, struct file *file,
int (*get)(void *, u64 *), int (*set)(void *, u64),
const char *fmt)