diff options
Diffstat (limited to 'arch')
613 files changed, 10126 insertions, 7310 deletions
diff --git a/arch/Kconfig b/arch/Kconfig index 659bdd079277..19483aea4bbc 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -96,6 +96,7 @@ config KPROBES_ON_FTRACE config UPROBES def_bool n + depends on ARCH_SUPPORTS_UPROBES help Uprobes is the user-space counterpart to kprobes: they enable instrumentation applications (such as 'perf probe') @@ -363,8 +364,9 @@ menuconfig GCC_PLUGINS See Documentation/gcc-plugins.txt for details. config GCC_PLUGIN_CYC_COMPLEXITY - bool "Compute the cyclomatic complexity of a function" + bool "Compute the cyclomatic complexity of a function" if EXPERT depends on GCC_PLUGINS + depends on !COMPILE_TEST help The complexity M of a function's control flow graph is defined as: M = E - N + 2P @@ -374,6 +376,10 @@ config GCC_PLUGIN_CYC_COMPLEXITY N = the number of nodes P = the number of connected components (exit nodes). + Enabling this plugin reports the complexity to stderr during the + build. It mainly serves as a simple example of how to create a + gcc plugin for the kernel. + config GCC_PLUGIN_SANCOV bool depends on GCC_PLUGINS @@ -512,6 +518,9 @@ config HAVE_CONTEXT_TRACKING config HAVE_VIRT_CPU_ACCOUNTING bool +config ARCH_HAS_SCALED_CPUTIME + bool + config HAVE_VIRT_CPU_ACCOUNTING_GEN bool default y if 64BIT diff --git a/arch/alpha/include/asm/mutex.h b/arch/alpha/include/asm/mutex.h deleted file mode 100644 index 458c1f7fbc18..000000000000 --- a/arch/alpha/include/asm/mutex.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Pull in the generic implementation for the mutex fastpath. - * - * TODO: implement optimized primitives instead, or leave the generic - * implementation in place, or pick the atomic_xchg() based generic - * implementation. (see asm-generic/mutex-xchg.h for details) - */ - -#include <asm-generic/mutex-dec.h> diff --git a/arch/alpha/include/asm/processor.h b/arch/alpha/include/asm/processor.h index 43a7559c448b..2fec2dee3020 100644 --- a/arch/alpha/include/asm/processor.h +++ b/arch/alpha/include/asm/processor.h @@ -58,7 +58,6 @@ unsigned long get_wchan(struct task_struct *p); ((tsk) == current ? rdusp() : task_thread_info(tsk)->pcb.usp) #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() #define ARCH_HAS_PREFETCH #define ARCH_HAS_PREFETCHW diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h index 9e46d6e656d9..afc901b7a6f6 100644 --- a/arch/alpha/include/uapi/asm/socket.h +++ b/arch/alpha/include/uapi/asm/socket.h @@ -97,4 +97,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/alpha/kernel/osf_sys.c b/arch/alpha/kernel/osf_sys.c index ffb93f499c83..56e427c7aa3c 100644 --- a/arch/alpha/kernel/osf_sys.c +++ b/arch/alpha/kernel/osf_sys.c @@ -1029,11 +1029,16 @@ SYSCALL_DEFINE2(osf_settimeofday, struct timeval32 __user *, tv, return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL); } +asmlinkage long sys_ni_posix_timers(void); + SYSCALL_DEFINE2(osf_getitimer, int, which, struct itimerval32 __user *, it) { struct itimerval kit; int error; + if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) + return sys_ni_posix_timers(); + error = do_getitimer(which, &kit); if (!error && put_it32(it, &kit)) error = -EFAULT; @@ -1047,6 +1052,9 @@ SYSCALL_DEFINE3(osf_setitimer, int, which, struct itimerval32 __user *, in, struct itimerval kin, kout; int error; + if (!IS_ENABLED(CONFIG_POSIX_TIMERS)) + return sys_ni_posix_timers(); + if (in) { if (get_it32(&kin, in)) return -EFAULT; diff --git a/arch/arc/include/asm/mutex.h b/arch/arc/include/asm/mutex.h deleted file mode 100644 index a2f88ff9f506..000000000000 --- a/arch/arc/include/asm/mutex.h +++ /dev/null @@ -1,18 +0,0 @@ -/* - * Copyright (C) 2004, 2007-2010, 2011-2012 Synopsys, Inc. (www.synopsys.com) - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -/* - * xchg() based mutex fast path maintains a state of 0 or 1, as opposed to - * atomic dec based which can "count" any number of lock contenders. - * This ideally needs to be fixed in core, but for now switching to dec ver. - */ -#if defined(CONFIG_SMP) && (CONFIG_NR_CPUS > 2) -#include <asm-generic/mutex-dec.h> -#else -#include <asm-generic/mutex-xchg.h> -#endif diff --git a/arch/arc/include/asm/processor.h b/arch/arc/include/asm/processor.h index 16b630fbeb6a..6e1242da0159 100644 --- a/arch/arc/include/asm/processor.h +++ b/arch/arc/include/asm/processor.h @@ -60,15 +60,12 @@ struct task_struct; #ifndef CONFIG_EZNPS_MTM_EXT #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() #else #define cpu_relax() \ __asm__ __volatile__ (".word %0" : : "i"(CTOP_INST_SCHD_RW) : "memory") -#define cpu_relax_lowlatency() barrier() - #endif #define copy_segments(tsk, mm) do { } while (0) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index b5d529fdffab..caef68429b08 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -703,6 +703,7 @@ config ARCH_VIRT select ARM_GIC select ARM_GIC_V2M if PCI select ARM_GIC_V3 + select ARM_GIC_V3_ITS if PCI select ARM_PSCI select HAVE_ARM_ARCH_TIMER diff --git a/arch/arm/boot/dts/dra72-evm-revc.dts b/arch/arm/boot/dts/dra72-evm-revc.dts index 064b322a7a04..3b23b32e1b30 100644 --- a/arch/arm/boot/dts/dra72-evm-revc.dts +++ b/arch/arm/boot/dts/dra72-evm-revc.dts @@ -59,15 +59,17 @@ &davinci_mdio { dp83867_0: ethernet-phy@2 { reg = <2>; - ti,rx-internal-delay = <DP83867_RGMIIDCTL_2_00_NS>; - ti,tx-internal-delay = <DP83867_RGMIIDCTL_1_NS>; + ti,rx-internal-delay = <DP83867_RGMIIDCTL_2_25_NS>; + ti,tx-internal-delay = <DP83867_RGMIIDCTL_250_PS>; ti,fifo-depth = <DP83867_PHYCR_FIFO_DEPTH_8_B_NIB>; + ti,min-output-impedance; }; dp83867_1: ethernet-phy@3 { reg = <3>; - ti,rx-internal-delay = <DP83867_RGMIIDCTL_2_00_NS>; - ti,tx-internal-delay = <DP83867_RGMIIDCTL_1_NS>; + ti,rx-internal-delay = <DP83867_RGMIIDCTL_2_25_NS>; + ti,tx-internal-delay = <DP83867_RGMIIDCTL_250_PS>; ti,fifo-depth = <DP83867_PHYCR_FIFO_DEPTH_8_B_NIB>; + ti,min-output-imepdance; }; }; diff --git a/arch/arm/boot/dts/hisi-x5hd2.dtsi b/arch/arm/boot/dts/hisi-x5hd2.dtsi index fdcc23d203e5..0da76c5ff6d7 100644 --- a/arch/arm/boot/dts/hisi-x5hd2.dtsi +++ b/arch/arm/boot/dts/hisi-x5hd2.dtsi @@ -436,18 +436,20 @@ }; gmac0: ethernet@1840000 { - compatible = "hisilicon,hix5hd2-gmac"; + compatible = "hisilicon,hix5hd2-gemac", "hisilicon,hisi-gemac-v1"; reg = <0x1840000 0x1000>,<0x184300c 0x4>; interrupts = <0 71 4>; clocks = <&clock HIX5HD2_MAC0_CLK>; + clock-names = "mac_core"; status = "disabled"; }; gmac1: ethernet@1841000 { - compatible = "hisilicon,hix5hd2-gmac"; + compatible = "hisilicon,hix5hd2-gemac", "hisilicon,hisi-gemac-v1"; reg = <0x1841000 0x1000>,<0x1843010 0x4>; interrupts = <0 72 4>; clocks = <&clock HIX5HD2_MAC1_CLK>; + clock-names = "mac_core"; status = "disabled"; }; diff --git a/arch/arm/boot/dts/imx7s.dtsi b/arch/arm/boot/dts/imx7s.dtsi index 0d7d5ac6257b..2b6cb05bc01a 100644 --- a/arch/arm/boot/dts/imx7s.dtsi +++ b/arch/arm/boot/dts/imx7s.dtsi @@ -643,9 +643,8 @@ reg = <0x30730000 0x10000>; interrupts = <GIC_SPI 5 IRQ_TYPE_LEVEL_HIGH>; clocks = <&clks IMX7D_LCDIF_PIXEL_ROOT_CLK>, - <&clks IMX7D_CLK_DUMMY>, - <&clks IMX7D_CLK_DUMMY>; - clock-names = "pix", "axi", "disp_axi"; + <&clks IMX7D_LCDIF_PIXEL_ROOT_CLK>; + clock-names = "pix", "axi"; status = "disabled"; }; }; diff --git a/arch/arm/boot/dts/orion5x-linkstation-lsgl.dts b/arch/arm/boot/dts/orion5x-linkstation-lsgl.dts index 1cf644bfd7ea..51dc734cd5b9 100644 --- a/arch/arm/boot/dts/orion5x-linkstation-lsgl.dts +++ b/arch/arm/boot/dts/orion5x-linkstation-lsgl.dts @@ -82,6 +82,10 @@ gpios = <&gpio0 9 GPIO_ACTIVE_HIGH>; }; +&sata { + nr-ports = <2>; +}; + &ehci1 { status = "okay"; }; diff --git a/arch/arm/boot/dts/r8a7778.dtsi b/arch/arm/boot/dts/r8a7778.dtsi index e571d66ea0fe..3d0a18abd408 100644 --- a/arch/arm/boot/dts/r8a7778.dtsi +++ b/arch/arm/boot/dts/r8a7778.dtsi @@ -626,4 +626,9 @@ "sru-src6", "sru-src7", "sru-src8"; }; }; + + rst: reset-controller@ffcc0000 { + compatible = "renesas,r8a7778-reset-wdt"; + reg = <0xffcc0000 0x40>; + }; }; diff --git a/arch/arm/boot/dts/r8a7779.dtsi b/arch/arm/boot/dts/r8a7779.dtsi index b9bbcce69dfb..8cf16008a09b 100644 --- a/arch/arm/boot/dts/r8a7779.dtsi +++ b/arch/arm/boot/dts/r8a7779.dtsi @@ -590,6 +590,11 @@ }; }; + rst: reset-controller@ffcc0000 { + compatible = "renesas,r8a7779-reset-wdt"; + reg = <0xffcc0000 0x48>; + }; + sysc: system-controller@ffd85000 { compatible = "renesas,r8a7779-sysc"; reg = <0xffd85000 0x0200>; diff --git a/arch/arm/boot/dts/r8a7790.dtsi b/arch/arm/boot/dts/r8a7790.dtsi index 351fcc2f87df..3f10b0bf1b08 100644 --- a/arch/arm/boot/dts/r8a7790.dtsi +++ b/arch/arm/boot/dts/r8a7790.dtsi @@ -1471,6 +1471,11 @@ }; }; + rst: reset-controller@e6160000 { + compatible = "renesas,r8a7790-rst"; + reg = <0 0xe6160000 0 0x0100>; + }; + sysc: system-controller@e6180000 { compatible = "renesas,r8a7790-sysc"; reg = <0 0xe6180000 0 0x0200>; diff --git a/arch/arm/boot/dts/r8a7791.dtsi b/arch/arm/boot/dts/r8a7791.dtsi index 162b55c665a3..c465c79bcca6 100644 --- a/arch/arm/boot/dts/r8a7791.dtsi +++ b/arch/arm/boot/dts/r8a7791.dtsi @@ -1482,6 +1482,11 @@ }; }; + rst: reset-controller@e6160000 { + compatible = "renesas,r8a7791-rst"; + reg = <0 0xe6160000 0 0x0100>; + }; + sysc: system-controller@e6180000 { compatible = "renesas,r8a7791-sysc"; reg = <0 0xe6180000 0 0x0200>; diff --git a/arch/arm/boot/dts/r8a7792.dtsi b/arch/arm/boot/dts/r8a7792.dtsi index 713141d38b3e..6e1f61f65d29 100644 --- a/arch/arm/boot/dts/r8a7792.dtsi +++ b/arch/arm/boot/dts/r8a7792.dtsi @@ -118,6 +118,11 @@ IRQ_TYPE_LEVEL_LOW)>; }; + rst: reset-controller@e6160000 { + compatible = "renesas,r8a7792-rst"; + reg = <0 0xe6160000 0 0x0100>; + }; + sysc: system-controller@e6180000 { compatible = "renesas,r8a7792-sysc"; reg = <0 0xe6180000 0 0x0200>; diff --git a/arch/arm/boot/dts/r8a7793.dtsi b/arch/arm/boot/dts/r8a7793.dtsi index 8d02aacf2892..e4b385eccf74 100644 --- a/arch/arm/boot/dts/r8a7793.dtsi +++ b/arch/arm/boot/dts/r8a7793.dtsi @@ -1279,6 +1279,11 @@ }; }; + rst: reset-controller@e6160000 { + compatible = "renesas,r8a7793-rst"; + reg = <0 0xe6160000 0 0x0100>; + }; + sysc: system-controller@e6180000 { compatible = "renesas,r8a7793-sysc"; reg = <0 0xe6180000 0 0x0200>; diff --git a/arch/arm/boot/dts/r8a7794.dtsi b/arch/arm/boot/dts/r8a7794.dtsi index 9365580a194f..69e4f4fad89b 100644 --- a/arch/arm/boot/dts/r8a7794.dtsi +++ b/arch/arm/boot/dts/r8a7794.dtsi @@ -1375,6 +1375,11 @@ }; }; + rst: reset-controller@e6160000 { + compatible = "renesas,r8a7794-rst"; + reg = <0 0xe6160000 0 0x0100>; + }; + sysc: system-controller@e6180000 { compatible = "renesas,r8a7794-sysc"; reg = <0 0xe6180000 0 0x0200>; diff --git a/arch/arm/boot/dts/rk3036.dtsi b/arch/arm/boot/dts/rk3036.dtsi index a935523a1eb8..7c2dc19925a1 100644 --- a/arch/arm/boot/dts/rk3036.dtsi +++ b/arch/arm/boot/dts/rk3036.dtsi @@ -204,7 +204,6 @@ g-np-tx-fifo-size = <16>; g-rx-fifo-size = <275>; g-tx-fifo-size = <256 128 128 64 64 32>; - g-use-dma; status = "disabled"; }; diff --git a/arch/arm/boot/dts/rk3288.dtsi b/arch/arm/boot/dts/rk3288.dtsi index 17ec2e2d7a60..74a749c566ee 100644 --- a/arch/arm/boot/dts/rk3288.dtsi +++ b/arch/arm/boot/dts/rk3288.dtsi @@ -596,7 +596,6 @@ g-np-tx-fifo-size = <16>; g-rx-fifo-size = <275>; g-tx-fifo-size = <256 128 128 64 64 32>; - g-use-dma; phys = <&usbphy0>; phy-names = "usb2-phy"; status = "disabled"; diff --git a/arch/arm/boot/dts/rk3xxx.dtsi b/arch/arm/boot/dts/rk3xxx.dtsi index e15beb3c671e..8fbd3c806fa0 100644 --- a/arch/arm/boot/dts/rk3xxx.dtsi +++ b/arch/arm/boot/dts/rk3xxx.dtsi @@ -181,7 +181,6 @@ g-np-tx-fifo-size = <16>; g-rx-fifo-size = <275>; g-tx-fifo-size = <256 128 128 64 64 32>; - g-use-dma; phys = <&usbphy0>; phy-names = "usb2-phy"; status = "disabled"; diff --git a/arch/arm/boot/dts/sun8i-h3.dtsi b/arch/arm/boot/dts/sun8i-h3.dtsi index 75a865406d3e..f4ba088b225e 100644 --- a/arch/arm/boot/dts/sun8i-h3.dtsi +++ b/arch/arm/boot/dts/sun8i-h3.dtsi @@ -410,7 +410,7 @@ }; uart3_pins: uart3 { - allwinner,pins = "PG13", "PG14"; + allwinner,pins = "PA13", "PA14"; allwinner,function = "uart3"; allwinner,drive = <SUN4I_PINCTRL_10_MA>; allwinner,pull = <SUN4I_PINCTRL_NO_PULL>; diff --git a/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts b/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts index 5c1fcab4a6f7..1552db00cc59 100644 --- a/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts +++ b/arch/arm/boot/dts/vf610-zii-dev-rev-b.dts @@ -88,10 +88,16 @@ switch0: switch0@0 { compatible = "marvell,mv88e6085"; + pinctrl-0 = <&pinctrl_gpio_switch0>; + pinctrl-names = "default"; #address-cells = <1>; #size-cells = <0>; reg = <0>; dsa,member = <0 0>; + interrupt-parent = <&gpio0>; + interrupts = <27 IRQ_TYPE_LEVEL_LOW>; + interrupt-controller; + #interrupt-cells = <2>; ports { #address-cells = <1>; @@ -99,16 +105,19 @@ port@0 { reg = <0>; label = "lan0"; + phy-handle = <&switch0phy0>; }; port@1 { reg = <1>; label = "lan1"; + phy-handle = <&switch0phy1>; }; port@2 { reg = <2>; label = "lan2"; + phy-handle = <&switch0phy2>; }; switch0port5: port@5 { @@ -133,6 +142,24 @@ }; }; }; + mdio { + #address-cells = <1>; + #size-cells = <0>; + switch0phy0: switch0phy0@0 { + reg = <0>; + interrupt-parent = <&switch0>; + interrupts = <0 IRQ_TYPE_LEVEL_HIGH>; + }; + switch0phy1: switch1phy0@1 { + reg = <1>; + interrupt-parent = <&switch0>; + interrupts = <1 IRQ_TYPE_LEVEL_HIGH>; }; + switch0phy2: switch1phy0@2 { + reg = <2>; + interrupt-parent = <&switch0>; + interrupts = <2 IRQ_TYPE_LEVEL_HIGH>; + }; + }; }; }; @@ -143,10 +170,16 @@ switch1: switch1@0 { compatible = "marvell,mv88e6085"; + pinctrl-0 = <&pinctrl_gpio_switch1>; + pinctrl-names = "default"; #address-cells = <1>; #size-cells = <0>; reg = <0>; dsa,member = <0 1>; + interrupt-parent = <&gpio0>; + interrupts = <26 IRQ_TYPE_LEVEL_LOW>; + interrupt-controller; + #interrupt-cells = <2>; ports { #address-cells = <1>; @@ -196,12 +229,18 @@ #size-cells = <0>; switch1phy0: switch1phy0@0 { reg = <0>; + interrupt-parent = <&switch1>; + interrupts = <0 IRQ_TYPE_LEVEL_HIGH>; }; switch1phy1: switch1phy0@1 { reg = <1>; + interrupt-parent = <&switch1>; + interrupts = <1 IRQ_TYPE_LEVEL_HIGH>; }; switch1phy2: switch1phy0@2 { reg = <2>; + interrupt-parent = <&switch1>; + interrupts = <2 IRQ_TYPE_LEVEL_HIGH>; }; }; }; @@ -636,6 +675,18 @@ >; }; + pinctrl_gpio_switch0: pinctrl-gpio-switch0 { + fsl,pins = < + VF610_PAD_PTB5__GPIO_27 0x219d + >; + }; + + pinctrl_gpio_switch1: pinctrl-gpio-switch1 { + fsl,pins = < + VF610_PAD_PTB4__GPIO_26 0x219d + >; + }; + pinctrl_i2c_mux_reset: pinctrl-i2c-mux-reset { fsl,pins = < VF610_PAD_PTE14__GPIO_119 0x31c2 diff --git a/arch/arm/common/bL_switcher.c b/arch/arm/common/bL_switcher.c index 37dc0fe1093f..46730017b3c5 100644 --- a/arch/arm/common/bL_switcher.c +++ b/arch/arm/common/bL_switcher.c @@ -757,19 +757,18 @@ EXPORT_SYMBOL_GPL(bL_switcher_put_enabled); * while the switcher is active. * We're just not ready to deal with that given the trickery involved. */ -static int bL_switcher_hotplug_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) +static int bL_switcher_cpu_pre(unsigned int cpu) { - if (bL_switcher_active) { - int pairing = bL_switcher_cpu_pairing[(unsigned long)hcpu]; - switch (action & 0xf) { - case CPU_UP_PREPARE: - case CPU_DOWN_PREPARE: - if (pairing == -1) - return NOTIFY_BAD; - } - } - return NOTIFY_DONE; + int pairing; + + if (!bL_switcher_active) + return 0; + + pairing = bL_switcher_cpu_pairing[cpu]; + + if (pairing == -1) + return -EINVAL; + return 0; } static bool no_bL_switcher; @@ -782,8 +781,15 @@ static int __init bL_switcher_init(void) if (!mcpm_is_available()) return -ENODEV; - cpu_notifier(bL_switcher_hotplug_callback, 0); - + cpuhp_setup_state_nocalls(CPUHP_ARM_BL_PREPARE, "arm/bl:prepare", + bL_switcher_cpu_pre, NULL); + ret = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "arm/bl:predown", + NULL, bL_switcher_cpu_pre); + if (ret < 0) { + cpuhp_remove_state_nocalls(CPUHP_ARM_BL_PREPARE); + pr_err("bL_switcher: Failed to allocate a hotplug state\n"); + return ret; + } if (!no_bL_switcher) { ret = bL_switcher_enable(); if (ret) diff --git a/arch/arm/configs/multi_v7_defconfig b/arch/arm/configs/multi_v7_defconfig index 11f37ed1dbff..30f39acd61bd 100644 --- a/arch/arm/configs/multi_v7_defconfig +++ b/arch/arm/configs/multi_v7_defconfig @@ -489,7 +489,7 @@ CONFIG_MFD_MAX8907=y CONFIG_MFD_MAX8997=y CONFIG_MFD_MAX8998=y CONFIG_MFD_RK808=y -CONFIG_MFD_PM8921_CORE=y +CONFIG_MFD_PM8XXX=y CONFIG_MFD_QCOM_RPM=y CONFIG_MFD_SPMI_PMIC=y CONFIG_MFD_SEC_CORE=y @@ -649,6 +649,9 @@ CONFIG_SND_SOC_AK4642=m CONFIG_SND_SOC_SGTL5000=m CONFIG_SND_SOC_SPDIF=m CONFIG_SND_SOC_WM8978=m +CONFIG_SND_SOC_STI=m +CONFIG_SND_SOC_STI_SAS=m +CONFIG_SND_SIMPLE_CARD=m CONFIG_USB=y CONFIG_USB_XHCI_HCD=y CONFIG_USB_XHCI_MVEBU=y @@ -790,6 +793,7 @@ CONFIG_DMA_OMAP=y CONFIG_QCOM_BAM_DMA=y CONFIG_XILINX_DMA=y CONFIG_DMA_SUN6I=y +CONFIG_ST_FDMA=m CONFIG_STAGING=y CONFIG_SENSORS_ISL29018=y CONFIG_SENSORS_ISL29028=y @@ -823,6 +827,8 @@ CONFIG_HWSPINLOCK_QCOM=y CONFIG_ROCKCHIP_IOMMU=y CONFIG_TEGRA_IOMMU_GART=y CONFIG_TEGRA_IOMMU_SMMU=y +CONFIG_REMOTEPROC=m +CONFIG_ST_REMOTEPROC=m CONFIG_PM_DEVFREQ=y CONFIG_ARM_TEGRA_DEVFREQ=m CONFIG_MEMORY=y diff --git a/arch/arm/configs/pxa_defconfig b/arch/arm/configs/pxa_defconfig index a016ecc0084b..e4314b1227a3 100644 --- a/arch/arm/configs/pxa_defconfig +++ b/arch/arm/configs/pxa_defconfig @@ -411,7 +411,6 @@ CONFIG_MFD_MAX77693=y CONFIG_MFD_MAX8907=m CONFIG_EZX_PCAP=y CONFIG_UCB1400_CORE=m -CONFIG_MFD_PM8921_CORE=m CONFIG_MFD_SEC_CORE=y CONFIG_MFD_PALMAS=y CONFIG_MFD_TPS65090=y diff --git a/arch/arm/configs/qcom_defconfig b/arch/arm/configs/qcom_defconfig index c2dff4fd5fc4..74e9cd759b99 100644 --- a/arch/arm/configs/qcom_defconfig +++ b/arch/arm/configs/qcom_defconfig @@ -119,7 +119,6 @@ CONFIG_POWER_RESET=y CONFIG_POWER_RESET_MSM=y CONFIG_THERMAL=y CONFIG_MFD_PM8XXX=y -CONFIG_MFD_PM8921_CORE=y CONFIG_MFD_QCOM_RPM=y CONFIG_MFD_SPMI_PMIC=y CONFIG_REGULATOR=y diff --git a/arch/arm/include/asm/arch_gicv3.h b/arch/arm/include/asm/arch_gicv3.h index a8088290b778..27475904e096 100644 --- a/arch/arm/include/asm/arch_gicv3.h +++ b/arch/arm/include/asm/arch_gicv3.h @@ -22,6 +22,7 @@ #include <linux/io.h> #include <asm/barrier.h> +#include <asm/cacheflush.h> #include <asm/cp15.h> #define ICC_EOIR1 __ACCESS_CP15(c12, 0, c12, 1) @@ -230,19 +231,14 @@ static inline void gic_write_bpr1(u32 val) * AArch32, since the syndrome register doesn't provide any information for * them. * Consequently, the following IO helpers use 32bit accesses. - * - * There are only two registers that need 64bit accesses in this driver: - * - GICD_IROUTERn, contain the affinity values associated to each interrupt. - * The upper-word (aff3) will always be 0, so there is no need for a lock. - * - GICR_TYPER is an ID register and doesn't need atomicity. */ -static inline void gic_write_irouter(u64 val, volatile void __iomem *addr) +static inline void __gic_writeq_nonatomic(u64 val, volatile void __iomem *addr) { writel_relaxed((u32)val, addr); writel_relaxed((u32)(val >> 32), addr + 4); } -static inline u64 gic_read_typer(const volatile void __iomem *addr) +static inline u64 __gic_readq_nonatomic(const volatile void __iomem *addr) { u64 val; @@ -251,5 +247,49 @@ static inline u64 gic_read_typer(const volatile void __iomem *addr) return val; } +#define gic_flush_dcache_to_poc(a,l) __cpuc_flush_dcache_area((a), (l)) + +/* + * GICD_IROUTERn, contain the affinity values associated to each interrupt. + * The upper-word (aff3) will always be 0, so there is no need for a lock. + */ +#define gic_write_irouter(v, c) __gic_writeq_nonatomic(v, c) + +/* + * GICR_TYPER is an ID register and doesn't need atomicity. + */ +#define gic_read_typer(c) __gic_readq_nonatomic(c) + +/* + * GITS_BASER - hi and lo bits may be accessed independently. + */ +#define gits_read_baser(c) __gic_readq_nonatomic(c) +#define gits_write_baser(v, c) __gic_writeq_nonatomic(v, c) + +/* + * GICR_PENDBASER and GICR_PROPBASE are changed with LPIs disabled, so they + * won't be being used during any updates and can be changed non-atomically + */ +#define gicr_read_propbaser(c) __gic_readq_nonatomic(c) +#define gicr_write_propbaser(v, c) __gic_writeq_nonatomic(v, c) +#define gicr_read_pendbaser(c) __gic_readq_nonatomic(c) +#define gicr_write_pendbaser(v, c) __gic_writeq_nonatomic(v, c) + +/* + * GITS_TYPER is an ID register and doesn't need atomicity. + */ +#define gits_read_typer(c) __gic_readq_nonatomic(c) + +/* + * GITS_CBASER - hi and lo bits may be accessed independently. + */ +#define gits_read_cbaser(c) __gic_readq_nonatomic(c) +#define gits_write_cbaser(v, c) __gic_writeq_nonatomic(v, c) + +/* + * GITS_CWRITER - hi and lo bits may be accessed independently. + */ +#define gits_write_cwriter(v, c) __gic_writeq_nonatomic(v, c) + #endif /* !__ASSEMBLY__ */ #endif /* !__ASM_ARCH_GICV3_H */ diff --git a/arch/arm/include/asm/efi.h b/arch/arm/include/asm/efi.h index 766bf9b78160..0b06f5341b45 100644 --- a/arch/arm/include/asm/efi.h +++ b/arch/arm/include/asm/efi.h @@ -57,6 +57,9 @@ void efi_virtmap_unload(void); #define __efi_call_early(f, ...) f(__VA_ARGS__) #define efi_is_64bit() (false) +#define efi_call_proto(protocol, f, instance, ...) \ + ((protocol##_t *)instance)->f(instance, ##__VA_ARGS__) + struct screen_info *alloc_screen_info(efi_system_table_t *sys_table_arg); void free_screen_info(efi_system_table_t *sys_table, struct screen_info *si); diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index 021692c64de3..42871fb8340e 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -25,7 +25,6 @@ #include <linux/string.h> #include <linux/types.h> -#include <linux/blk_types.h> #include <asm/byteorder.h> #include <asm/memory.h> #include <asm-generic/pci_iomap.h> diff --git a/arch/arm/include/asm/mutex.h b/arch/arm/include/asm/mutex.h deleted file mode 100644 index 87c044910fe0..000000000000 --- a/arch/arm/include/asm/mutex.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * arch/arm/include/asm/mutex.h - * - * ARM optimized mutex locking primitives - * - * Please look into asm-generic/mutex-xchg.h for a formal definition. - */ -#ifndef _ASM_MUTEX_H -#define _ASM_MUTEX_H -/* - * On pre-ARMv6 hardware this results in a swp-based implementation, - * which is the most efficient. For ARMv6+, we have exclusive memory - * accessors and use atomic_dec to avoid the extra xchg operations - * on the locking slowpaths. - */ -#if __LINUX_ARM_ARCH__ < 6 -#include <asm-generic/mutex-xchg.h> -#else -#include <asm-generic/mutex-dec.h> -#endif -#endif /* _ASM_MUTEX_H */ diff --git a/arch/arm/include/asm/processor.h b/arch/arm/include/asm/processor.h index 8a1e8e995dae..c3d5fc124a05 100644 --- a/arch/arm/include/asm/processor.h +++ b/arch/arm/include/asm/processor.h @@ -82,8 +82,6 @@ unsigned long get_wchan(struct task_struct *p); #define cpu_relax() barrier() #endif -#define cpu_relax_lowlatency() cpu_relax() - #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1) diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 1e25cd80589e..3f2eb76243e3 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -186,6 +186,8 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long addr) tlb_add_flush(tlb, addr); } +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, @@ -211,18 +213,17 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { + tlb->pages[tlb->nr++] = page; + VM_WARN_ON(tlb->nr > tlb->max); if (tlb->nr == tlb->max) return true; - tlb->pages[tlb->nr++] = page; return false; } static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - if (__tlb_remove_page(tlb, page)) { + if (__tlb_remove_page(tlb, page)) tlb_flush_mmu(tlb); - __tlb_remove_page(tlb, page); - } } static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, @@ -231,12 +232,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { @@ -284,5 +279,11 @@ tlb_remove_pmd_tlb_entry(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr #define tlb_migrate_finish(mm) do { } while (0) +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #endif /* CONFIG_MMU */ #endif diff --git a/arch/arm/include/asm/xen/hypercall.h b/arch/arm/include/asm/xen/hypercall.h index 9d874db13c0e..3522cbaed316 100644 --- a/arch/arm/include/asm/xen/hypercall.h +++ b/arch/arm/include/asm/xen/hypercall.h @@ -1,87 +1 @@ -/****************************************************************************** - * hypercall.h - * - * Linux-specific hypervisor handling. - * - * Stefano Stabellini <stefano.stabellini@eu.citrix.com>, Citrix, 2012 - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation; or, when distributed - * separately from the Linux kernel or incorporated into other - * software packages, subject to the following license: - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this source file (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, modify, - * merge, publish, distribute, sublicense, and/or sell copies of the Software, - * and to permit persons to whom the Software is furnished to do so, subject to - * the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef _ASM_ARM_XEN_HYPERCALL_H -#define _ASM_ARM_XEN_HYPERCALL_H - -#include <linux/bug.h> - -#include <xen/interface/xen.h> -#include <xen/interface/sched.h> -#include <xen/interface/platform.h> - -long privcmd_call(unsigned call, unsigned long a1, - unsigned long a2, unsigned long a3, - unsigned long a4, unsigned long a5); -int HYPERVISOR_xen_version(int cmd, void *arg); -int HYPERVISOR_console_io(int cmd, int count, char *str); -int HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count); -int HYPERVISOR_sched_op(int cmd, void *arg); -int HYPERVISOR_event_channel_op(int cmd, void *arg); -unsigned long HYPERVISOR_hvm_op(int op, void *arg); -int HYPERVISOR_memory_op(unsigned int cmd, void *arg); -int HYPERVISOR_physdev_op(int cmd, void *arg); -int HYPERVISOR_vcpu_op(int cmd, int vcpuid, void *extra_args); -int HYPERVISOR_tmem_op(void *arg); -int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type); -int HYPERVISOR_platform_op_raw(void *arg); -static inline int HYPERVISOR_platform_op(struct xen_platform_op *op) -{ - op->interface_version = XENPF_INTERFACE_VERSION; - return HYPERVISOR_platform_op_raw(op); -} -int HYPERVISOR_multicall(struct multicall_entry *calls, uint32_t nr); - -static inline int -HYPERVISOR_suspend(unsigned long start_info_mfn) -{ - struct sched_shutdown r = { .reason = SHUTDOWN_suspend }; - - /* start_info_mfn is unused on ARM */ - return HYPERVISOR_sched_op(SCHEDOP_shutdown, &r); -} - -static inline void -MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, - unsigned int new_val, unsigned long flags) -{ - BUG(); -} - -static inline void -MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req, - int count, int *success_count, domid_t domid) -{ - BUG(); -} - -#endif /* _ASM_ARM_XEN_HYPERCALL_H */ +#include <xen/arm/hypercall.h> diff --git a/arch/arm/include/asm/xen/hypervisor.h b/arch/arm/include/asm/xen/hypervisor.h index 95251512e2c4..d6e7709d0688 100644 --- a/arch/arm/include/asm/xen/hypervisor.h +++ b/arch/arm/include/asm/xen/hypervisor.h @@ -1,39 +1 @@ -#ifndef _ASM_ARM_XEN_HYPERVISOR_H -#define _ASM_ARM_XEN_HYPERVISOR_H - -#include <linux/init.h> - -extern struct shared_info *HYPERVISOR_shared_info; -extern struct start_info *xen_start_info; - -/* Lazy mode for batching updates / context switch */ -enum paravirt_lazy_mode { - PARAVIRT_LAZY_NONE, - PARAVIRT_LAZY_MMU, - PARAVIRT_LAZY_CPU, -}; - -static inline enum paravirt_lazy_mode paravirt_get_lazy_mode(void) -{ - return PARAVIRT_LAZY_NONE; -} - -extern struct dma_map_ops *xen_dma_ops; - -#ifdef CONFIG_XEN -void __init xen_early_init(void); -#else -static inline void xen_early_init(void) { return; } -#endif - -#ifdef CONFIG_HOTPLUG_CPU -static inline void xen_arch_register_cpu(int num) -{ -} - -static inline void xen_arch_unregister_cpu(int num) -{ -} -#endif - -#endif /* _ASM_ARM_XEN_HYPERVISOR_H */ +#include <xen/arm/hypervisor.h> diff --git a/arch/arm/include/asm/xen/interface.h b/arch/arm/include/asm/xen/interface.h index 75d596862892..88c0d75da190 100644 --- a/arch/arm/include/asm/xen/interface.h +++ b/arch/arm/include/asm/xen/interface.h @@ -1,85 +1 @@ -/****************************************************************************** - * Guest OS interface to ARM Xen. - * - * Stefano Stabellini <stefano.stabellini@eu.citrix.com>, Citrix, 2012 - */ - -#ifndef _ASM_ARM_XEN_INTERFACE_H -#define _ASM_ARM_XEN_INTERFACE_H - -#include <linux/types.h> - -#define uint64_aligned_t uint64_t __attribute__((aligned(8))) - -#define __DEFINE_GUEST_HANDLE(name, type) \ - typedef struct { union { type *p; uint64_aligned_t q; }; } \ - __guest_handle_ ## name - -#define DEFINE_GUEST_HANDLE_STRUCT(name) \ - __DEFINE_GUEST_HANDLE(name, struct name) -#define DEFINE_GUEST_HANDLE(name) __DEFINE_GUEST_HANDLE(name, name) -#define GUEST_HANDLE(name) __guest_handle_ ## name - -#define set_xen_guest_handle(hnd, val) \ - do { \ - if (sizeof(hnd) == 8) \ - *(uint64_t *)&(hnd) = 0; \ - (hnd).p = val; \ - } while (0) - -#define __HYPERVISOR_platform_op_raw __HYPERVISOR_platform_op - -#ifndef __ASSEMBLY__ -/* Explicitly size integers that represent pfns in the interface with - * Xen so that we can have one ABI that works for 32 and 64 bit guests. - * Note that this means that the xen_pfn_t type may be capable of - * representing pfn's which the guest cannot represent in its own pfn - * type. However since pfn space is controlled by the guest this is - * fine since it simply wouldn't be able to create any sure pfns in - * the first place. - */ -typedef uint64_t xen_pfn_t; -#define PRI_xen_pfn "llx" -typedef uint64_t xen_ulong_t; -#define PRI_xen_ulong "llx" -typedef int64_t xen_long_t; -#define PRI_xen_long "llx" -/* Guest handles for primitive C types. */ -__DEFINE_GUEST_HANDLE(uchar, unsigned char); -__DEFINE_GUEST_HANDLE(uint, unsigned int); -DEFINE_GUEST_HANDLE(char); -DEFINE_GUEST_HANDLE(int); -DEFINE_GUEST_HANDLE(void); -DEFINE_GUEST_HANDLE(uint64_t); -DEFINE_GUEST_HANDLE(uint32_t); -DEFINE_GUEST_HANDLE(xen_pfn_t); -DEFINE_GUEST_HANDLE(xen_ulong_t); - -/* Maximum number of virtual CPUs in multi-processor guests. */ -#define MAX_VIRT_CPUS 1 - -struct arch_vcpu_info { }; -struct arch_shared_info { }; - -/* TODO: Move pvclock definitions some place arch independent */ -struct pvclock_vcpu_time_info { - u32 version; - u32 pad0; - u64 tsc_timestamp; - u64 system_time; - u32 tsc_to_system_mul; - s8 tsc_shift; - u8 flags; - u8 pad[2]; -} __attribute__((__packed__)); /* 32 bytes */ - -/* It is OK to have a 12 bytes struct with no padding because it is packed */ -struct pvclock_wall_clock { - u32 version; - u32 sec; - u32 nsec; - u32 sec_hi; -} __attribute__((__packed__)); -#endif - -#endif /* _ASM_ARM_XEN_INTERFACE_H */ +#include <xen/arm/interface.h> diff --git a/arch/arm/include/asm/xen/page-coherent.h b/arch/arm/include/asm/xen/page-coherent.h index 95ce6ac3a971..b3ef061d8b74 100644 --- a/arch/arm/include/asm/xen/page-coherent.h +++ b/arch/arm/include/asm/xen/page-coherent.h @@ -1,98 +1 @@ -#ifndef _ASM_ARM_XEN_PAGE_COHERENT_H -#define _ASM_ARM_XEN_PAGE_COHERENT_H - -#include <asm/page.h> -#include <linux/dma-mapping.h> - -void __xen_dma_map_page(struct device *hwdev, struct page *page, - dma_addr_t dev_addr, unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs); -void __xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, - size_t size, enum dma_data_direction dir, - unsigned long attrs); -void __xen_dma_sync_single_for_cpu(struct device *hwdev, - dma_addr_t handle, size_t size, enum dma_data_direction dir); - -void __xen_dma_sync_single_for_device(struct device *hwdev, - dma_addr_t handle, size_t size, enum dma_data_direction dir); - -static inline void *xen_alloc_coherent_pages(struct device *hwdev, size_t size, - dma_addr_t *dma_handle, gfp_t flags, unsigned long attrs) -{ - return __generic_dma_ops(hwdev)->alloc(hwdev, size, dma_handle, flags, attrs); -} - -static inline void xen_free_coherent_pages(struct device *hwdev, size_t size, - void *cpu_addr, dma_addr_t dma_handle, unsigned long attrs) -{ - __generic_dma_ops(hwdev)->free(hwdev, size, cpu_addr, dma_handle, attrs); -} - -static inline void xen_dma_map_page(struct device *hwdev, struct page *page, - dma_addr_t dev_addr, unsigned long offset, size_t size, - enum dma_data_direction dir, unsigned long attrs) -{ - unsigned long page_pfn = page_to_xen_pfn(page); - unsigned long dev_pfn = XEN_PFN_DOWN(dev_addr); - unsigned long compound_pages = - (1<<compound_order(page)) * XEN_PFN_PER_PAGE; - bool local = (page_pfn <= dev_pfn) && - (dev_pfn - page_pfn < compound_pages); - - /* - * Dom0 is mapped 1:1, while the Linux page can span across - * multiple Xen pages, it's not possible for it to contain a - * mix of local and foreign Xen pages. So if the first xen_pfn - * == mfn the page is local otherwise it's a foreign page - * grant-mapped in dom0. If the page is local we can safely - * call the native dma_ops function, otherwise we call the xen - * specific function. - */ - if (local) - __generic_dma_ops(hwdev)->map_page(hwdev, page, offset, size, dir, attrs); - else - __xen_dma_map_page(hwdev, page, dev_addr, offset, size, dir, attrs); -} - -static inline void xen_dma_unmap_page(struct device *hwdev, dma_addr_t handle, - size_t size, enum dma_data_direction dir, unsigned long attrs) -{ - unsigned long pfn = PFN_DOWN(handle); - /* - * Dom0 is mapped 1:1, while the Linux page can be spanned accross - * multiple Xen page, it's not possible to have a mix of local and - * foreign Xen page. Dom0 is mapped 1:1, so calling pfn_valid on a - * foreign mfn will always return false. If the page is local we can - * safely call the native dma_ops function, otherwise we call the xen - * specific function. - */ - if (pfn_valid(pfn)) { - if (__generic_dma_ops(hwdev)->unmap_page) - __generic_dma_ops(hwdev)->unmap_page(hwdev, handle, size, dir, attrs); - } else - __xen_dma_unmap_page(hwdev, handle, size, dir, attrs); -} - -static inline void xen_dma_sync_single_for_cpu(struct device *hwdev, - dma_addr_t handle, size_t size, enum dma_data_direction dir) -{ - unsigned long pfn = PFN_DOWN(handle); - if (pfn_valid(pfn)) { - if (__generic_dma_ops(hwdev)->sync_single_for_cpu) - __generic_dma_ops(hwdev)->sync_single_for_cpu(hwdev, handle, size, dir); - } else - __xen_dma_sync_single_for_cpu(hwdev, handle, size, dir); -} - -static inline void xen_dma_sync_single_for_device(struct device *hwdev, - dma_addr_t handle, size_t size, enum dma_data_direction dir) -{ - unsigned long pfn = PFN_DOWN(handle); - if (pfn_valid(pfn)) { - if (__generic_dma_ops(hwdev)->sync_single_for_device) - __generic_dma_ops(hwdev)->sync_single_for_device(hwdev, handle, size, dir); - } else - __xen_dma_sync_single_for_device(hwdev, handle, size, dir); -} - -#endif /* _ASM_ARM_XEN_PAGE_COHERENT_H */ +#include <xen/arm/page-coherent.h> diff --git a/arch/arm/include/asm/xen/page.h b/arch/arm/include/asm/xen/page.h index 415dbc6e43fd..31bbc803cecb 100644 --- a/arch/arm/include/asm/xen/page.h +++ b/arch/arm/include/asm/xen/page.h @@ -1,122 +1 @@ -#ifndef _ASM_ARM_XEN_PAGE_H -#define _ASM_ARM_XEN_PAGE_H - -#include <asm/page.h> -#include <asm/pgtable.h> - -#include <linux/pfn.h> -#include <linux/types.h> -#include <linux/dma-mapping.h> - -#include <xen/xen.h> -#include <xen/interface/grant_table.h> - -#define phys_to_machine_mapping_valid(pfn) (1) - -/* Xen machine address */ -typedef struct xmaddr { - phys_addr_t maddr; -} xmaddr_t; - -/* Xen pseudo-physical address */ -typedef struct xpaddr { - phys_addr_t paddr; -} xpaddr_t; - -#define XMADDR(x) ((xmaddr_t) { .maddr = (x) }) -#define XPADDR(x) ((xpaddr_t) { .paddr = (x) }) - -#define INVALID_P2M_ENTRY (~0UL) - -/* - * The pseudo-physical frame (pfn) used in all the helpers is always based - * on Xen page granularity (i.e 4KB). - * - * A Linux page may be split across multiple non-contiguous Xen page so we - * have to keep track with frame based on 4KB page granularity. - * - * PV drivers should never make a direct usage of those helpers (particularly - * pfn_to_gfn and gfn_to_pfn). - */ - -unsigned long __pfn_to_mfn(unsigned long pfn); -extern struct rb_root phys_to_mach; - -/* Pseudo-physical <-> Guest conversion */ -static inline unsigned long pfn_to_gfn(unsigned long pfn) -{ - return pfn; -} - -static inline unsigned long gfn_to_pfn(unsigned long gfn) -{ - return gfn; -} - -/* Pseudo-physical <-> BUS conversion */ -static inline unsigned long pfn_to_bfn(unsigned long pfn) -{ - unsigned long mfn; - - if (phys_to_mach.rb_node != NULL) { - mfn = __pfn_to_mfn(pfn); - if (mfn != INVALID_P2M_ENTRY) - return mfn; - } - - return pfn; -} - -static inline unsigned long bfn_to_pfn(unsigned long bfn) -{ - return bfn; -} - -#define bfn_to_local_pfn(bfn) bfn_to_pfn(bfn) - -/* VIRT <-> GUEST conversion */ -#define virt_to_gfn(v) (pfn_to_gfn(virt_to_phys(v) >> XEN_PAGE_SHIFT)) -#define gfn_to_virt(m) (__va(gfn_to_pfn(m) << XEN_PAGE_SHIFT)) - -/* Only used in PV code. But ARM guests are always HVM. */ -static inline xmaddr_t arbitrary_virt_to_machine(void *vaddr) -{ - BUG(); -} - -/* TODO: this shouldn't be here but it is because the frontend drivers - * are using it (its rolled in headers) even though we won't hit the code path. - * So for right now just punt with this. - */ -static inline pte_t *lookup_address(unsigned long address, unsigned int *level) -{ - BUG(); - return NULL; -} - -extern int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, - struct gnttab_map_grant_ref *kmap_ops, - struct page **pages, unsigned int count); - -extern int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, - struct gnttab_unmap_grant_ref *kunmap_ops, - struct page **pages, unsigned int count); - -bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); -bool __set_phys_to_machine_multi(unsigned long pfn, unsigned long mfn, - unsigned long nr_pages); - -static inline bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) -{ - return __set_phys_to_machine(pfn, mfn); -} - -#define xen_remap(cookie, size) ioremap_cache((cookie), (size)) -#define xen_unmap(cookie) iounmap((cookie)) - -bool xen_arch_need_swiotlb(struct device *dev, - phys_addr_t phys, - dma_addr_t dev_addr); -unsigned long xen_get_swiotlb_free_pages(unsigned int order); - -#endif /* _ASM_ARM_XEN_PAGE_H */ +#include <xen/arm/page.h> diff --git a/arch/arm/include/uapi/asm/kvm.h b/arch/arm/include/uapi/asm/kvm.h index b38c10c73579..af05f8e0903e 100644 --- a/arch/arm/include/uapi/asm/kvm.h +++ b/arch/arm/include/uapi/asm/kvm.h @@ -87,9 +87,11 @@ struct kvm_regs { /* Supported VGICv3 address types */ #define KVM_VGIC_V3_ADDR_TYPE_DIST 2 #define KVM_VGIC_V3_ADDR_TYPE_REDIST 3 +#define KVM_VGIC_ITS_ADDR_TYPE 4 #define KVM_VGIC_V3_DIST_SIZE SZ_64K #define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K) +#define KVM_VGIC_V3_ITS_SIZE (2 * SZ_64K) #define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ #define KVM_ARM_VCPU_PSCI_0_2 1 /* CPU uses PSCI v0.2 */ diff --git a/arch/arm/kernel/hw_breakpoint.c b/arch/arm/kernel/hw_breakpoint.c index b8df45883cf7..188180b5523d 100644 --- a/arch/arm/kernel/hw_breakpoint.c +++ b/arch/arm/kernel/hw_breakpoint.c @@ -925,9 +925,9 @@ static bool core_has_os_save_restore(void) } } -static void reset_ctrl_regs(void *unused) +static void reset_ctrl_regs(unsigned int cpu) { - int i, raw_num_brps, err = 0, cpu = smp_processor_id(); + int i, raw_num_brps, err = 0; u32 val; /* @@ -1020,25 +1020,20 @@ out_mdbgen: cpumask_or(&debug_err_mask, &debug_err_mask, cpumask_of(cpu)); } -static int dbg_reset_notify(struct notifier_block *self, - unsigned long action, void *cpu) +static int dbg_reset_online(unsigned int cpu) { - if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE) - smp_call_function_single((int)cpu, reset_ctrl_regs, NULL, 1); - - return NOTIFY_OK; + local_irq_disable(); + reset_ctrl_regs(cpu); + local_irq_enable(); + return 0; } -static struct notifier_block dbg_reset_nb = { - .notifier_call = dbg_reset_notify, -}; - #ifdef CONFIG_CPU_PM static int dbg_cpu_pm_notify(struct notifier_block *self, unsigned long action, void *v) { if (action == CPU_PM_EXIT) - reset_ctrl_regs(NULL); + reset_ctrl_regs(smp_processor_id()); return NOTIFY_OK; } @@ -1059,6 +1054,8 @@ static inline void pm_init(void) static int __init arch_hw_breakpoint_init(void) { + int ret; + debug_arch = get_debug_arch(); if (!debug_arch_supported()) { @@ -1072,25 +1069,28 @@ static int __init arch_hw_breakpoint_init(void) core_num_brps = get_num_brps(); core_num_wrps = get_num_wrps(); - cpu_notifier_register_begin(); - /* * We need to tread carefully here because DBGSWENABLE may be * driven low on this core and there isn't an architected way to * determine that. */ + get_online_cpus(); register_undef_hook(&debug_reg_hook); /* - * Reset the breakpoint resources. We assume that a halting - * debugger will leave the world in a nice state for us. + * Register CPU notifier which resets the breakpoint resources. We + * assume that a halting debugger will leave the world in a nice state + * for us. */ - on_each_cpu(reset_ctrl_regs, NULL, 1); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "arm/hw_breakpoint:online", + dbg_reset_online, NULL); unregister_undef_hook(&debug_reg_hook); - if (!cpumask_empty(&debug_err_mask)) { + if (WARN_ON(ret < 0) || !cpumask_empty(&debug_err_mask)) { core_num_brps = 0; core_num_wrps = 0; - cpu_notifier_register_done(); + if (ret > 0) + cpuhp_remove_state_nocalls(ret); + put_online_cpus(); return 0; } @@ -1108,12 +1108,9 @@ static int __init arch_hw_breakpoint_init(void) TRAP_HWBKPT, "watchpoint debug exception"); hook_ifault_code(FAULT_CODE_DEBUG, hw_breakpoint_pending, SIGTRAP, TRAP_HWBKPT, "breakpoint debug exception"); + put_online_cpus(); - /* Register hotplug and PM notifiers. */ - __register_cpu_notifier(&dbg_reset_nb); - - cpu_notifier_register_done(); - + /* Register PM notifiers. */ pm_init(); return 0; } diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig index 3e1cd0452d67..90d0176fb30d 100644 --- a/arch/arm/kvm/Kconfig +++ b/arch/arm/kvm/Kconfig @@ -34,6 +34,7 @@ config KVM select HAVE_KVM_IRQFD select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQ_ROUTING + select HAVE_KVM_MSI depends on ARM_VIRT_EXT && ARM_LPAE && ARM_ARCH_TIMER ---help--- Support hosting virtualized guest machines. diff --git a/arch/arm/kvm/Makefile b/arch/arm/kvm/Makefile index f19842ea5418..d571243ab4d1 100644 --- a/arch/arm/kvm/Makefile +++ b/arch/arm/kvm/Makefile @@ -32,5 +32,6 @@ obj-y += $(KVM)/arm/vgic/vgic-mmio.o obj-y += $(KVM)/arm/vgic/vgic-mmio-v2.o obj-y += $(KVM)/arm/vgic/vgic-mmio-v3.o obj-y += $(KVM)/arm/vgic/vgic-kvm-device.o +obj-y += $(KVM)/arm/vgic/vgic-its.o obj-y += $(KVM)/irqchip.o obj-y += $(KVM)/arm/arch_timer.o diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c index 19b5f5c1c0ff..8f92efa8460e 100644 --- a/arch/arm/kvm/arm.c +++ b/arch/arm/kvm/arm.c @@ -221,6 +221,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; + case KVM_CAP_MSI_DEVID: + if (!kvm) + r = -EINVAL; + else + r = kvm->arch.vgic.msis_require_devid; + break; default: r = kvm_arch_dev_ioctl_check_extension(kvm, ext); break; diff --git a/arch/arm/mach-imx/gpc.c b/arch/arm/mach-imx/gpc.c index b54db47f6f32..1dc2a34b9dbd 100644 --- a/arch/arm/mach-imx/gpc.c +++ b/arch/arm/mach-imx/gpc.c @@ -380,13 +380,6 @@ static struct pu_domain imx6q_pu_domain = { .name = "PU", .power_off = imx6q_pm_pu_power_off, .power_on = imx6q_pm_pu_power_on, - .states = { - [0] = { - .power_off_latency_ns = 25000, - .power_on_latency_ns = 2000000, - }, - }, - .state_count = 1, }, }; @@ -430,6 +423,16 @@ static int imx_gpc_genpd_init(struct device *dev, struct regulator *pu_reg) if (!IS_ENABLED(CONFIG_PM_GENERIC_DOMAINS)) return 0; + imx6q_pu_domain.base.states = devm_kzalloc(dev, + sizeof(*imx6q_pu_domain.base.states), + GFP_KERNEL); + if (!imx6q_pu_domain.base.states) + return -ENOMEM; + + imx6q_pu_domain.base.states[0].power_off_latency_ns = 25000; + imx6q_pu_domain.base.states[0].power_on_latency_ns = 2000000; + imx6q_pu_domain.base.state_count = 1; + for (i = 0; i < ARRAY_SIZE(imx_gpc_domains); i++) pm_genpd_init(imx_gpc_domains[i], NULL, false); diff --git a/arch/arm/mach-integrator/impd1.c b/arch/arm/mach-integrator/impd1.c index ed9a01484030..a109f6482413 100644 --- a/arch/arm/mach-integrator/impd1.c +++ b/arch/arm/mach-integrator/impd1.c @@ -21,7 +21,6 @@ #include <linux/amba/bus.h> #include <linux/amba/clcd.h> #include <linux/amba/mmci.h> -#include <linux/amba/pl061.h> #include <linux/io.h> #include <linux/platform_data/clk-integrator.h> #include <linux/slab.h> diff --git a/arch/arm/mach-pxa/idp.c b/arch/arm/mach-pxa/idp.c index 66070acaa888..d1db32b1a2c6 100644 --- a/arch/arm/mach-pxa/idp.c +++ b/arch/arm/mach-pxa/idp.c @@ -85,6 +85,7 @@ static struct resource smc91x_resources[] = { static struct smc91x_platdata smc91x_platdata = { .flags = SMC91X_USE_8BIT | SMC91X_USE_16BIT | SMC91X_USE_32BIT | SMC91X_USE_DMA | SMC91X_NOWAIT, + .pxa_u16_align4 = true, }; static struct platform_device smc91x_device = { diff --git a/arch/arm/mach-pxa/mainstone.c b/arch/arm/mach-pxa/mainstone.c index 40964069a17c..a2d851a3a546 100644 --- a/arch/arm/mach-pxa/mainstone.c +++ b/arch/arm/mach-pxa/mainstone.c @@ -140,6 +140,7 @@ static struct resource smc91x_resources[] = { static struct smc91x_platdata mainstone_smc91x_info = { .flags = SMC91X_USE_8BIT | SMC91X_USE_16BIT | SMC91X_USE_32BIT | SMC91X_NOWAIT | SMC91X_USE_DMA, + .pxa_u16_align4 = true, }; static struct platform_device smc91x_device = { diff --git a/arch/arm/mach-pxa/stargate2.c b/arch/arm/mach-pxa/stargate2.c index 702f4f14b708..7b6610e9dae4 100644 --- a/arch/arm/mach-pxa/stargate2.c +++ b/arch/arm/mach-pxa/stargate2.c @@ -673,6 +673,7 @@ static struct resource smc91x_resources[] = { static struct smc91x_platdata stargate2_smc91x_info = { .flags = SMC91X_USE_8BIT | SMC91X_USE_16BIT | SMC91X_USE_32BIT | SMC91X_NOWAIT | SMC91X_USE_DMA, + .pxa_u16_align4 = true, }; static struct platform_device smc91x_device = { diff --git a/arch/arm/mach-shmobile/setup-r8a7778.c b/arch/arm/mach-shmobile/setup-r8a7778.c index cf236db686a9..7fa4a0b5f654 100644 --- a/arch/arm/mach-shmobile/setup-r8a7778.c +++ b/arch/arm/mach-shmobile/setup-r8a7778.c @@ -15,7 +15,6 @@ * GNU General Public License for more details. */ -#include <linux/clk/renesas.h> #include <linux/io.h> #include <linux/irqchip.h> @@ -23,19 +22,6 @@ #include "common.h" -#define MODEMR 0xffcc0020 - -static void __init r8a7778_timer_init(void) -{ - u32 mode; - void __iomem *modemr = ioremap_nocache(MODEMR, 4); - - BUG_ON(!modemr); - mode = ioread32(modemr); - iounmap(modemr); - r8a7778_clocks_init(mode); -} - #define INT2SMSKCR0 0x82288 /* 0xfe782288 */ #define INT2SMSKCR1 0x8228c /* 0xfe78228c */ @@ -70,6 +56,5 @@ DT_MACHINE_START(R8A7778_DT, "Generic R8A7778 (Flattened Device Tree)") .init_early = shmobile_init_delay, .init_irq = r8a7778_init_irq_dt, .init_late = shmobile_init_late, - .init_time = r8a7778_timer_init, .dt_compat = r8a7778_compat_dt, MACHINE_END diff --git a/arch/arm/mach-shmobile/setup-r8a7779.c b/arch/arm/mach-shmobile/setup-r8a7779.c index 0007ff51d180..0686112f2435 100644 --- a/arch/arm/mach-shmobile/setup-r8a7779.c +++ b/arch/arm/mach-shmobile/setup-r8a7779.c @@ -14,8 +14,6 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. */ -#include <linux/clk/renesas.h> -#include <linux/clocksource.h> #include <linux/init.h> #include <linux/irq.h> #include <linux/irqchip.h> @@ -76,30 +74,6 @@ static void __init r8a7779_init_irq_dt(void) __raw_writel(0x003fee3f, INT2SMSKCR4); } -#define MODEMR 0xffcc0020 - -static u32 __init r8a7779_read_mode_pins(void) -{ - static u32 mode; - static bool mode_valid; - - if (!mode_valid) { - void __iomem *modemr = ioremap_nocache(MODEMR, PAGE_SIZE); - BUG_ON(!modemr); - mode = ioread32(modemr); - iounmap(modemr); - mode_valid = true; - } - - return mode; -} - -static void __init r8a7779_init_time(void) -{ - r8a7779_clocks_init(r8a7779_read_mode_pins()); - clocksource_probe(); -} - static const char *const r8a7779_compat_dt[] __initconst = { "renesas,r8a7779", NULL, @@ -109,7 +83,6 @@ DT_MACHINE_START(R8A7779_DT, "Generic R8A7779 (Flattened Device Tree)") .smp = smp_ops(r8a7779_smp_ops), .map_io = r8a7779_map_io, .init_early = shmobile_init_delay, - .init_time = r8a7779_init_time, .init_irq = r8a7779_init_irq_dt, .init_late = shmobile_init_late, .dt_compat = r8a7779_compat_dt, diff --git a/arch/arm/mach-shmobile/setup-rcar-gen2.c b/arch/arm/mach-shmobile/setup-rcar-gen2.c index afb9fdcd3d90..b527258e0a62 100644 --- a/arch/arm/mach-shmobile/setup-rcar-gen2.c +++ b/arch/arm/mach-shmobile/setup-rcar-gen2.c @@ -15,7 +15,7 @@ * GNU General Public License for more details. */ -#include <linux/clk/renesas.h> +#include <linux/clk-provider.h> #include <linux/clocksource.h> #include <linux/device.h> #include <linux/dma-contiguous.h> @@ -71,7 +71,6 @@ static unsigned int __init get_extal_freq(void) void __init rcar_gen2_timer_init(void) { - u32 mode = rcar_gen2_read_mode_pins(); #ifdef CONFIG_ARM_ARCH_TIMER void __iomem *base; u32 freq; @@ -130,7 +129,7 @@ void __init rcar_gen2_timer_init(void) iounmap(base); #endif /* CONFIG_ARM_ARCH_TIMER */ - rcar_gen2_clocks_init(mode); + of_clk_init(NULL); clocksource_probe(); } diff --git a/arch/arm/xen/enlighten.c b/arch/arm/xen/enlighten.c index f193414d0f6f..4986dc0c1dff 100644 --- a/arch/arm/xen/enlighten.c +++ b/arch/arm/xen/enlighten.c @@ -372,8 +372,7 @@ static int __init xen_guest_init(void) * for secondary CPUs as they are brought up. * For uniformity we use VCPUOP_register_vcpu_info even on cpu0. */ - xen_vcpu_info = __alloc_percpu(sizeof(struct vcpu_info), - sizeof(struct vcpu_info)); + xen_vcpu_info = alloc_percpu(struct vcpu_info); if (xen_vcpu_info == NULL) return -ENOMEM; diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index d062f08f5020..bd62d94f8ac5 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -186,7 +186,6 @@ struct dma_map_ops *xen_dma_ops; EXPORT_SYMBOL(xen_dma_ops); static struct dma_map_ops xen_swiotlb_dma_ops = { - .mapping_error = xen_swiotlb_dma_mapping_error, .alloc = xen_swiotlb_alloc_coherent, .free = xen_swiotlb_free_coherent, .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 969ef880d234..111742126897 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -52,6 +52,7 @@ config ARM64 select GENERIC_TIME_VSYSCALL select HANDLE_DOMAIN_IRQ select HARDIRQS_SW_RESEND + select HAVE_ACPI_APEI if (ACPI && EFI) select HAVE_ALIGNED_STRUCT_PAGE if SLUB select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_BITREVERSE @@ -109,6 +110,7 @@ config ARM64 select POWER_SUPPLY select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE + select THREAD_INFO_IN_TASK help ARM 64-bit (AArch64) Linux support. @@ -238,6 +240,9 @@ config PGTABLE_LEVELS default 3 if ARM64_16K_PAGES && ARM64_VA_BITS_47 default 4 if !ARM64_64K_PAGES && ARM64_VA_BITS_48 +config ARCH_SUPPORTS_UPROBES + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" @@ -790,6 +795,14 @@ config SETEND_EMULATION If unsure, say Y endif +config ARM64_SW_TTBR0_PAN + bool "Emulate Privileged Access Never using TTBR0_EL1 switching" + help + Enabling this option prevents the kernel from accessing + user-space memory directly by pointing TTBR0_EL1 to a reserved + zeroed area and reserved ASID. The user access routines + restore the valid TTBR0_EL1 temporarily. + menu "ARMv8.1 architectural features" config ARM64_HW_AFDBM diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug index b661fe742615..d1ebd46872fd 100644 --- a/arch/arm64/Kconfig.debug +++ b/arch/arm64/Kconfig.debug @@ -2,9 +2,13 @@ menu "Kernel hacking" source "lib/Kconfig.debug" -config ARM64_PTDUMP +config ARM64_PTDUMP_CORE + def_bool n + +config ARM64_PTDUMP_DEBUGFS bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL + select ARM64_PTDUMP_CORE select DEBUG_FS help Say Y here if you want to show the kernel pagetable layout in a @@ -38,6 +42,35 @@ config ARM64_RANDOMIZE_TEXT_OFFSET of TEXT_OFFSET and platforms must not require a specific value. +config DEBUG_WX + bool "Warn on W+X mappings at boot" + select ARM64_PTDUMP_CORE + ---help--- + Generate a warning if any W+X mappings are found at boot. + + This is useful for discovering cases where the kernel is leaving + W+X mappings after applying NX, as such mappings are a security risk. + This check also includes UXN, which should be set on all kernel + mappings. + + Look for a message in dmesg output like this: + + arm64/mm: Checked W+X mappings: passed, no W+X pages found. + + or like this, if the check failed: + + arm64/mm: Checked W+X mappings: FAILED, <N> W+X pages found. + + Note that even if the check fails, your kernel is possibly + still fine, as W+X mappings are not a security hole in + themselves, what they do is that they make the exploitation + of other unfixed kernel bugs easier. + + There is no runtime or memory usage effect of this option + once the kernel has booted up - it's a one time check. + + If in doubt, say "Y". + config DEBUG_SET_MODULE_RONX bool "Set loadable kernel module data as NX and text as RO" depends on MODULES diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 3635b8662724..b9a4a934ca05 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -37,10 +37,16 @@ $(warning LSE atomics not supported by binutils) endif endif -KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr) +brokengasinst := $(call as-instr,1:\n.inst 0\n.rept . - 1b\n\nnop\n.endr\n,,-DCONFIG_BROKEN_GAS_INST=1) + +ifneq ($(brokengasinst),) +$(warning Detected assembler with broken .inst; disassembly will be unreliable) +endif + +KBUILD_CFLAGS += -mgeneral-regs-only $(lseinstr) $(brokengasinst) KBUILD_CFLAGS += -fno-asynchronous-unwind-tables KBUILD_CFLAGS += $(call cc-option, -mpc-relative-literal-loads) -KBUILD_AFLAGS += $(lseinstr) +KBUILD_AFLAGS += $(lseinstr) $(brokengasinst) ifeq ($(CONFIG_CPU_BIG_ENDIAN), y) KBUILD_CPPFLAGS += -mbig-endian diff --git a/arch/arm64/boot/dts/broadcom/ns2-svk.dts b/arch/arm64/boot/dts/broadcom/ns2-svk.dts index b09f3bc5c6c1..c4d544244b19 100644 --- a/arch/arm64/boot/dts/broadcom/ns2-svk.dts +++ b/arch/arm64/boot/dts/broadcom/ns2-svk.dts @@ -56,6 +56,10 @@ }; }; +&enet { + status = "ok"; +}; + &pci_phy0 { status = "ok"; }; @@ -174,6 +178,7 @@ &mdio_mux_iproc { mdio@10 { gphy0: eth-phy@10 { + enet-phy-lane-swap; reg = <0x10>; }; }; diff --git a/arch/arm64/boot/dts/broadcom/ns2.dtsi b/arch/arm64/boot/dts/broadcom/ns2.dtsi index d95dc408629a..773ed593da4d 100644 --- a/arch/arm64/boot/dts/broadcom/ns2.dtsi +++ b/arch/arm64/boot/dts/broadcom/ns2.dtsi @@ -191,6 +191,18 @@ #include "ns2-clock.dtsi" + enet: ethernet@61000000 { + compatible = "brcm,ns2-amac"; + reg = <0x61000000 0x1000>, + <0x61090000 0x1000>, + <0x61030000 0x100>; + reg-names = "amac_base", "idm_base", "nicpm_base"; + interrupts = <GIC_SPI 341 IRQ_TYPE_LEVEL_HIGH>; + phy-handle = <&gphy0>; + phy-mode = "rgmii"; + status = "disabled"; + }; + dma0: dma@61360000 { compatible = "arm,pl330", "arm,primecell"; reg = <0x61360000 0x1000>; diff --git a/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi b/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi index 7f0dc13b4087..d058e56db72d 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi +++ b/arch/arm64/boot/dts/freescale/fsl-ls2080a.dtsi @@ -216,6 +216,12 @@ clocks = <&sysclk>; }; + dcfg: dcfg@1e00000 { + compatible = "fsl,ls2080a-dcfg", "syscon"; + reg = <0x0 0x1e00000 0x0 0x10000>; + little-endian; + }; + serial0: serial@21c0500 { compatible = "fsl,ns16550", "ns16550a"; reg = <0x0 0x21c0500 0x0 0x100>; diff --git a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi index 17839db585d5..e0ea60382087 100644 --- a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi +++ b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi @@ -747,7 +747,6 @@ clocks = <&sys_ctrl HI6220_USBOTG_HCLK>; clock-names = "otg"; dr_mode = "otg"; - g-use-dma; g-rx-fifo-size = <512>; g-np-tx-fifo-size = <128>; g-tx-fifo-size = <128 128 128 128 128 128>; diff --git a/arch/arm64/boot/dts/marvell/armada-3720-db.dts b/arch/arm64/boot/dts/marvell/armada-3720-db.dts index 1372e9a6aaa4..a59d36cd6caf 100644 --- a/arch/arm64/boot/dts/marvell/armada-3720-db.dts +++ b/arch/arm64/boot/dts/marvell/armada-3720-db.dts @@ -81,3 +81,26 @@ &pcie0 { status = "okay"; }; + +&mdio { + status = "okay"; + phy0: ethernet-phy@0 { + reg = <0>; + }; + + phy1: ethernet-phy@1 { + reg = <1>; + }; +}; + +ð0 { + phy-mode = "rgmii-id"; + phy = <&phy0>; + status = "okay"; +}; + +ð1 { + phy-mode = "sgmii"; + phy = <&phy1>; + status = "okay"; +}; diff --git a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi index e9bd58793464..3b8eb45bdc76 100644 --- a/arch/arm64/boot/dts/marvell/armada-37xx.dtsi +++ b/arch/arm64/boot/dts/marvell/armada-37xx.dtsi @@ -140,6 +140,29 @@ }; }; + eth0: ethernet@30000 { + compatible = "marvell,armada-3700-neta"; + reg = <0x30000 0x4000>; + interrupts = <GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&sb_periph_clk 8>; + status = "disabled"; + }; + + mdio: mdio@32004 { + #address-cells = <1>; + #size-cells = <0>; + compatible = "marvell,orion-mdio"; + reg = <0x32004 0x4>; + }; + + eth1: ethernet@40000 { + compatible = "marvell,armada-3700-neta"; + reg = <0x40000 0x4000>; + interrupts = <GIC_SPI 45 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&sb_periph_clk 7>; + status = "disabled"; + }; + usb3: usb@58000 { compatible = "marvell,armada3700-xhci", "generic-xhci"; diff --git a/arch/arm64/boot/dts/mediatek/mt8173-evb.dts b/arch/arm64/boot/dts/mediatek/mt8173-evb.dts index 2a7f731c7759..0ecaad4333a7 100644 --- a/arch/arm64/boot/dts/mediatek/mt8173-evb.dts +++ b/arch/arm64/boot/dts/mediatek/mt8173-evb.dts @@ -34,15 +34,6 @@ chosen { }; - usb_p1_vbus: regulator@0 { - compatible = "regulator-fixed"; - regulator-name = "usb_vbus"; - regulator-min-microvolt = <5000000>; - regulator-max-microvolt = <5000000>; - gpio = <&pio 130 GPIO_ACTIVE_HIGH>; - enable-active-high; - }; - connector { compatible = "hdmi-connector"; label = "hdmi"; @@ -54,6 +45,29 @@ }; }; }; + + extcon_usb: extcon_iddig { + compatible = "linux,extcon-usb-gpio"; + id-gpio = <&pio 16 GPIO_ACTIVE_HIGH>; + }; + + usb_p1_vbus: regulator@0 { + compatible = "regulator-fixed"; + regulator-name = "usb_vbus"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + gpio = <&pio 130 GPIO_ACTIVE_HIGH>; + enable-active-high; + }; + + usb_p0_vbus: regulator@1 { + compatible = "regulator-fixed"; + regulator-name = "vbus"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + gpio = <&pio 9 GPIO_ACTIVE_HIGH>; + enable-active-high; + }; }; &cec { @@ -243,6 +257,20 @@ bias-pull-down = <MTK_PUPD_SET_R1R0_10>; }; }; + + usb_id_pins_float: usb_iddig_pull_up { + pins_iddig { + pinmux = <MT8173_PIN_16_IDDIG__FUNC_IDDIG>; + bias-pull-up; + }; + }; + + usb_id_pins_ground: usb_iddig_pull_down { + pins_iddig { + pinmux = <MT8173_PIN_16_IDDIG__FUNC_IDDIG>; + bias-pull-down; + }; + }; }; &pwm0 { @@ -469,12 +497,25 @@ status = "okay"; }; +&ssusb { + vusb33-supply = <&mt6397_vusb_reg>; + vbus-supply = <&usb_p0_vbus>; + extcon = <&extcon_usb>; + dr_mode = "otg"; + mediatek,enable-wakeup; + pinctrl-names = "default", "id_float", "id_ground"; + pinctrl-0 = <&usb_id_pins_float>; + pinctrl-1 = <&usb_id_pins_float>; + pinctrl-2 = <&usb_id_pins_ground>; + status = "okay"; +}; + &uart0 { status = "okay"; }; -&usb30 { +&usb_host { vusb33-supply = <&mt6397_vusb_reg>; vbus-supply = <&usb_p1_vbus>; - mediatek,wakeup-src = <1>; + status = "okay"; }; diff --git a/arch/arm64/boot/dts/mediatek/mt8173.dtsi b/arch/arm64/boot/dts/mediatek/mt8173.dtsi index 1c71e256601d..c2d588ca59b7 100644 --- a/arch/arm64/boot/dts/mediatek/mt8173.dtsi +++ b/arch/arm64/boot/dts/mediatek/mt8173.dtsi @@ -707,11 +707,14 @@ status = "disabled"; }; - usb30: usb@11270000 { - compatible = "mediatek,mt8173-xhci"; - reg = <0 0x11270000 0 0x1000>, + ssusb: usb@11271000 { + compatible = "mediatek,mt8173-mtu3"; + reg = <0 0x11271000 0 0x3000>, <0 0x11280700 0 0x0100>; - interrupts = <GIC_SPI 115 IRQ_TYPE_LEVEL_LOW>; + reg-names = "mac", "ippc"; + interrupts = <GIC_SPI 64 IRQ_TYPE_LEVEL_LOW>; + phys = <&phy_port0 PHY_TYPE_USB3>, + <&phy_port1 PHY_TYPE_USB2>; power-domains = <&scpsys MT8173_POWER_DOMAIN_USB>; clocks = <&topckgen CLK_TOP_USB30_SEL>, <&pericfg CLK_PERI_USB0>, @@ -719,10 +722,22 @@ clock-names = "sys_ck", "wakeup_deb_p0", "wakeup_deb_p1"; - phys = <&phy_port0 PHY_TYPE_USB3>, - <&phy_port1 PHY_TYPE_USB2>; mediatek,syscon-wakeup = <&pericfg>; - status = "okay"; + #address-cells = <2>; + #size-cells = <2>; + ranges; + status = "disabled"; + + usb_host: xhci@11270000 { + compatible = "mediatek,mt8173-xhci"; + reg = <0 0x11270000 0 0x1000>; + reg-names = "mac"; + interrupts = <GIC_SPI 115 IRQ_TYPE_LEVEL_LOW>; + power-domains = <&scpsys MT8173_POWER_DOMAIN_USB>; + clocks = <&topckgen CLK_TOP_USB30_SEL>; + clock-names = "sys_ck"; + status = "disabled"; + }; }; u3phy: usb-phy@11290000 { diff --git a/arch/arm64/boot/dts/renesas/r8a7795.dtsi b/arch/arm64/boot/dts/renesas/r8a7795.dtsi index 8c15040f2540..625dda713548 100644 --- a/arch/arm64/boot/dts/renesas/r8a7795.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a7795.dtsi @@ -321,6 +321,11 @@ #power-domain-cells = <0>; }; + rst: reset-controller@e6160000 { + compatible = "renesas,r8a7795-rst"; + reg = <0 0xe6160000 0 0x0200>; + }; + sysc: system-controller@e6180000 { compatible = "renesas,r8a7795-sysc"; reg = <0 0xe6180000 0 0x0400>; diff --git a/arch/arm64/boot/dts/renesas/r8a7796.dtsi b/arch/arm64/boot/dts/renesas/r8a7796.dtsi index 9217da983525..75c8c55a8248 100644 --- a/arch/arm64/boot/dts/renesas/r8a7796.dtsi +++ b/arch/arm64/boot/dts/renesas/r8a7796.dtsi @@ -233,6 +233,11 @@ #power-domain-cells = <0>; }; + rst: reset-controller@e6160000 { + compatible = "renesas,r8a7796-rst"; + reg = <0 0xe6160000 0 0x0200>; + }; + sysc: system-controller@e6180000 { compatible = "renesas,r8a7796-sysc"; reg = <0 0xe6180000 0 0x0400>; diff --git a/arch/arm64/boot/dts/rockchip/rk3368.dtsi b/arch/arm64/boot/dts/rockchip/rk3368.dtsi index 0fcb2147c9f9..df231c4df5a5 100644 --- a/arch/arm64/boot/dts/rockchip/rk3368.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3368.dtsi @@ -537,7 +537,6 @@ g-np-tx-fifo-size = <16>; g-rx-fifo-size = <275>; g-tx-fifo-size = <256 128 128 64 64 32>; - g-use-dma; status = "disabled"; }; diff --git a/arch/arm64/boot/dts/rockchip/rk3399.dtsi b/arch/arm64/boot/dts/rockchip/rk3399.dtsi index 7afbfb0f96a3..1e24e455700b 100644 --- a/arch/arm64/boot/dts/rockchip/rk3399.dtsi +++ b/arch/arm64/boot/dts/rockchip/rk3399.dtsi @@ -174,6 +174,7 @@ <GIC_PPI 14 IRQ_TYPE_LEVEL_LOW 0>, <GIC_PPI 11 IRQ_TYPE_LEVEL_LOW 0>, <GIC_PPI 10 IRQ_TYPE_LEVEL_LOW 0>; + arm,no-tick-in-suspend; }; xin24m: xin24m { diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig index dab2cb0c1f1c..c3caaddde6cc 100644 --- a/arch/arm64/configs/defconfig +++ b/arch/arm64/configs/defconfig @@ -82,6 +82,7 @@ CONFIG_KEXEC=y # CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set CONFIG_COMPAT=y CONFIG_CPU_IDLE=y +CONFIG_HIBERNATION=y CONFIG_ARM_CPUIDLE=y CONFIG_CPU_FREQ=y CONFIG_CPUFREQ_DT=y @@ -257,6 +258,7 @@ CONFIG_GPIO_DWAPB=y CONFIG_GPIO_PL061=y CONFIG_GPIO_RCAR=y CONFIG_GPIO_XGENE=y +CONFIG_GPIO_XGENE_SB=y CONFIG_GPIO_PCA953X=y CONFIG_GPIO_PCA953X_IRQ=y CONFIG_GPIO_MAX77620=y diff --git a/arch/arm64/include/asm/Kbuild b/arch/arm64/include/asm/Kbuild index 44e1d7f10add..8365a84c2640 100644 --- a/arch/arm64/include/asm/Kbuild +++ b/arch/arm64/include/asm/Kbuild @@ -1,7 +1,6 @@ generic-y += bugs.h generic-y += clkdev.h generic-y += cputime.h -generic-y += current.h generic-y += delay.h generic-y += div64.h generic-y += dma.h @@ -24,7 +23,6 @@ generic-y += mm-arch-hooks.h generic-y += mman.h generic-y += msgbuf.h generic-y += msi.h -generic-y += mutex.h generic-y += poll.h generic-y += preempt.h generic-y += resource.h diff --git a/arch/arm64/include/asm/acpi.h b/arch/arm64/include/asm/acpi.h index e517088d635f..d0de0e032bc2 100644 --- a/arch/arm64/include/asm/acpi.h +++ b/arch/arm64/include/asm/acpi.h @@ -17,6 +17,7 @@ #include <asm/cputype.h> #include <asm/smp_plat.h> +#include <asm/tlbflush.h> /* Macros for consistency checks of the GICC subtable of MADT */ #define ACPI_MADT_GICC_LENGTH \ @@ -114,8 +115,28 @@ static inline const char *acpi_get_enable_method(int cpu) } #ifdef CONFIG_ACPI_APEI +/* + * acpi_disable_cmcff is used in drivers/acpi/apei/hest.c for disabling + * IA-32 Architecture Corrected Machine Check (CMC) Firmware-First mode + * with a kernel command line parameter "acpi=nocmcoff". But we don't + * have this IA-32 specific feature on ARM64, this definition is only + * for compatibility. + */ +#define acpi_disable_cmcff 1 pgprot_t arch_apei_get_mem_attribute(phys_addr_t addr); -#endif + +/* + * Despite its name, this function must still broadcast the TLB + * invalidation in order to ensure other CPUs don't end up with junk + * entries as a result of speculation. Unusually, its also called in + * IRQ context (ghes_iounmap_irq) so if we ever need to use IPIs for + * TLB broadcasting, then we're in trouble here. + */ +static inline void arch_apei_flush_tlb_one(unsigned long addr) +{ + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); +} +#endif /* CONFIG_ACPI_APEI */ #ifdef CONFIG_ACPI_NUMA int arm64_acpi_numa_init(void); diff --git a/arch/arm64/include/asm/arch_gicv3.h b/arch/arm64/include/asm/arch_gicv3.h index f8ae6d6e4767..f37e3a21f6e7 100644 --- a/arch/arm64/include/asm/arch_gicv3.h +++ b/arch/arm64/include/asm/arch_gicv3.h @@ -79,19 +79,10 @@ #include <linux/stringify.h> #include <asm/barrier.h> +#include <asm/cacheflush.h> -#define read_gicreg(r) \ - ({ \ - u64 reg; \ - asm volatile("mrs_s %0, " __stringify(r) : "=r" (reg)); \ - reg; \ - }) - -#define write_gicreg(v,r) \ - do { \ - u64 __val = (v); \ - asm volatile("msr_s " __stringify(r) ", %0" : : "r" (__val));\ - } while (0) +#define read_gicreg read_sysreg_s +#define write_gicreg write_sysreg_s /* * Low-level accessors @@ -102,13 +93,13 @@ static inline void gic_write_eoir(u32 irq) { - asm volatile("msr_s " __stringify(ICC_EOIR1_EL1) ", %0" : : "r" ((u64)irq)); + write_sysreg_s(irq, ICC_EOIR1_EL1); isb(); } static inline void gic_write_dir(u32 irq) { - asm volatile("msr_s " __stringify(ICC_DIR_EL1) ", %0" : : "r" ((u64)irq)); + write_sysreg_s(irq, ICC_DIR_EL1); isb(); } @@ -116,7 +107,7 @@ static inline u64 gic_read_iar_common(void) { u64 irqstat; - asm volatile("mrs_s %0, " __stringify(ICC_IAR1_EL1) : "=r" (irqstat)); + irqstat = read_sysreg_s(ICC_IAR1_EL1); dsb(sy); return irqstat; } @@ -132,12 +123,9 @@ static inline u64 gic_read_iar_cavium_thunderx(void) { u64 irqstat; - asm volatile( - "nop;nop;nop;nop\n\t" - "nop;nop;nop;nop\n\t" - "mrs_s %0, " __stringify(ICC_IAR1_EL1) "\n\t" - "nop;nop;nop;nop" - : "=r" (irqstat)); + nops(8); + irqstat = read_sysreg_s(ICC_IAR1_EL1); + nops(4); mb(); return irqstat; @@ -145,37 +133,34 @@ static inline u64 gic_read_iar_cavium_thunderx(void) static inline void gic_write_pmr(u32 val) { - asm volatile("msr_s " __stringify(ICC_PMR_EL1) ", %0" : : "r" ((u64)val)); + write_sysreg_s(val, ICC_PMR_EL1); } static inline void gic_write_ctlr(u32 val) { - asm volatile("msr_s " __stringify(ICC_CTLR_EL1) ", %0" : : "r" ((u64)val)); + write_sysreg_s(val, ICC_CTLR_EL1); isb(); } static inline void gic_write_grpen1(u32 val) { - asm volatile("msr_s " __stringify(ICC_GRPEN1_EL1) ", %0" : : "r" ((u64)val)); + write_sysreg_s(val, ICC_GRPEN1_EL1); isb(); } static inline void gic_write_sgi1r(u64 val) { - asm volatile("msr_s " __stringify(ICC_SGI1R_EL1) ", %0" : : "r" (val)); + write_sysreg_s(val, ICC_SGI1R_EL1); } static inline u32 gic_read_sre(void) { - u64 val; - - asm volatile("mrs_s %0, " __stringify(ICC_SRE_EL1) : "=r" (val)); - return val; + return read_sysreg_s(ICC_SRE_EL1); } static inline void gic_write_sre(u32 val) { - asm volatile("msr_s " __stringify(ICC_SRE_EL1) ", %0" : : "r" ((u64)val)); + write_sysreg_s(val, ICC_SRE_EL1); isb(); } @@ -187,5 +172,21 @@ static inline void gic_write_bpr1(u32 val) #define gic_read_typer(c) readq_relaxed(c) #define gic_write_irouter(v, c) writeq_relaxed(v, c) +#define gic_flush_dcache_to_poc(a,l) __flush_dcache_area((a), (l)) + +#define gits_read_baser(c) readq_relaxed(c) +#define gits_write_baser(v, c) writeq_relaxed(v, c) + +#define gits_read_cbaser(c) readq_relaxed(c) +#define gits_write_cbaser(v, c) writeq_relaxed(v, c) + +#define gits_write_cwriter(v, c) writeq_relaxed(v, c) + +#define gicr_read_propbaser(c) readq_relaxed(c) +#define gicr_write_propbaser(v, c) writeq_relaxed(v, c) + +#define gicr_write_pendbaser(v, c) writeq_relaxed(v, c) +#define gicr_read_pendbaser(c) readq_relaxed(c) + #endif /* __ASSEMBLY__ */ #endif /* __ASM_ARCH_GICV3_H */ diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h index 28bfe6132eb6..446f6c46d4b1 100644 --- a/arch/arm64/include/asm/assembler.h +++ b/arch/arm64/include/asm/assembler.h @@ -41,6 +41,15 @@ msr daifclr, #2 .endm + .macro save_and_disable_irq, flags + mrs \flags, daif + msr daifset, #2 + .endm + + .macro restore_irq, flags + msr daif, \flags + .endm + /* * Enable and disable debug exceptions. */ @@ -202,14 +211,25 @@ lr .req x30 // link register .endm /* + * @dst: Result of per_cpu(sym, smp_processor_id()) * @sym: The name of the per-cpu variable - * @reg: Result of per_cpu(sym, smp_processor_id()) * @tmp: scratch register */ - .macro this_cpu_ptr, sym, reg, tmp - adr_l \reg, \sym + .macro adr_this_cpu, dst, sym, tmp + adr_l \dst, \sym mrs \tmp, tpidr_el1 - add \reg, \reg, \tmp + add \dst, \dst, \tmp + .endm + + /* + * @dst: Result of READ_ONCE(per_cpu(sym, smp_processor_id())) + * @sym: The name of the per-cpu variable + * @tmp: scratch register + */ + .macro ldr_this_cpu dst, sym, tmp + adr_l \dst, \sym + mrs \tmp, tpidr_el1 + ldr \dst, [\dst, \tmp] .endm /* @@ -395,4 +415,24 @@ alternative_endif movk \reg, :abs_g0_nc:\val .endm +/* + * Return the current thread_info. + */ + .macro get_thread_info, rd + mrs \rd, sp_el0 + .endm + +/* + * Errata workaround post TTBR0_EL1 update. + */ + .macro post_ttbr0_update_workaround +#ifdef CONFIG_CAVIUM_ERRATUM_27456 +alternative_if ARM64_WORKAROUND_CAVIUM_27456 + ic iallu + dsb nsh + isb +alternative_else_nop_endif +#endif + .endm + #endif /* __ASM_ASSEMBLER_H */ diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index 2e5fb976a572..5a2a6ee65f65 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -65,12 +65,12 @@ * - kaddr - page address * - size - region size */ -extern void flush_cache_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void flush_icache_range(unsigned long start, unsigned long end); extern void __flush_dcache_area(void *addr, size_t len); extern void __clean_dcache_area_poc(void *addr, size_t len); extern void __clean_dcache_area_pou(void *addr, size_t len); extern long __flush_cache_user_range(unsigned long start, unsigned long end); +extern void sync_icache_aliases(void *kaddr, unsigned long len); static inline void flush_cache_mm(struct mm_struct *mm) { @@ -81,6 +81,11 @@ static inline void flush_cache_page(struct vm_area_struct *vma, { } +static inline void flush_cache_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ +} + /* * Cache maintenance functions used by the DMA API. No to be used directly. */ diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index 87b446535185..4174f09678c4 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -34,7 +34,8 @@ #define ARM64_HAS_32BIT_EL0 13 #define ARM64_HYP_OFFSET_LOW 14 #define ARM64_MISMATCHED_CACHE_LINE_SIZE 15 +#define ARM64_HAS_NO_FPSIMD 16 -#define ARM64_NCAPS 16 +#define ARM64_NCAPS 17 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 0bc0b1de90c4..b4989df48670 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -9,8 +9,6 @@ #ifndef __ASM_CPUFEATURE_H #define __ASM_CPUFEATURE_H -#include <linux/jump_label.h> - #include <asm/cpucaps.h> #include <asm/hwcap.h> #include <asm/sysreg.h> @@ -27,6 +25,8 @@ #ifndef __ASSEMBLY__ +#include <linux/bug.h> +#include <linux/jump_label.h> #include <linux/kernel.h> /* CPU feature register tracking */ @@ -104,14 +104,19 @@ static inline bool cpu_have_feature(unsigned int num) return elf_hwcap & (1UL << num); } +/* System capability check for constant caps */ +static inline bool cpus_have_const_cap(int num) +{ + if (num >= ARM64_NCAPS) + return false; + return static_branch_unlikely(&cpu_hwcap_keys[num]); +} + static inline bool cpus_have_cap(unsigned int num) { if (num >= ARM64_NCAPS) return false; - if (__builtin_constant_p(num)) - return static_branch_unlikely(&cpu_hwcap_keys[num]); - else - return test_bit(num, cpu_hwcaps); + return test_bit(num, cpu_hwcaps); } static inline void cpus_set_cap(unsigned int num) @@ -200,7 +205,7 @@ static inline bool cpu_supports_mixed_endian_el0(void) static inline bool system_supports_32bit_el0(void) { - return cpus_have_cap(ARM64_HAS_32BIT_EL0); + return cpus_have_const_cap(ARM64_HAS_32BIT_EL0); } static inline bool system_supports_mixed_endian_el0(void) @@ -208,6 +213,17 @@ static inline bool system_supports_mixed_endian_el0(void) return id_aa64mmfr0_mixed_endian_el0(read_system_reg(SYS_ID_AA64MMFR0_EL1)); } +static inline bool system_supports_fpsimd(void) +{ + return !cpus_have_const_cap(ARM64_HAS_NO_FPSIMD); +} + +static inline bool system_uses_ttbr0_pan(void) +{ + return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) && + !cpus_have_cap(ARM64_HAS_PAN); +} + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/arm64/include/asm/current.h b/arch/arm64/include/asm/current.h new file mode 100644 index 000000000000..f2bcbe2d9889 --- /dev/null +++ b/arch/arm64/include/asm/current.h @@ -0,0 +1,22 @@ +#ifndef __ASM_CURRENT_H +#define __ASM_CURRENT_H + +#include <linux/compiler.h> + +#include <asm/sysreg.h> + +#ifndef __ASSEMBLY__ + +struct task_struct; + +static __always_inline struct task_struct *get_current(void) +{ + return (struct task_struct *)read_sysreg(sp_el0); +} + +#define current get_current() + +#endif /* __ASSEMBLY__ */ + +#endif /* __ASM_CURRENT_H */ + diff --git a/arch/arm64/include/asm/debug-monitors.h b/arch/arm64/include/asm/debug-monitors.h index b71420a12f26..a44cf5225429 100644 --- a/arch/arm64/include/asm/debug-monitors.h +++ b/arch/arm64/include/asm/debug-monitors.h @@ -68,6 +68,9 @@ #define BRK64_ESR_MASK 0xFFFF #define BRK64_ESR_KPROBES 0x0004 #define BRK64_OPCODE_KPROBES (AARCH64_BREAK_MON | (BRK64_ESR_KPROBES << 5)) +/* uprobes BRK opcodes with ESR encoding */ +#define BRK64_ESR_UPROBES 0x0005 +#define BRK64_OPCODE_UPROBES (AARCH64_BREAK_MON | (BRK64_ESR_UPROBES << 5)) /* AArch32 */ #define DBG_ESR_EVT_BKPT 0x4 diff --git a/arch/arm64/include/asm/efi.h b/arch/arm64/include/asm/efi.h index a9e54aad15ef..0b6b1633017f 100644 --- a/arch/arm64/include/asm/efi.h +++ b/arch/arm64/include/asm/efi.h @@ -1,6 +1,7 @@ #ifndef _ASM_EFI_H #define _ASM_EFI_H +#include <asm/cpufeature.h> #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/neon.h> @@ -51,6 +52,9 @@ int efi_set_mapping_permissions(struct mm_struct *mm, efi_memory_desc_t *md); #define __efi_call_early(f, ...) f(__VA_ARGS__) #define efi_is_64bit() (true) +#define efi_call_proto(protocol, f, instance, ...) \ + ((protocol##_t *)instance)->f(instance, ##__VA_ARGS__) + #define alloc_screen_info(x...) &screen_info #define free_screen_info(x...) @@ -75,7 +79,30 @@ static inline void efifb_setup_from_dmi(struct screen_info *si, const char *opt) static inline void efi_set_pgd(struct mm_struct *mm) { - switch_mm(NULL, mm, NULL); + __switch_mm(mm); + + if (system_uses_ttbr0_pan()) { + if (mm != current->active_mm) { + /* + * Update the current thread's saved ttbr0 since it is + * restored as part of a return from exception. Set + * the hardware TTBR0_EL1 using cpu_switch_mm() + * directly to enable potential errata workarounds. + */ + update_saved_ttbr0(current, mm); + cpu_switch_mm(mm->pgd, mm); + } else { + /* + * Defer the switch to the current thread's TTBR0_EL1 + * until uaccess_enable(). Restore the current + * thread's saved ttbr0 corresponding to its active_mm + * (if different from init_mm). + */ + cpu_set_reserved_ttbr0(); + if (current->active_mm != &init_mm) + update_saved_ttbr0(current, current->active_mm); + } + } } void efi_virtmap_load(void); diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index a55384f4a5d7..5d1700425efe 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -138,7 +138,11 @@ typedef struct user_fpsimd_state elf_fpregset_t; */ #define ELF_PLAT_INIT(_r, load_addr) (_r)->regs[0] = 0 -#define SET_PERSONALITY(ex) clear_thread_flag(TIF_32BIT); +#define SET_PERSONALITY(ex) \ +({ \ + clear_bit(TIF_32BIT, ¤t->mm->context.flags); \ + clear_thread_flag(TIF_32BIT); \ +}) /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ #define ARCH_DLINFO \ @@ -183,7 +187,11 @@ typedef compat_elf_greg_t compat_elf_gregset_t[COMPAT_ELF_NGREG]; ((x)->e_flags & EF_ARM_EABI_MASK)) #define compat_start_thread compat_start_thread -#define COMPAT_SET_PERSONALITY(ex) set_thread_flag(TIF_32BIT); +#define COMPAT_SET_PERSONALITY(ex) \ +({ \ + set_bit(TIF_32BIT, ¤t->mm->context.flags); \ + set_thread_flag(TIF_32BIT); \ + }) #define COMPAT_ARCH_DLINFO extern int aarch32_setup_vectors_page(struct linux_binprm *bprm, int uses_interp); diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index f2585cdd32c2..85c4a8981d47 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -21,15 +21,12 @@ #include <linux/futex.h> #include <linux/uaccess.h> -#include <asm/alternative.h> -#include <asm/cpufeature.h> #include <asm/errno.h> -#include <asm/sysreg.h> #define __futex_atomic_op(insn, ret, oldval, uaddr, tmp, oparg) \ +do { \ + uaccess_enable(); \ asm volatile( \ - ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ " prfm pstl1strm, %2\n" \ "1: ldxr %w1, %2\n" \ insn "\n" \ @@ -44,11 +41,11 @@ " .popsection\n" \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ - ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ : "=&r" (ret), "=&r" (oldval), "+Q" (*uaddr), "=&r" (tmp) \ : "r" (oparg), "Ir" (-EFAULT) \ - : "memory") + : "memory"); \ + uaccess_disable(); \ +} while (0) static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) @@ -118,8 +115,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; + uaccess_enable(); asm volatile("// futex_atomic_cmpxchg_inatomic\n" -ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " prfm pstl1strm, %2\n" "1: ldxr %w1, %2\n" " sub %w3, %w1, %w4\n" @@ -134,10 +131,10 @@ ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, CONFIG_ARM64_PAN) " .popsection\n" _ASM_EXTABLE(1b, 4b) _ASM_EXTABLE(2b, 4b) -ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, CONFIG_ARM64_PAN) : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp) : "r" (oldval), "r" (newval), "Ir" (-EFAULT) : "memory"); + uaccess_disable(); *uval = val; return ret; diff --git a/arch/arm64/include/asm/hw_breakpoint.h b/arch/arm64/include/asm/hw_breakpoint.h index 9510ace570e2..b6b167ac082b 100644 --- a/arch/arm64/include/asm/hw_breakpoint.h +++ b/arch/arm64/include/asm/hw_breakpoint.h @@ -77,7 +77,11 @@ static inline void decode_ctrl_reg(u32 reg, /* Lengths */ #define ARM_BREAKPOINT_LEN_1 0x1 #define ARM_BREAKPOINT_LEN_2 0x3 +#define ARM_BREAKPOINT_LEN_3 0x7 #define ARM_BREAKPOINT_LEN_4 0xf +#define ARM_BREAKPOINT_LEN_5 0x1f +#define ARM_BREAKPOINT_LEN_6 0x3f +#define ARM_BREAKPOINT_LEN_7 0x7f #define ARM_BREAKPOINT_LEN_8 0xff /* Kernel stepping */ @@ -119,7 +123,7 @@ struct perf_event; struct pmu; extern int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type); + int *gen_len, int *gen_type, int *offset); extern int arch_check_bp_in_kernelspace(struct perf_event *bp); extern int arch_validate_hwbkpt_settings(struct perf_event *bp); extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused, diff --git a/arch/arm64/include/asm/io.h b/arch/arm64/include/asm/io.h index 0bba427bb4c2..0c00c87bb9dd 100644 --- a/arch/arm64/include/asm/io.h +++ b/arch/arm64/include/asm/io.h @@ -22,7 +22,6 @@ #ifdef __KERNEL__ #include <linux/types.h> -#include <linux/blk_types.h> #include <asm/byteorder.h> #include <asm/barrier.h> diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 7e51d1b57c0c..7803343e5881 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -19,6 +19,7 @@ #ifndef __ASM_KERNEL_PGTABLE_H #define __ASM_KERNEL_PGTABLE_H +#include <asm/pgtable.h> #include <asm/sparsemem.h> /* @@ -54,6 +55,12 @@ #define SWAPPER_DIR_SIZE (SWAPPER_PGTABLE_LEVELS * PAGE_SIZE) #define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE) +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +#define RESERVED_TTBR0_SIZE (PAGE_SIZE) +#else +#define RESERVED_TTBR0_SIZE (0) +#endif + /* Initial memory map size */ #if ARM64_SWAPPER_USES_SECTION_MAPS #define SWAPPER_BLOCK_SHIFT SECTION_SHIFT diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 8d9fce037b2f..47619411f0ff 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -19,6 +19,7 @@ typedef struct { atomic64_t id; void *vdso; + unsigned long flags; } mm_context_t; /* @@ -34,7 +35,7 @@ extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); extern void init_mem_pgprot(void); extern void create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, - pgprot_t prot, bool allow_block_mappings); + pgprot_t prot, bool page_mappings_only); extern void *fixmap_remap_fdt(phys_addr_t dt_phys); #endif diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index a50185375f09..0363fe80455c 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -23,6 +23,7 @@ #include <linux/sched.h> #include <asm/cacheflush.h> +#include <asm/cpufeature.h> #include <asm/proc-fns.h> #include <asm-generic/mm_hooks.h> #include <asm/cputype.h> @@ -103,7 +104,7 @@ static inline void cpu_uninstall_idmap(void) local_flush_tlb_all(); cpu_set_default_tcr_t0sz(); - if (mm != &init_mm) + if (mm != &init_mm && !system_uses_ttbr0_pan()) cpu_switch_mm(mm->pgd, mm); } @@ -163,20 +164,26 @@ enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { } -/* - * This is the actual mm switch as far as the scheduler - * is concerned. No registers are touched. We avoid - * calling the CPU specific function when the mm hasn't - * actually changed. - */ -static inline void -switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +static inline void update_saved_ttbr0(struct task_struct *tsk, + struct mm_struct *mm) { - unsigned int cpu = smp_processor_id(); + if (system_uses_ttbr0_pan()) { + BUG_ON(mm->pgd == swapper_pg_dir); + task_thread_info(tsk)->ttbr0 = + virt_to_phys(mm->pgd) | ASID(mm) << 48; + } +} +#else +static inline void update_saved_ttbr0(struct task_struct *tsk, + struct mm_struct *mm) +{ +} +#endif - if (prev == next) - return; +static inline void __switch_mm(struct mm_struct *next) +{ + unsigned int cpu = smp_processor_id(); /* * init_mm.pgd does not contain any user mappings and it is always @@ -190,8 +197,26 @@ switch_mm(struct mm_struct *prev, struct mm_struct *next, check_and_switch_context(next, cpu); } +static inline void +switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + if (prev != next) + __switch_mm(next); + + /* + * Update the saved TTBR0_EL1 of the scheduled-in task as the previous + * value may have not been initialised yet (activate_mm caller) or the + * ASID has changed since the last run (following the context switch + * of another thread of the same process). Avoid setting the reserved + * TTBR0_EL1 to swapper_pg_dir (init_mm; e.g. via idle_task_exit). + */ + if (next != &init_mm) + update_saved_ttbr0(tsk, next); +} + #define deactivate_mm(tsk,mm) do { } while (0) -#define activate_mm(prev,next) switch_mm(prev, next, NULL) +#define activate_mm(prev,next) switch_mm(prev, next, current) void verify_cpu_asid_bits(void); diff --git a/arch/arm64/include/asm/neon.h b/arch/arm64/include/asm/neon.h index 13ce4cc18e26..ad4cdc966c0f 100644 --- a/arch/arm64/include/asm/neon.h +++ b/arch/arm64/include/asm/neon.h @@ -9,8 +9,9 @@ */ #include <linux/types.h> +#include <asm/fpsimd.h> -#define cpu_has_neon() (1) +#define cpu_has_neon() system_supports_fpsimd() #define kernel_neon_begin() kernel_neon_begin_partial(32) diff --git a/arch/arm64/include/asm/opcodes.h b/arch/arm64/include/asm/opcodes.h deleted file mode 100644 index 123f45d92cd1..000000000000 --- a/arch/arm64/include/asm/opcodes.h +++ /dev/null @@ -1,5 +0,0 @@ -#ifdef CONFIG_CPU_BIG_ENDIAN -#define CONFIG_CPU_ENDIAN_BE8 CONFIG_CPU_BIG_ENDIAN -#endif - -#include <../../arm/include/asm/opcodes.h> diff --git a/arch/arm64/include/asm/percpu.h b/arch/arm64/include/asm/percpu.h index 5394c8405e66..3bd498e4de4c 100644 --- a/arch/arm64/include/asm/percpu.h +++ b/arch/arm64/include/asm/percpu.h @@ -16,6 +16,8 @@ #ifndef __ASM_PERCPU_H #define __ASM_PERCPU_H +#include <asm/stack_pointer.h> + static inline void set_my_cpu_offset(unsigned long off) { asm volatile("msr tpidr_el1, %0" :: "r" (off) : "memory"); @@ -101,16 +103,16 @@ static inline unsigned long __percpu_read(void *ptr, int size) switch (size) { case 1: - ret = ACCESS_ONCE(*(u8 *)ptr); + ret = READ_ONCE(*(u8 *)ptr); break; case 2: - ret = ACCESS_ONCE(*(u16 *)ptr); + ret = READ_ONCE(*(u16 *)ptr); break; case 4: - ret = ACCESS_ONCE(*(u32 *)ptr); + ret = READ_ONCE(*(u32 *)ptr); break; case 8: - ret = ACCESS_ONCE(*(u64 *)ptr); + ret = READ_ONCE(*(u64 *)ptr); break; default: BUILD_BUG(); @@ -123,16 +125,16 @@ static inline void __percpu_write(void *ptr, unsigned long val, int size) { switch (size) { case 1: - ACCESS_ONCE(*(u8 *)ptr) = (u8)val; + WRITE_ONCE(*(u8 *)ptr, (u8)val); break; case 2: - ACCESS_ONCE(*(u16 *)ptr) = (u16)val; + WRITE_ONCE(*(u16 *)ptr, (u16)val); break; case 4: - ACCESS_ONCE(*(u32 *)ptr) = (u32)val; + WRITE_ONCE(*(u32 *)ptr, (u32)val); break; case 8: - ACCESS_ONCE(*(u64 *)ptr) = (u64)val; + WRITE_ONCE(*(u64 *)ptr, (u64)val); break; default: BUILD_BUG(); diff --git a/arch/arm64/include/asm/perf_event.h b/arch/arm64/include/asm/perf_event.h index 38b6a2b49d68..8d5cbec17d80 100644 --- a/arch/arm64/include/asm/perf_event.h +++ b/arch/arm64/include/asm/perf_event.h @@ -17,6 +17,8 @@ #ifndef __ASM_PERF_EVENT_H #define __ASM_PERF_EVENT_H +#include <asm/stack_pointer.h> + #define ARMV8_PMU_MAX_COUNTERS 32 #define ARMV8_PMU_COUNTER_MASK (ARMV8_PMU_MAX_COUNTERS - 1) diff --git a/arch/arm64/include/asm/probes.h b/arch/arm64/include/asm/probes.h index 5af574d632fa..6a5b28904c33 100644 --- a/arch/arm64/include/asm/probes.h +++ b/arch/arm64/include/asm/probes.h @@ -15,21 +15,22 @@ #ifndef _ARM_PROBES_H #define _ARM_PROBES_H -#include <asm/opcodes.h> - -struct kprobe; -struct arch_specific_insn; - -typedef u32 kprobe_opcode_t; -typedef void (kprobes_handler_t) (u32 opcode, long addr, struct pt_regs *); +typedef u32 probe_opcode_t; +typedef void (probes_handler_t) (u32 opcode, long addr, struct pt_regs *); /* architecture specific copy of original instruction */ -struct arch_specific_insn { - kprobe_opcode_t *insn; +struct arch_probe_insn { + probe_opcode_t *insn; pstate_check_t *pstate_cc; - kprobes_handler_t *handler; + probes_handler_t *handler; /* restore address after step xol */ unsigned long restore; }; +#ifdef CONFIG_KPROBES +typedef u32 kprobe_opcode_t; +struct arch_specific_insn { + struct arch_probe_insn api; +}; +#endif #endif diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 60e34824e18c..747c65a616ed 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -149,8 +149,6 @@ static inline void cpu_relax(void) asm volatile("yield" ::: "memory"); } -#define cpu_relax_lowlatency() cpu_relax() - /* Thread switching */ extern struct task_struct *cpu_switch_to(struct task_struct *prev, struct task_struct *next); diff --git a/arch/arm64/include/asm/ptdump.h b/arch/arm64/include/asm/ptdump.h index 07b8ed037dee..6afd8476c60c 100644 --- a/arch/arm64/include/asm/ptdump.h +++ b/arch/arm64/include/asm/ptdump.h @@ -16,9 +16,10 @@ #ifndef __ASM_PTDUMP_H #define __ASM_PTDUMP_H -#ifdef CONFIG_ARM64_PTDUMP +#ifdef CONFIG_ARM64_PTDUMP_CORE #include <linux/mm_types.h> +#include <linux/seq_file.h> struct addr_marker { unsigned long start_address; @@ -29,16 +30,25 @@ struct ptdump_info { struct mm_struct *mm; const struct addr_marker *markers; unsigned long base_addr; - unsigned long max_addr; }; -int ptdump_register(struct ptdump_info *info, const char *name); - +void ptdump_walk_pgd(struct seq_file *s, struct ptdump_info *info); +#ifdef CONFIG_ARM64_PTDUMP_DEBUGFS +int ptdump_debugfs_register(struct ptdump_info *info, const char *name); #else -static inline int ptdump_register(struct ptdump_info *info, const char *name) +static inline int ptdump_debugfs_register(struct ptdump_info *info, + const char *name) { return 0; } -#endif /* CONFIG_ARM64_PTDUMP */ +#endif +void ptdump_check_wx(void); +#endif /* CONFIG_ARM64_PTDUMP_CORE */ + +#ifdef CONFIG_DEBUG_WX +#define debug_checkwx() ptdump_check_wx() +#else +#define debug_checkwx() do { } while (0) +#endif #endif /* __ASM_PTDUMP_H */ diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index ada08b5b036d..513daf050e84 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -217,6 +217,14 @@ int valid_user_regs(struct user_pt_regs *regs, struct task_struct *task); #include <asm-generic/ptrace.h> +#define procedure_link_pointer(regs) ((regs)->regs[30]) + +static inline void procedure_link_pointer_set(struct pt_regs *regs, + unsigned long val) +{ + procedure_link_pointer(regs) = val; +} + #undef profile_pc extern unsigned long profile_pc(struct pt_regs *regs); diff --git a/arch/arm64/include/asm/smp.h b/arch/arm64/include/asm/smp.h index 022644704a93..d050d720a1b4 100644 --- a/arch/arm64/include/asm/smp.h +++ b/arch/arm64/include/asm/smp.h @@ -29,11 +29,22 @@ #ifndef __ASSEMBLY__ +#include <asm/percpu.h> + #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/thread_info.h> -#define raw_smp_processor_id() (current_thread_info()->cpu) +DECLARE_PER_CPU_READ_MOSTLY(int, cpu_number); + +/* + * We don't use this_cpu_read(cpu_number) as that has implicit writes to + * preempt_count, and associated (compiler) barriers, that we'd like to avoid + * the expense of. If we're preemptible, the value can be stale at use anyway. + * And we can't use this_cpu_ptr() either, as that winds up recursing back + * here under CONFIG_DEBUG_PREEMPT=y. + */ +#define raw_smp_processor_id() (*raw_cpu_ptr(&cpu_number)) struct seq_file; @@ -73,6 +84,7 @@ asmlinkage void secondary_start_kernel(void); */ struct secondary_data { void *stack; + struct task_struct *task; long status; }; diff --git a/arch/arm64/include/asm/stack_pointer.h b/arch/arm64/include/asm/stack_pointer.h new file mode 100644 index 000000000000..ffcdf742cddf --- /dev/null +++ b/arch/arm64/include/asm/stack_pointer.h @@ -0,0 +1,9 @@ +#ifndef __ASM_STACK_POINTER_H +#define __ASM_STACK_POINTER_H + +/* + * how to get the current stack pointer from C + */ +register unsigned long current_stack_pointer asm ("sp"); + +#endif /* __ASM_STACK_POINTER_H */ diff --git a/arch/arm64/include/asm/suspend.h b/arch/arm64/include/asm/suspend.h index b8a313fd7a09..de5600f40adf 100644 --- a/arch/arm64/include/asm/suspend.h +++ b/arch/arm64/include/asm/suspend.h @@ -1,7 +1,7 @@ #ifndef __ASM_SUSPEND_H #define __ASM_SUSPEND_H -#define NR_CTX_REGS 10 +#define NR_CTX_REGS 12 #define NR_CALLEE_SAVED_REGS 12 /* diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 6c80b3699cb8..98ae03f8eedd 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -22,8 +22,6 @@ #include <linux/stringify.h> -#include <asm/opcodes.h> - /* * ARMv8 ARM reserves the following encoding for system registers: * (Ref: ARMv8 ARM, Section: "System instruction class encoding overview", @@ -37,6 +35,33 @@ #define sys_reg(op0, op1, crn, crm, op2) \ ((((op0)&3)<<19)|((op1)<<16)|((crn)<<12)|((crm)<<8)|((op2)<<5)) +#ifndef CONFIG_BROKEN_GAS_INST + +#ifdef __ASSEMBLY__ +#define __emit_inst(x) .inst (x) +#else +#define __emit_inst(x) ".inst " __stringify((x)) "\n\t" +#endif + +#else /* CONFIG_BROKEN_GAS_INST */ + +#ifndef CONFIG_CPU_BIG_ENDIAN +#define __INSTR_BSWAP(x) (x) +#else /* CONFIG_CPU_BIG_ENDIAN */ +#define __INSTR_BSWAP(x) ((((x) << 24) & 0xff000000) | \ + (((x) << 8) & 0x00ff0000) | \ + (((x) >> 8) & 0x0000ff00) | \ + (((x) >> 24) & 0x000000ff)) +#endif /* CONFIG_CPU_BIG_ENDIAN */ + +#ifdef __ASSEMBLY__ +#define __emit_inst(x) .long __INSTR_BSWAP(x) +#else /* __ASSEMBLY__ */ +#define __emit_inst(x) ".long " __stringify(__INSTR_BSWAP(x)) "\n\t" +#endif /* __ASSEMBLY__ */ + +#endif /* CONFIG_BROKEN_GAS_INST */ + #define SYS_MIDR_EL1 sys_reg(3, 0, 0, 0, 0) #define SYS_MPIDR_EL1 sys_reg(3, 0, 0, 0, 5) #define SYS_REVIDR_EL1 sys_reg(3, 0, 0, 0, 6) @@ -81,10 +106,10 @@ #define REG_PSTATE_PAN_IMM sys_reg(0, 0, 4, 0, 4) #define REG_PSTATE_UAO_IMM sys_reg(0, 0, 4, 0, 3) -#define SET_PSTATE_PAN(x) __inst_arm(0xd5000000 | REG_PSTATE_PAN_IMM |\ - (!!x)<<8 | 0x1f) -#define SET_PSTATE_UAO(x) __inst_arm(0xd5000000 | REG_PSTATE_UAO_IMM |\ - (!!x)<<8 | 0x1f) +#define SET_PSTATE_PAN(x) __emit_inst(0xd5000000 | REG_PSTATE_PAN_IMM | \ + (!!x)<<8 | 0x1f) +#define SET_PSTATE_UAO(x) __emit_inst(0xd5000000 | REG_PSTATE_UAO_IMM | \ + (!!x)<<8 | 0x1f) /* Common SCTLR_ELx flags. */ #define SCTLR_ELx_EE (1 << 25) @@ -228,11 +253,11 @@ .equ .L__reg_num_xzr, 31 .macro mrs_s, rt, sreg - .inst 0xd5200000|(\sreg)|(.L__reg_num_\rt) + __emit_inst(0xd5200000|(\sreg)|(.L__reg_num_\rt)) .endm .macro msr_s, sreg, rt - .inst 0xd5000000|(\sreg)|(.L__reg_num_\rt) + __emit_inst(0xd5000000|(\sreg)|(.L__reg_num_\rt)) .endm #else @@ -246,11 +271,11 @@ asm( " .equ .L__reg_num_xzr, 31\n" "\n" " .macro mrs_s, rt, sreg\n" -" .inst 0xd5200000|(\\sreg)|(.L__reg_num_\\rt)\n" + __emit_inst(0xd5200000|(\\sreg)|(.L__reg_num_\\rt)) " .endm\n" "\n" " .macro msr_s, sreg, rt\n" -" .inst 0xd5000000|(\\sreg)|(.L__reg_num_\\rt)\n" + __emit_inst(0xd5000000|(\\sreg)|(.L__reg_num_\\rt)) " .endm\n" ); diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index e9ea5a6bd449..46c3b93cf865 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -36,58 +36,31 @@ struct task_struct; +#include <asm/stack_pointer.h> #include <asm/types.h> typedef unsigned long mm_segment_t; /* * low level task data that entry.S needs immediate access to. - * __switch_to() assumes cpu_context follows immediately after cpu_domain. */ struct thread_info { unsigned long flags; /* low level flags */ mm_segment_t addr_limit; /* address limit */ - struct task_struct *task; /* main task structure */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + u64 ttbr0; /* saved TTBR0_EL1 */ +#endif int preempt_count; /* 0 => preemptable, <0 => bug */ - int cpu; /* cpu */ }; #define INIT_THREAD_INFO(tsk) \ { \ - .task = &tsk, \ - .flags = 0, \ .preempt_count = INIT_PREEMPT_COUNT, \ .addr_limit = KERNEL_DS, \ } -#define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) -/* - * how to get the current stack pointer from C - */ -register unsigned long current_stack_pointer asm ("sp"); - -/* - * how to get the thread information struct from C - */ -static inline struct thread_info *current_thread_info(void) __attribute_const__; - -/* - * struct thread_info can be accessed directly via sp_el0. - * - * We don't use read_sysreg() as we want the compiler to cache the value where - * possible. - */ -static inline struct thread_info *current_thread_info(void) -{ - unsigned long sp_el0; - - asm ("mrs %0, sp_el0" : "=r" (sp_el0)); - - return (struct thread_info *)sp_el0; -} - #define thread_saved_pc(tsk) \ ((unsigned long)(tsk->thread.cpu_context.pc)) #define thread_saved_sp(tsk) \ @@ -112,6 +85,7 @@ static inline struct thread_info *current_thread_info(void) #define TIF_NEED_RESCHED 1 #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ +#define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -132,10 +106,12 @@ static inline struct thread_info *current_thread_info(void) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) +#define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE) + _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ + _TIF_UPROBE) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 55d0adbf6509..d26750ca6e06 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -18,6 +18,12 @@ #ifndef __ASM_UACCESS_H #define __ASM_UACCESS_H +#include <asm/alternative.h> +#include <asm/kernel-pgtable.h> +#include <asm/sysreg.h> + +#ifndef __ASSEMBLY__ + /* * User space memory access functions */ @@ -26,10 +32,8 @@ #include <linux/string.h> #include <linux/thread_info.h> -#include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/ptrace.h> -#include <asm/sysreg.h> #include <asm/errno.h> #include <asm/memory.h> #include <asm/compiler.h> @@ -120,6 +124,99 @@ static inline void set_fs(mm_segment_t fs) " .popsection\n" /* + * User access enabling/disabling. + */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN +static inline void __uaccess_ttbr0_disable(void) +{ + unsigned long ttbr; + + /* reserved_ttbr0 placed at the end of swapper_pg_dir */ + ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE; + write_sysreg(ttbr, ttbr0_el1); + isb(); +} + +static inline void __uaccess_ttbr0_enable(void) +{ + unsigned long flags; + + /* + * Disable interrupts to avoid preemption between reading the 'ttbr0' + * variable and the MSR. A context switch could trigger an ASID + * roll-over and an update of 'ttbr0'. + */ + local_irq_save(flags); + write_sysreg(current_thread_info()->ttbr0, ttbr0_el1); + isb(); + local_irq_restore(flags); +} + +static inline bool uaccess_ttbr0_disable(void) +{ + if (!system_uses_ttbr0_pan()) + return false; + __uaccess_ttbr0_disable(); + return true; +} + +static inline bool uaccess_ttbr0_enable(void) +{ + if (!system_uses_ttbr0_pan()) + return false; + __uaccess_ttbr0_enable(); + return true; +} +#else +static inline bool uaccess_ttbr0_disable(void) +{ + return false; +} + +static inline bool uaccess_ttbr0_enable(void) +{ + return false; +} +#endif + +#define __uaccess_disable(alt) \ +do { \ + if (!uaccess_ttbr0_disable()) \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ + CONFIG_ARM64_PAN)); \ +} while (0) + +#define __uaccess_enable(alt) \ +do { \ + if (!uaccess_ttbr0_enable()) \ + asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ + CONFIG_ARM64_PAN)); \ +} while (0) + +static inline void uaccess_disable(void) +{ + __uaccess_disable(ARM64_HAS_PAN); +} + +static inline void uaccess_enable(void) +{ + __uaccess_enable(ARM64_HAS_PAN); +} + +/* + * These functions are no-ops when UAO is present. + */ +static inline void uaccess_disable_not_uao(void) +{ + __uaccess_disable(ARM64_ALT_PAN_NOT_UAO); +} + +static inline void uaccess_enable_not_uao(void) +{ + __uaccess_enable(ARM64_ALT_PAN_NOT_UAO); +} + +/* * The "__xxx" versions of the user access functions do not verify the address * space - it must have been done previously with a separate "access_ok()" * call. @@ -146,8 +243,7 @@ static inline void set_fs(mm_segment_t fs) do { \ unsigned long __gu_val; \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \ @@ -168,9 +264,8 @@ do { \ default: \ BUILD_BUG(); \ } \ + uaccess_disable_not_uao(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ } while (0) #define __get_user(x, ptr) \ @@ -215,8 +310,7 @@ do { \ do { \ __typeof__(*(ptr)) __pu_val = (x); \ __chk_user_ptr(ptr); \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \ @@ -237,8 +331,7 @@ do { \ default: \ BUILD_BUG(); \ } \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_ALT_PAN_NOT_UAO,\ - CONFIG_ARM64_PAN)); \ + uaccess_disable_not_uao(); \ } while (0) #define __put_user(x, ptr) \ @@ -331,4 +424,66 @@ extern long strncpy_from_user(char *dest, const char __user *src, long count); extern __must_check long strlen_user(const char __user *str); extern __must_check long strnlen_user(const char __user *str, long n); +#else /* __ASSEMBLY__ */ + +#include <asm/assembler.h> + +/* + * User access enabling/disabling macros. + */ +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + .macro __uaccess_ttbr0_disable, tmp1 + mrs \tmp1, ttbr1_el1 // swapper_pg_dir + add \tmp1, \tmp1, #SWAPPER_DIR_SIZE // reserved_ttbr0 at the end of swapper_pg_dir + msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 + isb + .endm + + .macro __uaccess_ttbr0_enable, tmp1 + get_thread_info \tmp1 + ldr \tmp1, [\tmp1, #TSK_TI_TTBR0] // load saved TTBR0_EL1 + msr ttbr0_el1, \tmp1 // set the non-PAN TTBR0_EL1 + isb + .endm + + .macro uaccess_ttbr0_disable, tmp1 +alternative_if_not ARM64_HAS_PAN + __uaccess_ttbr0_disable \tmp1 +alternative_else_nop_endif + .endm + + .macro uaccess_ttbr0_enable, tmp1, tmp2 +alternative_if_not ARM64_HAS_PAN + save_and_disable_irq \tmp2 // avoid preemption + __uaccess_ttbr0_enable \tmp1 + restore_irq \tmp2 +alternative_else_nop_endif + .endm +#else + .macro uaccess_ttbr0_disable, tmp1 + .endm + + .macro uaccess_ttbr0_enable, tmp1, tmp2 + .endm +#endif + +/* + * These macros are no-ops when UAO is present. + */ + .macro uaccess_disable_not_uao, tmp1 + uaccess_ttbr0_disable \tmp1 +alternative_if ARM64_ALT_PAN_NOT_UAO + SET_PSTATE_PAN(1) +alternative_else_nop_endif + .endm + + .macro uaccess_enable_not_uao, tmp1, tmp2 + uaccess_ttbr0_enable \tmp1, \tmp2 +alternative_if ARM64_ALT_PAN_NOT_UAO + SET_PSTATE_PAN(0) +alternative_else_nop_endif + .endm + +#endif /* __ASSEMBLY__ */ + #endif /* __ASM_UACCESS_H */ diff --git a/arch/arm64/include/asm/uprobes.h b/arch/arm64/include/asm/uprobes.h new file mode 100644 index 000000000000..8d004073d0e8 --- /dev/null +++ b/arch/arm64/include/asm/uprobes.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2014-2016 Pratyush Anand <panand@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef _ASM_UPROBES_H +#define _ASM_UPROBES_H + +#include <asm/debug-monitors.h> +#include <asm/insn.h> +#include <asm/probes.h> + +#define MAX_UINSN_BYTES AARCH64_INSN_SIZE + +#define UPROBE_SWBP_INSN BRK64_OPCODE_UPROBES +#define UPROBE_SWBP_INSN_SIZE AARCH64_INSN_SIZE +#define UPROBE_XOL_SLOT_BYTES MAX_UINSN_BYTES + +typedef u32 uprobe_opcode_t; + +struct arch_uprobe_task { +}; + +struct arch_uprobe { + union { + u8 insn[MAX_UINSN_BYTES]; + u8 ixol[MAX_UINSN_BYTES]; + }; + struct arch_probe_insn api; + bool simulate; +}; + +#endif diff --git a/arch/arm64/include/asm/xen/hypercall.h b/arch/arm64/include/asm/xen/hypercall.h index 74b0c423ff5b..3522cbaed316 100644 --- a/arch/arm64/include/asm/xen/hypercall.h +++ b/arch/arm64/include/asm/xen/hypercall.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/hypercall.h> +#include <xen/arm/hypercall.h> diff --git a/arch/arm64/include/asm/xen/hypervisor.h b/arch/arm64/include/asm/xen/hypervisor.h index f263da8e8769..d6e7709d0688 100644 --- a/arch/arm64/include/asm/xen/hypervisor.h +++ b/arch/arm64/include/asm/xen/hypervisor.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/hypervisor.h> +#include <xen/arm/hypervisor.h> diff --git a/arch/arm64/include/asm/xen/interface.h b/arch/arm64/include/asm/xen/interface.h index 44457aebeed4..88c0d75da190 100644 --- a/arch/arm64/include/asm/xen/interface.h +++ b/arch/arm64/include/asm/xen/interface.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/interface.h> +#include <xen/arm/interface.h> diff --git a/arch/arm64/include/asm/xen/page-coherent.h b/arch/arm64/include/asm/xen/page-coherent.h index 2052102b4e02..b3ef061d8b74 100644 --- a/arch/arm64/include/asm/xen/page-coherent.h +++ b/arch/arm64/include/asm/xen/page-coherent.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/page-coherent.h> +#include <xen/arm/page-coherent.h> diff --git a/arch/arm64/include/asm/xen/page.h b/arch/arm64/include/asm/xen/page.h index bed87ec36780..31bbc803cecb 100644 --- a/arch/arm64/include/asm/xen/page.h +++ b/arch/arm64/include/asm/xen/page.h @@ -1 +1 @@ -#include <../../arm/include/asm/xen/page.h> +#include <xen/arm/page.h> diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index b0988bb1bf64..04de188a36c9 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -14,10 +14,8 @@ #include <linux/slab.h> #include <linux/sysctl.h> -#include <asm/alternative.h> #include <asm/cpufeature.h> #include <asm/insn.h> -#include <asm/opcodes.h> #include <asm/sysreg.h> #include <asm/system_misc.h> #include <asm/traps.h> @@ -285,10 +283,10 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) #define __SWP_LL_SC_LOOPS 4 #define __user_swpX_asm(data, addr, res, temp, temp2, B) \ +do { \ + uaccess_enable(); \ __asm__ __volatile__( \ " mov %w3, %w7\n" \ - ALTERNATIVE("nop", SET_PSTATE_PAN(0), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ "0: ldxr"B" %w2, [%4]\n" \ "1: stxr"B" %w0, %w1, [%4]\n" \ " cbz %w0, 2f\n" \ @@ -306,12 +304,12 @@ static void __init register_insn_emulation_sysctl(struct ctl_table *table) " .popsection" \ _ASM_EXTABLE(0b, 4b) \ _ASM_EXTABLE(1b, 4b) \ - ALTERNATIVE("nop", SET_PSTATE_PAN(1), ARM64_HAS_PAN, \ - CONFIG_ARM64_PAN) \ : "=&r" (res), "+r" (data), "=&r" (temp), "=&r" (temp2) \ : "r" (addr), "i" (-EAGAIN), "i" (-EFAULT), \ "i" (__SWP_LL_SC_LOOPS) \ - : "memory") + : "memory"); \ + uaccess_disable(); \ +} while (0) #define __user_swp_asm(data, addr, res, temp, temp2) \ __user_swpX_asm(data, addr, res, temp, temp2, "") @@ -352,6 +350,10 @@ static int emulate_swpX(unsigned int address, unsigned int *data, return res; } +#define ARM_OPCODE_CONDTEST_FAIL 0 +#define ARM_OPCODE_CONDTEST_PASS 1 +#define ARM_OPCODE_CONDTEST_UNCOND 2 + #define ARM_OPCODE_CONDITION_UNCOND 0xf static unsigned int __kprobes aarch32_check_condition(u32 opcode, u32 psr) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 4a2f0f0fef32..bc049afc73a7 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -36,11 +36,13 @@ int main(void) { DEFINE(TSK_ACTIVE_MM, offsetof(struct task_struct, active_mm)); BLANK(); - DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); - DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); - DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); - DEFINE(TI_TASK, offsetof(struct thread_info, task)); - DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); + DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); + DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); + DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit)); +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); +#endif + DEFINE(TSK_STACK, offsetof(struct task_struct, stack)); BLANK(); DEFINE(THREAD_CPU_CONTEXT, offsetof(struct task_struct, thread.cpu_context)); BLANK(); @@ -123,6 +125,7 @@ int main(void) DEFINE(TZ_DSTTIME, offsetof(struct timezone, tz_dsttime)); BLANK(); DEFINE(CPU_BOOT_STACK, offsetof(struct secondary_data, stack)); + DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task)); BLANK(); #ifdef CONFIG_KVM_ARM_HOST DEFINE(VCPU_CONTEXT, offsetof(struct kvm_vcpu, arch.ctxt)); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index c02504ea304b..fdf8f045929f 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -47,6 +47,7 @@ unsigned int compat_elf_hwcap2 __read_mostly; #endif DECLARE_BITMAP(cpu_hwcaps, ARM64_NCAPS); +EXPORT_SYMBOL(cpu_hwcaps); DEFINE_STATIC_KEY_ARRAY_FALSE(cpu_hwcap_keys, ARM64_NCAPS); EXPORT_SYMBOL(cpu_hwcap_keys); @@ -746,6 +747,14 @@ static bool hyp_offset_low(const struct arm64_cpu_capabilities *entry, return idmap_addr > GENMASK(VA_BITS - 2, 0) && !is_kernel_in_hyp_mode(); } +static bool has_no_fpsimd(const struct arm64_cpu_capabilities *entry, int __unused) +{ + u64 pfr0 = read_system_reg(SYS_ID_AA64PFR0_EL1); + + return cpuid_feature_extract_signed_field(pfr0, + ID_AA64PFR0_FP_SHIFT) < 0; +} + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -829,6 +838,13 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .def_scope = SCOPE_SYSTEM, .matches = hyp_offset_low, }, + { + /* FP/SIMD is not implemented */ + .capability = ARM64_HAS_NO_FPSIMD, + .def_scope = SCOPE_SYSTEM, + .min_field_value = 0, + .matches = has_no_fpsimd, + }, {}, }; @@ -1102,5 +1118,5 @@ void __init setup_cpu_features(void) static bool __maybe_unused cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused) { - return (cpus_have_cap(ARM64_HAS_PAN) && !cpus_have_cap(ARM64_HAS_UAO)); + return (cpus_have_const_cap(ARM64_HAS_PAN) && !cpus_have_const_cap(ARM64_HAS_UAO)); } diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c index b3d5b3e8fbcb..7b7be71e87bf 100644 --- a/arch/arm64/kernel/cpuinfo.c +++ b/arch/arm64/kernel/cpuinfo.c @@ -227,7 +227,7 @@ static struct attribute_group cpuregs_attr_group = { .name = "identification" }; -static int cpuid_add_regs(int cpu) +static int cpuid_cpu_online(unsigned int cpu) { int rc; struct device *dev; @@ -248,7 +248,7 @@ out: return rc; } -static int cpuid_remove_regs(int cpu) +static int cpuid_cpu_offline(unsigned int cpu) { struct device *dev; struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu); @@ -264,40 +264,22 @@ static int cpuid_remove_regs(int cpu) return 0; } -static int cpuid_callback(struct notifier_block *nb, - unsigned long action, void *hcpu) -{ - int rc = 0; - unsigned long cpu = (unsigned long)hcpu; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - rc = cpuid_add_regs(cpu); - break; - case CPU_DEAD: - rc = cpuid_remove_regs(cpu); - break; - } - - return notifier_from_errno(rc); -} - static int __init cpuinfo_regs_init(void) { - int cpu; - - cpu_notifier_register_begin(); + int cpu, ret; for_each_possible_cpu(cpu) { struct cpuinfo_arm64 *info = &per_cpu(cpu_data, cpu); kobject_init(&info->kobj, &cpuregs_kobj_type); - if (cpu_online(cpu)) - cpuid_add_regs(cpu); } - __hotcpu_notifier(cpuid_callback, 0); - cpu_notifier_register_done(); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "arm64/cpuinfo:online", + cpuid_cpu_online, cpuid_cpu_offline); + if (ret < 0) { + pr_err("cpuinfo: failed to register hotplug callbacks.\n"); + return ret; + } return 0; } static void cpuinfo_detect_icache_policy(struct cpuinfo_arm64 *info) diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 73ae90ef434c..605df76f0a06 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -226,6 +226,8 @@ static void send_user_sigtrap(int si_code) static int single_step_handler(unsigned long addr, unsigned int esr, struct pt_regs *regs) { + bool handler_found = false; + /* * If we are stepping a pending breakpoint, call the hw_breakpoint * handler first. @@ -233,7 +235,14 @@ static int single_step_handler(unsigned long addr, unsigned int esr, if (!reinstall_suspended_bps(regs)) return 0; - if (user_mode(regs)) { +#ifdef CONFIG_KPROBES + if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) + handler_found = true; +#endif + if (!handler_found && call_step_hook(regs, esr) == DBG_HOOK_HANDLED) + handler_found = true; + + if (!handler_found && user_mode(regs)) { send_user_sigtrap(TRAP_TRACE); /* @@ -243,15 +252,8 @@ static int single_step_handler(unsigned long addr, unsigned int esr, * to the active-not-pending state). */ user_rewind_single_step(current); - } else { -#ifdef CONFIG_KPROBES - if (kprobe_single_step_handler(regs, esr) == DBG_HOOK_HANDLED) - return 0; -#endif - if (call_step_hook(regs, esr) == DBG_HOOK_HANDLED) - return 0; - - pr_warning("Unexpected kernel single-step exception at EL1\n"); + } else if (!handler_found) { + pr_warn("Unexpected kernel single-step exception at EL1\n"); /* * Re-enable stepping since we know that we will be * returning to regs. @@ -304,16 +306,20 @@ NOKPROBE_SYMBOL(call_break_hook); static int brk_handler(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - if (user_mode(regs)) { - send_user_sigtrap(TRAP_BRKPT); - } + bool handler_found = false; + #ifdef CONFIG_KPROBES - else if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) { - if (kprobe_breakpoint_handler(regs, esr) != DBG_HOOK_HANDLED) - return -EFAULT; + if ((esr & BRK64_ESR_MASK) == BRK64_ESR_KPROBES) { + if (kprobe_breakpoint_handler(regs, esr) == DBG_HOOK_HANDLED) + handler_found = true; } #endif - else if (call_break_hook(regs, esr) != DBG_HOOK_HANDLED) { + if (!handler_found && call_break_hook(regs, esr) == DBG_HOOK_HANDLED) + handler_found = true; + + if (!handler_found && user_mode(regs)) { + send_user_sigtrap(TRAP_BRKPT); + } else if (!handler_found) { pr_warn("Unexpected kernel BRK exception at EL1\n"); return -EFAULT; } diff --git a/arch/arm64/kernel/efi.c b/arch/arm64/kernel/efi.c index ba9bee389fd5..5d17f377d905 100644 --- a/arch/arm64/kernel/efi.c +++ b/arch/arm64/kernel/efi.c @@ -62,8 +62,8 @@ struct screen_info screen_info __section(.data); int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) { pteval_t prot_val = create_mapping_protection(md); - bool allow_block_mappings = (md->type != EFI_RUNTIME_SERVICES_CODE && - md->type != EFI_RUNTIME_SERVICES_DATA); + bool page_mappings_only = (md->type == EFI_RUNTIME_SERVICES_CODE || + md->type == EFI_RUNTIME_SERVICES_DATA); if (!PAGE_ALIGNED(md->phys_addr) || !PAGE_ALIGNED(md->num_pages << EFI_PAGE_SHIFT)) { @@ -76,12 +76,12 @@ int __init efi_create_mapping(struct mm_struct *mm, efi_memory_desc_t *md) * from the MMU routines. So avoid block mappings altogether in * that case. */ - allow_block_mappings = false; + page_mappings_only = true; } create_pgd_mapping(mm, md->phys_addr, md->virt_addr, md->num_pages << EFI_PAGE_SHIFT, - __pgprot(prot_val | PTE_NG), allow_block_mappings); + __pgprot(prot_val | PTE_NG), page_mappings_only); return 0; } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 223d54a4d66b..4f0d76339414 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -29,7 +29,9 @@ #include <asm/esr.h> #include <asm/irq.h> #include <asm/memory.h> +#include <asm/ptrace.h> #include <asm/thread_info.h> +#include <asm/uaccess.h> #include <asm/unistd.h> /* @@ -90,9 +92,8 @@ .if \el == 0 mrs x21, sp_el0 - mov tsk, sp - and tsk, tsk, #~(THREAD_SIZE - 1) // Ensure MDSCR_EL1.SS is clear, - ldr x19, [tsk, #TI_FLAGS] // since we can unmask debug + ldr_this_cpu tsk, __entry_task, x20 // Ensure MDSCR_EL1.SS is clear, + ldr x19, [tsk, #TSK_TI_FLAGS] // since we can unmask debug disable_step_tsk x19, x20 // exceptions when scheduling. mov x29, xzr // fp pointed to user-space @@ -100,15 +101,41 @@ add x21, sp, #S_FRAME_SIZE get_thread_info tsk /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */ - ldr x20, [tsk, #TI_ADDR_LIMIT] + ldr x20, [tsk, #TSK_TI_ADDR_LIMIT] str x20, [sp, #S_ORIG_ADDR_LIMIT] mov x20, #TASK_SIZE_64 - str x20, [tsk, #TI_ADDR_LIMIT] + str x20, [tsk, #TSK_TI_ADDR_LIMIT] /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */ .endif /* \el == 0 */ mrs x22, elr_el1 mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Set the TTBR0 PAN bit in SPSR. When the exception is taken from + * EL0, there is no need to check the state of TTBR0_EL1 since + * accesses are always enabled. + * Note that the meaning of this bit differs from the ARMv8.1 PAN + * feature as all TTBR0_EL1 accesses are disabled, not just those to + * user mappings. + */ +alternative_if ARM64_HAS_PAN + b 1f // skip TTBR0 PAN +alternative_else_nop_endif + + .if \el != 0 + mrs x21, ttbr0_el1 + tst x21, #0xffff << 48 // Check for the reserved ASID + orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR + b.eq 1f // TTBR0 access already disabled + and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR + .endif + + __uaccess_ttbr0_disable x21 +1: +#endif + stp x22, x23, [sp, #S_PC] /* @@ -139,7 +166,7 @@ .if \el != 0 /* Restore the task's original addr_limit. */ ldr x20, [sp, #S_ORIG_ADDR_LIMIT] - str x20, [tsk, #TI_ADDR_LIMIT] + str x20, [tsk, #TSK_TI_ADDR_LIMIT] /* No need to restore UAO, it will be restored from SPSR_EL1 */ .endif @@ -147,6 +174,40 @@ ldp x21, x22, [sp, #S_PC] // load ELR, SPSR .if \el == 0 ct_user_enter + .endif + +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Restore access to TTBR0_EL1. If returning to EL0, no need for SPSR + * PAN bit checking. + */ +alternative_if ARM64_HAS_PAN + b 2f // skip TTBR0 PAN +alternative_else_nop_endif + + .if \el != 0 + tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set + .endif + + __uaccess_ttbr0_enable x0 + + .if \el == 0 + /* + * Enable errata workarounds only if returning to user. The only + * workaround currently required for TTBR0_EL1 changes are for the + * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache + * corruption). + */ + post_ttbr0_update_workaround + .endif +1: + .if \el != 0 + and x22, x22, #~PSR_PAN_BIT // ARMv8.0 CPUs do not understand this bit + .endif +2: +#endif + + .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 #ifdef CONFIG_ARM64_ERRATUM_845719 @@ -162,6 +223,7 @@ alternative_if ARM64_WORKAROUND_845719 alternative_else_nop_endif #endif .endif + msr elr_el1, x21 // set up the return data msr spsr_el1, x22 ldp x0, x1, [sp, #16 * 0] @@ -184,23 +246,20 @@ alternative_else_nop_endif eret // return to kernel .endm - .macro get_thread_info, rd - mrs \rd, sp_el0 - .endm - .macro irq_stack_entry mov x19, sp // preserve the original sp /* - * Compare sp with the current thread_info, if the top - * ~(THREAD_SIZE - 1) bits match, we are on a task stack, and - * should switch to the irq stack. + * Compare sp with the base of the task stack. + * If the top ~(THREAD_SIZE - 1) bits match, we are on a task stack, + * and should switch to the irq stack. */ - and x25, x19, #~(THREAD_SIZE - 1) - cmp x25, tsk - b.ne 9998f + ldr x25, [tsk, TSK_STACK] + eor x25, x25, x19 + and x25, x25, #~(THREAD_SIZE - 1) + cbnz x25, 9998f - this_cpu_ptr irq_stack, x25, x26 + adr_this_cpu x25, irq_stack, x26 mov x26, #IRQ_STACK_START_SP add x26, x25, x26 @@ -427,9 +486,9 @@ el1_irq: irq_handler #ifdef CONFIG_PREEMPT - ldr w24, [tsk, #TI_PREEMPT] // get preempt count + ldr w24, [tsk, #TSK_TI_PREEMPT] // get preempt count cbnz w24, 1f // preempt count != 0 - ldr x0, [tsk, #TI_FLAGS] // get flags + ldr x0, [tsk, #TSK_TI_FLAGS] // get flags tbz x0, #TIF_NEED_RESCHED, 1f // needs rescheduling? bl el1_preempt 1: @@ -444,7 +503,7 @@ ENDPROC(el1_irq) el1_preempt: mov x24, lr 1: bl preempt_schedule_irq // irq en/disable is done inside - ldr x0, [tsk, #TI_FLAGS] // get new tasks TI_FLAGS + ldr x0, [tsk, #TSK_TI_FLAGS] // get new tasks TI_FLAGS tbnz x0, #TIF_NEED_RESCHED, 1b // needs rescheduling? ret x24 #endif @@ -674,8 +733,7 @@ ENTRY(cpu_switch_to) ldp x29, x9, [x8], #16 ldr lr, [x8] mov sp, x9 - and x9, x9, #~(THREAD_SIZE - 1) - msr sp_el0, x9 + msr sp_el0, x1 ret ENDPROC(cpu_switch_to) @@ -686,7 +744,7 @@ ENDPROC(cpu_switch_to) ret_fast_syscall: disable_irq // disable interrupts str x0, [sp, #S_X0] // returned x0 - ldr x1, [tsk, #TI_FLAGS] // re-check for syscall tracing + ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for syscall tracing and x2, x1, #_TIF_SYSCALL_WORK cbnz x2, ret_fast_syscall_trace and x2, x1, #_TIF_WORK_MASK @@ -706,14 +764,14 @@ work_pending: #ifdef CONFIG_TRACE_IRQFLAGS bl trace_hardirqs_on // enabled while in userspace #endif - ldr x1, [tsk, #TI_FLAGS] // re-check for single-step + ldr x1, [tsk, #TSK_TI_FLAGS] // re-check for single-step b finish_ret_to_user /* * "slow" syscall return path. */ ret_to_user: disable_irq // disable interrupts - ldr x1, [tsk, #TI_FLAGS] + ldr x1, [tsk, #TSK_TI_FLAGS] and x2, x1, #_TIF_WORK_MASK cbnz x2, work_pending finish_ret_to_user: @@ -746,7 +804,7 @@ el0_svc_naked: // compat entry point enable_dbg_and_irq ct_user_exit 1 - ldr x16, [tsk, #TI_FLAGS] // check for syscall hooks + ldr x16, [tsk, #TSK_TI_FLAGS] // check for syscall hooks tst x16, #_TIF_SYSCALL_WORK b.ne __sys_trace cmp scno, sc_nr // check upper syscall limit diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 394c61db5566..b883f1f75216 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -127,6 +127,8 @@ void do_fpsimd_exc(unsigned int esr, struct pt_regs *regs) void fpsimd_thread_switch(struct task_struct *next) { + if (!system_supports_fpsimd()) + return; /* * Save the current FPSIMD state to memory, but only if whatever is in * the registers is in fact the most recent userland FPSIMD state of @@ -157,6 +159,8 @@ void fpsimd_thread_switch(struct task_struct *next) void fpsimd_flush_thread(void) { + if (!system_supports_fpsimd()) + return; memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); fpsimd_flush_task_state(current); set_thread_flag(TIF_FOREIGN_FPSTATE); @@ -168,6 +172,8 @@ void fpsimd_flush_thread(void) */ void fpsimd_preserve_current_state(void) { + if (!system_supports_fpsimd()) + return; preempt_disable(); if (!test_thread_flag(TIF_FOREIGN_FPSTATE)) fpsimd_save_state(¤t->thread.fpsimd_state); @@ -181,6 +187,8 @@ void fpsimd_preserve_current_state(void) */ void fpsimd_restore_current_state(void) { + if (!system_supports_fpsimd()) + return; preempt_disable(); if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { struct fpsimd_state *st = ¤t->thread.fpsimd_state; @@ -199,6 +207,8 @@ void fpsimd_restore_current_state(void) */ void fpsimd_update_current_state(struct fpsimd_state *state) { + if (!system_supports_fpsimd()) + return; preempt_disable(); fpsimd_load_state(state); if (test_and_clear_thread_flag(TIF_FOREIGN_FPSTATE)) { @@ -228,6 +238,8 @@ static DEFINE_PER_CPU(struct fpsimd_partial_state, softirq_fpsimdstate); */ void kernel_neon_begin_partial(u32 num_regs) { + if (WARN_ON(!system_supports_fpsimd())) + return; if (in_interrupt()) { struct fpsimd_partial_state *s = this_cpu_ptr( in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); @@ -252,6 +264,8 @@ EXPORT_SYMBOL(kernel_neon_begin_partial); void kernel_neon_end(void) { + if (!system_supports_fpsimd()) + return; if (in_interrupt()) { struct fpsimd_partial_state *s = this_cpu_ptr( in_irq() ? &hardirq_fpsimdstate : &softirq_fpsimdstate); diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 332e33193ccf..4b1abac3485a 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -326,14 +326,14 @@ __create_page_tables: * dirty cache lines being evicted. */ adrp x0, idmap_pg_dir - adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE bl __inval_cache_range /* * Clear the idmap and swapper page tables. */ adrp x0, idmap_pg_dir - adrp x6, swapper_pg_dir + SWAPPER_DIR_SIZE + adrp x6, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 @@ -412,7 +412,7 @@ __create_page_tables: * tables again to remove any speculatively loaded cache lines. */ adrp x0, idmap_pg_dir - adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE dmb sy bl __inval_cache_range @@ -428,7 +428,8 @@ ENDPROC(__create_page_tables) __primary_switched: adrp x4, init_thread_union add sp, x4, #THREAD_SIZE - msr sp_el0, x4 // Save thread_info + adr_l x5, init_task + msr sp_el0, x5 // Save thread_info adr_l x8, vectors // load VBAR_EL1 with virtual msr vbar_el1, x8 // vector table address @@ -524,10 +525,21 @@ set_hcr: msr hcr_el2, x0 isb - /* Generic timers. */ + /* + * Allow Non-secure EL1 and EL0 to access physical timer and counter. + * This is not necessary for VHE, since the host kernel runs in EL2, + * and EL0 accesses are configured in the later stage of boot process. + * Note that when HCR_EL2.E2H == 1, CNTHCTL_EL2 has the same bit layout + * as CNTKCTL_EL1, and CNTKCTL_EL1 accessing instructions are redefined + * to access CNTHCTL_EL2. This allows the kernel designed to run at EL1 + * to transparently mess with the EL0 bits via CNTKCTL_EL1 access in + * EL2. + */ + cbnz x2, 1f mrs x0, cnthctl_el2 orr x0, x0, #3 // Enable EL1 physical timers msr cnthctl_el2, x0 +1: msr cntvoff_el2, xzr // Clear virtual offset #ifdef CONFIG_ARM_GIC_V3 @@ -699,10 +711,10 @@ __secondary_switched: isb adr_l x0, secondary_data - ldr x0, [x0, #CPU_BOOT_STACK] // get secondary_data.stack - mov sp, x0 - and x0, x0, #~(THREAD_SIZE - 1) - msr sp_el0, x0 // save thread_info + ldr x1, [x0, #CPU_BOOT_STACK] // get secondary_data.stack + mov sp, x1 + ldr x2, [x0, #CPU_BOOT_TASK] + msr sp_el0, x2 mov x29, #0 b secondary_start_kernel ENDPROC(__secondary_switched) diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index 948b73148d56..1b3c747fedda 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -317,9 +317,21 @@ static int get_hbp_len(u8 hbp_len) case ARM_BREAKPOINT_LEN_2: len_in_bytes = 2; break; + case ARM_BREAKPOINT_LEN_3: + len_in_bytes = 3; + break; case ARM_BREAKPOINT_LEN_4: len_in_bytes = 4; break; + case ARM_BREAKPOINT_LEN_5: + len_in_bytes = 5; + break; + case ARM_BREAKPOINT_LEN_6: + len_in_bytes = 6; + break; + case ARM_BREAKPOINT_LEN_7: + len_in_bytes = 7; + break; case ARM_BREAKPOINT_LEN_8: len_in_bytes = 8; break; @@ -349,7 +361,7 @@ int arch_check_bp_in_kernelspace(struct perf_event *bp) * to generic breakpoint descriptions. */ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, - int *gen_len, int *gen_type) + int *gen_len, int *gen_type, int *offset) { /* Type */ switch (ctrl.type) { @@ -369,17 +381,33 @@ int arch_bp_generic_fields(struct arch_hw_breakpoint_ctrl ctrl, return -EINVAL; } + if (!ctrl.len) + return -EINVAL; + *offset = __ffs(ctrl.len); + /* Len */ - switch (ctrl.len) { + switch (ctrl.len >> *offset) { case ARM_BREAKPOINT_LEN_1: *gen_len = HW_BREAKPOINT_LEN_1; break; case ARM_BREAKPOINT_LEN_2: *gen_len = HW_BREAKPOINT_LEN_2; break; + case ARM_BREAKPOINT_LEN_3: + *gen_len = HW_BREAKPOINT_LEN_3; + break; case ARM_BREAKPOINT_LEN_4: *gen_len = HW_BREAKPOINT_LEN_4; break; + case ARM_BREAKPOINT_LEN_5: + *gen_len = HW_BREAKPOINT_LEN_5; + break; + case ARM_BREAKPOINT_LEN_6: + *gen_len = HW_BREAKPOINT_LEN_6; + break; + case ARM_BREAKPOINT_LEN_7: + *gen_len = HW_BREAKPOINT_LEN_7; + break; case ARM_BREAKPOINT_LEN_8: *gen_len = HW_BREAKPOINT_LEN_8; break; @@ -423,9 +451,21 @@ static int arch_build_bp_info(struct perf_event *bp) case HW_BREAKPOINT_LEN_2: info->ctrl.len = ARM_BREAKPOINT_LEN_2; break; + case HW_BREAKPOINT_LEN_3: + info->ctrl.len = ARM_BREAKPOINT_LEN_3; + break; case HW_BREAKPOINT_LEN_4: info->ctrl.len = ARM_BREAKPOINT_LEN_4; break; + case HW_BREAKPOINT_LEN_5: + info->ctrl.len = ARM_BREAKPOINT_LEN_5; + break; + case HW_BREAKPOINT_LEN_6: + info->ctrl.len = ARM_BREAKPOINT_LEN_6; + break; + case HW_BREAKPOINT_LEN_7: + info->ctrl.len = ARM_BREAKPOINT_LEN_7; + break; case HW_BREAKPOINT_LEN_8: info->ctrl.len = ARM_BREAKPOINT_LEN_8; break; @@ -517,18 +557,17 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp) default: return -EINVAL; } - - info->address &= ~alignment_mask; - info->ctrl.len <<= offset; } else { if (info->ctrl.type == ARM_BREAKPOINT_EXECUTE) alignment_mask = 0x3; else alignment_mask = 0x7; - if (info->address & alignment_mask) - return -EINVAL; + offset = info->address & alignment_mask; } + info->address &= ~alignment_mask; + info->ctrl.len <<= offset; + /* * Disallow per-task kernel breakpoints since these would * complicate the stepping code. @@ -661,12 +700,47 @@ unlock: } NOKPROBE_SYMBOL(breakpoint_handler); +/* + * Arm64 hardware does not always report a watchpoint hit address that matches + * one of the watchpoints set. It can also report an address "near" the + * watchpoint if a single instruction access both watched and unwatched + * addresses. There is no straight-forward way, short of disassembling the + * offending instruction, to map that address back to the watchpoint. This + * function computes the distance of the memory access from the watchpoint as a + * heuristic for the likelyhood that a given access triggered the watchpoint. + * + * See Section D2.10.5 "Determining the memory location that caused a Watchpoint + * exception" of ARMv8 Architecture Reference Manual for details. + * + * The function returns the distance of the address from the bytes watched by + * the watchpoint. In case of an exact match, it returns 0. + */ +static u64 get_distance_from_watchpoint(unsigned long addr, u64 val, + struct arch_hw_breakpoint_ctrl *ctrl) +{ + u64 wp_low, wp_high; + u32 lens, lene; + + lens = __ffs(ctrl->len); + lene = __fls(ctrl->len); + + wp_low = val + lens; + wp_high = val + lene; + if (addr < wp_low) + return wp_low - addr; + else if (addr > wp_high) + return addr - wp_high; + else + return 0; +} + static int watchpoint_handler(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - int i, step = 0, *kernel_step, access; + int i, step = 0, *kernel_step, access, closest_match = 0; + u64 min_dist = -1, dist; u32 ctrl_reg; - u64 val, alignment_mask; + u64 val; struct perf_event *wp, **slots; struct debug_info *debug_info; struct arch_hw_breakpoint *info; @@ -675,35 +749,15 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, slots = this_cpu_ptr(wp_on_reg); debug_info = ¤t->thread.debug; + /* + * Find all watchpoints that match the reported address. If no exact + * match is found. Attribute the hit to the closest watchpoint. + */ + rcu_read_lock(); for (i = 0; i < core_num_wrps; ++i) { - rcu_read_lock(); - wp = slots[i]; - if (wp == NULL) - goto unlock; - - info = counter_arch_bp(wp); - /* AArch32 watchpoints are either 4 or 8 bytes aligned. */ - if (is_compat_task()) { - if (info->ctrl.len == ARM_BREAKPOINT_LEN_8) - alignment_mask = 0x7; - else - alignment_mask = 0x3; - } else { - alignment_mask = 0x7; - } - - /* Check if the watchpoint value matches. */ - val = read_wb_reg(AARCH64_DBG_REG_WVR, i); - if (val != (addr & ~alignment_mask)) - goto unlock; - - /* Possible match, check the byte address select to confirm. */ - ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i); - decode_ctrl_reg(ctrl_reg, &ctrl); - if (!((1 << (addr & alignment_mask)) & ctrl.len)) - goto unlock; + continue; /* * Check that the access type matches. @@ -712,18 +766,41 @@ static int watchpoint_handler(unsigned long addr, unsigned int esr, access = (esr & AARCH64_ESR_ACCESS_MASK) ? HW_BREAKPOINT_W : HW_BREAKPOINT_R; if (!(access & hw_breakpoint_type(wp))) - goto unlock; + continue; + /* Check if the watchpoint value and byte select match. */ + val = read_wb_reg(AARCH64_DBG_REG_WVR, i); + ctrl_reg = read_wb_reg(AARCH64_DBG_REG_WCR, i); + decode_ctrl_reg(ctrl_reg, &ctrl); + dist = get_distance_from_watchpoint(addr, val, &ctrl); + if (dist < min_dist) { + min_dist = dist; + closest_match = i; + } + /* Is this an exact match? */ + if (dist != 0) + continue; + + info = counter_arch_bp(wp); info->trigger = addr; perf_bp_event(wp, regs); /* Do we need to handle the stepping? */ if (is_default_overflow_handler(wp)) step = 1; + } + if (min_dist > 0 && min_dist != -1) { + /* No exact match found. */ + wp = slots[closest_match]; + info = counter_arch_bp(wp); + info->trigger = addr; + perf_bp_event(wp, regs); -unlock: - rcu_read_unlock(); + /* Do we need to handle the stepping? */ + if (is_default_overflow_handler(wp)) + step = 1; } + rcu_read_unlock(); if (!step) return 0; diff --git a/arch/arm64/kernel/insn.c b/arch/arm64/kernel/insn.c index 6f2ac4fc66ca..94b62c1fa4df 100644 --- a/arch/arm64/kernel/insn.c +++ b/arch/arm64/kernel/insn.c @@ -30,7 +30,6 @@ #include <asm/cacheflush.h> #include <asm/debug-monitors.h> #include <asm/fixmap.h> -#include <asm/opcodes.h> #include <asm/insn.h> #define AARCH64_INSN_SF_BIT BIT(31) diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index e017a9493b92..d217c9e95b06 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -247,6 +247,9 @@ NOKPROBE_SYMBOL(kgdb_compiled_brk_fn); static int kgdb_step_brk_fn(struct pt_regs *regs, unsigned int esr) { + if (!kgdb_single_step) + return DBG_HOOK_ERROR; + kgdb_handle_exception(1, SIGTRAP, 0, regs); return 0; } diff --git a/arch/arm64/kernel/probes/Makefile b/arch/arm64/kernel/probes/Makefile index ce06312e3d34..89b6df613dde 100644 --- a/arch/arm64/kernel/probes/Makefile +++ b/arch/arm64/kernel/probes/Makefile @@ -1,3 +1,5 @@ obj-$(CONFIG_KPROBES) += kprobes.o decode-insn.o \ kprobes_trampoline.o \ simulate-insn.o +obj-$(CONFIG_UPROBES) += uprobes.o decode-insn.o \ + simulate-insn.o diff --git a/arch/arm64/kernel/probes/decode-insn.c b/arch/arm64/kernel/probes/decode-insn.c index d1731bf977ef..6bf6657a5a52 100644 --- a/arch/arm64/kernel/probes/decode-insn.c +++ b/arch/arm64/kernel/probes/decode-insn.c @@ -17,7 +17,6 @@ #include <linux/kprobes.h> #include <linux/module.h> #include <linux/kallsyms.h> -#include <asm/kprobes.h> #include <asm/insn.h> #include <asm/sections.h> @@ -78,8 +77,8 @@ static bool __kprobes aarch64_insn_is_steppable(u32 insn) * INSN_GOOD If instruction is supported and uses instruction slot, * INSN_GOOD_NO_SLOT If instruction is supported but doesn't use its slot. */ -static enum kprobe_insn __kprobes -arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) +enum probe_insn __kprobes +arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *api) { /* * Instructions reading or modifying the PC won't work from the XOL @@ -89,26 +88,26 @@ arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) return INSN_GOOD; if (aarch64_insn_is_bcond(insn)) { - asi->handler = simulate_b_cond; + api->handler = simulate_b_cond; } else if (aarch64_insn_is_cbz(insn) || aarch64_insn_is_cbnz(insn)) { - asi->handler = simulate_cbz_cbnz; + api->handler = simulate_cbz_cbnz; } else if (aarch64_insn_is_tbz(insn) || aarch64_insn_is_tbnz(insn)) { - asi->handler = simulate_tbz_tbnz; + api->handler = simulate_tbz_tbnz; } else if (aarch64_insn_is_adr_adrp(insn)) { - asi->handler = simulate_adr_adrp; + api->handler = simulate_adr_adrp; } else if (aarch64_insn_is_b(insn) || aarch64_insn_is_bl(insn)) { - asi->handler = simulate_b_bl; + api->handler = simulate_b_bl; } else if (aarch64_insn_is_br(insn) || aarch64_insn_is_blr(insn) || aarch64_insn_is_ret(insn)) { - asi->handler = simulate_br_blr_ret; + api->handler = simulate_br_blr_ret; } else if (aarch64_insn_is_ldr_lit(insn)) { - asi->handler = simulate_ldr_literal; + api->handler = simulate_ldr_literal; } else if (aarch64_insn_is_ldrsw_lit(insn)) { - asi->handler = simulate_ldrsw_literal; + api->handler = simulate_ldrsw_literal; } else { /* * Instruction cannot be stepped out-of-line and we don't @@ -120,6 +119,7 @@ arm_probe_decode_insn(kprobe_opcode_t insn, struct arch_specific_insn *asi) return INSN_GOOD_NO_SLOT; } +#ifdef CONFIG_KPROBES static bool __kprobes is_probed_address_atomic(kprobe_opcode_t *scan_start, kprobe_opcode_t *scan_end) { @@ -138,12 +138,12 @@ is_probed_address_atomic(kprobe_opcode_t *scan_start, kprobe_opcode_t *scan_end) return false; } -enum kprobe_insn __kprobes +enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) { - enum kprobe_insn decoded; - kprobe_opcode_t insn = le32_to_cpu(*addr); - kprobe_opcode_t *scan_end = NULL; + enum probe_insn decoded; + probe_opcode_t insn = le32_to_cpu(*addr); + probe_opcode_t *scan_end = NULL; unsigned long size = 0, offset = 0; /* @@ -162,7 +162,7 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) else scan_end = addr - MAX_ATOMIC_CONTEXT_SIZE; } - decoded = arm_probe_decode_insn(insn, asi); + decoded = arm_probe_decode_insn(insn, &asi->api); if (decoded != INSN_REJECTED && scan_end) if (is_probed_address_atomic(addr - 1, scan_end)) @@ -170,3 +170,4 @@ arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi) return decoded; } +#endif diff --git a/arch/arm64/kernel/probes/decode-insn.h b/arch/arm64/kernel/probes/decode-insn.h index d438289646a6..76d3f315407f 100644 --- a/arch/arm64/kernel/probes/decode-insn.h +++ b/arch/arm64/kernel/probes/decode-insn.h @@ -23,13 +23,17 @@ */ #define MAX_ATOMIC_CONTEXT_SIZE (128 / sizeof(kprobe_opcode_t)) -enum kprobe_insn { +enum probe_insn { INSN_REJECTED, INSN_GOOD_NO_SLOT, INSN_GOOD, }; -enum kprobe_insn __kprobes +#ifdef CONFIG_KPROBES +enum probe_insn __kprobes arm_kprobe_decode_insn(kprobe_opcode_t *addr, struct arch_specific_insn *asi); +#endif +enum probe_insn __kprobes +arm_probe_decode_insn(probe_opcode_t insn, struct arch_probe_insn *asi); #endif /* _ARM_KERNEL_KPROBES_ARM64_H */ diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index f5077ea7af6d..1decd2b2c730 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -44,31 +44,31 @@ post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *); static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { /* prepare insn slot */ - p->ainsn.insn[0] = cpu_to_le32(p->opcode); + p->ainsn.api.insn[0] = cpu_to_le32(p->opcode); - flush_icache_range((uintptr_t) (p->ainsn.insn), - (uintptr_t) (p->ainsn.insn) + + flush_icache_range((uintptr_t) (p->ainsn.api.insn), + (uintptr_t) (p->ainsn.api.insn) + MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); /* * Needs restoring of return address after stepping xol. */ - p->ainsn.restore = (unsigned long) p->addr + + p->ainsn.api.restore = (unsigned long) p->addr + sizeof(kprobe_opcode_t); } static void __kprobes arch_prepare_simulate(struct kprobe *p) { /* This instructions is not executed xol. No need to adjust the PC */ - p->ainsn.restore = 0; + p->ainsn.api.restore = 0; } static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - if (p->ainsn.handler) - p->ainsn.handler((u32)p->opcode, (long)p->addr, regs); + if (p->ainsn.api.handler) + p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs); /* single step simulated, now go for post processing */ post_kprobe_handler(kcb, regs); @@ -98,18 +98,18 @@ int __kprobes arch_prepare_kprobe(struct kprobe *p) return -EINVAL; case INSN_GOOD_NO_SLOT: /* insn need simulation */ - p->ainsn.insn = NULL; + p->ainsn.api.insn = NULL; break; case INSN_GOOD: /* instruction uses slot */ - p->ainsn.insn = get_insn_slot(); - if (!p->ainsn.insn) + p->ainsn.api.insn = get_insn_slot(); + if (!p->ainsn.api.insn) return -ENOMEM; break; }; /* prepare the instruction */ - if (p->ainsn.insn) + if (p->ainsn.api.insn) arch_prepare_ss_slot(p); else arch_prepare_simulate(p); @@ -142,9 +142,9 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { - if (p->ainsn.insn) { - free_insn_slot(p->ainsn.insn, 0); - p->ainsn.insn = NULL; + if (p->ainsn.api.insn) { + free_insn_slot(p->ainsn.api.insn, 0); + p->ainsn.api.insn = NULL; } } @@ -244,9 +244,9 @@ static void __kprobes setup_singlestep(struct kprobe *p, } - if (p->ainsn.insn) { + if (p->ainsn.api.insn) { /* prepare for single stepping */ - slot = (unsigned long)p->ainsn.insn; + slot = (unsigned long)p->ainsn.api.insn; set_ss_context(kcb, slot); /* mark pending ss */ @@ -295,8 +295,8 @@ post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs) return; /* return addr restore if non-branching insn */ - if (cur->ainsn.restore != 0) - instruction_pointer_set(regs, cur->ainsn.restore); + if (cur->ainsn.api.restore != 0) + instruction_pointer_set(regs, cur->ainsn.api.restore); /* restore back original saved kprobe variables and continue */ if (kcb->kprobe_status == KPROBE_REENTER) { diff --git a/arch/arm64/kernel/probes/simulate-insn.c b/arch/arm64/kernel/probes/simulate-insn.c index 8977ce9d009d..357d3efe1366 100644 --- a/arch/arm64/kernel/probes/simulate-insn.c +++ b/arch/arm64/kernel/probes/simulate-insn.c @@ -13,28 +13,26 @@ * General Public License for more details. */ +#include <linux/bitops.h> #include <linux/kernel.h> #include <linux/kprobes.h> #include "simulate-insn.h" -#define sign_extend(x, signbit) \ - ((x) | (0 - ((x) & (1 << (signbit))))) - #define bbl_displacement(insn) \ - sign_extend(((insn) & 0x3ffffff) << 2, 27) + sign_extend32(((insn) & 0x3ffffff) << 2, 27) #define bcond_displacement(insn) \ - sign_extend(((insn >> 5) & 0x7ffff) << 2, 20) + sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20) #define cbz_displacement(insn) \ - sign_extend(((insn >> 5) & 0x7ffff) << 2, 20) + sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20) #define tbz_displacement(insn) \ - sign_extend(((insn >> 5) & 0x3fff) << 2, 15) + sign_extend32(((insn >> 5) & 0x3fff) << 2, 15) #define ldr_displacement(insn) \ - sign_extend(((insn >> 5) & 0x7ffff) << 2, 20) + sign_extend32(((insn >> 5) & 0x7ffff) << 2, 20) static inline void set_x_reg(struct pt_regs *regs, int reg, u64 val) { @@ -106,7 +104,7 @@ simulate_adr_adrp(u32 opcode, long addr, struct pt_regs *regs) xn = opcode & 0x1f; imm = ((opcode >> 3) & 0x1ffffc) | ((opcode >> 29) & 0x3); - imm = sign_extend(imm, 20); + imm = sign_extend64(imm, 20); if (opcode & 0x80000000) val = (imm<<12) + (addr & 0xfffffffffffff000); else diff --git a/arch/arm64/kernel/probes/uprobes.c b/arch/arm64/kernel/probes/uprobes.c new file mode 100644 index 000000000000..26c998534dca --- /dev/null +++ b/arch/arm64/kernel/probes/uprobes.c @@ -0,0 +1,216 @@ +/* + * Copyright (C) 2014-2016 Pratyush Anand <panand@redhat.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/highmem.h> +#include <linux/ptrace.h> +#include <linux/uprobes.h> +#include <asm/cacheflush.h> + +#include "decode-insn.h" + +#define UPROBE_INV_FAULT_CODE UINT_MAX + +void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len) +{ + void *xol_page_kaddr = kmap_atomic(page); + void *dst = xol_page_kaddr + (vaddr & ~PAGE_MASK); + + /* Initialize the slot */ + memcpy(dst, src, len); + + /* flush caches (dcache/icache) */ + sync_icache_aliases(dst, len); + + kunmap_atomic(xol_page_kaddr); +} + +unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) +{ + return instruction_pointer(regs); +} + +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, + unsigned long addr) +{ + probe_opcode_t insn; + + /* TODO: Currently we do not support AARCH32 instruction probing */ + if (test_bit(TIF_32BIT, &mm->context.flags)) + return -ENOTSUPP; + else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) + return -EINVAL; + + insn = *(probe_opcode_t *)(&auprobe->insn[0]); + + switch (arm_probe_decode_insn(insn, &auprobe->api)) { + case INSN_REJECTED: + return -EINVAL; + + case INSN_GOOD_NO_SLOT: + auprobe->simulate = true; + break; + + default: + break; + } + + return 0; +} + +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + /* Initialize with an invalid fault code to detect if ol insn trapped */ + current->thread.fault_code = UPROBE_INV_FAULT_CODE; + + /* Instruction points to execute ol */ + instruction_pointer_set(regs, utask->xol_vaddr); + + user_enable_single_step(current); + + return 0; +} + +int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + WARN_ON_ONCE(current->thread.fault_code != UPROBE_INV_FAULT_CODE); + + /* Instruction points to execute next to breakpoint address */ + instruction_pointer_set(regs, utask->vaddr + 4); + + user_disable_single_step(current); + + return 0; +} +bool arch_uprobe_xol_was_trapped(struct task_struct *t) +{ + /* + * Between arch_uprobe_pre_xol and arch_uprobe_post_xol, if an xol + * insn itself is trapped, then detect the case with the help of + * invalid fault code which is being set in arch_uprobe_pre_xol + */ + if (t->thread.fault_code != UPROBE_INV_FAULT_CODE) + return true; + + return false; +} + +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + probe_opcode_t insn; + unsigned long addr; + + if (!auprobe->simulate) + return false; + + insn = *(probe_opcode_t *)(&auprobe->insn[0]); + addr = instruction_pointer(regs); + + if (auprobe->api.handler) + auprobe->api.handler(insn, addr, regs); + + return true; +} + +void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + /* + * Task has received a fatal signal, so reset back to probbed + * address. + */ + instruction_pointer_set(regs, utask->vaddr); + + user_disable_single_step(current); +} + +bool arch_uretprobe_is_alive(struct return_instance *ret, enum rp_check ctx, + struct pt_regs *regs) +{ + /* + * If a simple branch instruction (B) was called for retprobed + * assembly label then return true even when regs->sp and ret->stack + * are same. It will ensure that cleanup and reporting of return + * instances corresponding to callee label is done when + * handle_trampoline for called function is executed. + */ + if (ctx == RP_CHECK_CHAIN_CALL) + return regs->sp <= ret->stack; + else + return regs->sp < ret->stack; +} + +unsigned long +arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, + struct pt_regs *regs) +{ + unsigned long orig_ret_vaddr; + + orig_ret_vaddr = procedure_link_pointer(regs); + /* Replace the return addr with trampoline addr */ + procedure_link_pointer_set(regs, trampoline_vaddr); + + return orig_ret_vaddr; +} + +int arch_uprobe_exception_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return NOTIFY_DONE; +} + +static int uprobe_breakpoint_handler(struct pt_regs *regs, + unsigned int esr) +{ + if (user_mode(regs) && uprobe_pre_sstep_notifier(regs)) + return DBG_HOOK_HANDLED; + + return DBG_HOOK_ERROR; +} + +static int uprobe_single_step_handler(struct pt_regs *regs, + unsigned int esr) +{ + struct uprobe_task *utask = current->utask; + + if (user_mode(regs)) { + WARN_ON(utask && + (instruction_pointer(regs) != utask->xol_vaddr + 4)); + + if (uprobe_post_sstep_notifier(regs)) + return DBG_HOOK_HANDLED; + } + + return DBG_HOOK_ERROR; +} + +/* uprobe breakpoint handler hook */ +static struct break_hook uprobes_break_hook = { + .esr_mask = BRK64_ESR_MASK, + .esr_val = BRK64_ESR_UPROBES, + .fn = uprobe_breakpoint_handler, +}; + +/* uprobe single step handler hook */ +static struct step_hook uprobes_step_hook = { + .fn = uprobe_single_step_handler, +}; + +static int __init arch_init_uprobes(void) +{ + register_break_hook(&uprobes_break_hook); + register_step_hook(&uprobes_step_hook); + + return 0; +} + +device_initcall(arch_init_uprobes); diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 01753cd7d3f0..a3a2816ba73a 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -45,6 +45,7 @@ #include <linux/personality.h> #include <linux/notifier.h> #include <trace/events/power.h> +#include <linux/percpu.h> #include <asm/alternative.h> #include <asm/compat.h> @@ -282,7 +283,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, memset(childregs, 0, sizeof(struct pt_regs)); childregs->pstate = PSR_MODE_EL1h; if (IS_ENABLED(CONFIG_ARM64_UAO) && - cpus_have_cap(ARM64_HAS_UAO)) + cpus_have_const_cap(ARM64_HAS_UAO)) childregs->pstate |= PSR_UAO_BIT; p->thread.cpu_context.x19 = stack_start; p->thread.cpu_context.x20 = stk_sz; @@ -322,6 +323,20 @@ void uao_thread_switch(struct task_struct *next) } /* + * We store our current task in sp_el0, which is clobbered by userspace. Keep a + * shadow copy so that we can restore this upon entry from userspace. + * + * This is *only* for exception entry from EL0, and is not valid until we + * __switch_to() a user task. + */ +DEFINE_PER_CPU(struct task_struct *, __entry_task); + +static void entry_task_switch(struct task_struct *next) +{ + __this_cpu_write(__entry_task, next); +} + +/* * Thread switching. */ struct task_struct *__switch_to(struct task_struct *prev, @@ -333,6 +348,7 @@ struct task_struct *__switch_to(struct task_struct *prev, tls_thread_switch(next); hw_breakpoint_thread_switch(next); contextidr_thread_switch(next); + entry_task_switch(next); uao_thread_switch(next); /* @@ -350,27 +366,35 @@ struct task_struct *__switch_to(struct task_struct *prev, unsigned long get_wchan(struct task_struct *p) { struct stackframe frame; - unsigned long stack_page; + unsigned long stack_page, ret = 0; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; + stack_page = (unsigned long)try_get_task_stack(p); + if (!stack_page) + return 0; + frame.fp = thread_saved_fp(p); frame.sp = thread_saved_sp(p); frame.pc = thread_saved_pc(p); #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = p->curr_ret_stack; #endif - stack_page = (unsigned long)task_stack_page(p); do { if (frame.sp < stack_page || frame.sp >= stack_page + THREAD_SIZE || unwind_frame(p, &frame)) - return 0; - if (!in_sched_functions(frame.pc)) - return frame.pc; + goto out; + if (!in_sched_functions(frame.pc)) { + ret = frame.pc; + goto out; + } } while (count ++ < 16); - return 0; + +out: + put_task_stack(p); + return ret; } unsigned long arch_align_stack(unsigned long sp) diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index e0c81da60f76..fc35e06ccaac 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -327,13 +327,13 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, struct arch_hw_breakpoint_ctrl ctrl, struct perf_event_attr *attr) { - int err, len, type, disabled = !ctrl.enabled; + int err, len, type, offset, disabled = !ctrl.enabled; attr->disabled = disabled; if (disabled) return 0; - err = arch_bp_generic_fields(ctrl, &len, &type); + err = arch_bp_generic_fields(ctrl, &len, &type, &offset); if (err) return err; @@ -352,6 +352,7 @@ static int ptrace_hbp_fill_attr_ctrl(unsigned int note_type, attr->bp_len = len; attr->bp_type = type; + attr->bp_addr += offset; return 0; } @@ -404,7 +405,7 @@ static int ptrace_hbp_get_addr(unsigned int note_type, if (IS_ERR(bp)) return PTR_ERR(bp); - *addr = bp ? bp->attr.bp_addr : 0; + *addr = bp ? counter_arch_bp(bp)->address : 0; return 0; } diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c index 1718706fde83..12a87f2600f2 100644 --- a/arch/arm64/kernel/return_address.c +++ b/arch/arm64/kernel/return_address.c @@ -12,6 +12,7 @@ #include <linux/export.h> #include <linux/ftrace.h> +#include <asm/stack_pointer.h> #include <asm/stacktrace.h> struct return_address_data { diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index f534f492a268..a53f52ac81c6 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -291,6 +291,15 @@ void __init setup_arch(char **cmdline_p) smp_init_cpus(); smp_build_mpidr_hash(); +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + /* + * Make sure init_thread_info.ttbr0 always generates translation + * faults in case uaccess_enable() is inadvertently called by the init + * thread. + */ + init_task.thread_info.ttbr0 = virt_to_phys(empty_zero_page); +#endif + #ifdef CONFIG_VT #if defined(CONFIG_VGA_CONSOLE) conswitchp = &vga_con; diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index 404dd67080b9..c7b6de62f9d3 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -414,6 +414,9 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, } else { local_irq_enable(); + if (thread_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + if (thread_flags & _TIF_SIGPENDING) do_signal(regs); diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index 1bec41b5fda3..df67652e46f0 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -125,9 +125,6 @@ ENTRY(_cpu_resume) /* load sp from context */ ldr x2, [x0, #CPU_CTX_SP] mov sp, x2 - /* save thread_info */ - and x2, x2, #~(THREAD_SIZE - 1) - msr sp_el0, x2 /* * cpu_do_resume expects x0 to contain context address pointer */ diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 8507703dabe4..cb87234cfcf2 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -58,6 +58,9 @@ #define CREATE_TRACE_POINTS #include <trace/events/ipi.h> +DEFINE_PER_CPU_READ_MOSTLY(int, cpu_number); +EXPORT_PER_CPU_SYMBOL(cpu_number); + /* * as from 2.5, kernels no longer have an init_tasks structure * so we need some other way of telling a new secondary core @@ -146,6 +149,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) * We need to tell the secondary core where to find its stack and the * page tables. */ + secondary_data.task = idle; secondary_data.stack = task_stack_page(idle) + THREAD_START_SP; update_cpu_boot_status(CPU_MMU_OFF); __flush_dcache_area(&secondary_data, sizeof(secondary_data)); @@ -170,6 +174,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) pr_err("CPU%u: failed to boot: %d\n", cpu, ret); } + secondary_data.task = NULL; secondary_data.stack = NULL; status = READ_ONCE(secondary_data.status); if (ret && status) { @@ -208,7 +213,10 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle) asmlinkage void secondary_start_kernel(void) { struct mm_struct *mm = &init_mm; - unsigned int cpu = smp_processor_id(); + unsigned int cpu; + + cpu = task_cpu(current); + set_my_cpu_offset(per_cpu_offset(cpu)); /* * All kernel threads share the same mm context; grab a @@ -217,8 +225,6 @@ asmlinkage void secondary_start_kernel(void) atomic_inc(&mm->mm_count); current->active_mm = mm; - set_my_cpu_offset(per_cpu_offset(smp_processor_id())); - /* * TTBR0 is only used for the identity mapping at this stage. Make it * point to zero page to avoid speculatively fetching new entries. @@ -718,6 +724,8 @@ void __init smp_prepare_cpus(unsigned int max_cpus) */ for_each_possible_cpu(cpu) { + per_cpu(cpu_number, cpu) = cpu; + if (cpu == smp_processor_id()) continue; diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c index c2efddfca18c..8a552a33c6ef 100644 --- a/arch/arm64/kernel/stacktrace.c +++ b/arch/arm64/kernel/stacktrace.c @@ -22,6 +22,7 @@ #include <linux/stacktrace.h> #include <asm/irq.h> +#include <asm/stack_pointer.h> #include <asm/stacktrace.h> /* @@ -128,7 +129,6 @@ void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame, break; } } -EXPORT_SYMBOL(walk_stackframe); #ifdef CONFIG_STACKTRACE struct stack_trace_data { @@ -181,6 +181,9 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) struct stack_trace_data data; struct stackframe frame; + if (!try_get_task_stack(tsk)) + return; + data.trace = trace; data.skip = trace->skip; @@ -202,6 +205,8 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) walk_stackframe(tsk, &frame, save_trace, &data); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; + + put_task_stack(tsk); } void save_stack_trace(struct stack_trace *trace) diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index bb0cd787a9d3..1e3be9064cfa 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -47,12 +47,6 @@ void notrace __cpu_suspend_exit(void) cpu_uninstall_idmap(); /* - * Restore per-cpu offset before any kernel - * subsystem relying on it has a chance to run. - */ - set_my_cpu_offset(per_cpu_offset(cpu)); - - /* * PSTATE was not saved over suspend/resume, re-enable any detected * features that might not have been set correctly. */ diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 694f6deedbab..23e9e13bd2aa 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -19,10 +19,226 @@ #include <linux/nodemask.h> #include <linux/of.h> #include <linux/sched.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/cpufreq.h> +#include <asm/cpu.h> #include <asm/cputype.h> #include <asm/topology.h> +static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; +static DEFINE_MUTEX(cpu_scale_mutex); + +unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) +{ + return per_cpu(cpu_scale, cpu); +} + +static void set_capacity_scale(unsigned int cpu, unsigned long capacity) +{ + per_cpu(cpu_scale, cpu) = capacity; +} + +#ifdef CONFIG_PROC_SYSCTL +static ssize_t cpu_capacity_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + + return sprintf(buf, "%lu\n", + arch_scale_cpu_capacity(NULL, cpu->dev.id)); +} + +static ssize_t cpu_capacity_store(struct device *dev, + struct device_attribute *attr, + const char *buf, + size_t count) +{ + struct cpu *cpu = container_of(dev, struct cpu, dev); + int this_cpu = cpu->dev.id, i; + unsigned long new_capacity; + ssize_t ret; + + if (count) { + ret = kstrtoul(buf, 0, &new_capacity); + if (ret) + return ret; + if (new_capacity > SCHED_CAPACITY_SCALE) + return -EINVAL; + + mutex_lock(&cpu_scale_mutex); + for_each_cpu(i, &cpu_topology[this_cpu].core_sibling) + set_capacity_scale(i, new_capacity); + mutex_unlock(&cpu_scale_mutex); + } + + return count; +} + +static DEVICE_ATTR_RW(cpu_capacity); + +static int register_cpu_capacity_sysctl(void) +{ + int i; + struct device *cpu; + + for_each_possible_cpu(i) { + cpu = get_cpu_device(i); + if (!cpu) { + pr_err("%s: too early to get CPU%d device!\n", + __func__, i); + continue; + } + device_create_file(cpu, &dev_attr_cpu_capacity); + } + + return 0; +} +subsys_initcall(register_cpu_capacity_sysctl); +#endif + +static u32 capacity_scale; +static u32 *raw_capacity; +static bool cap_parsing_failed; + +static void __init parse_cpu_capacity(struct device_node *cpu_node, int cpu) +{ + int ret; + u32 cpu_capacity; + + if (cap_parsing_failed) + return; + + ret = of_property_read_u32(cpu_node, + "capacity-dmips-mhz", + &cpu_capacity); + if (!ret) { + if (!raw_capacity) { + raw_capacity = kcalloc(num_possible_cpus(), + sizeof(*raw_capacity), + GFP_KERNEL); + if (!raw_capacity) { + pr_err("cpu_capacity: failed to allocate memory for raw capacities\n"); + cap_parsing_failed = true; + return; + } + } + capacity_scale = max(cpu_capacity, capacity_scale); + raw_capacity[cpu] = cpu_capacity; + pr_debug("cpu_capacity: %s cpu_capacity=%u (raw)\n", + cpu_node->full_name, raw_capacity[cpu]); + } else { + if (raw_capacity) { + pr_err("cpu_capacity: missing %s raw capacity\n", + cpu_node->full_name); + pr_err("cpu_capacity: partial information: fallback to 1024 for all CPUs\n"); + } + cap_parsing_failed = true; + kfree(raw_capacity); + } +} + +static void normalize_cpu_capacity(void) +{ + u64 capacity; + int cpu; + + if (!raw_capacity || cap_parsing_failed) + return; + + pr_debug("cpu_capacity: capacity_scale=%u\n", capacity_scale); + mutex_lock(&cpu_scale_mutex); + for_each_possible_cpu(cpu) { + pr_debug("cpu_capacity: cpu=%d raw_capacity=%u\n", + cpu, raw_capacity[cpu]); + capacity = (raw_capacity[cpu] << SCHED_CAPACITY_SHIFT) + / capacity_scale; + set_capacity_scale(cpu, capacity); + pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n", + cpu, arch_scale_cpu_capacity(NULL, cpu)); + } + mutex_unlock(&cpu_scale_mutex); +} + +#ifdef CONFIG_CPU_FREQ +static cpumask_var_t cpus_to_visit; +static bool cap_parsing_done; +static void parsing_done_workfn(struct work_struct *work); +static DECLARE_WORK(parsing_done_work, parsing_done_workfn); + +static int +init_cpu_capacity_callback(struct notifier_block *nb, + unsigned long val, + void *data) +{ + struct cpufreq_policy *policy = data; + int cpu; + + if (cap_parsing_failed || cap_parsing_done) + return 0; + + switch (val) { + case CPUFREQ_NOTIFY: + pr_debug("cpu_capacity: init cpu capacity for CPUs [%*pbl] (to_visit=%*pbl)\n", + cpumask_pr_args(policy->related_cpus), + cpumask_pr_args(cpus_to_visit)); + cpumask_andnot(cpus_to_visit, + cpus_to_visit, + policy->related_cpus); + for_each_cpu(cpu, policy->related_cpus) { + raw_capacity[cpu] = arch_scale_cpu_capacity(NULL, cpu) * + policy->cpuinfo.max_freq / 1000UL; + capacity_scale = max(raw_capacity[cpu], capacity_scale); + } + if (cpumask_empty(cpus_to_visit)) { + normalize_cpu_capacity(); + kfree(raw_capacity); + pr_debug("cpu_capacity: parsing done\n"); + cap_parsing_done = true; + schedule_work(&parsing_done_work); + } + } + return 0; +} + +static struct notifier_block init_cpu_capacity_notifier = { + .notifier_call = init_cpu_capacity_callback, +}; + +static int __init register_cpufreq_notifier(void) +{ + if (cap_parsing_failed) + return -EINVAL; + + if (!alloc_cpumask_var(&cpus_to_visit, GFP_KERNEL)) { + pr_err("cpu_capacity: failed to allocate memory for cpus_to_visit\n"); + return -ENOMEM; + } + cpumask_copy(cpus_to_visit, cpu_possible_mask); + + return cpufreq_register_notifier(&init_cpu_capacity_notifier, + CPUFREQ_POLICY_NOTIFIER); +} +core_initcall(register_cpufreq_notifier); + +static void parsing_done_workfn(struct work_struct *work) +{ + cpufreq_unregister_notifier(&init_cpu_capacity_notifier, + CPUFREQ_POLICY_NOTIFIER); +} + +#else +static int __init free_raw_capacity(void) +{ + kfree(raw_capacity); + + return 0; +} +core_initcall(free_raw_capacity); +#endif + static int __init get_cpu_for_node(struct device_node *node) { struct device_node *cpu_node; @@ -34,6 +250,7 @@ static int __init get_cpu_for_node(struct device_node *node) for_each_possible_cpu(cpu) { if (of_get_cpu_node(cpu, NULL) == cpu_node) { + parse_cpu_capacity(cpu_node, cpu); of_node_put(cpu_node); return cpu; } @@ -178,13 +395,17 @@ static int __init parse_dt_topology(void) * cluster with restricted subnodes. */ map = of_get_child_by_name(cn, "cpu-map"); - if (!map) + if (!map) { + cap_parsing_failed = true; goto out; + } ret = parse_cluster(map, 0); if (ret != 0) goto out_map; + normalize_cpu_capacity(); + /* * Check that all cores are in the topology; the SMP code will * only mark cores described in the DT as possible. diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index c9986b3e0a96..5b830be79c01 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -38,6 +38,7 @@ #include <asm/esr.h> #include <asm/insn.h> #include <asm/traps.h> +#include <asm/stack_pointer.h> #include <asm/stacktrace.h> #include <asm/exception.h> #include <asm/system_misc.h> @@ -147,6 +148,9 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) if (!tsk) tsk = current; + if (!try_get_task_stack(tsk)) + return; + /* * Switching between stacks is valid when tracing current and in * non-preemptible context. @@ -212,6 +216,8 @@ static void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) stack + sizeof(struct pt_regs)); } } + + put_task_stack(tsk); } void show_stack(struct task_struct *tsk, unsigned long *sp) @@ -227,10 +233,9 @@ void show_stack(struct task_struct *tsk, unsigned long *sp) #endif #define S_SMP " SMP" -static int __die(const char *str, int err, struct thread_info *thread, - struct pt_regs *regs) +static int __die(const char *str, int err, struct pt_regs *regs) { - struct task_struct *tsk = thread->task; + struct task_struct *tsk = current; static int die_counter; int ret; @@ -245,7 +250,8 @@ static int __die(const char *str, int err, struct thread_info *thread, print_modules(); __show_regs(regs); pr_emerg("Process %.*s (pid: %d, stack limit = 0x%p)\n", - TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), thread + 1); + TASK_COMM_LEN, tsk->comm, task_pid_nr(tsk), + end_of_stack(tsk)); if (!user_mode(regs)) { dump_mem(KERN_EMERG, "Stack: ", regs->sp, @@ -264,7 +270,6 @@ static DEFINE_RAW_SPINLOCK(die_lock); */ void die(const char *str, struct pt_regs *regs, int err) { - struct thread_info *thread = current_thread_info(); int ret; oops_enter(); @@ -272,9 +277,9 @@ void die(const char *str, struct pt_regs *regs, int err) raw_spin_lock_irq(&die_lock); console_verbose(); bust_spinlocks(1); - ret = __die(str, err, thread, regs); + ret = __die(str, err, regs); - if (regs && kexec_should_crash(thread->task)) + if (regs && kexec_should_crash(current)) crash_kexec(regs); bust_spinlocks(0); @@ -435,9 +440,10 @@ int cpu_enable_cache_maint_trap(void *__unused) } #define __user_cache_maint(insn, address, res) \ - if (untagged_addr(address) >= user_addr_max()) \ + if (untagged_addr(address) >= user_addr_max()) { \ res = -EFAULT; \ - else \ + } else { \ + uaccess_ttbr0_enable(); \ asm volatile ( \ "1: " insn ", %1\n" \ " mov %w0, #0\n" \ @@ -449,7 +455,9 @@ int cpu_enable_cache_maint_trap(void *__unused) " .popsection\n" \ _ASM_EXTABLE(1b, 3b) \ : "=r" (res) \ - : "r" (address), "i" (-EFAULT) ) + : "r" (address), "i" (-EFAULT)); \ + uaccess_ttbr0_disable(); \ + } static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) { diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 1105aab1e6d6..b8deffa9e1bf 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -216,6 +216,11 @@ SECTIONS swapper_pg_dir = .; . += SWAPPER_DIR_SIZE; +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + reserved_ttbr0 = .; + . += RESERVED_TTBR0_SIZE; +#endif + _end = .; STABS_DEBUG diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index 6eaf12c1d627..52cb7ad9b2fd 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -16,9 +16,6 @@ menuconfig VIRTUALIZATION if VIRTUALIZATION -config KVM_ARM_VGIC_V3_ITS - bool - config KVM bool "Kernel-based Virtual Machine (KVM) support" depends on OF @@ -34,7 +31,6 @@ config KVM select KVM_VFIO select HAVE_KVM_EVENTFD select HAVE_KVM_IRQFD - select KVM_ARM_VGIC_V3_ITS select KVM_ARM_PMU if HW_PERF_EVENTS select HAVE_KVM_MSI select HAVE_KVM_IRQCHIP diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index a204adf29f0a..1bfe30dfbfe7 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -57,6 +57,16 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) return 1; } +/* + * Guest access to FP/ASIMD registers are routed to this handler only + * when the system doesn't support FP/ASIMD. + */ +static int handle_no_fpsimd(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + kvm_inject_undefined(vcpu); + return 1; +} + /** * kvm_handle_wfx - handle a wait-for-interrupts or wait-for-event * instruction executed by a guest @@ -144,6 +154,7 @@ static exit_handle_fn arm_exit_handlers[] = { [ESR_ELx_EC_BREAKPT_LOW]= kvm_handle_guest_debug, [ESR_ELx_EC_BKPT32] = kvm_handle_guest_debug, [ESR_ELx_EC_BRK64] = kvm_handle_guest_debug, + [ESR_ELx_EC_FP_ASIMD] = handle_no_fpsimd, }; static exit_handle_fn kvm_get_exit_handler(struct kvm_vcpu *vcpu) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 4e92399f7105..5e9052f087f2 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -106,9 +106,16 @@ el1_trap: * x0: ESR_EC */ - /* Guest accessed VFP/SIMD registers, save host, restore Guest */ + /* + * We trap the first access to the FP/SIMD to save the host context + * and restore the guest context lazily. + * If FP/SIMD is not implemented, handle the trap and inject an + * undefined instruction exception to the guest. + */ +alternative_if_not ARM64_HAS_NO_FPSIMD cmp x0, #ESR_ELx_EC_FP_ASIMD b.eq __fpsimd_guest_restore +alternative_else_nop_endif mrs x1, tpidr_el2 mov x0, #ARM_EXCEPTION_TRAP diff --git a/arch/arm64/kvm/hyp/switch.c b/arch/arm64/kvm/hyp/switch.c index 83037cd62d01..75e83dd40d43 100644 --- a/arch/arm64/kvm/hyp/switch.c +++ b/arch/arm64/kvm/hyp/switch.c @@ -21,6 +21,7 @@ #include <asm/kvm_asm.h> #include <asm/kvm_emulate.h> #include <asm/kvm_hyp.h> +#include <asm/fpsimd.h> static bool __hyp_text __fpsimd_enabled_nvhe(void) { @@ -76,16 +77,24 @@ static void __hyp_text __activate_traps(struct kvm_vcpu *vcpu) * traps are only taken to EL2 if the operation would not otherwise * trap to EL1. Therefore, always make sure that for 32-bit guests, * we set FPEXC.EN to prevent traps to EL1, when setting the TFP bit. + * If FP/ASIMD is not implemented, FPEXC is UNDEFINED and any access to + * it will cause an exception. */ val = vcpu->arch.hcr_el2; - if (!(val & HCR_RW)) { + if (!(val & HCR_RW) && system_supports_fpsimd()) { write_sysreg(1 << 30, fpexc32_el2); isb(); } write_sysreg(val, hcr_el2); /* Trap on AArch32 cp15 c15 accesses (EL1 or EL0) */ write_sysreg(1 << 15, hstr_el2); - /* Make sure we trap PMU access from EL0 to EL2 */ + /* + * Make sure we trap PMU access from EL0 to EL2. Also sanitize + * PMSELR_EL0 to make sure it never contains the cycle + * counter, which could make a PMXEVCNTR_EL0 access UNDEF at + * EL1 instead of being trapped to EL2. + */ + write_sysreg(0, pmselr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); __activate_traps_arch()(); diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index 5bc460884639..e95d4f68bf54 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -86,12 +86,6 @@ int kvm_arch_dev_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_VCPU_ATTRIBUTES: r = 1; break; - case KVM_CAP_MSI_DEVID: - if (!kvm) - r = -EINVAL; - else - r = kvm->arch.vgic.msis_require_devid; - break; default: r = 0; } diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 5d1cad3ce6d6..d7150e30438a 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -17,10 +17,7 @@ */ #include <linux/linkage.h> -#include <asm/alternative.h> -#include <asm/assembler.h> -#include <asm/cpufeature.h> -#include <asm/sysreg.h> +#include <asm/uaccess.h> .text @@ -33,8 +30,7 @@ * Alignment fixed up by hardware. */ ENTRY(__clear_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x2, x3 mov x2, x1 // save the size for fixup return subs x1, x1, #8 b.mi 2f @@ -54,8 +50,7 @@ uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 b.mi 5f uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x2 ret ENDPROC(__clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 4fd67ea03bb0..cfe13396085b 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -16,11 +16,8 @@ #include <linux/linkage.h> -#include <asm/alternative.h> -#include <asm/assembler.h> #include <asm/cache.h> -#include <asm/cpufeature.h> -#include <asm/sysreg.h> +#include <asm/uaccess.h> /* * Copy from user space to a kernel buffer (alignment handled by the hardware) @@ -67,12 +64,10 @@ end .req x5 ENTRY(__arch_copy_from_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 // Nothing to copy ret ENDPROC(__arch_copy_from_user) diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index f7292dd08c84..718b1c4e2f85 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -18,11 +18,8 @@ #include <linux/linkage.h> -#include <asm/alternative.h> -#include <asm/assembler.h> #include <asm/cache.h> -#include <asm/cpufeature.h> -#include <asm/sysreg.h> +#include <asm/uaccess.h> /* * Copy from user space to user space (alignment handled by the hardware) @@ -68,12 +65,10 @@ end .req x5 ENTRY(__copy_in_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 ret ENDPROC(__copy_in_user) diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 7a7efe255034..e99e31c9acac 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -16,11 +16,8 @@ #include <linux/linkage.h> -#include <asm/alternative.h> -#include <asm/assembler.h> #include <asm/cache.h> -#include <asm/cpufeature.h> -#include <asm/sysreg.h> +#include <asm/uaccess.h> /* * Copy to user space from a kernel buffer (alignment handled by the hardware) @@ -66,12 +63,10 @@ end .req x5 ENTRY(__arch_copy_to_user) -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(0)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_enable_not_uao x3, x4 add end, x0, x2 #include "copy_template.S" -ALTERNATIVE("nop", __stringify(SET_PSTATE_PAN(1)), ARM64_ALT_PAN_NOT_UAO, \ - CONFIG_ARM64_PAN) + uaccess_disable_not_uao x3 mov x0, #0 ret ENDPROC(__arch_copy_to_user) diff --git a/arch/arm64/mm/Makefile b/arch/arm64/mm/Makefile index 54bb209cae8e..e703fb9defad 100644 --- a/arch/arm64/mm/Makefile +++ b/arch/arm64/mm/Makefile @@ -3,7 +3,8 @@ obj-y := dma-mapping.o extable.o fault.o init.o \ ioremap.o mmap.o pgd.o mmu.o \ context.o proc.o pageattr.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o -obj-$(CONFIG_ARM64_PTDUMP) += dump.o +obj-$(CONFIG_ARM64_PTDUMP_CORE) += dump.o +obj-$(CONFIG_ARM64_PTDUMP_DEBUGFS) += ptdump_debugfs.o obj-$(CONFIG_NUMA) += numa.o obj-$(CONFIG_KASAN) += kasan_init.o diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S index 58b5a906ff78..da9576932322 100644 --- a/arch/arm64/mm/cache.S +++ b/arch/arm64/mm/cache.S @@ -23,6 +23,7 @@ #include <asm/assembler.h> #include <asm/cpufeature.h> #include <asm/alternative.h> +#include <asm/uaccess.h> /* * flush_icache_range(start,end) @@ -48,6 +49,7 @@ ENTRY(flush_icache_range) * - end - virtual end address of region */ ENTRY(__flush_cache_user_range) + uaccess_ttbr0_enable x2, x3 dcache_line_size x2, x3 sub x3, x2, #1 bic x4, x0, x3 @@ -69,10 +71,12 @@ USER(9f, ic ivau, x4 ) // invalidate I line PoU dsb ish isb mov x0, #0 +1: + uaccess_ttbr0_disable x1 ret 9: mov x0, #-EFAULT - ret + b 1b ENDPROC(flush_icache_range) ENDPROC(__flush_cache_user_range) diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c index efcf1f7ef1e4..4c63cb154859 100644 --- a/arch/arm64/mm/context.c +++ b/arch/arm64/mm/context.c @@ -221,7 +221,12 @@ void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: - cpu_switch_mm(mm->pgd, mm); + /* + * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when + * emulating PAN. + */ + if (!system_uses_ttbr0_pan()) + cpu_switch_mm(mm->pgd, mm); } static int asids_init(void) diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 3f74d0d98de6..aa6c8f834d9e 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -938,11 +938,6 @@ static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, void arch_teardown_dma_ops(struct device *dev) { - struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - - if (WARN_ON(domain)) - iommu_detach_device(domain, dev); - dev->archdata.dma_ops = NULL; } diff --git a/arch/arm64/mm/dump.c b/arch/arm64/mm/dump.c index 9c3e75df2180..ca74a2aace42 100644 --- a/arch/arm64/mm/dump.c +++ b/arch/arm64/mm/dump.c @@ -50,6 +50,18 @@ static const struct addr_marker address_markers[] = { { -1, NULL }, }; +#define pt_dump_seq_printf(m, fmt, args...) \ +({ \ + if (m) \ + seq_printf(m, fmt, ##args); \ +}) + +#define pt_dump_seq_puts(m, fmt) \ +({ \ + if (m) \ + seq_printf(m, fmt); \ +}) + /* * The page dumper groups page table entries of the same type into a single * description. It uses pg_state to track the range information while @@ -62,6 +74,9 @@ struct pg_state { unsigned long start_address; unsigned level; u64 current_prot; + bool check_wx; + unsigned long wx_pages; + unsigned long uxn_pages; }; struct prot_bits { @@ -186,10 +201,39 @@ static void dump_prot(struct pg_state *st, const struct prot_bits *bits, s = bits->clear; if (s) - seq_printf(st->seq, " %s", s); + pt_dump_seq_printf(st->seq, " %s", s); } } +static void note_prot_uxn(struct pg_state *st, unsigned long addr) +{ + if (!st->check_wx) + return; + + if ((st->current_prot & PTE_UXN) == PTE_UXN) + return; + + WARN_ONCE(1, "arm64/mm: Found non-UXN mapping at address %p/%pS\n", + (void *)st->start_address, (void *)st->start_address); + + st->uxn_pages += (addr - st->start_address) / PAGE_SIZE; +} + +static void note_prot_wx(struct pg_state *st, unsigned long addr) +{ + if (!st->check_wx) + return; + if ((st->current_prot & PTE_RDONLY) == PTE_RDONLY) + return; + if ((st->current_prot & PTE_PXN) == PTE_PXN) + return; + + WARN_ONCE(1, "arm64/mm: Found insecure W+X mapping at address %p/%pS\n", + (void *)st->start_address, (void *)st->start_address); + + st->wx_pages += (addr - st->start_address) / PAGE_SIZE; +} + static void note_page(struct pg_state *st, unsigned long addr, unsigned level, u64 val) { @@ -200,14 +244,16 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level, st->level = level; st->current_prot = prot; st->start_address = addr; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } else if (prot != st->current_prot || level != st->level || addr >= st->marker[1].start_address) { const char *unit = units; unsigned long delta; if (st->current_prot) { - seq_printf(st->seq, "0x%016lx-0x%016lx ", + note_prot_uxn(st, addr); + note_prot_wx(st, addr); + pt_dump_seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr); delta = (addr - st->start_address) >> 10; @@ -215,17 +261,17 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level, delta >>= 10; unit++; } - seq_printf(st->seq, "%9lu%c %s", delta, *unit, + pt_dump_seq_printf(st->seq, "%9lu%c %s", delta, *unit, pg_level[st->level].name); if (pg_level[st->level].bits) dump_prot(st, pg_level[st->level].bits, pg_level[st->level].num); - seq_puts(st->seq, "\n"); + pt_dump_seq_puts(st->seq, "\n"); } if (addr >= st->marker[1].start_address) { st->marker++; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } st->start_address = addr; @@ -235,7 +281,7 @@ static void note_page(struct pg_state *st, unsigned long addr, unsigned level, if (addr >= st->marker[1].start_address) { st->marker++; - seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); + pt_dump_seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } } @@ -304,9 +350,8 @@ static void walk_pgd(struct pg_state *st, struct mm_struct *mm, } } -static int ptdump_show(struct seq_file *m, void *v) +void ptdump_walk_pgd(struct seq_file *m, struct ptdump_info *info) { - struct ptdump_info *info = m->private; struct pg_state st = { .seq = m, .marker = info->markers, @@ -315,33 +360,16 @@ static int ptdump_show(struct seq_file *m, void *v) walk_pgd(&st, info->mm, info->base_addr); note_page(&st, 0, 0, 0); - return 0; } -static int ptdump_open(struct inode *inode, struct file *file) +static void ptdump_initialize(void) { - return single_open(file, ptdump_show, inode->i_private); -} - -static const struct file_operations ptdump_fops = { - .open = ptdump_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -int ptdump_register(struct ptdump_info *info, const char *name) -{ - struct dentry *pe; unsigned i, j; for (i = 0; i < ARRAY_SIZE(pg_level); i++) if (pg_level[i].bits) for (j = 0; j < pg_level[i].num; j++) pg_level[i].mask |= pg_level[i].bits[j].mask; - - pe = debugfs_create_file(name, 0400, NULL, info, &ptdump_fops); - return pe ? 0 : -ENOMEM; } static struct ptdump_info kernel_ptdump_info = { @@ -350,8 +378,30 @@ static struct ptdump_info kernel_ptdump_info = { .base_addr = VA_START, }; +void ptdump_check_wx(void) +{ + struct pg_state st = { + .seq = NULL, + .marker = (struct addr_marker[]) { + { 0, NULL}, + { -1, NULL}, + }, + .check_wx = true, + }; + + walk_pgd(&st, &init_mm, 0); + note_page(&st, 0, 0, 0); + if (st.wx_pages || st.uxn_pages) + pr_warn("Checked W+X mappings: FAILED, %lu W+X pages found, %lu non-UXN pages found\n", + st.wx_pages, st.uxn_pages); + else + pr_info("Checked W+X mappings: passed, no W+X pages found\n"); +} + static int ptdump_init(void) { - return ptdump_register(&kernel_ptdump_info, "kernel_page_tables"); + ptdump_initialize(); + return ptdump_debugfs_register(&kernel_ptdump_info, + "kernel_page_tables"); } device_initcall(ptdump_init); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 0f8788374815..a78a5c401806 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -269,13 +269,19 @@ out: return fault; } -static inline bool is_permission_fault(unsigned int esr) +static inline bool is_permission_fault(unsigned int esr, struct pt_regs *regs) { unsigned int ec = ESR_ELx_EC(esr); unsigned int fsc_type = esr & ESR_ELx_FSC_TYPE; - return (ec == ESR_ELx_EC_DABT_CUR && fsc_type == ESR_ELx_FSC_PERM) || - (ec == ESR_ELx_EC_IABT_CUR && fsc_type == ESR_ELx_FSC_PERM); + if (ec != ESR_ELx_EC_DABT_CUR && ec != ESR_ELx_EC_IABT_CUR) + return false; + + if (system_uses_ttbr0_pan()) + return fsc_type == ESR_ELx_FSC_FAULT && + (regs->pstate & PSR_PAN_BIT); + else + return fsc_type == ESR_ELx_FSC_PERM; } static bool is_el0_instruction_abort(unsigned int esr) @@ -315,7 +321,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, mm_flags |= FAULT_FLAG_WRITE; } - if (is_permission_fault(esr) && (addr < USER_DS)) { + if (addr < USER_DS && is_permission_fault(esr, regs)) { /* regs->orig_addr_limit may be 0 if we entered from EL0 */ if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); @@ -507,10 +513,10 @@ static const struct fault_info { { do_bad, SIGBUS, 0, "unknown 17" }, { do_bad, SIGBUS, 0, "unknown 18" }, { do_bad, SIGBUS, 0, "unknown 19" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, - { do_bad, SIGBUS, 0, "synchronous abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, + { do_bad, SIGBUS, 0, "synchronous external abort (translation table walk)" }, { do_bad, SIGBUS, 0, "synchronous parity error" }, { do_bad, SIGBUS, 0, "unknown 25" }, { do_bad, SIGBUS, 0, "unknown 26" }, diff --git a/arch/arm64/mm/flush.c b/arch/arm64/mm/flush.c index 8377329d8c97..554a2558c12e 100644 --- a/arch/arm64/mm/flush.c +++ b/arch/arm64/mm/flush.c @@ -25,14 +25,7 @@ #include <asm/cachetype.h> #include <asm/tlbflush.h> -void flush_cache_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end) -{ - if (vma->vm_flags & VM_EXEC) - __flush_icache_all(); -} - -static void sync_icache_aliases(void *kaddr, unsigned long len) +void sync_icache_aliases(void *kaddr, unsigned long len) { unsigned long addr = (unsigned long)kaddr; diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 2e49bd252fe7..964b7549af5c 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -51,20 +51,8 @@ static int find_num_contig(struct mm_struct *mm, unsigned long addr, *pgsize = PAGE_SIZE; if (!pte_cont(pte)) return 1; - if (!pgd_present(*pgd)) { - VM_BUG_ON(!pgd_present(*pgd)); - return 1; - } pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) { - VM_BUG_ON(!pud_present(*pud)); - return 1; - } pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) { - VM_BUG_ON(!pmd_present(*pmd)); - return 1; - } if ((pte_t *)pmd == ptep) { *pgsize = PMD_SIZE; return CONT_PMDS; @@ -212,7 +200,7 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); /* save the 1st pte to return */ pte = ptep_get_and_clear(mm, addr, cpte); - for (i = 1; i < ncontig; ++i) { + for (i = 1, addr += pgsize; i < ncontig; ++i, addr += pgsize) { /* * If HW_AFDBM is enabled, then the HW could * turn on the dirty bit for any of the page @@ -250,7 +238,7 @@ int huge_ptep_set_access_flags(struct vm_area_struct *vma, pfn = pte_pfn(*cpte); ncontig = find_num_contig(vma->vm_mm, addr, cpte, *cpte, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte) { + for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) { changed = ptep_set_access_flags(vma, addr, cpte, pfn_pte(pfn, hugeprot), @@ -273,7 +261,7 @@ void huge_ptep_set_wrprotect(struct mm_struct *mm, cpte = huge_pte_offset(mm, addr); ncontig = find_num_contig(mm, addr, cpte, *cpte, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte) + for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) ptep_set_wrprotect(mm, addr, cpte); } else { ptep_set_wrprotect(mm, addr, ptep); @@ -291,7 +279,7 @@ void huge_ptep_clear_flush(struct vm_area_struct *vma, cpte = huge_pte_offset(vma->vm_mm, addr); ncontig = find_num_contig(vma->vm_mm, addr, cpte, *cpte, &pgsize); - for (i = 0; i < ncontig; ++i, ++cpte) + for (i = 0; i < ncontig; ++i, ++cpte, addr += pgsize) ptep_clear_flush(vma, addr, cpte); } else { ptep_clear_flush(vma, addr, ptep); @@ -323,7 +311,7 @@ __setup("hugepagesz=", setup_hugepagesz); static __init int add_default_hugepagesz(void) { if (size_to_hstate(CONT_PTES * PAGE_SIZE) == NULL) - hugetlb_add_hstate(CONT_PMD_SHIFT); + hugetlb_add_hstate(CONT_PTE_SHIFT); return 0; } arch_initcall(add_default_hugepagesz); diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index 05615a3fdc6f..17243e43184e 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -28,8 +28,6 @@ #include <linux/memblock.h> #include <linux/fs.h> #include <linux/io.h> -#include <linux/slab.h> -#include <linux/stop_machine.h> #include <asm/barrier.h> #include <asm/cputype.h> @@ -42,6 +40,7 @@ #include <asm/tlb.h> #include <asm/memblock.h> #include <asm/mmu_context.h> +#include <asm/ptdump.h> u64 idmap_t0sz = TCR_T0SZ(VA_BITS); @@ -95,11 +94,24 @@ static phys_addr_t __init early_pgtable_alloc(void) return phys; } +static bool pgattr_change_is_safe(u64 old, u64 new) +{ + /* + * The following mapping attributes may be updated in live + * kernel mappings without the need for break-before-make. + */ + static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE; + + return old == 0 || new == 0 || ((old ^ new) & ~mask) == 0; +} + static void alloc_init_pte(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot, - phys_addr_t (*pgtable_alloc)(void)) + phys_addr_t (*pgtable_alloc)(void), + bool page_mappings_only) { + pgprot_t __prot = prot; pte_t *pte; BUG_ON(pmd_sect(*pmd)); @@ -115,8 +127,28 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, pte = pte_set_fixmap_offset(pmd, addr); do { - set_pte(pte, pfn_pte(pfn, prot)); + pte_t old_pte = *pte; + + /* + * Set the contiguous bit for the subsequent group of PTEs if + * its size and alignment are appropriate. + */ + if (((addr | PFN_PHYS(pfn)) & ~CONT_PTE_MASK) == 0) { + if (end - addr >= CONT_PTE_SIZE && !page_mappings_only) + __prot = __pgprot(pgprot_val(prot) | PTE_CONT); + else + __prot = prot; + } + + set_pte(pte, pfn_pte(pfn, __prot)); pfn++; + + /* + * After the PTE entry has been populated once, we + * only allow updates to the permission attributes. + */ + BUG_ON(!pgattr_change_is_safe(pte_val(old_pte), pte_val(*pte))); + } while (pte++, addr += PAGE_SIZE, addr != end); pte_clear_fixmap(); @@ -125,8 +157,9 @@ static void alloc_init_pte(pmd_t *pmd, unsigned long addr, static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), - bool allow_block_mappings) + bool page_mappings_only) { + pgprot_t __prot = prot; pmd_t *pmd; unsigned long next; @@ -146,27 +179,39 @@ static void alloc_init_pmd(pud_t *pud, unsigned long addr, unsigned long end, pmd = pmd_set_fixmap_offset(pud, addr); do { + pmd_t old_pmd = *pmd; + next = pmd_addr_end(addr, end); + /* try section mapping first */ if (((addr | next | phys) & ~SECTION_MASK) == 0 && - allow_block_mappings) { - pmd_t old_pmd =*pmd; - pmd_set_huge(pmd, phys, prot); + !page_mappings_only) { /* - * Check for previous table entries created during - * boot (__create_page_tables) and flush them. + * Set the contiguous bit for the subsequent group of + * PMDs if its size and alignment are appropriate. */ - if (!pmd_none(old_pmd)) { - flush_tlb_all(); - if (pmd_table(old_pmd)) { - phys_addr_t table = pmd_page_paddr(old_pmd); - if (!WARN_ON_ONCE(slab_is_available())) - memblock_free(table, PAGE_SIZE); - } + if (((addr | phys) & ~CONT_PMD_MASK) == 0) { + if (end - addr >= CONT_PMD_SIZE) + __prot = __pgprot(pgprot_val(prot) | + PTE_CONT); + else + __prot = prot; } + pmd_set_huge(pmd, phys, __prot); + + /* + * After the PMD entry has been populated once, we + * only allow updates to the permission attributes. + */ + BUG_ON(!pgattr_change_is_safe(pmd_val(old_pmd), + pmd_val(*pmd))); } else { alloc_init_pte(pmd, addr, next, __phys_to_pfn(phys), - prot, pgtable_alloc); + prot, pgtable_alloc, + page_mappings_only); + + BUG_ON(pmd_val(old_pmd) != 0 && + pmd_val(old_pmd) != pmd_val(*pmd)); } phys += next - addr; } while (pmd++, addr = next, addr != end); @@ -189,7 +234,7 @@ static inline bool use_1G_block(unsigned long addr, unsigned long next, static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, phys_addr_t phys, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), - bool allow_block_mappings) + bool page_mappings_only) { pud_t *pud; unsigned long next; @@ -204,33 +249,28 @@ static void alloc_init_pud(pgd_t *pgd, unsigned long addr, unsigned long end, pud = pud_set_fixmap_offset(pgd, addr); do { + pud_t old_pud = *pud; + next = pud_addr_end(addr, end); /* * For 4K granule only, attempt to put down a 1GB block */ - if (use_1G_block(addr, next, phys) && allow_block_mappings) { - pud_t old_pud = *pud; + if (use_1G_block(addr, next, phys) && !page_mappings_only) { pud_set_huge(pud, phys, prot); /* - * If we have an old value for a pud, it will - * be pointing to a pmd table that we no longer - * need (from swapper_pg_dir). - * - * Look up the old pmd table and free it. + * After the PUD entry has been populated once, we + * only allow updates to the permission attributes. */ - if (!pud_none(old_pud)) { - flush_tlb_all(); - if (pud_table(old_pud)) { - phys_addr_t table = pud_page_paddr(old_pud); - if (!WARN_ON_ONCE(slab_is_available())) - memblock_free(table, PAGE_SIZE); - } - } + BUG_ON(!pgattr_change_is_safe(pud_val(old_pud), + pud_val(*pud))); } else { alloc_init_pmd(pud, addr, next, phys, prot, - pgtable_alloc, allow_block_mappings); + pgtable_alloc, page_mappings_only); + + BUG_ON(pud_val(old_pud) != 0 && + pud_val(old_pud) != pud_val(*pud)); } phys += next - addr; } while (pud++, addr = next, addr != end); @@ -242,7 +282,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, unsigned long virt, phys_addr_t size, pgprot_t prot, phys_addr_t (*pgtable_alloc)(void), - bool allow_block_mappings) + bool page_mappings_only) { unsigned long addr, length, end, next; pgd_t *pgd = pgd_offset_raw(pgdir, virt); @@ -262,7 +302,7 @@ static void __create_pgd_mapping(pgd_t *pgdir, phys_addr_t phys, do { next = pgd_addr_end(addr, end); alloc_init_pud(pgd, addr, next, phys, prot, pgtable_alloc, - allow_block_mappings); + page_mappings_only); phys += next - addr; } while (pgd++, addr = next, addr != end); } @@ -291,17 +331,17 @@ static void __init create_mapping_noalloc(phys_addr_t phys, unsigned long virt, &phys, virt); return; } - __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, true); + __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, NULL, false); } void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys, unsigned long virt, phys_addr_t size, - pgprot_t prot, bool allow_block_mappings) + pgprot_t prot, bool page_mappings_only) { BUG_ON(mm == &init_mm); __create_pgd_mapping(mm->pgd, phys, virt, size, prot, - pgd_pgtable_alloc, allow_block_mappings); + pgd_pgtable_alloc, page_mappings_only); } static void create_mapping_late(phys_addr_t phys, unsigned long virt, @@ -314,7 +354,7 @@ static void create_mapping_late(phys_addr_t phys, unsigned long virt, } __create_pgd_mapping(init_mm.pgd, phys, virt, size, prot, - NULL, !debug_pagealloc_enabled()); + NULL, debug_pagealloc_enabled()); } static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) @@ -332,7 +372,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, PAGE_KERNEL, early_pgtable_alloc, - !debug_pagealloc_enabled()); + debug_pagealloc_enabled()); return; } @@ -345,13 +385,13 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end __phys_to_virt(start), kernel_start - start, PAGE_KERNEL, early_pgtable_alloc, - !debug_pagealloc_enabled()); + debug_pagealloc_enabled()); if (kernel_end < end) __create_pgd_mapping(pgd, kernel_end, __phys_to_virt(kernel_end), end - kernel_end, PAGE_KERNEL, early_pgtable_alloc, - !debug_pagealloc_enabled()); + debug_pagealloc_enabled()); /* * Map the linear alias of the [_text, __init_begin) interval as @@ -361,7 +401,7 @@ static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end */ __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), kernel_end - kernel_start, PAGE_KERNEL_RO, - early_pgtable_alloc, !debug_pagealloc_enabled()); + early_pgtable_alloc, debug_pagealloc_enabled()); } static void __init map_mem(pgd_t *pgd) @@ -396,6 +436,11 @@ void mark_rodata_ro(void) section_size = (unsigned long)__init_begin - (unsigned long)__start_rodata; create_mapping_late(__pa(__start_rodata), (unsigned long)__start_rodata, section_size, PAGE_KERNEL_RO); + + /* flush the TLBs after updating live kernel mappings */ + flush_tlb_all(); + + debug_checkwx(); } static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, @@ -408,7 +453,7 @@ static void __init map_kernel_segment(pgd_t *pgd, void *va_start, void *va_end, BUG_ON(!PAGE_ALIGNED(size)); __create_pgd_mapping(pgd, pa_start, (unsigned long)va_start, size, prot, - early_pgtable_alloc, !debug_pagealloc_enabled()); + early_pgtable_alloc, debug_pagealloc_enabled()); vma->addr = va_start; vma->phys_addr = pa_start; diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 352c73b6a59e..32682be978e0 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -70,11 +70,14 @@ ENTRY(cpu_do_suspend) mrs x8, mdscr_el1 mrs x9, oslsr_el1 mrs x10, sctlr_el1 + mrs x11, tpidr_el1 + mrs x12, sp_el0 stp x2, x3, [x0] stp x4, xzr, [x0, #16] stp x5, x6, [x0, #32] stp x7, x8, [x0, #48] stp x9, x10, [x0, #64] + stp x11, x12, [x0, #80] ret ENDPROC(cpu_do_suspend) @@ -90,6 +93,7 @@ ENTRY(cpu_do_resume) ldp x6, x8, [x0, #32] ldp x9, x10, [x0, #48] ldp x11, x12, [x0, #64] + ldp x13, x14, [x0, #80] msr tpidr_el0, x2 msr tpidrro_el0, x3 msr contextidr_el1, x4 @@ -112,6 +116,8 @@ ENTRY(cpu_do_resume) msr mdscr_el1, x10 msr sctlr_el1, x12 + msr tpidr_el1, x13 + msr sp_el0, x14 /* * Restore oslsr_el1 by writing oslar_el1 */ @@ -136,11 +142,7 @@ ENTRY(cpu_do_switch_mm) bfi x0, x1, #48, #16 // set the ASID msr ttbr0_el1, x0 // set TTBR0 isb -alternative_if ARM64_WORKAROUND_CAVIUM_27456 - ic iallu - dsb nsh - isb -alternative_else_nop_endif + post_ttbr0_update_workaround ret ENDPROC(cpu_do_switch_mm) diff --git a/arch/arm64/mm/ptdump_debugfs.c b/arch/arm64/mm/ptdump_debugfs.c new file mode 100644 index 000000000000..eee4d864350c --- /dev/null +++ b/arch/arm64/mm/ptdump_debugfs.c @@ -0,0 +1,31 @@ +#include <linux/debugfs.h> +#include <linux/seq_file.h> + +#include <asm/ptdump.h> + +static int ptdump_show(struct seq_file *m, void *v) +{ + struct ptdump_info *info = m->private; + ptdump_walk_pgd(m, info); + return 0; +} + +static int ptdump_open(struct inode *inode, struct file *file) +{ + return single_open(file, ptdump_show, inode->i_private); +} + +static const struct file_operations ptdump_fops = { + .open = ptdump_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +int ptdump_debugfs_register(struct ptdump_info *info, const char *name) +{ + struct dentry *pe; + pe = debugfs_create_file(name, 0400, NULL, info, &ptdump_fops); + return pe ? 0 : -ENOMEM; + +} diff --git a/arch/arm64/xen/hypercall.S b/arch/arm64/xen/hypercall.S index 329c8027b0a9..b41aff25426d 100644 --- a/arch/arm64/xen/hypercall.S +++ b/arch/arm64/xen/hypercall.S @@ -49,6 +49,7 @@ #include <linux/linkage.h> #include <asm/assembler.h> +#include <asm/uaccess.h> #include <xen/interface/xen.h> @@ -91,6 +92,20 @@ ENTRY(privcmd_call) mov x2, x3 mov x3, x4 mov x4, x5 + /* + * Privcmd calls are issued by the userspace. The kernel needs to + * enable access to TTBR0_EL1 as the hypervisor would issue stage 1 + * translations to user memory via AT instructions. Since AT + * instructions are not affected by the PAN bit (ARMv8.1), we only + * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation + * is enabled (it implies that hardware UAO and PAN disabled). + */ + uaccess_ttbr0_enable x6, x7 hvc XEN_IMM + + /* + * Disable userspace access from kernel once the hyp call completed. + */ + uaccess_ttbr0_disable x6 ret ENDPROC(privcmd_call); diff --git a/arch/avr32/include/asm/mutex.h b/arch/avr32/include/asm/mutex.h deleted file mode 100644 index 458c1f7fbc18..000000000000 --- a/arch/avr32/include/asm/mutex.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Pull in the generic implementation for the mutex fastpath. - * - * TODO: implement optimized primitives instead, or leave the generic - * implementation in place, or pick the atomic_xchg() based generic - * implementation. (see asm-generic/mutex-xchg.h for details) - */ - -#include <asm-generic/mutex-dec.h> diff --git a/arch/avr32/include/asm/processor.h b/arch/avr32/include/asm/processor.h index 941593c7d9f3..972adcc1e8f4 100644 --- a/arch/avr32/include/asm/processor.h +++ b/arch/avr32/include/asm/processor.h @@ -92,7 +92,6 @@ extern struct avr32_cpuinfo boot_cpu_data; #define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 3)) #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() #define cpu_sync_pipeline() asm volatile("sub pc, -2" : : : "memory") struct cpu_context { diff --git a/arch/avr32/include/uapi/asm/socket.h b/arch/avr32/include/uapi/asm/socket.h index 1fd147f09a38..5a650426f357 100644 --- a/arch/avr32/include/uapi/asm/socket.h +++ b/arch/avr32/include/uapi/asm/socket.h @@ -90,4 +90,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _UAPI__ASM_AVR32_SOCKET_H */ diff --git a/arch/avr32/include/uapi/asm/unistd.h b/arch/avr32/include/uapi/asm/unistd.h index 2c8a0d2b6c30..236505d889d0 100644 --- a/arch/avr32/include/uapi/asm/unistd.h +++ b/arch/avr32/include/uapi/asm/unistd.h @@ -340,5 +340,8 @@ #define __NR_copy_file_range 325 #define __NR_preadv2 326 #define __NR_pwritev2 327 +#define __NR_pkey_mprotect 328 +#define __NR_pkey_alloc 329 +#define __NR_pkey_free 330 #endif /* _UAPI__ASM_AVR32_UNISTD_H */ diff --git a/arch/avr32/kernel/syscall_table.S b/arch/avr32/kernel/syscall_table.S index 7b348ba70e41..774ce57f4948 100644 --- a/arch/avr32/kernel/syscall_table.S +++ b/arch/avr32/kernel/syscall_table.S @@ -341,4 +341,7 @@ sys_call_table: .long __sys_copy_file_range .long __sys_preadv2 .long __sys_pwritev2 + .long sys_pkey_mprotect + .long sys_pkey_alloc + .long sys_pkey_free /* 330 */ .long sys_ni_syscall /* r8 is saturated at nr_syscalls */ diff --git a/arch/avr32/mach-at32ap/clock.c b/arch/avr32/mach-at32ap/clock.c index 52c179bec0cc..fdf1caecb7b9 100644 --- a/arch/avr32/mach-at32ap/clock.c +++ b/arch/avr32/mach-at32ap/clock.c @@ -242,7 +242,7 @@ dump_clock(struct clk *parent, struct clkinf *r) clk_get_rate(parent)); if (parent->dev) seq_printf(r->s, ", for %s", dev_name(parent->dev)); - seq_printf(r->s, "\n"); + seq_putc(r->s, '\n'); /* cost of this scan is small, but not linear... */ r->nest = nest + NEST_DELTA; @@ -261,23 +261,32 @@ static int clk_show(struct seq_file *s, void *unused) struct clk *clk; /* show all the power manager registers */ - seq_printf(s, "MCCTRL = %8x\n", pm_readl(MCCTRL)); - seq_printf(s, "CKSEL = %8x\n", pm_readl(CKSEL)); - seq_printf(s, "CPUMASK = %8x\n", pm_readl(CPU_MASK)); - seq_printf(s, "HSBMASK = %8x\n", pm_readl(HSB_MASK)); - seq_printf(s, "PBAMASK = %8x\n", pm_readl(PBA_MASK)); - seq_printf(s, "PBBMASK = %8x\n", pm_readl(PBB_MASK)); - seq_printf(s, "PLL0 = %8x\n", pm_readl(PLL0)); - seq_printf(s, "PLL1 = %8x\n", pm_readl(PLL1)); - seq_printf(s, "IMR = %8x\n", pm_readl(IMR)); + seq_printf(s, + "MCCTRL = %8x\n" + "CKSEL = %8x\n" + "CPUMASK = %8x\n" + "HSBMASK = %8x\n" + "PBAMASK = %8x\n" + "PBBMASK = %8x\n" + "PLL0 = %8x\n" + "PLL1 = %8x\n" + "IMR = %8x\n", + pm_readl(MCCTRL), + pm_readl(CKSEL), + pm_readl(CPU_MASK), + pm_readl(HSB_MASK), + pm_readl(PBA_MASK), + pm_readl(PBB_MASK), + pm_readl(PLL0), + pm_readl(PLL1), + pm_readl(IMR)); for (i = 0; i < 8; i++) { if (i == 5) continue; seq_printf(s, "GCCTRL%d = %8x\n", i, pm_readl(GCCTRL(i))); } - seq_printf(s, "\n"); - + seq_putc(s, '\n'); r.s = s; r.nest = 0; /* protected from changes on the list while dumping */ diff --git a/arch/avr32/mach-at32ap/pio.c b/arch/avr32/mach-at32ap/pio.c index 13d3fc4270b7..7fae6ec7e8ec 100644 --- a/arch/avr32/mach-at32ap/pio.c +++ b/arch/avr32/mach-at32ap/pio.c @@ -367,13 +367,13 @@ static void pio_bank_show(struct seq_file *s, struct gpio_chip *chip) (mask & pdsr) ? "hi" : "lo", (mask & pusr) ? " " : "up"); if (ifsr & mask) - seq_printf(s, " deglitch"); + seq_puts(s, " deglitch"); if ((osr & mdsr) & mask) - seq_printf(s, " open-drain"); + seq_puts(s, " open-drain"); if (imr & mask) seq_printf(s, " irq-%d edge-both", gpio_to_irq(chip->base + i)); - seq_printf(s, "\n"); + seq_putc(s, '\n'); } } diff --git a/arch/blackfin/include/asm/Kbuild b/arch/blackfin/include/asm/Kbuild index 91d49c0a3118..2fb67b59d188 100644 --- a/arch/blackfin/include/asm/Kbuild +++ b/arch/blackfin/include/asm/Kbuild @@ -24,7 +24,6 @@ generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += mman.h generic-y += msgbuf.h -generic-y += mutex.h generic-y += param.h generic-y += percpu.h generic-y += pgalloc.h diff --git a/arch/blackfin/include/asm/processor.h b/arch/blackfin/include/asm/processor.h index 0c265aba94ad..85d4af97c986 100644 --- a/arch/blackfin/include/asm/processor.h +++ b/arch/blackfin/include/asm/processor.h @@ -92,7 +92,6 @@ unsigned long get_wchan(struct task_struct *p); #define KSTK_ESP(tsk) ((tsk) == current ? rdusp() : (tsk)->thread.usp) #define cpu_relax() smp_mb() -#define cpu_relax_lowlatency() cpu_relax() /* Get the Silicon Revision of the chip */ static inline uint32_t __pure bfin_revid(void) diff --git a/arch/blackfin/mach-bf561/coreb.c b/arch/blackfin/mach-bf561/coreb.c index 8a2543c654b3..cf27554e76bf 100644 --- a/arch/blackfin/mach-bf561/coreb.c +++ b/arch/blackfin/mach-bf561/coreb.c @@ -1,5 +1,7 @@ /* Load firmware into Core B on a BF561 * + * Author: Bas Vermeulen <bvermeul@blackstar.xs4all.nl> + * * Copyright 2004-2009 Analog Devices Inc. * Licensed under the GPL-2 or later. */ @@ -14,9 +16,9 @@ #include <linux/device.h> #include <linux/fs.h> +#include <linux/init.h> #include <linux/kernel.h> #include <linux/miscdevice.h> -#include <linux/module.h> #define CMD_COREB_START _IO('b', 0) #define CMD_COREB_STOP _IO('b', 1) @@ -59,8 +61,4 @@ static struct miscdevice coreb_dev = { .name = "coreb", .fops = &coreb_fops, }; -module_misc_device(coreb_dev); - -MODULE_AUTHOR("Bas Vermeulen <bvermeul@blackstar.xs4all.nl>"); -MODULE_DESCRIPTION("BF561 Core B Support"); -MODULE_LICENSE("GPL"); +builtin_misc_device(coreb_dev); diff --git a/arch/c6x/include/asm/mutex.h b/arch/c6x/include/asm/mutex.h deleted file mode 100644 index 7a7248e0462d..000000000000 --- a/arch/c6x/include/asm/mutex.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_C6X_MUTEX_H -#define _ASM_C6X_MUTEX_H - -#include <asm-generic/mutex-null.h> - -#endif /* _ASM_C6X_MUTEX_H */ diff --git a/arch/c6x/include/asm/processor.h b/arch/c6x/include/asm/processor.h index f2ef31be2f8b..b9eb3da7f278 100644 --- a/arch/c6x/include/asm/processor.h +++ b/arch/c6x/include/asm/processor.h @@ -121,7 +121,6 @@ extern unsigned long get_wchan(struct task_struct *p); #define KSTK_ESP(task) (task_pt_regs(task)->sp) #define cpu_relax() do { } while (0) -#define cpu_relax_lowlatency() cpu_relax() extern const struct seq_operations cpuinfo_op; diff --git a/arch/cris/boot/compressed/Makefile b/arch/cris/boot/compressed/Makefile index 8fe9338c1775..e4ba0be0e782 100644 --- a/arch/cris/boot/compressed/Makefile +++ b/arch/cris/boot/compressed/Makefile @@ -2,9 +2,6 @@ # arch/cris/boot/compressed/Makefile # -asflags-y += $(LINUXINCLUDE) -ccflags-y += -O2 $(LINUXINCLUDE) - # asflags-$(CONFIG_ETRAX_ARCH_V32) += -I$(srctree)/include/asm/mach \ # -I$(srctree)/include/asm/arch # ccflags-$(CONFIG_ETRAX_ARCH_V32) += -O2 -I$(srctree)/include/asm/mach diff --git a/arch/cris/boot/rescue/Makefile b/arch/cris/boot/rescue/Makefile index 52bd0bd1dd22..a82025940006 100644 --- a/arch/cris/boot/rescue/Makefile +++ b/arch/cris/boot/rescue/Makefile @@ -8,8 +8,8 @@ # asflags-y += -I $(srctree)/include/asm/arch/mach/ -I $(srctree)/include/asm/arch # LD = gcc-cris -mlinux -march=v32 -nostdlib -asflags-y += $(LINUXINCLUDE) -ccflags-y += -O2 $(LINUXINCLUDE) +ifdef CONFIG_ETRAX_AXISFLASHMAP + arch-$(CONFIG_ETRAX_ARCH_V10) = v10 arch-$(CONFIG_ETRAX_ARCH_V32) = v32 @@ -28,6 +28,11 @@ $(obj)/rescue.bin: $(obj)/rescue.o FORCE $(call if_changed,objcopy) cp -p $(obj)/rescue.bin $(objtree) +else +$(obj)/rescue.bin: + +endif + $(obj)/testrescue.bin: $(obj)/testrescue.o $(OBJCOPY) $(OBJCOPYFLAGS) $(obj)/testrescue.o tr.bin # Pad it to 784 bytes diff --git a/arch/cris/include/asm/mutex.h b/arch/cris/include/asm/mutex.h deleted file mode 100644 index 458c1f7fbc18..000000000000 --- a/arch/cris/include/asm/mutex.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Pull in the generic implementation for the mutex fastpath. - * - * TODO: implement optimized primitives instead, or leave the generic - * implementation in place, or pick the atomic_xchg() based generic - * implementation. (see asm-generic/mutex-xchg.h for details) - */ - -#include <asm-generic/mutex-dec.h> diff --git a/arch/cris/include/asm/processor.h b/arch/cris/include/asm/processor.h index 862126b58116..15b815df29c1 100644 --- a/arch/cris/include/asm/processor.h +++ b/arch/cris/include/asm/processor.h @@ -63,7 +63,6 @@ static inline void release_thread(struct task_struct *dead_task) #define init_stack (init_thread_union.stack) #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() void default_idle(void); diff --git a/arch/frv/include/asm/mutex.h b/arch/frv/include/asm/mutex.h deleted file mode 100644 index 458c1f7fbc18..000000000000 --- a/arch/frv/include/asm/mutex.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Pull in the generic implementation for the mutex fastpath. - * - * TODO: implement optimized primitives instead, or leave the generic - * implementation in place, or pick the atomic_xchg() based generic - * implementation. (see asm-generic/mutex-xchg.h for details) - */ - -#include <asm-generic/mutex-dec.h> diff --git a/arch/frv/include/asm/processor.h b/arch/frv/include/asm/processor.h index 73f0a79ad8e6..ddaeb9cc9143 100644 --- a/arch/frv/include/asm/processor.h +++ b/arch/frv/include/asm/processor.h @@ -107,7 +107,6 @@ unsigned long get_wchan(struct task_struct *p); #define KSTK_ESP(tsk) ((tsk)->thread.frame0->sp) #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() /* data cache prefetch */ #define ARCH_HAS_PREFETCH diff --git a/arch/frv/include/uapi/asm/socket.h b/arch/frv/include/uapi/asm/socket.h index afbc98f02d27..81e03530ed39 100644 --- a/arch/frv/include/uapi/asm/socket.h +++ b/arch/frv/include/uapi/asm/socket.h @@ -90,5 +90,7 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/h8300/include/asm/mutex.h b/arch/h8300/include/asm/mutex.h deleted file mode 100644 index 458c1f7fbc18..000000000000 --- a/arch/h8300/include/asm/mutex.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Pull in the generic implementation for the mutex fastpath. - * - * TODO: implement optimized primitives instead, or leave the generic - * implementation in place, or pick the atomic_xchg() based generic - * implementation. (see asm-generic/mutex-xchg.h for details) - */ - -#include <asm-generic/mutex-dec.h> diff --git a/arch/h8300/include/asm/processor.h b/arch/h8300/include/asm/processor.h index 111df7397ac7..65132d7ae9e5 100644 --- a/arch/h8300/include/asm/processor.h +++ b/arch/h8300/include/asm/processor.h @@ -127,7 +127,6 @@ unsigned long get_wchan(struct task_struct *p); #define KSTK_ESP(tsk) ((tsk) == current ? rdusp() : (tsk)->thread.usp) #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() #define HARD_RESET_NOW() ({ \ local_irq_disable(); \ diff --git a/arch/hexagon/include/asm/mutex.h b/arch/hexagon/include/asm/mutex.h deleted file mode 100644 index 58b52de1bc22..000000000000 --- a/arch/hexagon/include/asm/mutex.h +++ /dev/null @@ -1,8 +0,0 @@ -/* - * Pull in the generic implementation for the mutex fastpath. - * - * TODO: implement optimized primitives instead, or leave the generic - * implementation in place, or pick the atomic_xchg() based generic - * implementation. (see asm-generic/mutex-xchg.h for details) - */ -#include <asm-generic/mutex-xchg.h> diff --git a/arch/hexagon/include/asm/processor.h b/arch/hexagon/include/asm/processor.h index d8501137c8d0..45a825402f63 100644 --- a/arch/hexagon/include/asm/processor.h +++ b/arch/hexagon/include/asm/processor.h @@ -56,7 +56,6 @@ struct thread_struct { } #define cpu_relax() __vmyield() -#define cpu_relax_lowlatency() cpu_relax() /* * Decides where the kernel will search for a free chunk of vm space during diff --git a/arch/ia64/include/asm/mutex.h b/arch/ia64/include/asm/mutex.h deleted file mode 100644 index 28cb819e0ff9..000000000000 --- a/arch/ia64/include/asm/mutex.h +++ /dev/null @@ -1,90 +0,0 @@ -/* - * ia64 implementation of the mutex fastpath. - * - * Copyright (C) 2006 Ken Chen <kenneth.w.chen@intel.com> - * - */ - -#ifndef _ASM_MUTEX_H -#define _ASM_MUTEX_H - -/** - * __mutex_fastpath_lock - try to take the lock by moving the count - * from 1 to a 0 value - * @count: pointer of type atomic_t - * @fail_fn: function to call if the original value was not 1 - * - * Change the count from 1 to a value lower than 1, and call <fail_fn> if - * it wasn't 1 originally. This function MUST leave the value lower than - * 1 even when the "1" assertion wasn't true. - */ -static inline void -__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *)) -{ - if (unlikely(ia64_fetchadd4_acq(count, -1) != 1)) - fail_fn(count); -} - -/** - * __mutex_fastpath_lock_retval - try to take the lock by moving the count - * from 1 to a 0 value - * @count: pointer of type atomic_t - * - * Change the count from 1 to a value lower than 1. This function returns 0 - * if the fastpath succeeds, or -1 otherwise. - */ -static inline int -__mutex_fastpath_lock_retval(atomic_t *count) -{ - if (unlikely(ia64_fetchadd4_acq(count, -1) != 1)) - return -1; - return 0; -} - -/** - * __mutex_fastpath_unlock - try to promote the count from 0 to 1 - * @count: pointer of type atomic_t - * @fail_fn: function to call if the original value was not 0 - * - * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>. - * In the failure case, this function is allowed to either set the value to - * 1, or to set it to a value lower than 1. - * - * If the implementation sets it to a value of lower than 1, then the - * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs - * to return 0 otherwise. - */ -static inline void -__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *)) -{ - int ret = ia64_fetchadd4_rel(count, 1); - if (unlikely(ret < 0)) - fail_fn(count); -} - -#define __mutex_slowpath_needs_to_unlock() 1 - -/** - * __mutex_fastpath_trylock - try to acquire the mutex, without waiting - * - * @count: pointer of type atomic_t - * @fail_fn: fallback function - * - * Change the count from 1 to a value lower than 1, and return 0 (failure) - * if it wasn't 1 originally, or return 1 (success) otherwise. This function - * MUST leave the value lower than 1 even when the "1" assertion wasn't true. - * Additionally, if the value was < 0 originally, this function must not leave - * it to 0 on failure. - * - * If the architecture has no effective trylock variant, it should call the - * <fail_fn> spinlock-based trylock variant unconditionally. - */ -static inline int -__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) -{ - if (atomic_read(count) == 1 && cmpxchg_acq(count, 1, 0) == 1) - return 1; - return 0; -} - -#endif diff --git a/arch/ia64/include/asm/processor.h b/arch/ia64/include/asm/processor.h index ce53c50d0ba4..03911a336406 100644 --- a/arch/ia64/include/asm/processor.h +++ b/arch/ia64/include/asm/processor.h @@ -547,7 +547,6 @@ ia64_eoi (void) } #define cpu_relax() ia64_hint(ia64_hint_pause) -#define cpu_relax_lowlatency() cpu_relax() static inline int ia64_get_irr(unsigned int vector) diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index 77e541cf0e5d..fced197b9626 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -207,15 +207,15 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) */ static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - if (tlb->nr == tlb->max) - return true; - tlb->need_flush = 1; if (!tlb->nr && tlb->pages == tlb->local) __tlb_alloc_page(tlb); tlb->pages[tlb->nr++] = page; + VM_WARN_ON(tlb->nr > tlb->max); + if (tlb->nr == tlb->max) + return true; return false; } @@ -236,10 +236,8 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - if (__tlb_remove_page(tlb, page)) { + if (__tlb_remove_page(tlb, page)) tlb_flush_mmu(tlb); - __tlb_remove_page(tlb, page); - } } static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, @@ -248,12 +246,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { @@ -283,6 +275,15 @@ do { \ __tlb_remove_tlb_entry(tlb, ptep, addr); \ } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) + +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb->need_flush = 1; \ diff --git a/arch/ia64/include/uapi/asm/socket.h b/arch/ia64/include/uapi/asm/socket.h index 0018fad9039f..57feb0c1f7d7 100644 --- a/arch/ia64/include/uapi/asm/socket.h +++ b/arch/ia64/include/uapi/asm/socket.h @@ -99,4 +99,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _ASM_IA64_SOCKET_H */ diff --git a/arch/ia64/kernel/err_inject.c b/arch/ia64/kernel/err_inject.c index 5ed0ea92c5bf..85bba43e7d5d 100644 --- a/arch/ia64/kernel/err_inject.c +++ b/arch/ia64/kernel/err_inject.c @@ -224,85 +224,45 @@ static struct attribute_group err_inject_attr_group = { .name = "err_inject" }; /* Add/Remove err_inject interface for CPU device */ -static int err_inject_add_dev(struct device *sys_dev) +static int err_inject_add_dev(unsigned int cpu) { + struct device *sys_dev = get_cpu_device(cpu); + return sysfs_create_group(&sys_dev->kobj, &err_inject_attr_group); } -static int err_inject_remove_dev(struct device *sys_dev) +static int err_inject_remove_dev(unsigned int cpu) { + struct device *sys_dev = get_cpu_device(cpu); + sysfs_remove_group(&sys_dev->kobj, &err_inject_attr_group); return 0; } -static int err_inject_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct device *sys_dev; - - sys_dev = get_cpu_device(cpu); - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - err_inject_add_dev(sys_dev); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - err_inject_remove_dev(sys_dev); - break; - } - - return NOTIFY_OK; -} -static struct notifier_block err_inject_cpu_notifier = -{ - .notifier_call = err_inject_cpu_callback, -}; +static enum cpuhp_state hp_online; -static int __init -err_inject_init(void) +static int __init err_inject_init(void) { - int i; - + int ret; #ifdef ERR_INJ_DEBUG printk(KERN_INFO "Enter error injection driver.\n"); #endif - cpu_notifier_register_begin(); - - for_each_online_cpu(i) { - err_inject_cpu_callback(&err_inject_cpu_notifier, CPU_ONLINE, - (void *)(long)i); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/err_inj:online", + err_inject_add_dev, err_inject_remove_dev); + if (ret >= 0) { + hp_online = ret; + ret = 0; } - - __register_hotcpu_notifier(&err_inject_cpu_notifier); - - cpu_notifier_register_done(); - - return 0; + return ret; } -static void __exit -err_inject_exit(void) +static void __exit err_inject_exit(void) { - int i; - struct device *sys_dev; - #ifdef ERR_INJ_DEBUG printk(KERN_INFO "Exit error injection driver.\n"); #endif - - cpu_notifier_register_begin(); - - for_each_online_cpu(i) { - sys_dev = get_cpu_device(i); - sysfs_remove_group(&sys_dev->kobj, &err_inject_attr_group); - } - - __unregister_hotcpu_notifier(&err_inject_cpu_notifier); - - cpu_notifier_register_done(); + cpuhp_remove_state(hp_online); } module_init(err_inject_init); diff --git a/arch/ia64/kernel/palinfo.c b/arch/ia64/kernel/palinfo.c index c39c3cd3ac34..b6e597860888 100644 --- a/arch/ia64/kernel/palinfo.c +++ b/arch/ia64/kernel/palinfo.c @@ -932,8 +932,7 @@ static const struct file_operations proc_palinfo_fops = { .release = single_release, }; -static void -create_palinfo_proc_entries(unsigned int cpu) +static int palinfo_add_proc(unsigned int cpu) { pal_func_cpu_u_t f; struct proc_dir_entry *cpu_dir; @@ -943,7 +942,7 @@ create_palinfo_proc_entries(unsigned int cpu) cpu_dir = proc_mkdir(cpustr, palinfo_dir); if (!cpu_dir) - return; + return -EINVAL; f.req_cpu = cpu; @@ -952,42 +951,21 @@ create_palinfo_proc_entries(unsigned int cpu) proc_create_data(palinfo_entries[j].name, 0, cpu_dir, &proc_palinfo_fops, (void *)f.value); } + return 0; } -static void -remove_palinfo_proc_entries(unsigned int hcpu) +static int palinfo_del_proc(unsigned int hcpu) { char cpustr[3+4+1]; /* cpu numbers are up to 4095 on itanic */ + sprintf(cpustr, "cpu%d", hcpu); remove_proc_subtree(cpustr, palinfo_dir); + return 0; } -static int palinfo_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int hotcpu = (unsigned long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - create_palinfo_proc_entries(hotcpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - remove_palinfo_proc_entries(hotcpu); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block __refdata palinfo_cpu_notifier = -{ - .notifier_call = palinfo_cpu_callback, - .priority = 0, -}; +static enum cpuhp_state hp_online; -static int __init -palinfo_init(void) +static int __init palinfo_init(void) { int i = 0; @@ -996,25 +974,19 @@ palinfo_init(void) if (!palinfo_dir) return -ENOMEM; - cpu_notifier_register_begin(); - - /* Create palinfo dirs in /proc for all online cpus */ - for_each_online_cpu(i) { - create_palinfo_proc_entries(i); + i = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/palinfo:online", + palinfo_add_proc, palinfo_del_proc); + if (i < 0) { + remove_proc_subtree("pal", NULL); + return i; } - - /* Register for future delivery via notify registration */ - __register_hotcpu_notifier(&palinfo_cpu_notifier); - - cpu_notifier_register_done(); - + hp_online = i; return 0; } -static void __exit -palinfo_exit(void) +static void __exit palinfo_exit(void) { - unregister_hotcpu_notifier(&palinfo_cpu_notifier); + cpuhp_remove_state(hp_online); remove_proc_subtree("pal", NULL); } diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c index 5313007d5423..aaf74f36cfa1 100644 --- a/arch/ia64/kernel/salinfo.c +++ b/arch/ia64/kernel/salinfo.c @@ -550,52 +550,40 @@ static const struct file_operations salinfo_data_fops = { .llseek = default_llseek, }; -static int -salinfo_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) +static int salinfo_cpu_online(unsigned int cpu) { - unsigned int i, cpu = (unsigned long)hcpu; - unsigned long flags; + unsigned int i, end = ARRAY_SIZE(salinfo_data); struct salinfo_data *data; - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - spin_lock_irqsave(&data_saved_lock, flags); - for (i = 0, data = salinfo_data; - i < ARRAY_SIZE(salinfo_data); - ++i, ++data) { - cpumask_set_cpu(cpu, &data->cpu_event); - wake_up_interruptible(&data->read_wait); - } - spin_unlock_irqrestore(&data_saved_lock, flags); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - spin_lock_irqsave(&data_saved_lock, flags); - for (i = 0, data = salinfo_data; - i < ARRAY_SIZE(salinfo_data); - ++i, ++data) { - struct salinfo_data_saved *data_saved; - int j; - for (j = ARRAY_SIZE(data->data_saved) - 1, data_saved = data->data_saved + j; - j >= 0; - --j, --data_saved) { - if (data_saved->buffer && data_saved->cpu == cpu) { - shift1_data_saved(data, j); - } - } - cpumask_clear_cpu(cpu, &data->cpu_event); - } - spin_unlock_irqrestore(&data_saved_lock, flags); - break; + + spin_lock_irq(&data_saved_lock); + for (i = 0, data = salinfo_data; i < end; ++i, ++data) { + cpumask_set_cpu(cpu, &data->cpu_event); + wake_up_interruptible(&data->read_wait); } - return NOTIFY_OK; + spin_unlock_irq(&data_saved_lock); + return 0; } -static struct notifier_block salinfo_cpu_notifier = +static int salinfo_cpu_pre_down(unsigned int cpu) { - .notifier_call = salinfo_cpu_callback, - .priority = 0, -}; + unsigned int i, end = ARRAY_SIZE(salinfo_data); + struct salinfo_data *data; + + spin_lock_irq(&data_saved_lock); + for (i = 0, data = salinfo_data; i < end; ++i, ++data) { + struct salinfo_data_saved *data_saved; + int j = ARRAY_SIZE(data->data_saved) - 1; + + for (data_saved = data->data_saved + j; j >= 0; + --j, --data_saved) { + if (data_saved->buffer && data_saved->cpu == cpu) + shift1_data_saved(data, j); + } + cpumask_clear_cpu(cpu, &data->cpu_event); + } + spin_unlock_irq(&data_saved_lock); + return 0; +} static int __init salinfo_init(void) @@ -604,7 +592,7 @@ salinfo_init(void) struct proc_dir_entry **sdir = salinfo_proc_entries; /* keeps track of every entry */ struct proc_dir_entry *dir, *entry; struct salinfo_data *data; - int i, j; + int i; salinfo_dir = proc_mkdir("sal", NULL); if (!salinfo_dir) @@ -617,8 +605,6 @@ salinfo_init(void) (void *)salinfo_entries[i].feature); } - cpu_notifier_register_begin(); - for (i = 0; i < ARRAY_SIZE(salinfo_log_name); i++) { data = salinfo_data + i; data->type = i; @@ -639,10 +625,6 @@ salinfo_init(void) continue; *sdir++ = entry; - /* we missed any events before now */ - for_each_online_cpu(j) - cpumask_set_cpu(j, &data->cpu_event); - *sdir++ = dir; } @@ -653,10 +635,9 @@ salinfo_init(void) salinfo_timer.function = &salinfo_timeout; add_timer(&salinfo_timer); - __register_hotcpu_notifier(&salinfo_cpu_notifier); - - cpu_notifier_register_done(); - + i = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/salinfo:online", + salinfo_cpu_online, salinfo_cpu_pre_down); + WARN_ON(i < 0); return 0; } diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 6f892b94e906..021f44ab4bfb 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -68,7 +68,7 @@ void vtime_account_user(struct task_struct *tsk) if (ti->ac_utime) { delta_utime = cycle_to_cputime(ti->ac_utime); - account_user_time(tsk, delta_utime, delta_utime); + account_user_time(tsk, delta_utime); ti->ac_utime = 0; } } @@ -112,7 +112,7 @@ void vtime_account_system(struct task_struct *tsk) { cputime_t delta = vtime_delta(tsk); - account_system_time(tsk, 0, delta, delta); + account_system_time(tsk, 0, delta); } EXPORT_SYMBOL_GPL(vtime_account_system); diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index c01fe8991244..1a68f012a6dc 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -349,9 +349,9 @@ static int cpu_cache_sysfs_init(unsigned int cpu) } /* Add cache interface for CPU device */ -static int cache_add_dev(struct device *sys_dev) +static int cache_add_dev(unsigned int cpu) { - unsigned int cpu = sys_dev->id; + struct device *sys_dev = get_cpu_device(cpu); unsigned long i, j; struct cache_info *this_object; int retval = 0; @@ -399,9 +399,8 @@ static int cache_add_dev(struct device *sys_dev) } /* Remove cache interface for CPU device */ -static int cache_remove_dev(struct device *sys_dev) +static int cache_remove_dev(unsigned int cpu) { - unsigned int cpu = sys_dev->id; unsigned long i; for (i = 0; i < all_cpu_cache_info[cpu].num_cache_leaves; i++) @@ -419,52 +418,13 @@ static int cache_remove_dev(struct device *sys_dev) return 0; } -/* - * When a cpu is hot-plugged, do a check and initiate - * cache kobject if necessary - */ -static int cache_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct device *sys_dev; - - sys_dev = get_cpu_device(cpu); - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - cache_add_dev(sys_dev); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - cache_remove_dev(sys_dev); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block cache_cpu_notifier = -{ - .notifier_call = cache_cpu_callback -}; - static int __init cache_sysfs_init(void) { - int i; - - cpu_notifier_register_begin(); - - for_each_online_cpu(i) { - struct device *sys_dev = get_cpu_device((unsigned int)i); - cache_add_dev(sys_dev); - } - - __register_hotcpu_notifier(&cache_cpu_notifier); - - cpu_notifier_register_done(); + int ret; + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "ia64/topology:online", + cache_add_dev, cache_remove_dev); + WARN_ON(ret < 0); return 0; } - device_initcall(cache_sysfs_init); - diff --git a/arch/m32r/Kconfig b/arch/m32r/Kconfig index 3cc8498fe0fe..d227a6988d6b 100644 --- a/arch/m32r/Kconfig +++ b/arch/m32r/Kconfig @@ -34,7 +34,7 @@ config NO_IOPORT_MAP def_bool y config NO_DMA - def_bool y + def_bool n config HZ int diff --git a/arch/m32r/include/asm/device.h b/arch/m32r/include/asm/device.h index d8f9872b0e2d..4a9f35e0973f 100644 --- a/arch/m32r/include/asm/device.h +++ b/arch/m32r/include/asm/device.h @@ -3,5 +3,9 @@ * * This file is released under the GPLv2 */ -#include <asm-generic/device.h> +struct dev_archdata { + struct dma_map_ops *dma_ops; +}; +struct pdev_archdata { +}; diff --git a/arch/m32r/include/asm/dma-mapping.h b/arch/m32r/include/asm/dma-mapping.h new file mode 100644 index 000000000000..2c43a77fe942 --- /dev/null +++ b/arch/m32r/include/asm/dma-mapping.h @@ -0,0 +1,32 @@ +#ifndef _ASM_M32R_DMA_MAPPING_H +#define _ASM_M32R_DMA_MAPPING_H + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/mm.h> +#include <linux/scatterlist.h> +#include <linux/dma-debug.h> +#include <linux/io.h> + +#define DMA_ERROR_CODE (~(dma_addr_t)0x0) + +static inline struct dma_map_ops *get_dma_ops(struct device *dev) +{ + if (dev && dev->archdata.dma_ops) + return dev->archdata.dma_ops; + return &dma_noop_ops; +} + +static inline void dma_cache_sync(struct device *dev, void *vaddr, size_t size, + enum dma_data_direction direction) +{ +} + +static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size) +{ + if (!dev->dma_mask) + return false; + return addr + size - 1 <= *dev->dma_mask; +} + +#endif /* _ASM_M32R_DMA_MAPPING_H */ diff --git a/arch/m32r/include/asm/mutex.h b/arch/m32r/include/asm/mutex.h deleted file mode 100644 index 458c1f7fbc18..000000000000 --- a/arch/m32r/include/asm/mutex.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Pull in the generic implementation for the mutex fastpath. - * - * TODO: implement optimized primitives instead, or leave the generic - * implementation in place, or pick the atomic_xchg() based generic - * implementation. (see asm-generic/mutex-xchg.h for details) - */ - -#include <asm-generic/mutex-dec.h> diff --git a/arch/m32r/include/asm/processor.h b/arch/m32r/include/asm/processor.h index 9f8fd9bef70f..5767367550c6 100644 --- a/arch/m32r/include/asm/processor.h +++ b/arch/m32r/include/asm/processor.h @@ -133,6 +133,5 @@ unsigned long get_wchan(struct task_struct *p); #define KSTK_ESP(tsk) ((tsk)->thread.sp) #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() #endif /* _ASM_M32R_PROCESSOR_H */ diff --git a/arch/m32r/include/uapi/asm/socket.h b/arch/m32r/include/uapi/asm/socket.h index 5fe42fc7b6c5..5853f8e92c20 100644 --- a/arch/m32r/include/uapi/asm/socket.h +++ b/arch/m32r/include/uapi/asm/socket.h @@ -90,4 +90,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _ASM_M32R_SOCKET_H */ diff --git a/arch/m32r/platforms/m32700ut/setup.c b/arch/m32r/platforms/m32700ut/setup.c index 9a4ba8a8589d..349eb341752c 100644 --- a/arch/m32r/platforms/m32700ut/setup.c +++ b/arch/m32r/platforms/m32700ut/setup.c @@ -201,6 +201,7 @@ static struct irq_chip m32700ut_lanpld_irq_type = #define lcdpldirq2port(x) (unsigned long)((int)M32700UT_LCD_ICUCR1 + \ (((x) - 1) * sizeof(unsigned short))) +#ifdef CONFIG_USB static pld_icu_data_t lcdpld_icu_data[M32700UT_NUM_LCD_PLD_IRQ]; static void disable_m32700ut_lcdpld_irq(unsigned int irq) @@ -253,6 +254,7 @@ static struct irq_chip m32700ut_lcdpld_irq_type = .irq_mask = mask_m32700ut_lcdpld, .irq_unmask = unmask_m32700ut_lcdpld, }; +#endif void __init init_IRQ(void) { diff --git a/arch/m68k/Kconfig.machine b/arch/m68k/Kconfig.machine index 2a5c7abb2896..9225b4ad9aeb 100644 --- a/arch/m68k/Kconfig.machine +++ b/arch/m68k/Kconfig.machine @@ -259,6 +259,12 @@ config M5407C3 help Support for the Motorola M5407C3 board. +config AMCORE + bool "Sysam AMCORE board support" + depends on M5307 + help + Support for the Sysam AMCORE open-hardware generic board. + config FIREBEE bool "FireBee board support" depends on M547x diff --git a/arch/m68k/amiga/config.c b/arch/m68k/amiga/config.c index ddb8192a3661..65f63a457130 100644 --- a/arch/m68k/amiga/config.c +++ b/arch/m68k/amiga/config.c @@ -786,8 +786,7 @@ static void amiga_get_hardware_list(struct seq_file *m) if (AMIGAHW_PRESENT(name)) \ seq_printf (m, "\t%s\n", str) - seq_printf (m, "Detected hardware:\n"); - + seq_puts(m, "Detected hardware:\n"); AMIGAHW_ANNOUNCE(AMI_VIDEO, "Amiga Video"); AMIGAHW_ANNOUNCE(AMI_BLITTER, "Blitter"); AMIGAHW_ANNOUNCE(AMBER_FF, "Amber Flicker Fixer"); diff --git a/arch/m68k/atari/config.c b/arch/m68k/atari/config.c index 97a3c38cd1f5..e328eaf816e3 100644 --- a/arch/m68k/atari/config.c +++ b/arch/m68k/atari/config.c @@ -629,7 +629,7 @@ static void atari_get_hardware_list(struct seq_file *m) if (ATARIHW_PRESENT(name)) \ seq_printf(m, "\t%s\n", str) - seq_printf(m, "Detected hardware:\n"); + seq_puts(m, "Detected hardware:\n"); ATARIHW_ANNOUNCE(STND_SHIFTER, "ST Shifter"); ATARIHW_ANNOUNCE(EXTD_SHIFTER, "STe Shifter"); ATARIHW_ANNOUNCE(TT_SHIFTER, "TT Shifter"); diff --git a/arch/m68k/coldfire/Makefile b/arch/m68k/coldfire/Makefile index 68f0fac60099..4aa2c57afc35 100644 --- a/arch/m68k/coldfire/Makefile +++ b/arch/m68k/coldfire/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_NETtel) += nettel.o obj-$(CONFIG_CLEOPATRA) += nettel.o obj-$(CONFIG_FIREBEE) += firebee.o obj-$(CONFIG_MCF8390) += mcf8390.o +obj-$(CONFIG_AMCORE) += amcore.o obj-$(CONFIG_PCI) += pci.o diff --git a/arch/m68k/coldfire/amcore.c b/arch/m68k/coldfire/amcore.c new file mode 100644 index 000000000000..c6cb1a5cc1a5 --- /dev/null +++ b/arch/m68k/coldfire/amcore.c @@ -0,0 +1,156 @@ +/* + * amcore.c -- Support for Sysam AMCORE open board + * + * (C) Copyright 2016, Angelo Dureghello <angelo@sysam.it> + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file COPYING in the main directory of this archive + * for more details. + */ + +#include <linux/device.h> +#include <linux/platform_device.h> +#include <linux/dm9000.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/mtd/mtd.h> +#include <linux/mtd/map.h> +#include <linux/mtd/partitions.h> +#include <linux/mtd/physmap.h> +#include <linux/i2c.h> + +#include <asm/coldfire.h> +#include <asm/mcfsim.h> +#include <asm/io.h> + +#if IS_ENABLED(CONFIG_DM9000) + +#define DM9000_IRQ 25 +#define DM9000_ADDR 0x30000000 + +/* + * DEVICES and related device RESOURCES + */ +static struct resource dm9000_resources[] = { + /* physical address of the address register (CMD [A2] to 0)*/ + [0] = { + .start = DM9000_ADDR, + .end = DM9000_ADDR, + .flags = IORESOURCE_MEM, + }, + /* + * physical address of the data register (CMD [A2] to 1), + * driver wants a range >=4 to assume a 32bit data bus + */ + [1] = { + .start = DM9000_ADDR + 4, + .end = DM9000_ADDR + 7, + .flags = IORESOURCE_MEM, + }, + /* IRQ line the device's interrupt pin is connected to */ + [2] = { + .start = DM9000_IRQ, + .end = DM9000_IRQ, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct dm9000_plat_data dm9000_platdata = { + .flags = DM9000_PLATF_32BITONLY, +}; + +static struct platform_device dm9000_device = { + .name = "dm9000", + .id = 0, + .num_resources = ARRAY_SIZE(dm9000_resources), + .resource = dm9000_resources, + .dev = { + .platform_data = &dm9000_platdata, + } +}; +#endif + +static void __init dm9000_pre_init(void) +{ + /* Set the dm9000 interrupt to be auto-vectored */ + mcf_autovector(DM9000_IRQ); +} + +/* + * Partitioning of parallel NOR flash (39VF3201B) + */ +static struct mtd_partition amcore_partitions[] = { + { + .name = "U-Boot (128K)", + .size = 0x20000, + .offset = 0x0 + }, + { + .name = "Kernel+ROMfs (2994K)", + .size = 0x2E0000, + .offset = MTDPART_OFS_APPEND + }, + { + .name = "Flash Free Space (1024K)", + .size = MTDPART_SIZ_FULL, + .offset = MTDPART_OFS_APPEND + } +}; + +static struct physmap_flash_data flash_data = { + .parts = amcore_partitions, + .nr_parts = ARRAY_SIZE(amcore_partitions), + .width = 2, +}; + +static struct resource flash_resource = { + .start = 0xffc00000, + .end = 0xffffffff, + .flags = IORESOURCE_MEM, +}; + +static struct platform_device flash_device = { + .name = "physmap-flash", + .id = -1, + .resource = &flash_resource, + .num_resources = 1, + .dev = { + .platform_data = &flash_data, + }, +}; + +static struct platform_device rtc_device = { + .name = "rtc-ds1307", + .id = -1, +}; + +static struct i2c_board_info amcore_i2c_info[] __initdata = { + { + I2C_BOARD_INFO("ds1338", 0x68), + }, +}; + +static struct platform_device *amcore_devices[] __initdata = { +#if IS_ENABLED(CONFIG_DM9000) + &dm9000_device, +#endif + &flash_device, + &rtc_device, +}; + +static int __init init_amcore(void) +{ +#if IS_ENABLED(CONFIG_DM9000) + dm9000_pre_init(); +#endif + + /* Add i2c RTC Dallas chip supprt */ + i2c_register_board_info(0, amcore_i2c_info, + ARRAY_SIZE(amcore_i2c_info)); + + platform_add_devices(amcore_devices, ARRAY_SIZE(amcore_devices)); + + return 0; +} + +arch_initcall(init_amcore); diff --git a/arch/m68k/coldfire/device.c b/arch/m68k/coldfire/device.c index a0fc0c192427..84938fdbbada 100644 --- a/arch/m68k/coldfire/device.c +++ b/arch/m68k/coldfire/device.c @@ -327,6 +327,147 @@ static struct platform_device mcf_qspi = { }; #endif /* IS_ENABLED(CONFIG_SPI_COLDFIRE_QSPI) */ +#if IS_ENABLED(CONFIG_I2C_IMX) +static struct resource mcf_i2c0_resources[] = { + { + .start = MCFI2C_BASE0, + .end = MCFI2C_BASE0 + MCFI2C_SIZE0 - 1, + .flags = IORESOURCE_MEM, + }, + { + .start = MCF_IRQ_I2C0, + .end = MCF_IRQ_I2C0, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mcf_i2c0 = { + .name = "imx1-i2c", + .id = 0, + .num_resources = ARRAY_SIZE(mcf_i2c0_resources), + .resource = mcf_i2c0_resources, +}; +#ifdef MCFI2C_BASE1 + +static struct resource mcf_i2c1_resources[] = { + { + .start = MCFI2C_BASE1, + .end = MCFI2C_BASE1 + MCFI2C_SIZE1 - 1, + .flags = IORESOURCE_MEM, + }, + { + .start = MCF_IRQ_I2C1, + .end = MCF_IRQ_I2C1, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mcf_i2c1 = { + .name = "imx1-i2c", + .id = 1, + .num_resources = ARRAY_SIZE(mcf_i2c1_resources), + .resource = mcf_i2c1_resources, +}; + +#endif /* MCFI2C_BASE1 */ + +#ifdef MCFI2C_BASE2 + +static struct resource mcf_i2c2_resources[] = { + { + .start = MCFI2C_BASE2, + .end = MCFI2C_BASE2 + MCFI2C_SIZE2 - 1, + .flags = IORESOURCE_MEM, + }, + { + .start = MCF_IRQ_I2C2, + .end = MCF_IRQ_I2C2, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mcf_i2c2 = { + .name = "imx1-i2c", + .id = 2, + .num_resources = ARRAY_SIZE(mcf_i2c2_resources), + .resource = mcf_i2c2_resources, +}; + +#endif /* MCFI2C_BASE2 */ + +#ifdef MCFI2C_BASE3 + +static struct resource mcf_i2c3_resources[] = { + { + .start = MCFI2C_BASE3, + .end = MCFI2C_BASE3 + MCFI2C_SIZE3 - 1, + .flags = IORESOURCE_MEM, + }, + { + .start = MCF_IRQ_I2C3, + .end = MCF_IRQ_I2C3, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mcf_i2c3 = { + .name = "imx1-i2c", + .id = 3, + .num_resources = ARRAY_SIZE(mcf_i2c3_resources), + .resource = mcf_i2c3_resources, +}; + +#endif /* MCFI2C_BASE3 */ + +#ifdef MCFI2C_BASE4 + +static struct resource mcf_i2c4_resources[] = { + { + .start = MCFI2C_BASE4, + .end = MCFI2C_BASE4 + MCFI2C_SIZE4 - 1, + .flags = IORESOURCE_MEM, + }, + { + .start = MCF_IRQ_I2C4, + .end = MCF_IRQ_I2C4, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mcf_i2c4 = { + .name = "imx1-i2c", + .id = 4, + .num_resources = ARRAY_SIZE(mcf_i2c4_resources), + .resource = mcf_i2c4_resources, +}; + +#endif /* MCFI2C_BASE4 */ + +#ifdef MCFI2C_BASE5 + +static struct resource mcf_i2c5_resources[] = { + { + .start = MCFI2C_BASE5, + .end = MCFI2C_BASE5 + MCFI2C_SIZE5 - 1, + .flags = IORESOURCE_MEM, + }, + { + .start = MCF_IRQ_I2C5, + .end = MCF_IRQ_I2C5, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mcf_i2c5 = { + .name = "imx1-i2c", + .id = 5, + .num_resources = ARRAY_SIZE(mcf_i2c5_resources), + .resource = mcf_i2c5_resources, +}; + +#endif /* MCFI2C_BASE5 */ +#endif /* IS_ENABLED(CONFIG_I2C_IMX) */ + static struct platform_device *mcf_devices[] __initdata = { &mcf_uart, #if IS_ENABLED(CONFIG_FEC) @@ -338,6 +479,24 @@ static struct platform_device *mcf_devices[] __initdata = { #if IS_ENABLED(CONFIG_SPI_COLDFIRE_QSPI) &mcf_qspi, #endif +#if IS_ENABLED(CONFIG_I2C_IMX) + &mcf_i2c0, +#ifdef MCFI2C_BASE1 + &mcf_i2c1, +#endif +#ifdef MCFI2C_BASE2 + &mcf_i2c2, +#endif +#ifdef MCFI2C_BASE3 + &mcf_i2c3, +#endif +#ifdef MCFI2C_BASE4 + &mcf_i2c4, +#endif +#ifdef MCFI2C_BASE5 + &mcf_i2c5, +#endif +#endif }; /* diff --git a/arch/m68k/coldfire/m5206.c b/arch/m68k/coldfire/m5206.c index 8945f5e7b39c..a3bcf0883f98 100644 --- a/arch/m68k/coldfire/m5206.c +++ b/arch/m68k/coldfire/m5206.c @@ -26,6 +26,7 @@ DEFINE_CLK(mcftmr0, "mcftmr.0", MCF_BUSCLK); DEFINE_CLK(mcftmr1, "mcftmr.1", MCF_BUSCLK); DEFINE_CLK(mcfuart0, "mcfuart.0", MCF_BUSCLK); DEFINE_CLK(mcfuart1, "mcfuart.1", MCF_BUSCLK); +DEFINE_CLK(mcfi2c0, "imx1-i2c.0", MCF_BUSCLK); struct clk *mcf_clks[] = { &clk_pll, @@ -34,11 +35,21 @@ struct clk *mcf_clks[] = { &clk_mcftmr1, &clk_mcfuart0, &clk_mcfuart1, + &clk_mcfi2c0, NULL }; /***************************************************************************/ +static void __init m5206_i2c_init(void) +{ +#if IS_ENABLED(CONFIG_I2C_IMX) + writeb(MCFSIM_ICR_AUTOVEC | MCFSIM_ICR_LEVEL5 | MCFSIM_ICR_PRI0, + MCFSIM_I2CICR); + mcf_mapirq2imr(MCF_IRQ_I2C0, MCFINTC_I2C); +#endif /* IS_ENABLED(CONFIG_I2C_IMX) */ +} + void __init config_BSP(char *commandp, int size) { #if defined(CONFIG_NETtel) @@ -53,6 +64,7 @@ void __init config_BSP(char *commandp, int size) mcf_mapirq2imr(25, MCFINTC_EINT1); mcf_mapirq2imr(28, MCFINTC_EINT4); mcf_mapirq2imr(31, MCFINTC_EINT7); + m5206_i2c_init(); } /***************************************************************************/ diff --git a/arch/m68k/coldfire/m520x.c b/arch/m68k/coldfire/m520x.c index 173834f251eb..5ba69217ce6c 100644 --- a/arch/m68k/coldfire/m520x.c +++ b/arch/m68k/coldfire/m520x.c @@ -28,7 +28,7 @@ DEFINE_CLK(0, "fec.0", 12, MCF_CLK); DEFINE_CLK(0, "edma", 17, MCF_CLK); DEFINE_CLK(0, "intc.0", 18, MCF_CLK); DEFINE_CLK(0, "iack.0", 21, MCF_CLK); -DEFINE_CLK(0, "mcfi2c.0", 22, MCF_CLK); +DEFINE_CLK(0, "imx1-i2c.0", 22, MCF_CLK); DEFINE_CLK(0, "mcfqspi.0", 23, MCF_CLK); DEFINE_CLK(0, "mcfuart.0", 24, MCF_BUSCLK); DEFINE_CLK(0, "mcfuart.1", 25, MCF_BUSCLK); @@ -53,7 +53,7 @@ struct clk *mcf_clks[] = { &__clk_0_17, /* edma */ &__clk_0_18, /* intc.0 */ &__clk_0_21, /* iack.0 */ - &__clk_0_22, /* mcfi2c.0 */ + &__clk_0_22, /* imx1-i2c.0 */ &__clk_0_23, /* mcfqspi.0 */ &__clk_0_24, /* mcfuart.0 */ &__clk_0_25, /* mcfuart.1 */ @@ -71,7 +71,7 @@ struct clk *mcf_clks[] = { &__clk_0_40, /* sys.0 */ &__clk_0_41, /* gpio.0 */ &__clk_0_42, /* sdram.0 */ -NULL, + NULL, }; static struct clk * const enable_clks[] __initconst = { @@ -94,7 +94,7 @@ static struct clk * const enable_clks[] __initconst = { static struct clk * const disable_clks[] __initconst = { &__clk_0_12, /* fec.0 */ &__clk_0_17, /* edma */ - &__clk_0_22, /* mcfi2c.0 */ + &__clk_0_22, /* imx1-i2c.0 */ &__clk_0_23, /* mcfqspi.0 */ &__clk_0_28, /* mcftmr.0 */ &__clk_0_29, /* mcftmr.1 */ @@ -133,6 +133,21 @@ static void __init m520x_qspi_init(void) /***************************************************************************/ +static void __init m520x_i2c_init(void) +{ +#if IS_ENABLED(CONFIG_I2C_IMX) + u8 par; + + /* setup Port FECI2C Pin Assignment Register for I2C */ + /* set PAR_SCL to SCL and PAR_SDA to SDA */ + par = readb(MCF_GPIO_PAR_FECI2C); + par |= 0x0f; + writeb(par, MCF_GPIO_PAR_FECI2C); +#endif /* IS_ENABLED(CONFIG_I2C_IMX) */ +} + +/***************************************************************************/ + static void __init m520x_uarts_init(void) { u16 par; @@ -175,6 +190,7 @@ void __init config_BSP(char *commandp, int size) m520x_uarts_init(); m520x_fec_init(); m520x_qspi_init(); + m520x_i2c_init(); } /***************************************************************************/ diff --git a/arch/m68k/coldfire/m523x.c b/arch/m68k/coldfire/m523x.c index a191a467eff2..f7a0fcc5618c 100644 --- a/arch/m68k/coldfire/m523x.c +++ b/arch/m68k/coldfire/m523x.c @@ -34,6 +34,7 @@ DEFINE_CLK(mcfuart1, "mcfuart.1", MCF_BUSCLK); DEFINE_CLK(mcfuart2, "mcfuart.2", MCF_BUSCLK); DEFINE_CLK(mcfqspi0, "mcfqspi.0", MCF_BUSCLK); DEFINE_CLK(fec0, "fec.0", MCF_BUSCLK); +DEFINE_CLK(mcfi2c0, "imx1-i2c.0", MCF_BUSCLK); struct clk *mcf_clks[] = { &clk_pll, @@ -47,6 +48,7 @@ struct clk *mcf_clks[] = { &clk_mcfuart2, &clk_mcfqspi0, &clk_fec0, + &clk_mcfi2c0, NULL }; @@ -68,6 +70,21 @@ static void __init m523x_qspi_init(void) /***************************************************************************/ +static void __init m523x_i2c_init(void) +{ +#if IS_ENABLED(CONFIG_I2C_IMX) + u8 par; + + /* setup Port AS Pin Assignment Register for I2C */ + /* set PASPA0 to SCL and PASPA1 to SDA */ + par = readb(MCFGPIO_PAR_FECI2C); + par |= 0x0f; + writeb(par, MCFGPIO_PAR_FECI2C); +#endif /* IS_ENABLED(CONFIG_I2C_IMX) */ +} + +/***************************************************************************/ + static void __init m523x_fec_init(void) { /* Set multi-function pins to ethernet use */ @@ -81,6 +98,7 @@ void __init config_BSP(char *commandp, int size) mach_sched_init = hw_timer_init; m523x_fec_init(); m523x_qspi_init(); + m523x_i2c_init(); } /***************************************************************************/ diff --git a/arch/m68k/coldfire/m5249.c b/arch/m68k/coldfire/m5249.c index e48f55adc447..b16cf9b4580c 100644 --- a/arch/m68k/coldfire/m5249.c +++ b/arch/m68k/coldfire/m5249.c @@ -27,6 +27,8 @@ DEFINE_CLK(mcftmr1, "mcftmr.1", MCF_BUSCLK); DEFINE_CLK(mcfuart0, "mcfuart.0", MCF_BUSCLK); DEFINE_CLK(mcfuart1, "mcfuart.1", MCF_BUSCLK); DEFINE_CLK(mcfqspi0, "mcfqspi.0", MCF_BUSCLK); +DEFINE_CLK(mcfi2c0, "imx1-i2c.0", MCF_BUSCLK); +DEFINE_CLK(mcfi2c1, "imx1-i2c.1", MCF_BUSCLK); struct clk *mcf_clks[] = { &clk_pll, @@ -36,6 +38,8 @@ struct clk *mcf_clks[] = { &clk_mcfuart0, &clk_mcfuart1, &clk_mcfqspi0, + &clk_mcfi2c0, + &clk_mcfi2c1, NULL }; @@ -85,6 +89,26 @@ static void __init m5249_qspi_init(void) /***************************************************************************/ +static void __init m5249_i2c_init(void) +{ +#if IS_ENABLED(CONFIG_I2C_IMX) + u32 r; + + /* first I2C controller uses regular irq setup */ + writeb(MCFSIM_ICR_AUTOVEC | MCFSIM_ICR_LEVEL5 | MCFSIM_ICR_PRI0, + MCFSIM_I2CICR); + mcf_mapirq2imr(MCF_IRQ_I2C0, MCFINTC_I2C); + + /* second I2C controller is completely different */ + r = readl(MCFINTC2_INTPRI_REG(MCF_IRQ_I2C1)); + r &= ~MCFINTC2_INTPRI_BITS(0xf, MCF_IRQ_I2C1); + r |= MCFINTC2_INTPRI_BITS(0x5, MCF_IRQ_I2C1); + writel(r, MCFINTC2_INTPRI_REG(MCF_IRQ_I2C1)); +#endif /* CONFIG_I2C_IMX */ +} + +/***************************************************************************/ + #ifdef CONFIG_M5249C3 static void __init m5249_smc91x_init(void) @@ -111,6 +135,7 @@ void __init config_BSP(char *commandp, int size) m5249_smc91x_init(); #endif m5249_qspi_init(); + m5249_i2c_init(); } /***************************************************************************/ diff --git a/arch/m68k/coldfire/m525x.c b/arch/m68k/coldfire/m525x.c index 3d8583e2187c..110e2cd34e62 100644 --- a/arch/m68k/coldfire/m525x.c +++ b/arch/m68k/coldfire/m525x.c @@ -27,6 +27,8 @@ DEFINE_CLK(mcftmr1, "mcftmr.1", MCF_BUSCLK); DEFINE_CLK(mcfuart0, "mcfuart.0", MCF_BUSCLK); DEFINE_CLK(mcfuart1, "mcfuart.1", MCF_BUSCLK); DEFINE_CLK(mcfqspi0, "mcfqspi.0", MCF_BUSCLK); +DEFINE_CLK(mcfi2c0, "imx1-i2c.0", MCF_BUSCLK); +DEFINE_CLK(mcfi2c1, "imx1-i2c.1", MCF_BUSCLK); struct clk *mcf_clks[] = { &clk_pll, @@ -36,6 +38,8 @@ struct clk *mcf_clks[] = { &clk_mcfuart0, &clk_mcfuart1, &clk_mcfqspi0, + &clk_mcfi2c0, + &clk_mcfi2c1, NULL }; @@ -59,12 +63,12 @@ static void __init m525x_qspi_init(void) static void __init m525x_i2c_init(void) { -#if IS_ENABLED(CONFIG_I2C_COLDFIRE) +#if IS_ENABLED(CONFIG_I2C_IMX) u32 r; /* first I2C controller uses regular irq setup */ writeb(MCFSIM_ICR_AUTOVEC | MCFSIM_ICR_LEVEL5 | MCFSIM_ICR_PRI0, - MCFSIM_I2CICR); + MCFSIM_I2CICR); mcf_mapirq2imr(MCF_IRQ_I2C0, MCFINTC_I2C); /* second I2C controller is completely different */ @@ -72,7 +76,7 @@ static void __init m525x_i2c_init(void) r &= ~MCFINTC2_INTPRI_BITS(0xf, MCF_IRQ_I2C1); r |= MCFINTC2_INTPRI_BITS(0x5, MCF_IRQ_I2C1); writel(r, MCFINTC2_INTPRI_REG(MCF_IRQ_I2C1)); -#endif /* IS_ENABLED(CONFIG_I2C_COLDFIRE) */ +#endif /* IS_ENABLED(CONFIG_I2C_IMX) */ } /***************************************************************************/ diff --git a/arch/m68k/coldfire/m527x.c b/arch/m68k/coldfire/m527x.c index c0b3e28f91df..b10b436b5a31 100644 --- a/arch/m68k/coldfire/m527x.c +++ b/arch/m68k/coldfire/m527x.c @@ -36,6 +36,7 @@ DEFINE_CLK(mcfuart2, "mcfuart.2", MCF_BUSCLK); DEFINE_CLK(mcfqspi0, "mcfqspi.0", MCF_BUSCLK); DEFINE_CLK(fec0, "fec.0", MCF_BUSCLK); DEFINE_CLK(fec1, "fec.1", MCF_BUSCLK); +DEFINE_CLK(mcfi2c0, "imx1-i2c.0", MCF_BUSCLK); struct clk *mcf_clks[] = { &clk_pll, @@ -50,6 +51,7 @@ struct clk *mcf_clks[] = { &clk_mcfqspi0, &clk_fec0, &clk_fec1, + &clk_mcfi2c0, NULL }; @@ -76,6 +78,31 @@ static void __init m527x_qspi_init(void) /***************************************************************************/ +static void __init m527x_i2c_init(void) +{ +#if IS_ENABLED(CONFIG_I2C_IMX) +#if defined(CONFIG_M5271) + u8 par; + + /* setup Port FECI2C Pin Assignment Register for I2C */ + /* set PAR_SCL to SCL and PAR_SDA to SDA */ + par = readb(MCFGPIO_PAR_FECI2C); + par |= 0x0f; + writeb(par, MCFGPIO_PAR_FECI2C); +#elif defined(CONFIG_M5275) + u16 par; + + /* setup Port FECI2C Pin Assignment Register for I2C */ + /* set PAR_SCL to SCL and PAR_SDA to SDA */ + par = readw(MCFGPIO_PAR_FECI2C); + par |= 0x0f; + writew(par, MCFGPIO_PAR_FECI2C); +#endif +#endif /* IS_ENABLED(CONFIG_I2C_IMX) */ +} + +/***************************************************************************/ + static void __init m527x_uarts_init(void) { u16 sepmask; @@ -122,6 +149,7 @@ void __init config_BSP(char *commandp, int size) m527x_uarts_init(); m527x_fec_init(); m527x_qspi_init(); + m527x_i2c_init(); } /***************************************************************************/ diff --git a/arch/m68k/coldfire/m528x.c b/arch/m68k/coldfire/m528x.c index 12f9e370d8dd..ea76998d5ab9 100644 --- a/arch/m68k/coldfire/m528x.c +++ b/arch/m68k/coldfire/m528x.c @@ -36,6 +36,7 @@ DEFINE_CLK(mcfuart1, "mcfuart.1", MCF_BUSCLK); DEFINE_CLK(mcfuart2, "mcfuart.2", MCF_BUSCLK); DEFINE_CLK(mcfqspi0, "mcfqspi.0", MCF_BUSCLK); DEFINE_CLK(fec0, "fec.0", MCF_BUSCLK); +DEFINE_CLK(mcfi2c0, "imx1-i2c.0", MCF_BUSCLK); struct clk *mcf_clks[] = { &clk_pll, @@ -49,6 +50,7 @@ struct clk *mcf_clks[] = { &clk_mcfuart2, &clk_mcfqspi0, &clk_fec0, + &clk_mcfi2c0, NULL }; @@ -64,6 +66,21 @@ static void __init m528x_qspi_init(void) /***************************************************************************/ +static void __init m528x_i2c_init(void) +{ +#if IS_ENABLED(CONFIG_I2C_IMX) + u16 paspar; + + /* setup Port AS Pin Assignment Register for I2C */ + /* set PASPA0 to SCL and PASPA1 to SDA */ + paspar = readw(MCFGPIO_PASPAR); + paspar |= 0xF; + writew(paspar, MCFGPIO_PASPAR); +#endif /* IS_ENABLED(CONFIG_I2C_IMX) */ +} + +/***************************************************************************/ + static void __init m528x_uarts_init(void) { u8 port; @@ -127,6 +144,7 @@ void __init config_BSP(char *commandp, int size) m528x_uarts_init(); m528x_fec_init(); m528x_qspi_init(); + m528x_i2c_init(); } /***************************************************************************/ diff --git a/arch/m68k/coldfire/m5307.c b/arch/m68k/coldfire/m5307.c index 2da1d146e344..cc5e8a50a423 100644 --- a/arch/m68k/coldfire/m5307.c +++ b/arch/m68k/coldfire/m5307.c @@ -35,6 +35,7 @@ DEFINE_CLK(mcftmr0, "mcftmr.0", MCF_BUSCLK); DEFINE_CLK(mcftmr1, "mcftmr.1", MCF_BUSCLK); DEFINE_CLK(mcfuart0, "mcfuart.0", MCF_BUSCLK); DEFINE_CLK(mcfuart1, "mcfuart.1", MCF_BUSCLK); +DEFINE_CLK(mcfi2c0, "imx1-i2c.0", MCF_BUSCLK); struct clk *mcf_clks[] = { &clk_pll, @@ -43,11 +44,23 @@ struct clk *mcf_clks[] = { &clk_mcftmr1, &clk_mcfuart0, &clk_mcfuart1, + &clk_mcfi2c0, NULL }; /***************************************************************************/ +static void __init m5307_i2c_init(void) +{ +#if IS_ENABLED(CONFIG_I2C_IMX) + writeb(MCFSIM_ICR_AUTOVEC | MCFSIM_ICR_LEVEL5 | MCFSIM_ICR_PRI0, + MCFSIM_I2CICR); + mcf_mapirq2imr(MCF_IRQ_I2C0, MCFINTC_I2C); +#endif /* IS_ENABLED(CONFIG_I2C_IMX) */ +} + +/***************************************************************************/ + void __init config_BSP(char *commandp, int size) { #if defined(CONFIG_NETtel) || \ @@ -73,6 +86,7 @@ void __init config_BSP(char *commandp, int size) */ wdebug(MCFDEBUG_CSR, MCFDEBUG_CSR_PSTCLK); #endif + m5307_i2c_init(); } /***************************************************************************/ diff --git a/arch/m68k/coldfire/m53xx.c b/arch/m68k/coldfire/m53xx.c index 2502f63960bc..cf1917934b8a 100644 --- a/arch/m68k/coldfire/m53xx.c +++ b/arch/m68k/coldfire/m53xx.c @@ -38,7 +38,7 @@ DEFINE_CLK(0, "edma", 17, MCF_CLK); DEFINE_CLK(0, "intc.0", 18, MCF_CLK); DEFINE_CLK(0, "intc.1", 19, MCF_CLK); DEFINE_CLK(0, "iack.0", 21, MCF_CLK); -DEFINE_CLK(0, "mcfi2c.0", 22, MCF_CLK); +DEFINE_CLK(0, "imx1-i2c.0", 22, MCF_CLK); DEFINE_CLK(0, "mcfqspi.0", 23, MCF_CLK); DEFINE_CLK(0, "mcfuart.0", 24, MCF_BUSCLK); DEFINE_CLK(0, "mcfuart.1", 25, MCF_BUSCLK); @@ -77,7 +77,7 @@ struct clk *mcf_clks[] = { &__clk_0_18, /* intc.0 */ &__clk_0_19, /* intc.1 */ &__clk_0_21, /* iack.0 */ - &__clk_0_22, /* mcfi2c.0 */ + &__clk_0_22, /* imx1-i2c.0 */ &__clk_0_23, /* mcfqspi.0 */ &__clk_0_24, /* mcfuart.0 */ &__clk_0_25, /* mcfuart.1 */ @@ -133,7 +133,7 @@ static struct clk * const disable_clks[] __initconst = { &__clk_0_8, /* mcfcan.0 */ &__clk_0_12, /* fec.0 */ &__clk_0_17, /* edma */ - &__clk_0_22, /* mcfi2c.0 */ + &__clk_0_22, /* imx1-i2c.0 */ &__clk_0_23, /* mcfqspi.0 */ &__clk_0_30, /* mcftmr.2 */ &__clk_0_31, /* mcftmr.3 */ @@ -176,6 +176,19 @@ static void __init m53xx_qspi_init(void) /***************************************************************************/ +static void __init m53xx_i2c_init(void) +{ +#if IS_ENABLED(CONFIG_I2C_IMX) + /* setup Port AS Pin Assignment Register for I2C */ + /* set PASPA0 to SCL and PASPA1 to SDA */ + u8 r = readb(MCFGPIO_PAR_FECI2C); + r |= 0x0f; + writeb(r, MCFGPIO_PAR_FECI2C); +#endif /* IS_ENABLED(CONFIG_I2C_IMX) */ +} + +/***************************************************************************/ + static void __init m53xx_uarts_init(void) { /* UART GPIO initialization */ @@ -218,6 +231,7 @@ void __init config_BSP(char *commandp, int size) m53xx_uarts_init(); m53xx_fec_init(); m53xx_qspi_init(); + m53xx_i2c_init(); #ifdef CONFIG_BDM_DISABLE /* diff --git a/arch/m68k/coldfire/m5407.c b/arch/m68k/coldfire/m5407.c index 738eba6be40e..38863ddbeab0 100644 --- a/arch/m68k/coldfire/m5407.c +++ b/arch/m68k/coldfire/m5407.c @@ -26,6 +26,7 @@ DEFINE_CLK(mcftmr0, "mcftmr.0", MCF_BUSCLK); DEFINE_CLK(mcftmr1, "mcftmr.1", MCF_BUSCLK); DEFINE_CLK(mcfuart0, "mcfuart.0", MCF_BUSCLK); DEFINE_CLK(mcfuart1, "mcfuart.1", MCF_BUSCLK); +DEFINE_CLK(mcfi2c0, "imx1-i2c.0", MCF_BUSCLK); struct clk *mcf_clks[] = { &clk_pll, @@ -34,11 +35,23 @@ struct clk *mcf_clks[] = { &clk_mcftmr1, &clk_mcfuart0, &clk_mcfuart1, + &clk_mcfi2c0, NULL }; /***************************************************************************/ +static void __init m5407_i2c_init(void) +{ +#if IS_ENABLED(CONFIG_I2C_IMX) + writeb(MCFSIM_ICR_AUTOVEC | MCFSIM_ICR_LEVEL5 | MCFSIM_ICR_PRI0, + MCFSIM_I2CICR); + mcf_mapirq2imr(MCF_IRQ_I2C0, MCFINTC_I2C); +#endif /* IS_ENABLED(CONFIG_I2C_IMX) */ +} + +/***************************************************************************/ + void __init config_BSP(char *commandp, int size) { mach_sched_init = hw_timer_init; @@ -48,6 +61,7 @@ void __init config_BSP(char *commandp, int size) mcf_mapirq2imr(27, MCFINTC_EINT3); mcf_mapirq2imr(29, MCFINTC_EINT5); mcf_mapirq2imr(31, MCFINTC_EINT7); + m5407_i2c_init(); } /***************************************************************************/ diff --git a/arch/m68k/coldfire/m5441x.c b/arch/m68k/coldfire/m5441x.c index 98a13cce93d8..dc589b039b62 100644 --- a/arch/m68k/coldfire/m5441x.c +++ b/arch/m68k/coldfire/m5441x.c @@ -19,13 +19,13 @@ DEFINE_CLK(0, "flexbus", 2, MCF_CLK); DEFINE_CLK(0, "mcfcan.0", 8, MCF_CLK); DEFINE_CLK(0, "mcfcan.1", 9, MCF_CLK); -DEFINE_CLK(0, "mcfi2c.1", 14, MCF_CLK); +DEFINE_CLK(0, "imx1-i2c.1", 14, MCF_CLK); DEFINE_CLK(0, "mcfdspi.1", 15, MCF_CLK); DEFINE_CLK(0, "edma", 17, MCF_CLK); DEFINE_CLK(0, "intc.0", 18, MCF_CLK); DEFINE_CLK(0, "intc.1", 19, MCF_CLK); DEFINE_CLK(0, "intc.2", 20, MCF_CLK); -DEFINE_CLK(0, "mcfi2c.0", 22, MCF_CLK); +DEFINE_CLK(0, "imx1-i2c.0", 22, MCF_CLK); DEFINE_CLK(0, "mcfdspi.0", 23, MCF_CLK); DEFINE_CLK(0, "mcfuart.0", 24, MCF_BUSCLK); DEFINE_CLK(0, "mcfuart.1", 25, MCF_BUSCLK); @@ -59,10 +59,10 @@ DEFINE_CLK(0, "switch.1", 56, MCF_CLK); DEFINE_CLK(0, "nand.0", 63, MCF_CLK); DEFINE_CLK(1, "mcfow.0", 2, MCF_CLK); -DEFINE_CLK(1, "mcfi2c.2", 4, MCF_CLK); -DEFINE_CLK(1, "mcfi2c.3", 5, MCF_CLK); -DEFINE_CLK(1, "mcfi2c.4", 6, MCF_CLK); -DEFINE_CLK(1, "mcfi2c.5", 7, MCF_CLK); +DEFINE_CLK(1, "imx1-i2c.2", 4, MCF_CLK); +DEFINE_CLK(1, "imx1-i2c.3", 5, MCF_CLK); +DEFINE_CLK(1, "imx1-i2c.4", 6, MCF_CLK); +DEFINE_CLK(1, "imx1-i2c.5", 7, MCF_CLK); DEFINE_CLK(1, "mcfuart.4", 24, MCF_BUSCLK); DEFINE_CLK(1, "mcfuart.5", 25, MCF_BUSCLK); DEFINE_CLK(1, "mcfuart.6", 26, MCF_BUSCLK); diff --git a/arch/m68k/coldfire/m54xx.c b/arch/m68k/coldfire/m54xx.c index 386df3b68cdf..c552851ec617 100644 --- a/arch/m68k/coldfire/m54xx.c +++ b/arch/m68k/coldfire/m54xx.c @@ -37,6 +37,7 @@ DEFINE_CLK(mcfuart0, "mcfuart.0", MCF_BUSCLK); DEFINE_CLK(mcfuart1, "mcfuart.1", MCF_BUSCLK); DEFINE_CLK(mcfuart2, "mcfuart.2", MCF_BUSCLK); DEFINE_CLK(mcfuart3, "mcfuart.3", MCF_BUSCLK); +DEFINE_CLK(mcfi2c0, "imx1-i2c.0", MCF_BUSCLK); struct clk *mcf_clks[] = { &clk_pll, @@ -47,6 +48,7 @@ struct clk *mcf_clks[] = { &clk_mcfuart1, &clk_mcfuart2, &clk_mcfuart3, + &clk_mcfi2c0, NULL }; @@ -65,6 +67,20 @@ static void __init m54xx_uarts_init(void) /***************************************************************************/ +static void __init m54xx_i2c_init(void) +{ +#if IS_ENABLED(CONFIG_I2C_IMX) + u32 r; + + /* set the fec/i2c/irq pin assignment register for i2c */ + r = readl(MCF_PAR_FECI2CIRQ); + r |= MCF_PAR_FECI2CIRQ_SDA | MCF_PAR_FECI2CIRQ_SCL; + writel(r, MCF_PAR_FECI2CIRQ); +#endif /* IS_ENABLED(CONFIG_I2C_IMX) */ +} + +/***************************************************************************/ + static void mcf54xx_reset(void) { /* disable interrupts and enable the watchdog */ @@ -86,6 +102,7 @@ void __init config_BSP(char *commandp, int size) mach_reset = mcf54xx_reset; mach_sched_init = hw_timer_init; m54xx_uarts_init(); + m54xx_i2c_init(); } /***************************************************************************/ diff --git a/arch/m68k/configs/amcore_defconfig b/arch/m68k/configs/amcore_defconfig new file mode 100644 index 000000000000..f108dd121e9a --- /dev/null +++ b/arch/m68k/configs/amcore_defconfig @@ -0,0 +1,118 @@ +CONFIG_LOCALVERSION="amcore-001" +CONFIG_DEFAULT_HOSTNAME="amcore" +CONFIG_SYSVIPC=y +# CONFIG_FHANDLE is not set +# CONFIG_USELIB is not set +CONFIG_LOG_BUF_SHIFT=14 +CONFIG_NAMESPACES=y +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +# CONFIG_AIO is not set +# CONFIG_ADVISE_SYSCALLS is not set +# CONFIG_MEMBARRIER is not set +CONFIG_EMBEDDED=y +# CONFIG_VM_EVENT_COUNTERS is not set +# CONFIG_COMPAT_BRK is not set +# CONFIG_LBDAF is not set +# CONFIG_BLK_DEV_BSG is not set +# CONFIG_MMU is not set +CONFIG_M5307=y +CONFIG_AMCORE=y +CONFIG_UBOOT=y +CONFIG_RAMSIZE=0x1000000 +CONFIG_KERNELBASE=0x20000 +CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 +CONFIG_BINFMT_FLAT=y +# CONFIG_COREDUMP is not set +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_IPV6 is not set +# CONFIG_WIRELESS is not set +# CONFIG_UEVENT_HELPER is not set +CONFIG_FW_LOADER_USER_HELPER_FALLBACK=y +# CONFIG_ALLOW_DEV_COREDUMP is not set +CONFIG_CONNECTOR=y +CONFIG_MTD=y +CONFIG_MTD_BLOCK=y +CONFIG_MTD_CFI=y +CONFIG_MTD_JEDECPROBE=y +CONFIG_MTD_CFI_ADV_OPTIONS=y +CONFIG_MTD_CFI_LE_BYTE_SWAP=y +CONFIG_MTD_CFI_GEOMETRY=y +# CONFIG_MTD_CFI_I2 is not set +CONFIG_MTD_CFI_AMDSTD=y +CONFIG_MTD_CFI_STAA=y +CONFIG_MTD_ROM=y +CONFIG_MTD_COMPLEX_MAPPINGS=y +CONFIG_MTD_PHYSMAP=y +CONFIG_MTD_UCLINUX=y +CONFIG_MTD_PLATRAM=y +CONFIG_BLK_DEV_RAM=y +CONFIG_NETDEVICES=y +# CONFIG_NET_VENDOR_ARC is not set +# CONFIG_NET_CADENCE is not set +# CONFIG_NET_VENDOR_BROADCOM is not set +CONFIG_DM9000=y +# CONFIG_NET_VENDOR_EZCHIP is not set +# CONFIG_NET_VENDOR_INTEL is not set +# CONFIG_NET_VENDOR_MARVELL is not set +# CONFIG_NET_VENDOR_MICREL is not set +# CONFIG_NET_VENDOR_NATSEMI is not set +# CONFIG_NET_VENDOR_NETRONOME is not set +# CONFIG_NET_VENDOR_QUALCOMM is not set +# CONFIG_NET_VENDOR_RENESAS is not set +# CONFIG_NET_VENDOR_ROCKER is not set +# CONFIG_NET_VENDOR_SAMSUNG is not set +# CONFIG_NET_VENDOR_SEEQ is not set +# CONFIG_NET_VENDOR_SMSC is not set +# CONFIG_NET_VENDOR_STMICRO is not set +# CONFIG_NET_VENDOR_SYNOPSYS is not set +# CONFIG_NET_VENDOR_VIA is not set +# CONFIG_NET_VENDOR_WIZNET is not set +# CONFIG_WLAN is not set +# CONFIG_INPUT is not set +# CONFIG_SERIO is not set +# CONFIG_VT is not set +# CONFIG_UNIX98_PTYS is not set +# CONFIG_DEVMEM is not set +# CONFIG_DEVKMEM is not set +CONFIG_SERIAL_MCF=y +CONFIG_SERIAL_MCF_BAUDRATE=115200 +CONFIG_SERIAL_MCF_CONSOLE=y +# CONFIG_HW_RANDOM is not set +CONFIG_I2C=y +# CONFIG_I2C_COMPAT is not set +CONFIG_I2C_CHARDEV=y +# CONFIG_I2C_HELPER_AUTO is not set +CONFIG_I2C_IMX=y +CONFIG_PPS=y +# CONFIG_HWMON is not set +# CONFIG_USB_SUPPORT is not set +CONFIG_RTC_CLASS=y +# CONFIG_RTC_SYSTOHC is not set +CONFIG_RTC_DRV_DS1307=y +CONFIG_EXT2_FS=y +CONFIG_EXT2_FS_XATTR=y +# CONFIG_FILE_LOCKING is not set +# CONFIG_DNOTIFY is not set +# CONFIG_INOTIFY_USER is not set +CONFIG_FSCACHE=y +# CONFIG_PROC_SYSCTL is not set +CONFIG_JFFS2_FS=y +CONFIG_ROMFS_FS=y +CONFIG_ROMFS_BACKED_BY_BOTH=y +# CONFIG_NETWORK_FILESYSTEMS is not set +CONFIG_PRINTK_TIME=y +# CONFIG_ENABLE_WARN_DEPRECATED is not set +# CONFIG_SECTION_MISMATCH_WARN_ONLY is not set +CONFIG_PANIC_ON_OOPS=y +# CONFIG_SCHED_DEBUG is not set +# CONFIG_DEBUG_BUGVERBOSE is not set +# CONFIG_CRYPTO_ECHAINIV is not set +CONFIG_CRYPTO_ANSI_CPRNG=y +# CONFIG_CRYPTO_HW is not set +CONFIG_CRC16=y diff --git a/arch/m68k/configs/amiga_defconfig b/arch/m68k/configs/amiga_defconfig index 55be7e3ff109..b98acd15ca22 100644 --- a/arch/m68k/configs/amiga_defconfig +++ b/arch/m68k/configs/amiga_defconfig @@ -95,9 +95,10 @@ CONFIG_NF_TABLES_INET=m CONFIG_NF_TABLES_NETDEV=m CONFIG_NFT_EXTHDR=m CONFIG_NFT_META=m +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m -CONFIG_NFT_RBTREE=m -CONFIG_NFT_HASH=m +CONFIG_NFT_SET_RBTREE=m +CONFIG_NFT_SET_HASH=m CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -105,8 +106,10 @@ CONFIG_NFT_MASQ=m CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m CONFIG_NETFILTER_XT_SET=m @@ -366,6 +369,7 @@ CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_VETH=m # CONFIG_NET_VENDOR_3COM is not set +# CONFIG_NET_VENDOR_AMAZON is not set CONFIG_A2065=y CONFIG_ARIADNE=y # CONFIG_NET_VENDOR_ARC is not set diff --git a/arch/m68k/configs/apollo_defconfig b/arch/m68k/configs/apollo_defconfig index 365dda66b0e6..f80dc57e6374 100644 --- a/arch/m68k/configs/apollo_defconfig +++ b/arch/m68k/configs/apollo_defconfig @@ -93,9 +93,10 @@ CONFIG_NF_TABLES_INET=m CONFIG_NF_TABLES_NETDEV=m CONFIG_NFT_EXTHDR=m CONFIG_NFT_META=m +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m -CONFIG_NFT_RBTREE=m -CONFIG_NFT_HASH=m +CONFIG_NFT_SET_RBTREE=m +CONFIG_NFT_SET_HASH=m CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -103,8 +104,10 @@ CONFIG_NFT_MASQ=m CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m CONFIG_NETFILTER_XT_SET=m @@ -347,6 +350,7 @@ CONFIG_MACSEC=m CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_VETH=m +# CONFIG_NET_VENDOR_AMAZON is not set # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_CADENCE is not set # CONFIG_NET_VENDOR_BROADCOM is not set diff --git a/arch/m68k/configs/atari_defconfig b/arch/m68k/configs/atari_defconfig index ce3cbfd16fcd..4e16b1821fbb 100644 --- a/arch/m68k/configs/atari_defconfig +++ b/arch/m68k/configs/atari_defconfig @@ -93,9 +93,10 @@ CONFIG_NF_TABLES_INET=m CONFIG_NF_TABLES_NETDEV=m CONFIG_NFT_EXTHDR=m CONFIG_NFT_META=m +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m -CONFIG_NFT_RBTREE=m -CONFIG_NFT_HASH=m +CONFIG_NFT_SET_RBTREE=m +CONFIG_NFT_SET_HASH=m CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -103,8 +104,10 @@ CONFIG_NFT_MASQ=m CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m CONFIG_NETFILTER_XT_SET=m @@ -356,6 +359,7 @@ CONFIG_MACSEC=m CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_VETH=m +# CONFIG_NET_VENDOR_AMAZON is not set CONFIG_ATARILANCE=y # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_CADENCE is not set diff --git a/arch/m68k/configs/bvme6000_defconfig b/arch/m68k/configs/bvme6000_defconfig index 8db496a9797d..2767bbf5ad61 100644 --- a/arch/m68k/configs/bvme6000_defconfig +++ b/arch/m68k/configs/bvme6000_defconfig @@ -91,9 +91,10 @@ CONFIG_NF_TABLES_INET=m CONFIG_NF_TABLES_NETDEV=m CONFIG_NFT_EXTHDR=m CONFIG_NFT_META=m +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m -CONFIG_NFT_RBTREE=m -CONFIG_NFT_HASH=m +CONFIG_NFT_SET_RBTREE=m +CONFIG_NFT_SET_HASH=m CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -101,8 +102,10 @@ CONFIG_NFT_MASQ=m CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m CONFIG_NETFILTER_XT_SET=m @@ -346,6 +349,7 @@ CONFIG_MACSEC=m CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_VETH=m +# CONFIG_NET_VENDOR_AMAZON is not set # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_CADENCE is not set # CONFIG_NET_VENDOR_BROADCOM is not set diff --git a/arch/m68k/configs/hp300_defconfig b/arch/m68k/configs/hp300_defconfig index 8314156f7149..d13ba309265e 100644 --- a/arch/m68k/configs/hp300_defconfig +++ b/arch/m68k/configs/hp300_defconfig @@ -93,9 +93,10 @@ CONFIG_NF_TABLES_INET=m CONFIG_NF_TABLES_NETDEV=m CONFIG_NFT_EXTHDR=m CONFIG_NFT_META=m +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m -CONFIG_NFT_RBTREE=m -CONFIG_NFT_HASH=m +CONFIG_NFT_SET_RBTREE=m +CONFIG_NFT_SET_HASH=m CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -103,8 +104,10 @@ CONFIG_NFT_MASQ=m CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m CONFIG_NETFILTER_XT_SET=m @@ -347,6 +350,7 @@ CONFIG_MACSEC=m CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_VETH=m +# CONFIG_NET_VENDOR_AMAZON is not set CONFIG_HPLANCE=y # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_CADENCE is not set diff --git a/arch/m68k/configs/mac_defconfig b/arch/m68k/configs/mac_defconfig index 6600270b9622..78b5101c1aa6 100644 --- a/arch/m68k/configs/mac_defconfig +++ b/arch/m68k/configs/mac_defconfig @@ -92,9 +92,10 @@ CONFIG_NF_TABLES_INET=m CONFIG_NF_TABLES_NETDEV=m CONFIG_NFT_EXTHDR=m CONFIG_NFT_META=m +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m -CONFIG_NFT_RBTREE=m -CONFIG_NFT_HASH=m +CONFIG_NFT_SET_RBTREE=m +CONFIG_NFT_SET_HASH=m CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -102,8 +103,10 @@ CONFIG_NFT_MASQ=m CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m CONFIG_NETFILTER_XT_SET=m @@ -363,6 +366,7 @@ CONFIG_MACSEC=m CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_VETH=m +# CONFIG_NET_VENDOR_AMAZON is not set CONFIG_MACMACE=y # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_CADENCE is not set diff --git a/arch/m68k/configs/multi_defconfig b/arch/m68k/configs/multi_defconfig index 90abfe9eabba..38e5bcbd0d62 100644 --- a/arch/m68k/configs/multi_defconfig +++ b/arch/m68k/configs/multi_defconfig @@ -102,9 +102,10 @@ CONFIG_NF_TABLES_INET=m CONFIG_NF_TABLES_NETDEV=m CONFIG_NFT_EXTHDR=m CONFIG_NFT_META=m +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m -CONFIG_NFT_RBTREE=m -CONFIG_NFT_HASH=m +CONFIG_NFT_SET_RBTREE=m +CONFIG_NFT_SET_HASH=m CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -112,8 +113,10 @@ CONFIG_NFT_MASQ=m CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m CONFIG_NETFILTER_XT_SET=m @@ -397,6 +400,7 @@ CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_VETH=m # CONFIG_NET_VENDOR_3COM is not set +# CONFIG_NET_VENDOR_AMAZON is not set CONFIG_A2065=y CONFIG_ARIADNE=y CONFIG_ATARILANCE=y diff --git a/arch/m68k/configs/mvme147_defconfig b/arch/m68k/configs/mvme147_defconfig index 0d502c2f73d5..28687192b68e 100644 --- a/arch/m68k/configs/mvme147_defconfig +++ b/arch/m68k/configs/mvme147_defconfig @@ -90,9 +90,10 @@ CONFIG_NF_TABLES_INET=m CONFIG_NF_TABLES_NETDEV=m CONFIG_NFT_EXTHDR=m CONFIG_NFT_META=m +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m -CONFIG_NFT_RBTREE=m -CONFIG_NFT_HASH=m +CONFIG_NFT_SET_RBTREE=m +CONFIG_NFT_SET_HASH=m CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -100,8 +101,10 @@ CONFIG_NFT_MASQ=m CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m CONFIG_NETFILTER_XT_SET=m @@ -345,6 +348,7 @@ CONFIG_MACSEC=m CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_VETH=m +# CONFIG_NET_VENDOR_AMAZON is not set CONFIG_MVME147_NET=y # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_CADENCE is not set diff --git a/arch/m68k/configs/mvme16x_defconfig b/arch/m68k/configs/mvme16x_defconfig index 5930e91fc710..5a5f109ab3cd 100644 --- a/arch/m68k/configs/mvme16x_defconfig +++ b/arch/m68k/configs/mvme16x_defconfig @@ -91,9 +91,10 @@ CONFIG_NF_TABLES_INET=m CONFIG_NF_TABLES_NETDEV=m CONFIG_NFT_EXTHDR=m CONFIG_NFT_META=m +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m -CONFIG_NFT_RBTREE=m -CONFIG_NFT_HASH=m +CONFIG_NFT_SET_RBTREE=m +CONFIG_NFT_SET_HASH=m CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -101,8 +102,10 @@ CONFIG_NFT_MASQ=m CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m CONFIG_NETFILTER_XT_SET=m @@ -346,6 +349,7 @@ CONFIG_MACSEC=m CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_VETH=m +# CONFIG_NET_VENDOR_AMAZON is not set # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_CADENCE is not set # CONFIG_NET_VENDOR_BROADCOM is not set diff --git a/arch/m68k/configs/q40_defconfig b/arch/m68k/configs/q40_defconfig index 74e3ad82eca9..e557c9de3fbc 100644 --- a/arch/m68k/configs/q40_defconfig +++ b/arch/m68k/configs/q40_defconfig @@ -91,9 +91,10 @@ CONFIG_NF_TABLES_INET=m CONFIG_NF_TABLES_NETDEV=m CONFIG_NFT_EXTHDR=m CONFIG_NFT_META=m +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m -CONFIG_NFT_RBTREE=m -CONFIG_NFT_HASH=m +CONFIG_NFT_SET_RBTREE=m +CONFIG_NFT_SET_HASH=m CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -101,8 +102,10 @@ CONFIG_NFT_MASQ=m CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m CONFIG_NETFILTER_XT_SET=m @@ -353,6 +356,7 @@ CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_VETH=m # CONFIG_NET_VENDOR_3COM is not set +# CONFIG_NET_VENDOR_AMAZON is not set # CONFIG_NET_VENDOR_AMD is not set # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_CADENCE is not set diff --git a/arch/m68k/configs/sun3_defconfig b/arch/m68k/configs/sun3_defconfig index 4ba8606a4e69..c6a748a36daf 100644 --- a/arch/m68k/configs/sun3_defconfig +++ b/arch/m68k/configs/sun3_defconfig @@ -88,9 +88,10 @@ CONFIG_NF_TABLES_INET=m CONFIG_NF_TABLES_NETDEV=m CONFIG_NFT_EXTHDR=m CONFIG_NFT_META=m +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m -CONFIG_NFT_RBTREE=m -CONFIG_NFT_HASH=m +CONFIG_NFT_SET_RBTREE=m +CONFIG_NFT_SET_HASH=m CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -98,8 +99,10 @@ CONFIG_NFT_MASQ=m CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m CONFIG_NETFILTER_XT_SET=m @@ -343,6 +346,7 @@ CONFIG_MACSEC=m CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_VETH=m +# CONFIG_NET_VENDOR_AMAZON is not set CONFIG_SUN3LANCE=y # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_CADENCE is not set diff --git a/arch/m68k/configs/sun3x_defconfig b/arch/m68k/configs/sun3x_defconfig index c6f49726a6c9..10d60857b9a6 100644 --- a/arch/m68k/configs/sun3x_defconfig +++ b/arch/m68k/configs/sun3x_defconfig @@ -88,9 +88,10 @@ CONFIG_NF_TABLES_INET=m CONFIG_NF_TABLES_NETDEV=m CONFIG_NFT_EXTHDR=m CONFIG_NFT_META=m +CONFIG_NFT_NUMGEN=m CONFIG_NFT_CT=m -CONFIG_NFT_RBTREE=m -CONFIG_NFT_HASH=m +CONFIG_NFT_SET_RBTREE=m +CONFIG_NFT_SET_HASH=m CONFIG_NFT_COUNTER=m CONFIG_NFT_LOG=m CONFIG_NFT_LIMIT=m @@ -98,8 +99,10 @@ CONFIG_NFT_MASQ=m CONFIG_NFT_REDIR=m CONFIG_NFT_NAT=m CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m CONFIG_NFT_REJECT=m CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m CONFIG_NFT_DUP_NETDEV=m CONFIG_NFT_FWD_NETDEV=m CONFIG_NETFILTER_XT_SET=m @@ -343,6 +346,7 @@ CONFIG_MACSEC=m CONFIG_NETCONSOLE=m CONFIG_NETCONSOLE_DYNAMIC=y CONFIG_VETH=m +# CONFIG_NET_VENDOR_AMAZON is not set CONFIG_SUN3LANCE=y # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_CADENCE is not set diff --git a/arch/m68k/emu/nfeth.c b/arch/m68k/emu/nfeth.c index a0985fd088d1..fc4be028c418 100644 --- a/arch/m68k/emu/nfeth.c +++ b/arch/m68k/emu/nfeth.c @@ -184,7 +184,6 @@ static const struct net_device_ops nfeth_netdev_ops = { .ndo_start_xmit = nfeth_xmit, .ndo_tx_timeout = nfeth_tx_timeout, .ndo_validate_addr = eth_validate_addr, - .ndo_change_mtu = eth_change_mtu, .ndo_set_mac_address = eth_mac_addr, }; diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index eb85bd9c6180..1f2e5d31cb24 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -20,7 +20,6 @@ generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += mman.h -generic-y += mutex.h generic-y += percpu.h generic-y += preempt.h generic-y += resource.h diff --git a/arch/m68k/include/asm/delay.h b/arch/m68k/include/asm/delay.h index d28fa8fe26fe..c598d847d56b 100644 --- a/arch/m68k/include/asm/delay.h +++ b/arch/m68k/include/asm/delay.h @@ -114,6 +114,6 @@ static inline void __udelay(unsigned long usecs) */ #define HZSCALE (268435456 / (1000000 / HZ)) -#define ndelay(n) __delay(DIV_ROUND_UP((n) * ((((HZSCALE) >> 11) * (loops_per_jiffy >> 11)) >> 6), 1000)); +#define ndelay(n) __delay(DIV_ROUND_UP((n) * ((((HZSCALE) >> 11) * (loops_per_jiffy >> 11)) >> 6), 1000)) #endif /* defined(_M68K_DELAY_H) */ diff --git a/arch/m68k/include/asm/m5206sim.h b/arch/m68k/include/asm/m5206sim.h index 4cf864f5ea7a..0ddf3efbcae9 100644 --- a/arch/m68k/include/asm/m5206sim.h +++ b/arch/m68k/include/asm/m5206sim.h @@ -110,6 +110,7 @@ /* * Define system peripheral IRQ usage. */ +#define MCF_IRQ_I2C0 29 /* I2C, Level 5 */ #define MCF_IRQ_TIMER 30 /* Timer0, Level 6 */ #define MCF_IRQ_PROFILER 31 /* Timer1, Level 7 */ #define MCF_IRQ_UART0 73 /* UART0 */ @@ -138,6 +139,7 @@ #define MCFSIM_SWDICR MCFSIM_ICR8 /* Watchdog timer ICR */ #define MCFSIM_TIMER1ICR MCFSIM_ICR9 /* Timer 1 ICR */ #define MCFSIM_TIMER2ICR MCFSIM_ICR10 /* Timer 2 ICR */ +#define MCFSIM_I2CICR MCFSIM_ICR11 /* I2C ICR */ #define MCFSIM_UART1ICR MCFSIM_ICR12 /* UART 1 ICR */ #define MCFSIM_UART2ICR MCFSIM_ICR13 /* UART 2 ICR */ #ifdef CONFIG_M5206e @@ -145,5 +147,11 @@ #define MCFSIM_DMA2ICR MCFSIM_ICR15 /* DMA 2 ICR */ #endif +/* + * I2C Controller +*/ +#define MCFI2C_BASE0 (MCF_MBAR + 0x1e0) +#define MCFI2C_SIZE0 0x40 + /****************************************************************************/ #endif /* m5206sim_h */ diff --git a/arch/m68k/include/asm/m520xsim.h b/arch/m68k/include/asm/m520xsim.h index db3f8ee4a6c6..6d50cefa76c3 100644 --- a/arch/m68k/include/asm/m520xsim.h +++ b/arch/m68k/include/asm/m520xsim.h @@ -50,6 +50,7 @@ #define MCFINT_UART0 26 /* Interrupt number for UART0 */ #define MCFINT_UART1 27 /* Interrupt number for UART1 */ #define MCFINT_UART2 28 /* Interrupt number for UART2 */ +#define MCFINT_I2C0 30 /* Interrupt number for I2C */ #define MCFINT_QSPI 31 /* Interrupt number for QSPI */ #define MCFINT_FECRX0 36 /* Interrupt number for FEC RX */ #define MCFINT_FECTX0 40 /* Interrupt number for FEC RX */ @@ -67,6 +68,7 @@ #define MCF_IRQ_QSPI (MCFINT_VECBASE + MCFINT_QSPI) #define MCF_IRQ_PIT1 (MCFINT_VECBASE + MCFINT_PIT1) +#define MCF_IRQ_I2C0 (MCFINT_VECBASE + MCFINT_I2C0) /* * SDRAM configuration registers. */ @@ -200,5 +202,11 @@ #define MCFPM_PPMLR0 0xfc040034 #define MCFPM_LPCR 0xfc0a0007 +/* + * I2C module. + */ +#define MCFI2C_BASE0 0xFC058000 +#define MCFI2C_SIZE0 0x40 + /****************************************************************************/ #endif /* m520xsim_h */ diff --git a/arch/m68k/include/asm/m523xsim.h b/arch/m68k/include/asm/m523xsim.h index 5e06b4eb57f3..d43f6ab1edc9 100644 --- a/arch/m68k/include/asm/m523xsim.h +++ b/arch/m68k/include/asm/m523xsim.h @@ -37,7 +37,8 @@ #define MCFINT_UART0 13 /* Interrupt number for UART0 */ #define MCFINT_UART1 14 /* Interrupt number for UART1 */ #define MCFINT_UART2 15 /* Interrupt number for UART2 */ -#define MCFINT_QSPI 18 /* Interrupt number for QSPI */ +#define MCFINT_I2C0 17 /* Interrupt number for I2C */ +#define MCFINT_QSPI 18 /* Interrupt number for QSPI */ #define MCFINT_FECRX0 23 /* Interrupt number for FEC */ #define MCFINT_FECTX0 27 /* Interrupt number for FEC */ #define MCFINT_FECENTC0 29 /* Interrupt number for FEC */ @@ -53,6 +54,7 @@ #define MCF_IRQ_QSPI (MCFINT_VECBASE + MCFINT_QSPI) #define MCF_IRQ_PIT1 (MCFINT_VECBASE + MCFINT_PIT1) +#define MCF_IRQ_I2C0 (MCFINT_VECBASE + MCFINT_I2C0) /* * SDRAM configuration registers. @@ -208,5 +210,11 @@ #define MCFDMA_BASE2 (MCF_IPSBAR + 0x180) #define MCFDMA_BASE3 (MCF_IPSBAR + 0x1C0) +/* + * I2C module. + */ +#define MCFI2C_BASE0 (MCF_IPSBAR + 0x300) +#define MCFI2C_SIZE0 0x40 + /****************************************************************************/ #endif /* m523xsim_h */ diff --git a/arch/m68k/include/asm/m527xsim.h b/arch/m68k/include/asm/m527xsim.h index 2c648a043f24..35f6fbc89b92 100644 --- a/arch/m68k/include/asm/m527xsim.h +++ b/arch/m68k/include/asm/m527xsim.h @@ -37,6 +37,7 @@ #define MCFINT_UART0 13 /* Interrupt number for UART0 */ #define MCFINT_UART1 14 /* Interrupt number for UART1 */ #define MCFINT_UART2 15 /* Interrupt number for UART2 */ +#define MCFINT_I2C0 17 /* Interrupt number for I2C */ #define MCFINT_QSPI 18 /* Interrupt number for QSPI */ #define MCFINT_FECRX0 23 /* Interrupt number for FEC0 */ #define MCFINT_FECTX0 27 /* Interrupt number for FEC0 */ @@ -61,6 +62,7 @@ #define MCF_IRQ_QSPI (MCFINT_VECBASE + MCFINT_QSPI) #define MCF_IRQ_PIT1 (MCFINT_VECBASE + MCFINT_PIT1) +#define MCF_IRQ_I2C0 (MCFINT_VECBASE + MCFINT_I2C0) /* * SDRAM configuration registers. @@ -353,5 +355,11 @@ #define MCF_RCR_SWRESET 0x80 /* Software reset bit */ #define MCF_RCR_FRCSTOUT 0x40 /* Force external reset */ +/* + * I2C module. + */ +#define MCFI2C_BASE0 (MCF_IPSBAR + 0x300) +#define MCFI2C_SIZE0 0x40 + /****************************************************************************/ #endif /* m527xsim_h */ diff --git a/arch/m68k/include/asm/m528xsim.h b/arch/m68k/include/asm/m528xsim.h index cf68ca0ac3a5..67f6182d10a4 100644 --- a/arch/m68k/include/asm/m528xsim.h +++ b/arch/m68k/include/asm/m528xsim.h @@ -37,6 +37,7 @@ #define MCFINT_UART0 13 /* Interrupt number for UART0 */ #define MCFINT_UART1 14 /* Interrupt number for UART1 */ #define MCFINT_UART2 15 /* Interrupt number for UART2 */ +#define MCFINT_I2C0 17 /* Interrupt number for I2C */ #define MCFINT_QSPI 18 /* Interrupt number for QSPI */ #define MCFINT_FECRX0 23 /* Interrupt number for FEC */ #define MCFINT_FECTX0 27 /* Interrupt number for FEC */ @@ -53,6 +54,8 @@ #define MCF_IRQ_QSPI (MCFINT_VECBASE + MCFINT_QSPI) #define MCF_IRQ_PIT1 (MCFINT_VECBASE + MCFINT_PIT1) +#define MCF_IRQ_I2C0 (MCFINT_VECBASE + MCFINT_I2C0) + /* * SDRAM configuration registers. */ @@ -242,5 +245,11 @@ #define MCF_RCR_SWRESET 0x80 /* Software reset bit */ #define MCF_RCR_FRCSTOUT 0x40 /* Force external reset */ +/* + * I2C module + */ +#define MCFI2C_BASE0 (MCF_IPSBAR + 0x300) +#define MCFI2C_SIZE0 0x40 + /****************************************************************************/ #endif /* m528xsim_h */ diff --git a/arch/m68k/include/asm/m5307sim.h b/arch/m68k/include/asm/m5307sim.h index 5d0bb7ec31f8..d2595e04eb1d 100644 --- a/arch/m68k/include/asm/m5307sim.h +++ b/arch/m68k/include/asm/m5307sim.h @@ -148,6 +148,7 @@ #define MCFSIM_SWDICR MCFSIM_ICR0 /* Watchdog timer ICR */ #define MCFSIM_TIMER1ICR MCFSIM_ICR1 /* Timer 1 ICR */ #define MCFSIM_TIMER2ICR MCFSIM_ICR2 /* Timer 2 ICR */ +#define MCFSIM_I2CICR MCFSIM_ICR3 /* I2C ICR */ #define MCFSIM_UART1ICR MCFSIM_ICR4 /* UART 1 ICR */ #define MCFSIM_UART2ICR MCFSIM_ICR5 /* UART 2 ICR */ #define MCFSIM_DMA0ICR MCFSIM_ICR6 /* DMA 0 ICR */ @@ -155,7 +156,6 @@ #define MCFSIM_DMA2ICR MCFSIM_ICR8 /* DMA 2 ICR */ #define MCFSIM_DMA3ICR MCFSIM_ICR9 /* DMA 3 ICR */ - /* * Some symbol defines for the Parallel Port Pin Assignment Register */ @@ -174,10 +174,17 @@ /* * Define system peripheral IRQ usage. */ +#define MCF_IRQ_I2C0 29 /* I2C, Level 5 */ #define MCF_IRQ_TIMER 30 /* Timer0, Level 6 */ #define MCF_IRQ_PROFILER 31 /* Timer1, Level 7 */ #define MCF_IRQ_UART0 73 /* UART0 */ #define MCF_IRQ_UART1 74 /* UART1 */ +/* + * I2C module + */ +#define MCFI2C_BASE0 (MCF_MBAR + 0x280) +#define MCFI2C_SIZE0 0x40 + /****************************************************************************/ #endif /* m5307sim_h */ diff --git a/arch/m68k/include/asm/m53xxsim.h b/arch/m68k/include/asm/m53xxsim.h index faa1a2133bfd..53329ae4d3e3 100644 --- a/arch/m68k/include/asm/m53xxsim.h +++ b/arch/m68k/include/asm/m53xxsim.h @@ -19,6 +19,7 @@ #define MCFINT_UART0 26 /* Interrupt number for UART0 */ #define MCFINT_UART1 27 /* Interrupt number for UART1 */ #define MCFINT_UART2 28 /* Interrupt number for UART2 */ +#define MCFINT_I2C0 30 /* Interrupt number for I2C */ #define MCFINT_QSPI 31 /* Interrupt number for QSPI */ #define MCFINT_FECRX0 36 /* Interrupt number for FEC */ #define MCFINT_FECTX0 40 /* Interrupt number for FEC */ @@ -32,6 +33,7 @@ #define MCF_IRQ_FECTX0 (MCFINT_VECBASE + MCFINT_FECTX0) #define MCF_IRQ_FECENTC0 (MCFINT_VECBASE + MCFINT_FECENTC0) +#define MCF_IRQ_I2C0 (MCFINT_VECBASE + MCFINT_I2C0) #define MCF_IRQ_QSPI (MCFINT_VECBASE + MCFINT_QSPI) #define MCF_WTM_WCR 0xFC098000 @@ -1237,5 +1239,11 @@ #define MCFEPORT_EPPDR (0xFC094005) #define MCFEPORT_EPFR (0xFC094006) +/* + * I2C Module + */ +#define MCFI2C_BASE0 (0xFc058000) +#define MCFI2C_SIZE0 0x40 + /********************************************************************/ #endif /* m53xxsim_h */ diff --git a/arch/m68k/include/asm/m5407sim.h b/arch/m68k/include/asm/m5407sim.h index a7550bc5cd1e..ab40c16ba989 100644 --- a/arch/m68k/include/asm/m5407sim.h +++ b/arch/m68k/include/asm/m5407sim.h @@ -112,6 +112,7 @@ #define MCFSIM_SWDICR MCFSIM_ICR0 /* Watchdog timer ICR */ #define MCFSIM_TIMER1ICR MCFSIM_ICR1 /* Timer 1 ICR */ #define MCFSIM_TIMER2ICR MCFSIM_ICR2 /* Timer 2 ICR */ +#define MCFSIM_I2CICR MCFSIM_ICR3 /* I2C ICR */ #define MCFSIM_UART1ICR MCFSIM_ICR4 /* UART 1 ICR */ #define MCFSIM_UART2ICR MCFSIM_ICR5 /* UART 2 ICR */ #define MCFSIM_DMA0ICR MCFSIM_ICR6 /* DMA 0 ICR */ @@ -137,10 +138,17 @@ /* * Define system peripheral IRQ usage. */ +#define MCF_IRQ_I2C0 29 /* I2C, Level 5 */ #define MCF_IRQ_TIMER 30 /* Timer0, Level 6 */ #define MCF_IRQ_PROFILER 31 /* Timer1, Level 7 */ #define MCF_IRQ_UART0 73 /* UART0 */ #define MCF_IRQ_UART1 74 /* UART1 */ +/* + * I2C module + */ +#define MCFI2C_BASE0 (MCF_MBAR + 0x280) +#define MCFI2C_SIZE0 0x40 + /****************************************************************************/ #endif /* m5407sim_h */ diff --git a/arch/m68k/include/asm/m54xxsim.h b/arch/m68k/include/asm/m54xxsim.h index 73d937ff36eb..7758d0a1a84d 100644 --- a/arch/m68k/include/asm/m54xxsim.h +++ b/arch/m68k/include/asm/m54xxsim.h @@ -45,6 +45,7 @@ */ #define MCF_IRQ_TIMER (MCFINT_VECBASE + 54) /* Slice Timer 0 */ #define MCF_IRQ_PROFILER (MCFINT_VECBASE + 53) /* Slice Timer 1 */ +#define MCF_IRQ_I2C0 (MCFINT_VECBASE + 40) #define MCF_IRQ_UART0 (MCFINT_VECBASE + 35) #define MCF_IRQ_UART1 (MCFINT_VECBASE + 34) #define MCF_IRQ_UART2 (MCFINT_VECBASE + 33) @@ -107,4 +108,14 @@ #define MCF_PAR_PSC_RTS_RTS (0x30) #define MCF_PAR_PSC_CANRX (0x40) +#define MCF_PAR_FECI2CIRQ (MCF_MBAR + 0x00000a44) /* FEC/I2C/IRQ */ +#define MCF_PAR_FECI2CIRQ_SDA (1 << 3) +#define MCF_PAR_FECI2CIRQ_SCL (1 << 2) + +/* + * I2C module. + */ +#define MCFI2C_BASE0 (MCF_MBAR + 0x8f00) +#define MCFI2C_SIZE0 0x40 + #endif /* m54xxsim_h */ diff --git a/arch/m68k/include/asm/processor.h b/arch/m68k/include/asm/processor.h index c84a2183b3f0..f5f790c31bf8 100644 --- a/arch/m68k/include/asm/processor.h +++ b/arch/m68k/include/asm/processor.h @@ -156,6 +156,5 @@ unsigned long get_wchan(struct task_struct *p); #define task_pt_regs(tsk) ((struct pt_regs *) ((tsk)->thread.esp0)) #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() #endif diff --git a/arch/metag/include/asm/Kbuild b/arch/metag/include/asm/Kbuild index 29acb89daaaa..167150c701d1 100644 --- a/arch/metag/include/asm/Kbuild +++ b/arch/metag/include/asm/Kbuild @@ -27,7 +27,6 @@ generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += msgbuf.h -generic-y += mutex.h generic-y += param.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/metag/include/asm/processor.h b/arch/metag/include/asm/processor.h index a0333ebcac35..ec6a49076980 100644 --- a/arch/metag/include/asm/processor.h +++ b/arch/metag/include/asm/processor.h @@ -152,7 +152,6 @@ unsigned long get_wchan(struct task_struct *p); #define user_stack_pointer(regs) ((regs)->ctx.AX[0].U0) #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() extern void setup_priv(void); diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 86f65721e629..85885a501dce 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -27,6 +27,7 @@ config MICROBLAZE select HAVE_MEMBLOCK_NODE_MAP select HAVE_OPROFILE select IRQ_DOMAIN + select XILINX_INTC select MODULES_USE_ELF_RELA select OF select OF_EARLY_FLATTREE diff --git a/arch/microblaze/include/asm/irq.h b/arch/microblaze/include/asm/irq.h index bab3b1393ad4..d785defeeed5 100644 --- a/arch/microblaze/include/asm/irq.h +++ b/arch/microblaze/include/asm/irq.h @@ -16,6 +16,6 @@ struct pt_regs; extern void do_IRQ(struct pt_regs *regs); /* should be defined in each interrupt controller driver */ -extern unsigned int get_irq(void); +extern unsigned int xintc_get_irq(void); #endif /* _ASM_MICROBLAZE_IRQ_H */ diff --git a/arch/microblaze/include/asm/mutex.h b/arch/microblaze/include/asm/mutex.h deleted file mode 100644 index ff6101aa2c71..000000000000 --- a/arch/microblaze/include/asm/mutex.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/mutex-dec.h> diff --git a/arch/microblaze/include/asm/processor.h b/arch/microblaze/include/asm/processor.h index c38d0dd91134..37ef196e4519 100644 --- a/arch/microblaze/include/asm/processor.h +++ b/arch/microblaze/include/asm/processor.h @@ -22,7 +22,6 @@ extern const struct seq_operations cpuinfo_op; # define cpu_relax() barrier() -# define cpu_relax_lowlatency() cpu_relax() #define task_pt_regs(tsk) \ (((struct pt_regs *)(THREAD_SIZE + task_stack_page(tsk))) - 1) diff --git a/arch/microblaze/kernel/Makefile b/arch/microblaze/kernel/Makefile index f08bacaf8a95..e098381af928 100644 --- a/arch/microblaze/kernel/Makefile +++ b/arch/microblaze/kernel/Makefile @@ -15,7 +15,7 @@ endif extra-y := head.o vmlinux.lds obj-y += dma.o exceptions.o \ - hw_exception_handler.o intc.o irq.o \ + hw_exception_handler.o irq.o \ platform.o process.o prom.o ptrace.o \ reset.o setup.o signal.o sys_microblaze.o timer.o traps.o unwind.o diff --git a/arch/microblaze/kernel/intc.c b/arch/microblaze/kernel/intc.c deleted file mode 100644 index 90bec7d71f85..000000000000 --- a/arch/microblaze/kernel/intc.c +++ /dev/null @@ -1,196 +0,0 @@ -/* - * Copyright (C) 2007-2013 Michal Simek <monstr@monstr.eu> - * Copyright (C) 2012-2013 Xilinx, Inc. - * Copyright (C) 2007-2009 PetaLogix - * Copyright (C) 2006 Atmark Techno, Inc. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ - -#include <linux/irqdomain.h> -#include <linux/irq.h> -#include <linux/irqchip.h> -#include <linux/of_address.h> -#include <linux/io.h> -#include <linux/bug.h> - -static void __iomem *intc_baseaddr; - -/* No one else should require these constants, so define them locally here. */ -#define ISR 0x00 /* Interrupt Status Register */ -#define IPR 0x04 /* Interrupt Pending Register */ -#define IER 0x08 /* Interrupt Enable Register */ -#define IAR 0x0c /* Interrupt Acknowledge Register */ -#define SIE 0x10 /* Set Interrupt Enable bits */ -#define CIE 0x14 /* Clear Interrupt Enable bits */ -#define IVR 0x18 /* Interrupt Vector Register */ -#define MER 0x1c /* Master Enable Register */ - -#define MER_ME (1<<0) -#define MER_HIE (1<<1) - -static unsigned int (*read_fn)(void __iomem *); -static void (*write_fn)(u32, void __iomem *); - -static void intc_write32(u32 val, void __iomem *addr) -{ - iowrite32(val, addr); -} - -static unsigned int intc_read32(void __iomem *addr) -{ - return ioread32(addr); -} - -static void intc_write32_be(u32 val, void __iomem *addr) -{ - iowrite32be(val, addr); -} - -static unsigned int intc_read32_be(void __iomem *addr) -{ - return ioread32be(addr); -} - -static void intc_enable_or_unmask(struct irq_data *d) -{ - unsigned long mask = 1 << d->hwirq; - - pr_debug("enable_or_unmask: %ld\n", d->hwirq); - - /* ack level irqs because they can't be acked during - * ack function since the handle_level_irq function - * acks the irq before calling the interrupt handler - */ - if (irqd_is_level_type(d)) - write_fn(mask, intc_baseaddr + IAR); - - write_fn(mask, intc_baseaddr + SIE); -} - -static void intc_disable_or_mask(struct irq_data *d) -{ - pr_debug("disable: %ld\n", d->hwirq); - write_fn(1 << d->hwirq, intc_baseaddr + CIE); -} - -static void intc_ack(struct irq_data *d) -{ - pr_debug("ack: %ld\n", d->hwirq); - write_fn(1 << d->hwirq, intc_baseaddr + IAR); -} - -static void intc_mask_ack(struct irq_data *d) -{ - unsigned long mask = 1 << d->hwirq; - - pr_debug("disable_and_ack: %ld\n", d->hwirq); - write_fn(mask, intc_baseaddr + CIE); - write_fn(mask, intc_baseaddr + IAR); -} - -static struct irq_chip intc_dev = { - .name = "Xilinx INTC", - .irq_unmask = intc_enable_or_unmask, - .irq_mask = intc_disable_or_mask, - .irq_ack = intc_ack, - .irq_mask_ack = intc_mask_ack, -}; - -static struct irq_domain *root_domain; - -unsigned int get_irq(void) -{ - unsigned int hwirq, irq = -1; - - hwirq = read_fn(intc_baseaddr + IVR); - if (hwirq != -1U) - irq = irq_find_mapping(root_domain, hwirq); - - pr_debug("get_irq: hwirq=%d, irq=%d\n", hwirq, irq); - - return irq; -} - -static int xintc_map(struct irq_domain *d, unsigned int irq, irq_hw_number_t hw) -{ - u32 intr_mask = (u32)d->host_data; - - if (intr_mask & (1 << hw)) { - irq_set_chip_and_handler_name(irq, &intc_dev, - handle_edge_irq, "edge"); - irq_clear_status_flags(irq, IRQ_LEVEL); - } else { - irq_set_chip_and_handler_name(irq, &intc_dev, - handle_level_irq, "level"); - irq_set_status_flags(irq, IRQ_LEVEL); - } - return 0; -} - -static const struct irq_domain_ops xintc_irq_domain_ops = { - .xlate = irq_domain_xlate_onetwocell, - .map = xintc_map, -}; - -static int __init xilinx_intc_of_init(struct device_node *intc, - struct device_node *parent) -{ - u32 nr_irq, intr_mask; - int ret; - - intc_baseaddr = of_iomap(intc, 0); - BUG_ON(!intc_baseaddr); - - ret = of_property_read_u32(intc, "xlnx,num-intr-inputs", &nr_irq); - if (ret < 0) { - pr_err("%s: unable to read xlnx,num-intr-inputs\n", __func__); - return ret; - } - - ret = of_property_read_u32(intc, "xlnx,kind-of-intr", &intr_mask); - if (ret < 0) { - pr_err("%s: unable to read xlnx,kind-of-intr\n", __func__); - return ret; - } - - if (intr_mask >> nr_irq) - pr_warn("%s: mismatch in kind-of-intr param\n", __func__); - - pr_info("%s: num_irq=%d, edge=0x%x\n", - intc->full_name, nr_irq, intr_mask); - - write_fn = intc_write32; - read_fn = intc_read32; - - /* - * Disable all external interrupts until they are - * explicity requested. - */ - write_fn(0, intc_baseaddr + IER); - - /* Acknowledge any pending interrupts just in case. */ - write_fn(0xffffffff, intc_baseaddr + IAR); - - /* Turn on the Master Enable. */ - write_fn(MER_HIE | MER_ME, intc_baseaddr + MER); - if (!(read_fn(intc_baseaddr + MER) & (MER_HIE | MER_ME))) { - write_fn = intc_write32_be; - read_fn = intc_read32_be; - write_fn(MER_HIE | MER_ME, intc_baseaddr + MER); - } - - /* Yeah, okay, casting the intr_mask to a void* is butt-ugly, but I'm - * lazy and Michal can clean it up to something nicer when he tests - * and commits this patch. ~~gcl */ - root_domain = irq_domain_add_linear(intc, nr_irq, &xintc_irq_domain_ops, - (void *)intr_mask); - - irq_set_default_host(root_domain); - - return 0; -} - -IRQCHIP_DECLARE(xilinx_intc, "xlnx,xps-intc-1.00.a", xilinx_intc_of_init); diff --git a/arch/microblaze/kernel/irq.c b/arch/microblaze/kernel/irq.c index 11e24de91aa4..903dad822fad 100644 --- a/arch/microblaze/kernel/irq.c +++ b/arch/microblaze/kernel/irq.c @@ -29,12 +29,12 @@ void __irq_entry do_IRQ(struct pt_regs *regs) trace_hardirqs_off(); irq_enter(); - irq = get_irq(); + irq = xintc_get_irq(); next_irq: BUG_ON(!irq); generic_handle_irq(irq); - irq = get_irq(); + irq = xintc_get_irq(); if (irq != -1U) { pr_debug("next irq: %d\n", irq); ++concurrent_irq; diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 9740066cc631..3269b742a75e 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -9,7 +9,6 @@ generic-y += irq_work.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h -generic-y += mutex.h generic-y += parport.h generic-y += percpu.h generic-y += preempt.h diff --git a/arch/mips/include/asm/processor.h b/arch/mips/include/asm/processor.h index 0d36c87acbe2..95b8c471f572 100644 --- a/arch/mips/include/asm/processor.h +++ b/arch/mips/include/asm/processor.h @@ -389,7 +389,6 @@ unsigned long get_wchan(struct task_struct *p); #define KSTK_STATUS(tsk) (task_pt_regs(tsk)->cp0_status) #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() /* * Return_address is a replacement for __builtin_return_address(count) diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h index 2027240aafbb..566ecdcb5b4b 100644 --- a/arch/mips/include/uapi/asm/socket.h +++ b/arch/mips/include/uapi/asm/socket.h @@ -108,4 +108,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/mips/kernel/time.c b/arch/mips/kernel/time.c index 8d0170969e22..a7f81261c781 100644 --- a/arch/mips/kernel/time.c +++ b/arch/mips/kernel/time.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL(rtc_lock); int __weak rtc_mips_set_time(unsigned long sec) { - return 0; + return -ENODEV; } int __weak rtc_mips_set_mmss(unsigned long nowtime) diff --git a/arch/mips/lantiq/falcon/sysctrl.c b/arch/mips/lantiq/falcon/sysctrl.c index 2a1b3021589c..82bbd0e2e298 100644 --- a/arch/mips/lantiq/falcon/sysctrl.c +++ b/arch/mips/lantiq/falcon/sysctrl.c @@ -24,7 +24,7 @@ /* GPE frequency selection */ #define GPPC_OFFSET 24 -#define GPEFREQ_MASK 0x00000C0 +#define GPEFREQ_MASK 0x0000C00 #define GPEFREQ_OFFSET 10 /* Clock status register */ #define SYSCTL_CLKS 0x0000 diff --git a/arch/mn10300/include/asm/mutex.h b/arch/mn10300/include/asm/mutex.h deleted file mode 100644 index 84f5490c6fb4..000000000000 --- a/arch/mn10300/include/asm/mutex.h +++ /dev/null @@ -1,16 +0,0 @@ -/* MN10300 Mutex fastpath - * - * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved. - * Written by David Howells (dhowells@redhat.com) - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public Licence - * as published by the Free Software Foundation; either version - * 2 of the Licence, or (at your option) any later version. - * - * - * TODO: implement optimized primitives instead, or leave the generic - * implementation in place, or pick the atomic_xchg() based generic - * implementation. (see asm-generic/mutex-xchg.h for details) - */ -#include <asm-generic/mutex-null.h> diff --git a/arch/mn10300/include/asm/processor.h b/arch/mn10300/include/asm/processor.h index b10ba121c849..18e17abf7664 100644 --- a/arch/mn10300/include/asm/processor.h +++ b/arch/mn10300/include/asm/processor.h @@ -69,7 +69,6 @@ extern void print_cpu_info(struct mn10300_cpuinfo *); extern void dodgy_tsc(void); #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() /* * User space process size: 1.75GB (default). diff --git a/arch/mn10300/include/uapi/asm/socket.h b/arch/mn10300/include/uapi/asm/socket.h index 5129f23a9ee1..0e12527c4b0e 100644 --- a/arch/mn10300/include/uapi/asm/socket.h +++ b/arch/mn10300/include/uapi/asm/socket.h @@ -90,4 +90,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/mn10300/unit-asb2303/include/unit/smc91111.h b/arch/mn10300/unit-asb2303/include/unit/smc91111.h index dd456e9c513f..dd4e2946438e 100644 --- a/arch/mn10300/unit-asb2303/include/unit/smc91111.h +++ b/arch/mn10300/unit-asb2303/include/unit/smc91111.h @@ -30,7 +30,7 @@ #if SMC_CAN_USE_16BIT #define SMC_inw(a, r) inw((unsigned long) ((a) + (r))) -#define SMC_outw(v, a, r) outw(v, (unsigned long) ((a) + (r))) +#define SMC_outw(lp, v, a, r) outw(v, (unsigned long) ((a) + (r))) #define SMC_insw(a, r, p, l) insw((unsigned long) ((a) + (r)), (p), (l)) #define SMC_outsw(a, r, p, l) outsw((unsigned long) ((a) + (r)), (p), (l)) #endif diff --git a/arch/nios2/include/asm/mutex.h b/arch/nios2/include/asm/mutex.h deleted file mode 100644 index ff6101aa2c71..000000000000 --- a/arch/nios2/include/asm/mutex.h +++ /dev/null @@ -1 +0,0 @@ -#include <asm-generic/mutex-dec.h> diff --git a/arch/nios2/include/asm/processor.h b/arch/nios2/include/asm/processor.h index 1c953f0cadbf..3bbbc3d798e5 100644 --- a/arch/nios2/include/asm/processor.h +++ b/arch/nios2/include/asm/processor.h @@ -88,7 +88,6 @@ extern unsigned long get_wchan(struct task_struct *p); #define KSTK_ESP(tsk) ((tsk)->thread.kregs->sp) #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() #endif /* __ASSEMBLY__ */ diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig index 489e7f909286..8d22015fde3e 100644 --- a/arch/openrisc/Kconfig +++ b/arch/openrisc/Kconfig @@ -26,6 +26,7 @@ config OPENRISC select HAVE_DEBUG_STACKOVERFLOW select OR1K_PIC select CPU_NO_EFFICIENT_FFS if !OPENRISC_HAVE_INST_FF1 + select NO_BOOTMEM config MMU def_bool y @@ -98,6 +99,9 @@ config OPENRISC_HAVE_INST_DIV Select this if your implementation has a hardware divide instruction endmenu +config NR_CPUS + int + default "1" source kernel/Kconfig.hz source kernel/Kconfig.preempt diff --git a/arch/openrisc/README.openrisc b/arch/openrisc/README.openrisc index c9f7edf2b9a2..072069ab5100 100644 --- a/arch/openrisc/README.openrisc +++ b/arch/openrisc/README.openrisc @@ -6,7 +6,7 @@ target architecture, specifically, is the 32-bit OpenRISC 1000 family (or1k). For information about OpenRISC processors and ongoing development: - website http://openrisc.net + website http://openrisc.io For more information about Linux on OpenRISC, please contact South Pole AB. @@ -24,17 +24,17 @@ In order to build and run Linux for OpenRISC, you'll need at least a basic toolchain and, perhaps, the architectural simulator. Steps to get these bits in place are outlined here. -1) The toolchain can be obtained from openrisc.net. Instructions for building +1) The toolchain can be obtained from openrisc.io. Instructions for building a toolchain can be found at: -http://openrisc.net/toolchain-build.html +https://github.com/openrisc/tutorials 2) or1ksim (optional) or1ksim is the architectural simulator which will allow you to actually run your OpenRISC Linux kernel if you don't have an OpenRISC processor at hand. - git clone git://openrisc.net/jonas/or1ksim-svn + git clone https://github.com/openrisc/or1ksim.git cd or1ksim ./configure --prefix=$OPENRISC_PREFIX diff --git a/arch/openrisc/TODO.openrisc b/arch/openrisc/TODO.openrisc index acfeef9c58e3..0eb04c8240f9 100644 --- a/arch/openrisc/TODO.openrisc +++ b/arch/openrisc/TODO.openrisc @@ -5,9 +5,6 @@ that are due for investigation shortly, i.e. our TODO list: -- Implement the rest of the DMA API... dma_map_sg, etc. --- Consolidate usage of memblock and bootmem... move everything over to - memblock. - -- Finish the renaming cleanup... there are references to or32 in the code which was an older name for the architecture. The name we've settled on is or1k and this change is slowly trickling through the stack. For the time diff --git a/arch/openrisc/include/asm/mutex.h b/arch/openrisc/include/asm/mutex.h deleted file mode 100644 index b85a0cfa9fc9..000000000000 --- a/arch/openrisc/include/asm/mutex.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * OpenRISC Linux - * - * Linux architectural port borrowing liberally from similar works of - * others. All original copyrights apply as per the original source - * declaration. - * - * OpenRISC implementation: - * Copyright (C) 2003 Matjaz Breskvar <phoenix@bsemi.com> - * Copyright (C) 2010-2011 Jonas Bonn <jonas@southpole.se> - * et al. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - */ - -/* - * Pull in the generic implementation for the mutex fastpath. - * - * TODO: implement optimized primitives instead, or leave the generic - * implementation in place, or pick the atomic_xchg() based generic - * implementation. (see asm-generic/mutex-xchg.h for details) - */ - -#include <asm-generic/mutex-dec.h> diff --git a/arch/openrisc/include/asm/pgalloc.h b/arch/openrisc/include/asm/pgalloc.h index 87eebd185089..3e1a46615120 100644 --- a/arch/openrisc/include/asm/pgalloc.h +++ b/arch/openrisc/include/asm/pgalloc.h @@ -23,7 +23,6 @@ #include <linux/threads.h> #include <linux/mm.h> #include <linux/memblock.h> -#include <linux/bootmem.h> extern int mem_init_done; diff --git a/arch/openrisc/include/asm/pgtable.h b/arch/openrisc/include/asm/pgtable.h index 69c7df0e1420..3567aa7be555 100644 --- a/arch/openrisc/include/asm/pgtable.h +++ b/arch/openrisc/include/asm/pgtable.h @@ -69,7 +69,7 @@ extern void paging_init(void); */ #define PTRS_PER_PTE (1UL << (PAGE_SHIFT-2)) -#define PTRS_PER_PGD (1UL << (PAGE_SHIFT-2)) +#define PTRS_PER_PGD (1UL << (32-PGDIR_SHIFT)) /* calculate how many PGD entries a user-level program can use * the first mappable virtual address is 0 diff --git a/arch/openrisc/include/asm/processor.h b/arch/openrisc/include/asm/processor.h index 70334c9f7d24..a908e6c30a00 100644 --- a/arch/openrisc/include/asm/processor.h +++ b/arch/openrisc/include/asm/processor.h @@ -92,7 +92,6 @@ extern unsigned long thread_saved_pc(struct task_struct *t); #define init_stack (init_thread_union.stack) #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() #endif /* __ASSEMBLY__ */ #endif /* __ASM_OPENRISC_PROCESSOR_H */ diff --git a/arch/openrisc/kernel/entry.S b/arch/openrisc/kernel/entry.S index fec8bf97d806..aac0bde3330c 100644 --- a/arch/openrisc/kernel/entry.S +++ b/arch/openrisc/kernel/entry.S @@ -264,7 +264,7 @@ EXCEPTION_ENTRY(_data_page_fault_handler) l.srli r6,r6,26 // check opcode for write access #endif - l.sfgeui r6,0x34 // check opcode for write access + l.sfgeui r6,0x33 // check opcode for write access l.bnf 1f l.sfleui r6,0x37 l.bnf 1f @@ -1101,8 +1101,16 @@ ENTRY(__sys_fork) l.addi r3,r1,0 ENTRY(sys_rt_sigreturn) - l.j _sys_rt_sigreturn + l.jal _sys_rt_sigreturn l.addi r3,r1,0 + l.sfne r30,r0 + l.bnf _no_syscall_trace + l.nop + l.jal do_syscall_trace_leave + l.addi r3,r1,0 +_no_syscall_trace: + l.j _resume_userspace + l.nop /* This is a catch-all syscall for atomic instructions for the OpenRISC 1000. * The functions takes a variable number of parameters depending on which diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 7095dfe7666b..277123bb4bf8 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -173,6 +173,19 @@ copy_thread(unsigned long clone_flags, unsigned long usp, if (usp) userregs->sp = usp; + + /* + * For CLONE_SETTLS set "tp" (r10) to the TLS pointer passed to sys_clone. + * + * The kernel entry is: + * int clone (long flags, void *child_stack, int *parent_tid, + * int *child_tid, struct void *tls) + * + * This makes the source r7 in the kernel registers. + */ + if (clone_flags & CLONE_SETTLS) + userregs->gpr[10] = userregs->gpr[7]; + userregs->gpr[11] = 0; /* Result from fork() */ kregs->gpr[20] = 0; /* Userspace thread */ diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c index b4ed8b36e078..cb797a3beb47 100644 --- a/arch/openrisc/kernel/setup.c +++ b/arch/openrisc/kernel/setup.c @@ -38,7 +38,6 @@ #include <linux/of.h> #include <linux/memblock.h> #include <linux/device.h> -#include <linux/of_platform.h> #include <asm/sections.h> #include <asm/segment.h> @@ -51,18 +50,16 @@ #include "vmlinux.h" -static unsigned long __init setup_memory(void) +static void __init setup_memory(void) { - unsigned long bootmap_size; unsigned long ram_start_pfn; - unsigned long free_ram_start_pfn; unsigned long ram_end_pfn; phys_addr_t memory_start, memory_end; struct memblock_region *region; memory_end = memory_start = 0; - /* Find main memory where is the kernel */ + /* Find main memory where is the kernel, we assume its the only one */ for_each_memblock(memory, region) { memory_start = region->base; memory_end = region->base + region->size; @@ -75,10 +72,11 @@ static unsigned long __init setup_memory(void) } ram_start_pfn = PFN_UP(memory_start); - /* free_ram_start_pfn is first page after kernel */ - free_ram_start_pfn = PFN_UP(__pa(_end)); ram_end_pfn = PFN_DOWN(memblock_end_of_DRAM()); + /* setup bootmem globals (we use no_bootmem, but mm still depends on this) */ + min_low_pfn = ram_start_pfn; + max_low_pfn = ram_end_pfn; max_pfn = ram_end_pfn; /* @@ -86,22 +84,13 @@ static unsigned long __init setup_memory(void) * * This makes the memory from the end of the kernel to the end of * RAM usable. - * init_bootmem sets the global values min_low_pfn, max_low_pfn. */ - bootmap_size = init_bootmem(free_ram_start_pfn, - ram_end_pfn - ram_start_pfn); - free_bootmem(PFN_PHYS(free_ram_start_pfn), - (ram_end_pfn - free_ram_start_pfn) << PAGE_SHIFT); - reserve_bootmem(PFN_PHYS(free_ram_start_pfn), bootmap_size, - BOOTMEM_DEFAULT); - - for_each_memblock(reserved, region) { - printk(KERN_INFO "Reserved - 0x%08x-0x%08x\n", - (u32) region->base, (u32) region->size); - reserve_bootmem(region->base, region->size, BOOTMEM_DEFAULT); - } + memblock_reserve(__pa(_stext), _end - _stext); + + early_init_fdt_reserve_self(); + early_init_fdt_scan_reserved_mem(); - return ram_end_pfn; + memblock_dump_all(); } struct cpuinfo cpuinfo; @@ -219,15 +208,6 @@ void __init or32_early_setup(void *fdt) early_init_devtree(fdt); } -static int __init openrisc_device_probe(void) -{ - of_platform_populate(NULL, NULL, NULL, NULL); - - return 0; -} - -device_initcall(openrisc_device_probe); - static inline unsigned long extract_value_bits(unsigned long reg, short bit_nr, short width) { @@ -282,8 +262,6 @@ void calibrate_delay(void) void __init setup_arch(char **cmdline_p) { - unsigned long max_low_pfn; - unflatten_and_copy_device_tree(); setup_cpuinfo(); @@ -304,8 +282,8 @@ void __init setup_arch(char **cmdline_p) initrd_below_start_ok = 1; #endif - /* setup bootmem allocator */ - max_low_pfn = setup_memory(); + /* setup memblock allocator */ + setup_memory(); /* paging_init() sets up the MMU and marks all pages as reserved */ paging_init(); @@ -317,7 +295,7 @@ void __init setup_arch(char **cmdline_p) *cmdline_p = boot_command_line; - printk(KERN_INFO "OpenRISC Linux -- http://openrisc.net\n"); + printk(KERN_INFO "OpenRISC Linux -- http://openrisc.io\n"); } static int show_cpuinfo(struct seq_file *m, void *v) diff --git a/arch/openrisc/kernel/vmlinux.lds.S b/arch/openrisc/kernel/vmlinux.lds.S index d68b9ede8423..ef31fc24344e 100644 --- a/arch/openrisc/kernel/vmlinux.lds.S +++ b/arch/openrisc/kernel/vmlinux.lds.S @@ -30,7 +30,13 @@ #include <asm/cache.h> #include <asm-generic/vmlinux.lds.h> -OUTPUT_FORMAT("elf32-or32", "elf32-or32", "elf32-or32") +#ifdef __OR1K__ +#define __OUTPUT_FORMAT "elf32-or1k" +#else +#define __OUTPUT_FORMAT "elf32-or32" +#endif + +OUTPUT_FORMAT(__OUTPUT_FORMAT, __OUTPUT_FORMAT, __OUTPUT_FORMAT) jiffies = jiffies_64 + 4; SECTIONS diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 7f94652311d7..f67d82b9d22f 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -106,11 +106,11 @@ static void __init map_ram(void) } /* Alloc one page for holding PTE's... */ - pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); + pte = (pte_t *) __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE)); set_pmd(pme, __pmd(_KERNPG_TABLE + __pa(pte))); /* Fill the newly allocated page with PTE'S */ - for (j = 0; p < e && j < PTRS_PER_PGD; + for (j = 0; p < e && j < PTRS_PER_PTE; v += PAGE_SIZE, p += PAGE_SIZE, j++, pte++) { if (v >= (u32) _e_kernel_ro || v < (u32) _s_kernel_ro) diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index fa60b81aee3e..8705a46218f9 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -124,11 +124,7 @@ pte_t __ref *pte_alloc_one_kernel(struct mm_struct *mm, if (likely(mem_init_done)) { pte = (pte_t *) __get_free_page(GFP_KERNEL); } else { - pte = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); -#if 0 - /* FIXME: use memblock... */ pte = (pte_t *) __va(memblock_alloc(PAGE_SIZE, PAGE_SIZE)); -#endif } if (pte) diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index f9b3a81aefcd..91f53c07f410 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -16,7 +16,6 @@ generic-y += local.h generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h -generic-y += mutex.h generic-y += param.h generic-y += percpu.h generic-y += poll.h diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h index c2c43f714684..3a4ed9f91d57 100644 --- a/arch/parisc/include/asm/pgtable.h +++ b/arch/parisc/include/asm/pgtable.h @@ -65,9 +65,9 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr) unsigned long flags; \ spin_lock_irqsave(&pa_tlb_lock, flags); \ old_pte = *ptep; \ - set_pte(ptep, pteval); \ if (pte_inserted(old_pte)) \ purge_tlb_entries(mm, addr); \ + set_pte(ptep, pteval); \ spin_unlock_irqrestore(&pa_tlb_lock, flags); \ } while (0) @@ -478,8 +478,8 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned spin_unlock_irqrestore(&pa_tlb_lock, flags); return 0; } - set_pte(ptep, pte_mkold(pte)); purge_tlb_entries(vma->vm_mm, addr); + set_pte(ptep, pte_mkold(pte)); spin_unlock_irqrestore(&pa_tlb_lock, flags); return 1; } @@ -492,9 +492,9 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, spin_lock_irqsave(&pa_tlb_lock, flags); old_pte = *ptep; - set_pte(ptep, __pte(0)); if (pte_inserted(old_pte)) purge_tlb_entries(mm, addr); + set_pte(ptep, __pte(0)); spin_unlock_irqrestore(&pa_tlb_lock, flags); return old_pte; @@ -504,8 +504,8 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, { unsigned long flags; spin_lock_irqsave(&pa_tlb_lock, flags); - set_pte(ptep, pte_wrprotect(*ptep)); purge_tlb_entries(mm, addr); + set_pte(ptep, pte_wrprotect(*ptep)); spin_unlock_irqrestore(&pa_tlb_lock, flags); } diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index 2e674e13e005..ca40741378be 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -309,7 +309,6 @@ extern unsigned long get_wchan(struct task_struct *p); #define KSTK_ESP(tsk) ((tsk)->thread.regs.gr[30]) #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() /* * parisc_requires_coherency() is used to identify the combined VIPT/PIPT diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h index 9c935d717df9..7a109b73ddf7 100644 --- a/arch/parisc/include/uapi/asm/socket.h +++ b/arch/parisc/include/uapi/asm/socket.h @@ -89,4 +89,6 @@ #define SO_CNX_ADVICE 0x402E +#define SCM_TIMESTAMPING_OPT_STATS 0x402F + #endif /* _UAPI_ASM_SOCKET_H */ diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index c263301648f3..977f0a4f5ecf 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -393,6 +393,15 @@ void __init parisc_setup_cache_timing(void) /* calculate TLB flush threshold */ + /* On SMP machines, skip the TLB measure of kernel text which + * has been mapped as huge pages. */ + if (num_online_cpus() > 1 && !parisc_requires_coherency()) { + threshold = max(cache_info.it_size, cache_info.dt_size); + threshold *= PAGE_SIZE; + threshold /= num_online_cpus(); + goto set_tlb_threshold; + } + alltime = mfctl(16); flush_tlb_all(); alltime = mfctl(16) - alltime; @@ -411,6 +420,8 @@ void __init parisc_setup_cache_timing(void) alltime, size, rangetime); threshold = PAGE_ALIGN(num_online_cpus() * size * alltime / rangetime); + +set_tlb_threshold: if (threshold) parisc_tlb_flush_threshold = threshold; printk(KERN_INFO "TLB flush threshold set to %lu KiB\n", diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S index 1b39a2acaadf..adf7187f8951 100644 --- a/arch/parisc/kernel/pacache.S +++ b/arch/parisc/kernel/pacache.S @@ -892,19 +892,10 @@ ENTRY_CFI(flush_dcache_page_asm) fdc,m r31(%r28) fdc,m r31(%r28) fdc,m r31(%r28) - cmpb,COND(<<) %r28, %r25,1b + cmpb,COND(<<) %r28, %r25,1b fdc,m r31(%r28) sync - -#ifdef CONFIG_PA20 - pdtlb,l %r0(%r25) -#else - tlb_lock %r20,%r21,%r22 - pdtlb %r0(%r25) - tlb_unlock %r20,%r21,%r22 -#endif - bv %r0(%r2) nop .exit @@ -979,17 +970,6 @@ ENTRY_CFI(flush_icache_page_asm) fic,m %r31(%sr4,%r28) sync - -#ifdef CONFIG_PA20 - pdtlb,l %r0(%r28) - pitlb,l %r0(%sr4,%r25) -#else - tlb_lock %r20,%r21,%r22 - pdtlb %r0(%r28) - pitlb %r0(%sr4,%r25) - tlb_unlock %r20,%r21,%r22 -#endif - bv %r0(%r2) nop .exit diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 65fba4c34cd7..c7f120aaa98f 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -160,6 +160,7 @@ config PPC select HAVE_LIVEPATCH if HAVE_DYNAMIC_FTRACE_WITH_REGS select GENERIC_CPU_AUTOPROBE select HAVE_VIRT_CPU_ACCOUNTING + select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select HAVE_ARCH_HARDENED_USERCOPY select HAVE_KERNEL_GZIP diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index eae2dc8bc218..9d47f2efa830 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -100,7 +100,8 @@ src-wlib-y := string.S crt0.S crtsavres.S stdio.c decompress.c main.c \ ns16550.c serial.c simple_alloc.c div64.S util.S \ elf_util.c $(zlib-y) devtree.c stdlib.c \ oflib.c ofconsole.c cuboot.c mpsc.c cpm-serial.c \ - uartlite.c mpc52xx-psc.c opal.c opal-calls.S + uartlite.c mpc52xx-psc.c opal.c +src-wlib-$(CONFIG_PPC64_BOOT_WRAPPER) += opal-calls.S src-wlib-$(CONFIG_40x) += 4xx.c planetcore.c src-wlib-$(CONFIG_44x) += 4xx.c ebony.c bamboo.c src-wlib-$(CONFIG_8xx) += mpc8xx.c planetcore.c fsl-soc.c diff --git a/arch/powerpc/boot/opal.c b/arch/powerpc/boot/opal.c index d7b4fd47eb44..0272570d02de 100644 --- a/arch/powerpc/boot/opal.c +++ b/arch/powerpc/boot/opal.c @@ -13,7 +13,7 @@ #include <libfdt.h> #include "../include/asm/opal-api.h" -#ifdef __powerpc64__ +#ifdef CONFIG_PPC64_BOOT_WRAPPER /* Global OPAL struct used by opal-call.S */ struct opal { diff --git a/arch/powerpc/configs/dpaa.config b/arch/powerpc/configs/dpaa.config index efa99c048543..2fe76f5e938a 100644 --- a/arch/powerpc/configs/dpaa.config +++ b/arch/powerpc/configs/dpaa.config @@ -1 +1,4 @@ CONFIG_FSL_DPAA=y +CONFIG_FSL_PAMU=y +CONFIG_FSL_FMAN=y +CONFIG_FSL_DPAA_ETH=y diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index e407af2b7333..2e6a823fa502 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -70,7 +70,9 @@ #define HPTE_V_SSIZE_SHIFT 62 #define HPTE_V_AVPN_SHIFT 7 +#define HPTE_V_COMMON_BITS ASM_CONST(0x000fffffffffffff) #define HPTE_V_AVPN ASM_CONST(0x3fffffffffffff80) +#define HPTE_V_AVPN_3_0 ASM_CONST(0x000fffffffffff80) #define HPTE_V_AVPN_VAL(x) (((x) & HPTE_V_AVPN) >> HPTE_V_AVPN_SHIFT) #define HPTE_V_COMPARE(x,y) (!(((x) ^ (y)) & 0xffffffffffffff80UL)) #define HPTE_V_BOLTED ASM_CONST(0x0000000000000010) @@ -80,14 +82,16 @@ #define HPTE_V_VALID ASM_CONST(0x0000000000000001) /* - * ISA 3.0 have a different HPTE format. + * ISA 3.0 has a different HPTE format. */ #define HPTE_R_3_0_SSIZE_SHIFT 58 +#define HPTE_R_3_0_SSIZE_MASK (3ull << HPTE_R_3_0_SSIZE_SHIFT) #define HPTE_R_PP0 ASM_CONST(0x8000000000000000) #define HPTE_R_TS ASM_CONST(0x4000000000000000) #define HPTE_R_KEY_HI ASM_CONST(0x3000000000000000) #define HPTE_R_RPN_SHIFT 12 #define HPTE_R_RPN ASM_CONST(0x0ffffffffffff000) +#define HPTE_R_RPN_3_0 ASM_CONST(0x01fffffffffff000) #define HPTE_R_PP ASM_CONST(0x0000000000000003) #define HPTE_R_PPP ASM_CONST(0x8000000000000003) #define HPTE_R_N ASM_CONST(0x0000000000000004) @@ -316,12 +320,43 @@ static inline unsigned long hpte_encode_avpn(unsigned long vpn, int psize, */ v = (vpn >> (23 - VPN_SHIFT)) & ~(mmu_psize_defs[psize].avpnm); v <<= HPTE_V_AVPN_SHIFT; - if (!cpu_has_feature(CPU_FTR_ARCH_300)) - v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; + v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; return v; } /* + * ISA v3.0 defines a new HPTE format, which differs from the old + * format in having smaller AVPN and ARPN fields, and the B field + * in the second dword instead of the first. + */ +static inline unsigned long hpte_old_to_new_v(unsigned long v) +{ + /* trim AVPN, drop B */ + return v & HPTE_V_COMMON_BITS; +} + +static inline unsigned long hpte_old_to_new_r(unsigned long v, unsigned long r) +{ + /* move B field from 1st to 2nd dword, trim ARPN */ + return (r & ~HPTE_R_3_0_SSIZE_MASK) | + (((v) >> HPTE_V_SSIZE_SHIFT) << HPTE_R_3_0_SSIZE_SHIFT); +} + +static inline unsigned long hpte_new_to_old_v(unsigned long v, unsigned long r) +{ + /* insert B field */ + return (v & HPTE_V_COMMON_BITS) | + ((r & HPTE_R_3_0_SSIZE_MASK) << + (HPTE_V_SSIZE_SHIFT - HPTE_R_3_0_SSIZE_SHIFT)); +} + +static inline unsigned long hpte_new_to_old_r(unsigned long r) +{ + /* clear out B field */ + return r & ~HPTE_R_3_0_SSIZE_MASK; +} + +/* * This function sets the AVPN and L fields of the HPTE appropriately * using the base page size and actual page size. */ @@ -341,12 +376,8 @@ static inline unsigned long hpte_encode_v(unsigned long vpn, int base_psize, * aligned for the requested page size */ static inline unsigned long hpte_encode_r(unsigned long pa, int base_psize, - int actual_psize, int ssize) + int actual_psize) { - - if (cpu_has_feature(CPU_FTR_ARCH_300)) - pa |= ((unsigned long) ssize) << HPTE_R_3_0_SSIZE_SHIFT; - /* A 4K page needs no special encoding */ if (actual_psize == MMU_PAGE_4K) return pa & HPTE_R_RPN; diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 9fd77f8794a0..0ebfbc8f0449 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1009,7 +1009,8 @@ static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, - struct spinlock *old_pmd_ptl) + struct spinlock *old_pmd_ptl, + struct vm_area_struct *vma) { if (radix_enabled()) return false; @@ -1020,6 +1021,16 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, */ return true; } + + +#define arch_needs_pgtable_deposit arch_needs_pgtable_deposit +static inline bool arch_needs_pgtable_deposit(void) +{ + if (radix_enabled()) + return false; + return true; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 4f60db074725..aa2e6a34b872 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -46,26 +46,12 @@ extern cputime_t cputime_one_jiffy; * Convert cputime <-> jiffies */ extern u64 __cputime_jiffies_factor; -DECLARE_PER_CPU(unsigned long, cputime_last_delta); -DECLARE_PER_CPU(unsigned long, cputime_scaled_last_delta); static inline unsigned long cputime_to_jiffies(const cputime_t ct) { return mulhdu((__force u64) ct, __cputime_jiffies_factor); } -/* Estimate the scaled cputime by scaling the real cputime based on - * the last scaled to real ratio */ -static inline cputime_t cputime_to_scaled(const cputime_t ct) -{ - if (cpu_has_feature(CPU_FTR_SPURR) && - __this_cpu_read(cputime_last_delta)) - return (__force u64) ct * - __this_cpu_read(cputime_scaled_last_delta) / - __this_cpu_read(cputime_last_delta); - return ct; -} - static inline cputime_t jiffies_to_cputime(const unsigned long jif) { u64 ct; diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 05cabed3d1bd..09a802bb702f 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -99,6 +99,7 @@ #define BOOK3S_INTERRUPT_H_EMUL_ASSIST 0xe40 #define BOOK3S_INTERRUPT_HMI 0xe60 #define BOOK3S_INTERRUPT_H_DOORBELL 0xe80 +#define BOOK3S_INTERRUPT_H_VIRT 0xea0 #define BOOK3S_INTERRUPT_PERFMON 0xf00 #define BOOK3S_INTERRUPT_ALTIVEC 0xf20 #define BOOK3S_INTERRUPT_VSX 0xf40 diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 28350a294b1e..e59b172666cd 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -48,7 +48,7 @@ #ifdef CONFIG_KVM_MMIO #define KVM_COALESCED_MMIO_PAGE_OFFSET 1 #endif -#define KVM_HALT_POLL_NS_DEFAULT 500000 +#define KVM_HALT_POLL_NS_DEFAULT 10000 /* 10 us */ /* These values are internal and can be increased later */ #define KVM_NR_IRQCHIPS 1 @@ -244,8 +244,10 @@ struct kvm_arch_memory_slot { struct kvm_arch { unsigned int lpid; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + unsigned int tlb_sets; unsigned long hpt_virt; struct revmap_entry *revmap; + atomic64_t mmio_update; unsigned int host_lpid; unsigned long host_lpcr; unsigned long sdr1; @@ -408,6 +410,24 @@ struct kvmppc_passthru_irqmap { #define KVMPPC_IRQ_MPIC 1 #define KVMPPC_IRQ_XICS 2 +#define MMIO_HPTE_CACHE_SIZE 4 + +struct mmio_hpte_cache_entry { + unsigned long hpte_v; + unsigned long hpte_r; + unsigned long rpte; + unsigned long pte_index; + unsigned long eaddr; + unsigned long slb_v; + long mmio_update; + unsigned int slb_base_pshift; +}; + +struct mmio_hpte_cache { + struct mmio_hpte_cache_entry entry[MMIO_HPTE_CACHE_SIZE]; + unsigned int index; +}; + struct openpic; struct kvm_vcpu_arch { @@ -498,6 +518,8 @@ struct kvm_vcpu_arch { ulong tcscr; ulong acop; ulong wort; + ulong tid; + ulong psscr; ulong shadow_srr1; #endif u32 vrsave; /* also USPRG0 */ @@ -546,6 +568,7 @@ struct kvm_vcpu_arch { u64 tfiar; u32 cr_tm; + u64 xer_tm; u64 lr_tm; u64 ctr_tm; u64 amr_tm; @@ -655,9 +678,11 @@ struct kvm_vcpu_arch { #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE struct kvm_vcpu_arch_shared shregs; + struct mmio_hpte_cache mmio_cache; unsigned long pgfault_addr; long pgfault_index; unsigned long pgfault_hpte[2]; + struct mmio_hpte_cache_entry *pgfault_cache; struct task_struct *run_task; struct kvm_run *kvm_run; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index f6e49640dbe1..2da67bf1f2ec 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -483,9 +483,10 @@ extern void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long guest_irq, unsigned long host_irq); extern void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long guest_irq, unsigned long host_irq); -extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr, - struct kvmppc_irq_map *irq_map, - struct kvmppc_passthru_irqmap *pimap); +extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, __be32 xirr, + struct kvmppc_irq_map *irq_map, + struct kvmppc_passthru_irqmap *pimap, + bool *again); extern int h_ipi_redirect; #else static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( @@ -510,6 +511,48 @@ static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) #endif /* + * Prototypes for functions called only from assembler code. + * Having prototypes reduces sparse errors. + */ +long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, + unsigned long ioba, unsigned long tce); +long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_list, unsigned long npages); +long kvmppc_rm_h_stuff_tce(struct kvm_vcpu *vcpu, + unsigned long liobn, unsigned long ioba, + unsigned long tce_value, unsigned long npages); +long int kvmppc_rm_h_confer(struct kvm_vcpu *vcpu, int target, + unsigned int yield_count); +long kvmppc_h_random(struct kvm_vcpu *vcpu); +void kvmhv_commence_exit(int trap); +long kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu); +void kvmppc_subcore_enter_guest(void); +void kvmppc_subcore_exit_guest(void); +long kvmppc_realmode_hmi_handler(void); +long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, + long pte_index, unsigned long pteh, unsigned long ptel); +long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index, unsigned long avpn); +long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu); +long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index, unsigned long avpn, + unsigned long va); +long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index); +long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index); +long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, + unsigned long pte_index); +long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, + unsigned long slb_v, unsigned int status, bool data); +unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu); +int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, + unsigned long mfrr); +int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr); +int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr); + +/* * Host-side operations we want to set up while running in real * mode in the guest operating on the xics. * Currently only VCPU wakeup is supported. diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index e311c25751a4..8d1499334257 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -214,6 +214,11 @@ extern u64 ppc64_rma_size; /* Cleanup function used by kexec */ extern void mmu_cleanup_all(void); extern void radix__mmu_cleanup_all(void); + +/* Functions for creating and updating partition table on POWER9 */ +extern void mmu_partition_table_init(void); +extern void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, + unsigned long dw1); #endif /* CONFIG_PPC64 */ struct mm_struct; diff --git a/arch/powerpc/include/asm/mutex.h b/arch/powerpc/include/asm/mutex.h deleted file mode 100644 index 078155fa1189..000000000000 --- a/arch/powerpc/include/asm/mutex.h +++ /dev/null @@ -1,132 +0,0 @@ -/* - * Optimised mutex implementation of include/asm-generic/mutex-dec.h algorithm - */ -#ifndef _ASM_POWERPC_MUTEX_H -#define _ASM_POWERPC_MUTEX_H - -static inline int __mutex_cmpxchg_lock(atomic_t *v, int old, int new) -{ - int t; - - __asm__ __volatile__ ( -"1: lwarx %0,0,%1 # mutex trylock\n\ - cmpw 0,%0,%2\n\ - bne- 2f\n" - PPC405_ERR77(0,%1) -" stwcx. %3,0,%1\n\ - bne- 1b" - PPC_ACQUIRE_BARRIER - "\n\ -2:" - : "=&r" (t) - : "r" (&v->counter), "r" (old), "r" (new) - : "cc", "memory"); - - return t; -} - -static inline int __mutex_dec_return_lock(atomic_t *v) -{ - int t; - - __asm__ __volatile__( -"1: lwarx %0,0,%1 # mutex lock\n\ - addic %0,%0,-1\n" - PPC405_ERR77(0,%1) -" stwcx. %0,0,%1\n\ - bne- 1b" - PPC_ACQUIRE_BARRIER - : "=&r" (t) - : "r" (&v->counter) - : "cc", "memory"); - - return t; -} - -static inline int __mutex_inc_return_unlock(atomic_t *v) -{ - int t; - - __asm__ __volatile__( - PPC_RELEASE_BARRIER -"1: lwarx %0,0,%1 # mutex unlock\n\ - addic %0,%0,1\n" - PPC405_ERR77(0,%1) -" stwcx. %0,0,%1 \n\ - bne- 1b" - : "=&r" (t) - : "r" (&v->counter) - : "cc", "memory"); - - return t; -} - -/** - * __mutex_fastpath_lock - try to take the lock by moving the count - * from 1 to a 0 value - * @count: pointer of type atomic_t - * @fail_fn: function to call if the original value was not 1 - * - * Change the count from 1 to a value lower than 1, and call <fail_fn> if - * it wasn't 1 originally. This function MUST leave the value lower than - * 1 even when the "1" assertion wasn't true. - */ -static inline void -__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *)) -{ - if (unlikely(__mutex_dec_return_lock(count) < 0)) - fail_fn(count); -} - -/** - * __mutex_fastpath_lock_retval - try to take the lock by moving the count - * from 1 to a 0 value - * @count: pointer of type atomic_t - * - * Change the count from 1 to a value lower than 1. This function returns 0 - * if the fastpath succeeds, or -1 otherwise. - */ -static inline int -__mutex_fastpath_lock_retval(atomic_t *count) -{ - if (unlikely(__mutex_dec_return_lock(count) < 0)) - return -1; - return 0; -} - -/** - * __mutex_fastpath_unlock - try to promote the count from 0 to 1 - * @count: pointer of type atomic_t - * @fail_fn: function to call if the original value was not 0 - * - * Try to promote the count from 0 to 1. If it wasn't 0, call <fail_fn>. - * In the failure case, this function is allowed to either set the value to - * 1, or to set it to a value lower than 1. - */ -static inline void -__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *)) -{ - if (unlikely(__mutex_inc_return_unlock(count) <= 0)) - fail_fn(count); -} - -#define __mutex_slowpath_needs_to_unlock() 1 - -/** - * __mutex_fastpath_trylock - try to acquire the mutex, without waiting - * - * @count: pointer of type atomic_t - * @fail_fn: fallback function - * - * Change the count from 1 to 0, and return 1 (success), or if the count - * was not 1, then return 0 (failure). - */ -static inline int -__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) -{ - if (likely(atomic_read(count) == 1 && __mutex_cmpxchg_lock(count, 1, 0) == 1)) - return 1; - return 0; -} - -#endif diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index e958b7096f19..5c7db0f1a708 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -220,9 +220,12 @@ int64_t opal_pci_set_power_state(uint64_t async_token, uint64_t id, int64_t opal_pci_poll2(uint64_t id, uint64_t data); int64_t opal_int_get_xirr(uint32_t *out_xirr, bool just_poll); +int64_t opal_rm_int_get_xirr(__be32 *out_xirr, bool just_poll); int64_t opal_int_set_cppr(uint8_t cppr); int64_t opal_int_eoi(uint32_t xirr); +int64_t opal_rm_int_eoi(uint32_t xirr); int64_t opal_int_set_mfrr(uint32_t cpu, uint8_t mfrr); +int64_t opal_rm_int_set_mfrr(uint32_t cpu, uint8_t mfrr); int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type, uint32_t pe_num, uint32_t tce_size, uint64_t dma_addr, uint32_t npages); diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index c07c31b0e89e..dac83fcb9445 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -404,8 +404,6 @@ static inline unsigned long __pack_fe01(unsigned int fpmode) #define cpu_relax() barrier() #endif -#define cpu_relax_lowlatency() cpu_relax() - /* Check that a certain kernel stack pointer is valid in task_struct p */ int validate_sp(unsigned long sp, struct task_struct *p, unsigned long nbytes); diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 9e1499f98def..04aa1ee8cdb6 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -153,6 +153,8 @@ #define PSSCR_EC 0x00100000 /* Exit Criterion */ #define PSSCR_ESL 0x00200000 /* Enable State Loss */ #define PSSCR_SD 0x00400000 /* Status Disable */ +#define PSSCR_PLS 0xf000000000000000 /* Power-saving Level Status */ +#define PSSCR_GUEST_VIS 0xf0000000000003ff /* Guest-visible PSSCR fields */ /* Floating Point Status and Control Register (FPSCR) Fields */ #define FPSCR_FX 0x80000000 /* FPU exception summary */ @@ -236,6 +238,7 @@ #define SPRN_TEXASRU 0x83 /* '' '' '' Upper 32 */ #define TEXASR_FS __MASK(63-36) /* TEXASR Failure Summary */ #define SPRN_TFHAR 0x80 /* Transaction Failure Handler Addr */ +#define SPRN_TIDR 144 /* Thread ID register */ #define SPRN_CTRLF 0x088 #define SPRN_CTRLT 0x098 #define CTRL_CT 0xc0000000 /* current thread */ @@ -294,6 +297,7 @@ #define SPRN_HSRR1 0x13B /* Hypervisor Save/Restore 1 */ #define SPRN_LMRR 0x32D /* Load Monitor Region Register */ #define SPRN_LMSER 0x32E /* Load Monitor Section Enable Register */ +#define SPRN_ASDR 0x330 /* Access segment descriptor register */ #define SPRN_IC 0x350 /* Virtual Instruction Count */ #define SPRN_VTB 0x351 /* Virtual Time Base */ #define SPRN_LDBAR 0x352 /* LD Base Address Register */ @@ -305,6 +309,7 @@ /* HFSCR and FSCR bit numbers are the same */ #define FSCR_LM_LG 11 /* Enable Load Monitor Registers */ +#define FSCR_MSGP_LG 10 /* Enable MSGP */ #define FSCR_TAR_LG 8 /* Enable Target Address Register */ #define FSCR_EBB_LG 7 /* Enable Event Based Branching */ #define FSCR_TM_LG 5 /* Enable Transactional Memory */ @@ -320,6 +325,7 @@ #define FSCR_DSCR __MASK(FSCR_DSCR_LG) #define SPRN_HFSCR 0xbe /* HV=1 Facility Status & Control Register */ #define HFSCR_LM __MASK(FSCR_LM_LG) +#define HFSCR_MSGP __MASK(FSCR_MSGP_LG) #define HFSCR_TAR __MASK(FSCR_TAR_LG) #define HFSCR_EBB __MASK(FSCR_EBB_LG) #define HFSCR_TM __MASK(FSCR_TM_LG) @@ -358,6 +364,7 @@ #define LPCR_PECE_HVEE ASM_CONST(0x0000400000000000) /* P9 Wakeup on HV interrupts */ #define LPCR_MER ASM_CONST(0x0000000000000800) /* Mediated External Exception */ #define LPCR_MER_SH 11 +#define LPCR_GTSE ASM_CONST(0x0000000000000400) /* Guest Translation Shootdown Enable */ #define LPCR_TC ASM_CONST(0x0000000000000200) /* Translation control */ #define LPCR_LPES 0x0000000c #define LPCR_LPES0 ASM_CONST(0x0000000000000008) /* LPAR Env selector 0 */ @@ -378,6 +385,12 @@ #define PCR_VEC_DIS (1ul << (63-0)) /* Vec. disable (bit NA since POWER8) */ #define PCR_VSX_DIS (1ul << (63-1)) /* VSX disable (bit NA since POWER8) */ #define PCR_TM_DIS (1ul << (63-2)) /* Trans. memory disable (POWER8) */ +/* + * These bits are used in the function kvmppc_set_arch_compat() to specify and + * determine both the compatibility level which we want to emulate and the + * compatibility level which the host is capable of emulating. + */ +#define PCR_ARCH_207 0x8 /* Architecture 2.07 */ #define PCR_ARCH_206 0x4 /* Architecture 2.06 */ #define PCR_ARCH_205 0x2 /* Architecture 2.05 */ #define SPRN_HEIR 0x153 /* Hypervisor Emulated Instruction Register */ @@ -1219,6 +1232,7 @@ #define PVR_ARCH_206 0x0f000003 #define PVR_ARCH_206p 0x0f100003 #define PVR_ARCH_207 0x0f000004 +#define PVR_ARCH_300 0x0f000005 /* Macros for setting and retrieving special purpose registers */ #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h index fa37fe93bc02..8c1b913de6d7 100644 --- a/arch/powerpc/include/asm/spinlock.h +++ b/arch/powerpc/include/asm/spinlock.h @@ -52,6 +52,14 @@ #define SYNC_IO #endif +#ifdef CONFIG_PPC_PSERIES +#define vcpu_is_preempted vcpu_is_preempted +static inline bool vcpu_is_preempted(int cpu) +{ + return !!(be32_to_cpu(lppaca_of(cpu).yield_count) & 1); +} +#endif + static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) { return lock.slock == 0; diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index 99e1397b71da..609557569f65 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -28,6 +28,7 @@ #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change extern void tlb_flush(struct mmu_gather *tlb); @@ -46,6 +47,21 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, #endif } +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ + if (!tlb->page_size) + tlb->page_size = page_size; + else if (tlb->page_size != page_size) { + tlb_flush_mmu(tlb); + /* + * update the page size after flush for the new + * mmu_gather. + */ + tlb->page_size = page_size; + } +} + #ifdef CONFIG_SMP static inline int mm_is_core_local(struct mm_struct *mm) { diff --git a/arch/powerpc/include/asm/xilinx_intc.h b/arch/powerpc/include/asm/xilinx_intc.h index 343612f8fece..3192d7f0a05b 100644 --- a/arch/powerpc/include/asm/xilinx_intc.h +++ b/arch/powerpc/include/asm/xilinx_intc.h @@ -14,7 +14,7 @@ #ifdef __KERNEL__ extern void __init xilinx_intc_init_tree(void); -extern unsigned int xilinx_intc_get_irq(void); +extern unsigned int xintc_get_irq(void); #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_XILINX_INTC_H */ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index c93cf35ce379..3603b6f51b11 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -573,6 +573,10 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_SPRG9 (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xba) #define KVM_REG_PPC_DBSR (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbb) +/* POWER9 registers */ +#define KVM_REG_PPC_TIDR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbc) +#define KVM_REG_PPC_PSSCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbd) + /* Transactional Memory checkpointed state: * This is all GPRs, all VSX regs and a subset of SPRs */ @@ -596,6 +600,7 @@ struct kvm_get_htab_header { #define KVM_REG_PPC_TM_VSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U32 | 0x67) #define KVM_REG_PPC_TM_DSCR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x68) #define KVM_REG_PPC_TM_TAR (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x69) +#define KVM_REG_PPC_TM_XER (KVM_REG_PPC_TM | KVM_REG_SIZE_U64 | 0x6a) /* PPC64 eXternal Interrupt Controller Specification */ #define KVM_DEV_XICS_GRP_SOURCES 1 /* 64-bit source attributes */ diff --git a/arch/powerpc/include/uapi/asm/socket.h b/arch/powerpc/include/uapi/asm/socket.h index 1672e3398270..44583a52f882 100644 --- a/arch/powerpc/include/uapi/asm/socket.h +++ b/arch/powerpc/include/uapi/asm/socket.h @@ -97,4 +97,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _ASM_POWERPC_SOCKET_H */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index caec7bf3b99a..195a9fc8f81c 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -487,6 +487,7 @@ int main(void) /* book3s */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + DEFINE(KVM_TLB_SETS, offsetof(struct kvm, arch.tlb_sets)); DEFINE(KVM_SDR1, offsetof(struct kvm, arch.sdr1)); DEFINE(KVM_HOST_LPID, offsetof(struct kvm, arch.host_lpid)); DEFINE(KVM_HOST_LPCR, offsetof(struct kvm, arch.host_lpcr)); @@ -548,6 +549,8 @@ int main(void) DEFINE(VCPU_TCSCR, offsetof(struct kvm_vcpu, arch.tcscr)); DEFINE(VCPU_ACOP, offsetof(struct kvm_vcpu, arch.acop)); DEFINE(VCPU_WORT, offsetof(struct kvm_vcpu, arch.wort)); + DEFINE(VCPU_TID, offsetof(struct kvm_vcpu, arch.tid)); + DEFINE(VCPU_PSSCR, offsetof(struct kvm_vcpu, arch.psscr)); DEFINE(VCORE_ENTRY_EXIT, offsetof(struct kvmppc_vcore, entry_exit_map)); DEFINE(VCORE_IN_GUEST, offsetof(struct kvmppc_vcore, in_guest)); DEFINE(VCORE_NAPPING_THREADS, offsetof(struct kvmppc_vcore, napping_threads)); @@ -569,6 +572,7 @@ int main(void) DEFINE(VCPU_VRS_TM, offsetof(struct kvm_vcpu, arch.vr_tm.vr)); DEFINE(VCPU_VRSAVE_TM, offsetof(struct kvm_vcpu, arch.vrsave_tm)); DEFINE(VCPU_CR_TM, offsetof(struct kvm_vcpu, arch.cr_tm)); + DEFINE(VCPU_XER_TM, offsetof(struct kvm_vcpu, arch.xer_tm)); DEFINE(VCPU_LR_TM, offsetof(struct kvm_vcpu, arch.lr_tm)); DEFINE(VCPU_CTR_TM, offsetof(struct kvm_vcpu, arch.ctr_tm)); DEFINE(VCPU_AMR_TM, offsetof(struct kvm_vcpu, arch.amr_tm)); diff --git a/arch/powerpc/kernel/cpu_setup_power.S b/arch/powerpc/kernel/cpu_setup_power.S index 37c027ca83b2..f3e1f5d29dce 100644 --- a/arch/powerpc/kernel/cpu_setup_power.S +++ b/arch/powerpc/kernel/cpu_setup_power.S @@ -174,7 +174,7 @@ __init_FSCR: __init_HFSCR: mfspr r3,SPRN_HFSCR ori r3,r3,HFSCR_TAR|HFSCR_TM|HFSCR_BHRB|HFSCR_PM|\ - HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB + HFSCR_DSCR|HFSCR_VECVSX|HFSCR_FP|HFSCR_EBB|HFSCR_MSGP mtspr SPRN_HFSCR,r3 blr diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index a62be72da274..5c31369435f2 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -671,8 +671,10 @@ static int eeh_reset_device(struct eeh_pe *pe, struct pci_bus *bus, /* Clear frozen state */ rc = eeh_clear_pe_frozen_state(pe, false); - if (rc) + if (rc) { + pci_unlock_rescan_remove(); return rc; + } /* Give the system 5 seconds to finish running the user-space * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, diff --git a/arch/powerpc/kernel/sysfs.c b/arch/powerpc/kernel/sysfs.c index c4f1d1f7bae0..c1fb255a60d6 100644 --- a/arch/powerpc/kernel/sysfs.c +++ b/arch/powerpc/kernel/sysfs.c @@ -703,7 +703,7 @@ static struct device_attribute pa6t_attrs[] = { #endif /* HAS_PPC_PMC_PA6T */ #endif /* HAS_PPC_PMC_CLASSIC */ -static void register_cpu_online(unsigned int cpu) +static int register_cpu_online(unsigned int cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); struct device *s = &c->dev; @@ -782,11 +782,12 @@ static void register_cpu_online(unsigned int cpu) } #endif cacheinfo_cpu_online(cpu); + return 0; } -#ifdef CONFIG_HOTPLUG_CPU -static void unregister_cpu_online(unsigned int cpu) +static int unregister_cpu_online(unsigned int cpu) { +#ifdef CONFIG_HOTPLUG_CPU struct cpu *c = &per_cpu(cpu_devices, cpu); struct device *s = &c->dev; struct device_attribute *attrs, *pmc_attrs; @@ -863,6 +864,8 @@ static void unregister_cpu_online(unsigned int cpu) } #endif cacheinfo_cpu_offline(cpu); +#endif /* CONFIG_HOTPLUG_CPU */ + return 0; } #ifdef CONFIG_ARCH_CPU_PROBE_RELEASE @@ -883,32 +886,6 @@ ssize_t arch_cpu_release(const char *buf, size_t count) } #endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ -#endif /* CONFIG_HOTPLUG_CPU */ - -static int sysfs_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned int)(long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - register_cpu_online(cpu); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - unregister_cpu_online(cpu); - break; -#endif - } - return NOTIFY_OK; -} - -static struct notifier_block sysfs_cpu_nb = { - .notifier_call = sysfs_cpu_notify, -}; - static DEFINE_MUTEX(cpu_mutex); int cpu_add_dev_attr(struct device_attribute *attr) @@ -1023,12 +1000,10 @@ static DEVICE_ATTR(physical_id, 0444, show_physical_id, NULL); static int __init topology_init(void) { - int cpu; + int cpu, r; register_nodes(); - cpu_notifier_register_begin(); - for_each_possible_cpu(cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); @@ -1047,15 +1022,10 @@ static int __init topology_init(void) device_create_file(&c->dev, &dev_attr_physical_id); } - - if (cpu_online(cpu)) - register_cpu_online(cpu); } - - __register_cpu_notifier(&sysfs_cpu_nb); - - cpu_notifier_register_done(); - + r = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/topology:online", + register_cpu_online, unregister_cpu_online); + WARN_ON(r < 0); #ifdef CONFIG_PPC64 sysfs_create_dscr_default(); #endif /* CONFIG_PPC64 */ diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index bc3f7d0d7b79..be9751f1cb2a 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -164,8 +164,6 @@ u64 __cputime_sec_factor; EXPORT_SYMBOL(__cputime_sec_factor); u64 __cputime_clockt_factor; EXPORT_SYMBOL(__cputime_clockt_factor); -DEFINE_PER_CPU(unsigned long, cputime_last_delta); -DEFINE_PER_CPU(unsigned long, cputime_scaled_last_delta); cputime_t cputime_one_jiffy; @@ -360,7 +358,8 @@ void vtime_account_system(struct task_struct *tsk) unsigned long delta, sys_scaled, stolen; delta = vtime_delta(tsk, &sys_scaled, &stolen); - account_system_time(tsk, 0, delta, sys_scaled); + account_system_time(tsk, 0, delta); + tsk->stimescaled += sys_scaled; if (stolen) account_steal_time(stolen); } @@ -393,7 +392,8 @@ void vtime_account_user(struct task_struct *tsk) acct->user_time = 0; acct->user_time_scaled = 0; acct->utime_sspurr = 0; - account_user_time(tsk, utime, utimescaled); + account_user_time(tsk, utime); + tsk->utimescaled += utimescaled; } #ifdef CONFIG_PPC32 diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 8295f51c1a5f..7394b770ae1f 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -94,8 +94,17 @@ SECTIONS * detected, and will result in a crash at boot due to offsets being * wrong. */ +#ifdef CONFIG_PPC64 + /* + * BLOCK(0) overrides the default output section alignment because + * this needs to start right after .head.text in order for fixed + * section placement to work. + */ + .text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) { +#else .text : AT(ADDR(.text) - LOAD_OFFSET) { ALIGN_FUNCTION(); +#endif /* careful! __ftr_alt_* sections need to be close to .text */ *(.text .fixup __ftr_alt_* .ref.text) SCHED_TEXT diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 05f09ae82587..b795dd1ac2ef 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -88,6 +88,8 @@ long kvmppc_alloc_hpt(struct kvm *kvm, u32 *htab_orderp) /* 128 (2**7) bytes in each HPTEG */ kvm->arch.hpt_mask = (1ul << (order - 7)) - 1; + atomic64_set(&kvm->arch.mmio_update, 0); + /* Allocate reverse map array */ rev = vmalloc(sizeof(struct revmap_entry) * kvm->arch.hpt_npte); if (!rev) { @@ -255,7 +257,7 @@ static void kvmppc_mmu_book3s_64_hv_reset_msr(struct kvm_vcpu *vcpu) kvmppc_set_msr(vcpu, msr); } -long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, +static long kvmppc_virtmode_do_h_enter(struct kvm *kvm, unsigned long flags, long pte_index, unsigned long pteh, unsigned long ptel, unsigned long *pte_idx_ret) { @@ -312,7 +314,7 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, struct kvmppc_slb *slbe; unsigned long slb_v; unsigned long pp, key; - unsigned long v, gr; + unsigned long v, orig_v, gr; __be64 *hptep; int index; int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR); @@ -337,10 +339,12 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, return -ENOENT; } hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); - v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; + v = orig_v = be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK; + if (cpu_has_feature(CPU_FTR_ARCH_300)) + v = hpte_new_to_old_v(v, be64_to_cpu(hptep[1])); gr = kvm->arch.revmap[index].guest_rpte; - unlock_hpte(hptep, v); + unlock_hpte(hptep, orig_v); preempt_enable(); gpte->eaddr = eaddr; @@ -438,6 +442,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, { struct kvm *kvm = vcpu->kvm; unsigned long hpte[3], r; + unsigned long hnow_v, hnow_r; __be64 *hptep; unsigned long mmu_seq, psize, pte_size; unsigned long gpa_base, gfn_base; @@ -451,6 +456,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned int writing, write_ok; struct vm_area_struct *vma; unsigned long rcbits; + long mmio_update; /* * Real-mode code has already searched the HPT and found the @@ -460,6 +466,19 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, */ if (ea != vcpu->arch.pgfault_addr) return RESUME_GUEST; + + if (vcpu->arch.pgfault_cache) { + mmio_update = atomic64_read(&kvm->arch.mmio_update); + if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) { + r = vcpu->arch.pgfault_cache->rpte; + psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r); + gpa_base = r & HPTE_R_RPN & ~(psize - 1); + gfn_base = gpa_base >> PAGE_SHIFT; + gpa = gpa_base | (ea & (psize - 1)); + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, + dsisr & DSISR_ISSTORE); + } + } index = vcpu->arch.pgfault_index; hptep = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); rev = &kvm->arch.revmap[index]; @@ -472,6 +491,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unlock_hpte(hptep, hpte[0]); preempt_enable(); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hpte[0] = hpte_new_to_old_v(hpte[0], hpte[1]); + hpte[1] = hpte_new_to_old_r(hpte[1]); + } if (hpte[0] != vcpu->arch.pgfault_hpte[0] || hpte[1] != vcpu->arch.pgfault_hpte[1]) return RESUME_GUEST; @@ -575,16 +598,22 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, */ if (psize < PAGE_SIZE) psize = PAGE_SIZE; - r = (r & ~(HPTE_R_PP0 - psize)) | ((pfn << PAGE_SHIFT) & ~(psize - 1)); + r = (r & HPTE_R_KEY_HI) | (r & ~(HPTE_R_PP0 - psize)) | + ((pfn << PAGE_SHIFT) & ~(psize - 1)); if (hpte_is_writable(r) && !write_ok) r = hpte_make_readonly(r); ret = RESUME_GUEST; preempt_disable(); while (!try_lock_hpte(hptep, HPTE_V_HVLOCK)) cpu_relax(); - if ((be64_to_cpu(hptep[0]) & ~HPTE_V_HVLOCK) != hpte[0] || - be64_to_cpu(hptep[1]) != hpte[1] || - rev->guest_rpte != hpte[2]) + hnow_v = be64_to_cpu(hptep[0]); + hnow_r = be64_to_cpu(hptep[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hnow_v = hpte_new_to_old_v(hnow_v, hnow_r); + hnow_r = hpte_new_to_old_r(hnow_r); + } + if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || + rev->guest_rpte != hpte[2]) /* HPTE has been changed under us; let the guest retry */ goto out_unlock; hpte[0] = (hpte[0] & ~HPTE_V_ABSENT) | HPTE_V_VALID; @@ -615,6 +644,10 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_add_revmap_chain(kvm, rev, rmap, index, 0); } + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + r = hpte_old_to_new_r(hpte[0], r); + hpte[0] = hpte_old_to_new_v(hpte[0]); + } hptep[1] = cpu_to_be64(r); eieio(); __unlock_hpte(hptep, hpte[0]); @@ -758,6 +791,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, hpte_rpn(ptel, psize) == gfn) { hptep[0] |= cpu_to_be64(HPTE_V_ABSENT); kvmppc_invalidate_hpte(kvm, hptep, i); + hptep[1] &= ~cpu_to_be64(HPTE_R_KEY_HI | HPTE_R_KEY_LO); /* Harvest R and C */ rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C); *rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT; @@ -1165,7 +1199,7 @@ static long record_hpte(unsigned long flags, __be64 *hptp, unsigned long *hpte, struct revmap_entry *revp, int want_valid, int first_pass) { - unsigned long v, r; + unsigned long v, r, hr; unsigned long rcbits_unset; int ok = 1; int valid, dirty; @@ -1192,6 +1226,11 @@ static long record_hpte(unsigned long flags, __be64 *hptp, while (!try_lock_hpte(hptp, HPTE_V_HVLOCK)) cpu_relax(); v = be64_to_cpu(hptp[0]); + hr = be64_to_cpu(hptp[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + v = hpte_new_to_old_v(v, hr); + hr = hpte_new_to_old_r(hr); + } /* re-evaluate valid and dirty from synchronized HPTE value */ valid = !!(v & HPTE_V_VALID); @@ -1199,8 +1238,8 @@ static long record_hpte(unsigned long flags, __be64 *hptp, /* Harvest R and C into guest view if necessary */ rcbits_unset = ~revp->guest_rpte & (HPTE_R_R | HPTE_R_C); - if (valid && (rcbits_unset & be64_to_cpu(hptp[1]))) { - revp->guest_rpte |= (be64_to_cpu(hptp[1]) & + if (valid && (rcbits_unset & hr)) { + revp->guest_rpte |= (hr & (HPTE_R_R | HPTE_R_C)) | HPTE_GR_MODIFIED; dirty = 1; } @@ -1608,7 +1647,7 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf, return ret; } -ssize_t debugfs_htab_write(struct file *file, const char __user *buf, +static ssize_t debugfs_htab_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { return -EACCES; diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index d461c440889a..e4c4ea973e57 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -39,7 +39,6 @@ #include <asm/udbg.h> #include <asm/iommu.h> #include <asm/tce.h> -#include <asm/iommu.h> #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 3686471be32b..8dcbe37a4dac 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -54,6 +54,9 @@ #include <asm/dbell.h> #include <asm/hmi.h> #include <asm/pnv-pci.h> +#include <asm/mmu.h> +#include <asm/opal.h> +#include <asm/xics.h> #include <linux/gfp.h> #include <linux/vmalloc.h> #include <linux/highmem.h> @@ -62,6 +65,7 @@ #include <linux/irqbypass.h> #include <linux/module.h> #include <linux/compiler.h> +#include <linux/of.h> #include "book3s.h" @@ -104,23 +108,6 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); #endif -/* Maximum halt poll interval defaults to KVM_HALT_POLL_NS_DEFAULT */ -static unsigned int halt_poll_max_ns = KVM_HALT_POLL_NS_DEFAULT; -module_param(halt_poll_max_ns, uint, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(halt_poll_max_ns, "Maximum halt poll time in ns"); - -/* Factor by which the vcore halt poll interval is grown, default is to double - */ -static unsigned int halt_poll_ns_grow = 2; -module_param(halt_poll_ns_grow, int, S_IRUGO); -MODULE_PARM_DESC(halt_poll_ns_grow, "Factor halt poll time is grown by"); - -/* Factor by which the vcore halt poll interval is shrunk, default is to reset - */ -static unsigned int halt_poll_ns_shrink; -module_param(halt_poll_ns_shrink, int, S_IRUGO); -MODULE_PARM_DESC(halt_poll_ns_shrink, "Factor halt poll time is shrunk by"); - static void kvmppc_end_cede(struct kvm_vcpu *vcpu); static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu); @@ -146,12 +133,21 @@ static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc, static bool kvmppc_ipi_thread(int cpu) { + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); + + /* On POWER9 we can use msgsnd to IPI any cpu */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + msg |= get_hard_smp_processor_id(cpu); + smp_mb(); + __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); + return true; + } + /* On POWER8 for IPIs to threads in the same core, use msgsnd */ if (cpu_has_feature(CPU_FTR_ARCH_207S)) { preempt_disable(); if (cpu_first_thread_sibling(cpu) == cpu_first_thread_sibling(smp_processor_id())) { - unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); msg |= cpu_thread_in_core(cpu); smp_mb(); __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); @@ -162,8 +158,12 @@ static bool kvmppc_ipi_thread(int cpu) } #if defined(CONFIG_PPC_ICP_NATIVE) && defined(CONFIG_SMP) - if (cpu >= 0 && cpu < nr_cpu_ids && paca[cpu].kvm_hstate.xics_phys) { - xics_wake_cpu(cpu); + if (cpu >= 0 && cpu < nr_cpu_ids) { + if (paca[cpu].kvm_hstate.xics_phys) { + xics_wake_cpu(cpu); + return true; + } + opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY); return true; } #endif @@ -299,41 +299,54 @@ static void kvmppc_set_pvr_hv(struct kvm_vcpu *vcpu, u32 pvr) vcpu->arch.pvr = pvr; } +/* Dummy value used in computing PCR value below */ +#define PCR_ARCH_300 (PCR_ARCH_207 << 1) + static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat) { - unsigned long pcr = 0; + unsigned long host_pcr_bit = 0, guest_pcr_bit = 0; struct kvmppc_vcore *vc = vcpu->arch.vcore; + /* We can (emulate) our own architecture version and anything older */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + host_pcr_bit = PCR_ARCH_300; + else if (cpu_has_feature(CPU_FTR_ARCH_207S)) + host_pcr_bit = PCR_ARCH_207; + else if (cpu_has_feature(CPU_FTR_ARCH_206)) + host_pcr_bit = PCR_ARCH_206; + else + host_pcr_bit = PCR_ARCH_205; + + /* Determine lowest PCR bit needed to run guest in given PVR level */ + guest_pcr_bit = host_pcr_bit; if (arch_compat) { switch (arch_compat) { case PVR_ARCH_205: - /* - * If an arch bit is set in PCR, all the defined - * higher-order arch bits also have to be set. - */ - pcr = PCR_ARCH_206 | PCR_ARCH_205; + guest_pcr_bit = PCR_ARCH_205; break; case PVR_ARCH_206: case PVR_ARCH_206p: - pcr = PCR_ARCH_206; + guest_pcr_bit = PCR_ARCH_206; break; case PVR_ARCH_207: + guest_pcr_bit = PCR_ARCH_207; + break; + case PVR_ARCH_300: + guest_pcr_bit = PCR_ARCH_300; break; default: return -EINVAL; } - - if (!cpu_has_feature(CPU_FTR_ARCH_207S)) { - /* POWER7 can't emulate POWER8 */ - if (!(pcr & PCR_ARCH_206)) - return -EINVAL; - pcr &= ~PCR_ARCH_206; - } } + /* Check requested PCR bits don't exceed our capabilities */ + if (guest_pcr_bit > host_pcr_bit) + return -EINVAL; + spin_lock(&vc->lock); vc->arch_compat = arch_compat; - vc->pcr = pcr; + /* Set all PCR bits for which guest_pcr_bit <= bit < host_pcr_bit */ + vc->pcr = host_pcr_bit - guest_pcr_bit; spin_unlock(&vc->lock); return 0; @@ -945,6 +958,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, break; case BOOK3S_INTERRUPT_EXTERNAL: case BOOK3S_INTERRUPT_H_DOORBELL: + case BOOK3S_INTERRUPT_H_VIRT: vcpu->stat.ext_intr_exits++; r = RESUME_GUEST; break; @@ -1229,6 +1243,12 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_WORT: *val = get_reg_val(id, vcpu->arch.wort); break; + case KVM_REG_PPC_TIDR: + *val = get_reg_val(id, vcpu->arch.tid); + break; + case KVM_REG_PPC_PSSCR: + *val = get_reg_val(id, vcpu->arch.psscr); + break; case KVM_REG_PPC_VPA_ADDR: spin_lock(&vcpu->arch.vpa_update_lock); *val = get_reg_val(id, vcpu->arch.vpa.next_gpa); @@ -1288,6 +1308,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_TM_CR: *val = get_reg_val(id, vcpu->arch.cr_tm); break; + case KVM_REG_PPC_TM_XER: + *val = get_reg_val(id, vcpu->arch.xer_tm); + break; case KVM_REG_PPC_TM_LR: *val = get_reg_val(id, vcpu->arch.lr_tm); break; @@ -1427,6 +1450,12 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_WORT: vcpu->arch.wort = set_reg_val(id, *val); break; + case KVM_REG_PPC_TIDR: + vcpu->arch.tid = set_reg_val(id, *val); + break; + case KVM_REG_PPC_PSSCR: + vcpu->arch.psscr = set_reg_val(id, *val) & PSSCR_GUEST_VIS; + break; case KVM_REG_PPC_VPA_ADDR: addr = set_reg_val(id, *val); r = -EINVAL; @@ -1498,6 +1527,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_TM_CR: vcpu->arch.cr_tm = set_reg_val(id, *val); break; + case KVM_REG_PPC_TM_XER: + vcpu->arch.xer_tm = set_reg_val(id, *val); + break; case KVM_REG_PPC_TM_LR: vcpu->arch.lr_tm = set_reg_val(id, *val); break; @@ -1540,6 +1572,20 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, return r; } +/* + * On POWER9, threads are independent and can be in different partitions. + * Therefore we consider each thread to be a subcore. + * There is a restriction that all threads have to be in the same + * MMU mode (radix or HPT), unfortunately, but since we only support + * HPT guests on a HPT host so far, that isn't an impediment yet. + */ +static int threads_per_vcore(void) +{ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + return 1; + return threads_per_subcore; +} + static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) { struct kvmppc_vcore *vcore; @@ -1554,7 +1600,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core) init_swait_queue_head(&vcore->wq); vcore->preempt_tb = TB_NIL; vcore->lpcr = kvm->arch.lpcr; - vcore->first_vcpuid = core * threads_per_subcore; + vcore->first_vcpuid = core * threads_per_vcore(); vcore->kvm = kvm; INIT_LIST_HEAD(&vcore->preempt_list); @@ -1717,7 +1763,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, int core; struct kvmppc_vcore *vcore; - core = id / threads_per_subcore; + core = id / threads_per_vcore(); if (core >= KVM_MAX_VCORES) goto out; @@ -1935,7 +1981,10 @@ static void kvmppc_wait_for_nap(void) { int cpu = smp_processor_id(); int i, loops; + int n_threads = threads_per_vcore(); + if (n_threads <= 1) + return; for (loops = 0; loops < 1000000; ++loops) { /* * Check if all threads are finished. @@ -1943,17 +1992,17 @@ static void kvmppc_wait_for_nap(void) * and the thread clears it when finished, so we look * for any threads that still have a non-NULL vcore ptr. */ - for (i = 1; i < threads_per_subcore; ++i) + for (i = 1; i < n_threads; ++i) if (paca[cpu + i].kvm_hstate.kvm_vcore) break; - if (i == threads_per_subcore) { + if (i == n_threads) { HMT_medium(); return; } HMT_low(); } HMT_medium(); - for (i = 1; i < threads_per_subcore; ++i) + for (i = 1; i < n_threads; ++i) if (paca[cpu + i].kvm_hstate.kvm_vcore) pr_err("KVM: CPU %d seems to be stuck\n", cpu + i); } @@ -2019,7 +2068,7 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc) vc->vcore_state = VCORE_PREEMPT; vc->pcpu = smp_processor_id(); - if (vc->num_threads < threads_per_subcore) { + if (vc->num_threads < threads_per_vcore()) { spin_lock(&lp->lock); list_add_tail(&vc->preempt_list, &lp->list); spin_unlock(&lp->lock); @@ -2123,8 +2172,7 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) cip->subcore_threads[sub] = vc->num_threads; cip->subcore_vm[sub] = vc->kvm; init_master_vcore(vc); - list_del(&vc->preempt_list); - list_add_tail(&vc->preempt_list, &cip->vcs[sub]); + list_move_tail(&vc->preempt_list, &cip->vcs[sub]); return true; } @@ -2254,12 +2302,12 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) * enter the guest. Only do this if it is the primary thread of the * core (not if a subcore) that is entering the guest. */ -static inline void kvmppc_clear_host_core(int cpu) +static inline int kvmppc_clear_host_core(unsigned int cpu) { int core; if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) - return; + return 0; /* * Memory barrier can be omitted here as we will do a smp_wmb() * later in kvmppc_start_thread and we need ensure that state is @@ -2267,6 +2315,7 @@ static inline void kvmppc_clear_host_core(int cpu) */ core = cpu >> threads_shift; kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 0; + return 0; } /* @@ -2274,12 +2323,12 @@ static inline void kvmppc_clear_host_core(int cpu) * Only need to do this if it is the primary thread of the core that is * exiting. */ -static inline void kvmppc_set_host_core(int cpu) +static inline int kvmppc_set_host_core(unsigned int cpu) { int core; if (!kvmppc_host_rm_ops_hv || cpu_thread_in_core(cpu)) - return; + return 0; /* * Memory barrier can be omitted here because we do a spin_unlock @@ -2287,6 +2336,7 @@ static inline void kvmppc_set_host_core(int cpu) */ core = cpu >> threads_shift; kvmppc_host_rm_ops_hv->rm_core[core].rm_state.in_host = 1; + return 0; } /* @@ -2307,6 +2357,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) unsigned long cmd_bit, stat_bit; int pcpu, thr; int target_threads; + int controlled_threads; /* * Remove from the list any threads that have a signal pending @@ -2325,11 +2376,18 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) vc->preempt_tb = TB_NIL; /* + * Number of threads that we will be controlling: the same as + * the number of threads per subcore, except on POWER9, + * where it's 1 because the threads are (mostly) independent. + */ + controlled_threads = threads_per_vcore(); + + /* * Make sure we are running on primary threads, and that secondary * threads are offline. Also check if the number of threads in this * guest are greater than the current system threads per guest. */ - if ((threads_per_core > 1) && + if ((controlled_threads > 1) && ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { for_each_runnable_thread(i, vcpu, vc) { vcpu->arch.ret = -EBUSY; @@ -2345,7 +2403,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) */ init_core_info(&core_info, vc); pcpu = smp_processor_id(); - target_threads = threads_per_subcore; + target_threads = controlled_threads; if (target_smt_mode && target_smt_mode < target_threads) target_threads = target_smt_mode; if (vc->num_threads < target_threads) @@ -2381,7 +2439,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) smp_wmb(); } pcpu = smp_processor_id(); - for (thr = 0; thr < threads_per_subcore; ++thr) + for (thr = 0; thr < controlled_threads; ++thr) paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip; /* Initiate micro-threading (split-core) if required */ @@ -2491,7 +2549,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) } /* Let secondaries go back to the offline loop */ - for (i = 0; i < threads_per_subcore; ++i) { + for (i = 0; i < controlled_threads; ++i) { kvmppc_release_hwthread(pcpu + i); if (sip && sip->napped[i]) kvmppc_ipi_thread(pcpu + i); @@ -2543,9 +2601,6 @@ static void grow_halt_poll_ns(struct kvmppc_vcore *vc) vc->halt_poll_ns = 10000; else vc->halt_poll_ns *= halt_poll_ns_grow; - - if (vc->halt_poll_ns > halt_poll_max_ns) - vc->halt_poll_ns = halt_poll_max_ns; } static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) @@ -2556,7 +2611,8 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc) vc->halt_poll_ns /= halt_poll_ns_shrink; } -/* Check to see if any of the runnable vcpus on the vcore have pending +/* + * Check to see if any of the runnable vcpus on the vcore have pending * exceptions or are no longer ceded */ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc) @@ -2655,16 +2711,18 @@ out: } /* Adjust poll time */ - if (halt_poll_max_ns) { + if (halt_poll_ns) { if (block_ns <= vc->halt_poll_ns) ; /* We slept and blocked for longer than the max halt time */ - else if (vc->halt_poll_ns && block_ns > halt_poll_max_ns) + else if (vc->halt_poll_ns && block_ns > halt_poll_ns) shrink_halt_poll_ns(vc); /* We slept and our poll time is too small */ - else if (vc->halt_poll_ns < halt_poll_max_ns && - block_ns < halt_poll_max_ns) + else if (vc->halt_poll_ns < halt_poll_ns && + block_ns < halt_poll_ns) grow_halt_poll_ns(vc); + if (vc->halt_poll_ns > halt_poll_ns) + vc->halt_poll_ns = halt_poll_ns; } else vc->halt_poll_ns = 0; @@ -2971,6 +3029,15 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, struct kvm_memslots *slots; struct kvm_memory_slot *memslot; + /* + * If we are making a new memslot, it might make + * some address that was previously cached as emulated + * MMIO be no longer emulated MMIO, so invalidate + * all the caches of emulated MMIO translations. + */ + if (npages) + atomic64_inc(&kvm->arch.mmio_update); + if (npages && old->npages) { /* * If modifying a memslot, reset all the rmap dirty bits. @@ -3015,6 +3082,22 @@ static void kvmppc_mmu_destroy_hv(struct kvm_vcpu *vcpu) return; } +static void kvmppc_setup_partition_table(struct kvm *kvm) +{ + unsigned long dw0, dw1; + + /* PS field - page size for VRMA */ + dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) | + ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1); + /* HTABSIZE and HTABORG fields */ + dw0 |= kvm->arch.sdr1; + + /* Second dword has GR=0; other fields are unused since UPRT=0 */ + dw1 = 0; + + mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); +} + static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) { int err = 0; @@ -3066,17 +3149,20 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) psize == 0x1000000)) goto out_srcu; - /* Update VRMASD field in the LPCR */ senc = slb_pgsize_encoding(psize); kvm->arch.vrma_slb_v = senc | SLB_VSID_B_1T | (VRMA_VSID << SLB_VSID_SHIFT_1T); - /* the -4 is to account for senc values starting at 0x10 */ - lpcr = senc << (LPCR_VRMASD_SH - 4); - /* Create HPTEs in the hash page table for the VRMA */ kvmppc_map_vrma(vcpu, memslot, porder); - kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); + /* Update VRMASD field in the LPCR */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) { + /* the -4 is to account for senc values starting at 0x10 */ + lpcr = senc << (LPCR_VRMASD_SH - 4); + kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD); + } else { + kvmppc_setup_partition_table(kvm); + } /* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */ smp_wmb(); @@ -3094,36 +3180,6 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) } #ifdef CONFIG_KVM_XICS -static int kvmppc_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) -{ - unsigned long cpu = (long)hcpu; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - kvmppc_set_host_core(cpu); - break; - -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - kvmppc_clear_host_core(cpu); - break; -#endif - default: - break; - } - - return NOTIFY_OK; -} - -static struct notifier_block kvmppc_cpu_notifier = { - .notifier_call = kvmppc_cpu_notify, -}; - /* * Allocate a per-core structure for managing state about which cores are * running in the host versus the guest and for exchanging data between @@ -3185,15 +3241,17 @@ void kvmppc_alloc_host_rm_ops(void) return; } - register_cpu_notifier(&kvmppc_cpu_notifier); - + cpuhp_setup_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE, + "ppc/kvm_book3s:prepare", + kvmppc_set_host_core, + kvmppc_clear_host_core); put_online_cpus(); } void kvmppc_free_host_rm_ops(void) { if (kvmppc_host_rm_ops_hv) { - unregister_cpu_notifier(&kvmppc_cpu_notifier); + cpuhp_remove_state_nocalls(CPUHP_KVM_PPC_BOOK3S_PREPARE); kfree(kvmppc_host_rm_ops_hv->rm_core); kfree(kvmppc_host_rm_ops_hv); kvmppc_host_rm_ops_hv = NULL; @@ -3219,14 +3277,18 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) * Since we don't flush the TLB when tearing down a VM, * and this lpid might have previously been used, * make sure we flush on each core before running the new VM. + * On POWER9, the tlbie in mmu_partition_table_set_entry() + * does this flush for us. */ - cpumask_setall(&kvm->arch.need_tlb_flush); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + cpumask_setall(&kvm->arch.need_tlb_flush); /* Start out with the default set of hcalls enabled */ memcpy(kvm->arch.enabled_hcalls, default_enabled_hcalls, sizeof(kvm->arch.enabled_hcalls)); - kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); /* Init LPCR for virtual RMA mode */ kvm->arch.host_lpid = mfspr(SPRN_LPID); @@ -3239,9 +3301,29 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) /* On POWER8 turn on online bit to enable PURR/SPURR */ if (cpu_has_feature(CPU_FTR_ARCH_207S)) lpcr |= LPCR_ONL; + /* + * On POWER9, VPM0 bit is reserved (VPM0=1 behaviour is assumed) + * Set HVICE bit to enable hypervisor virtualization interrupts. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + lpcr &= ~LPCR_VPM0; + lpcr |= LPCR_HVICE; + } + kvm->arch.lpcr = lpcr; /* + * Work out how many sets the TLB has, for the use of + * the TLB invalidation loop in book3s_hv_rmhandlers.S. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */ + else if (cpu_has_feature(CPU_FTR_ARCH_207S)) + kvm->arch.tlb_sets = POWER8_TLB_SETS; /* 512 */ + else + kvm->arch.tlb_sets = POWER7_TLB_SETS; /* 128 */ + + /* * Track that we now have a HV mode VM active. This blocks secondary * CPU threads from coming online. */ @@ -3305,9 +3387,9 @@ static int kvmppc_core_check_processor_compat_hv(void) !cpu_has_feature(CPU_FTR_ARCH_206)) return -EIO; /* - * Disable KVM for Power9, untill the required bits merged. + * Disable KVM for Power9 in radix mode. */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) + if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled()) return -EIO; return 0; @@ -3661,6 +3743,23 @@ static int kvmppc_book3s_init_hv(void) if (r) return r; + /* + * We need a way of accessing the XICS interrupt controller, + * either directly, via paca[cpu].kvm_hstate.xics_phys, or + * indirectly, via OPAL. + */ +#ifdef CONFIG_SMP + if (!get_paca()->kvm_hstate.xics_phys) { + struct device_node *np; + + np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc"); + if (!np) { + pr_err("KVM-HV: Cannot determine method for accessing XICS\n"); + return -ENODEV; + } + } +#endif + kvm_ops_hv.owner = THIS_MODULE; kvmppc_hv_ops = &kvm_ops_hv; @@ -3683,3 +3782,4 @@ module_exit(kvmppc_book3s_exit_hv); MODULE_LICENSE("GPL"); MODULE_ALIAS_MISCDEV(KVM_MINOR); MODULE_ALIAS("devname:kvm"); + diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 0c84d6bc8356..5bb24be0b346 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -26,6 +26,8 @@ #include <asm/dbell.h> #include <asm/cputhreads.h> #include <asm/io.h> +#include <asm/opal.h> +#include <asm/smp.h> #define KVM_CMA_CHUNK_ORDER 18 @@ -205,12 +207,18 @@ static inline void rm_writeb(unsigned long paddr, u8 val) void kvmhv_rm_send_ipi(int cpu) { unsigned long xics_phys; + unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); - /* On POWER8 for IPIs to threads in the same core, use msgsnd */ + /* On POWER9 we can use msgsnd for any destination cpu. */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + msg |= get_hard_smp_processor_id(cpu); + __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); + return; + } + /* On POWER8 for IPIs to threads in the same core, use msgsnd. */ if (cpu_has_feature(CPU_FTR_ARCH_207S) && cpu_first_thread_sibling(cpu) == cpu_first_thread_sibling(raw_smp_processor_id())) { - unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); msg |= cpu_thread_in_core(cpu); __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); return; @@ -218,7 +226,11 @@ void kvmhv_rm_send_ipi(int cpu) /* Else poke the target with an IPI */ xics_phys = paca[cpu].kvm_hstate.xics_phys; - rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); + if (xics_phys) + rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); + else + opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu), + IPI_PRIORITY); } /* @@ -329,7 +341,7 @@ static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap, * saved a copy of the XIRR in the PACA, it will be picked up by * the host ICP driver. */ -static int kvmppc_check_passthru(u32 xisr, __be32 xirr) +static int kvmppc_check_passthru(u32 xisr, __be32 xirr, bool *again) { struct kvmppc_passthru_irqmap *pimap; struct kvmppc_irq_map *irq_map; @@ -348,11 +360,11 @@ static int kvmppc_check_passthru(u32 xisr, __be32 xirr) /* We're handling this interrupt, generic code doesn't need to */ local_paca->kvm_hstate.saved_xirr = 0; - return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap); + return kvmppc_deliver_irq_passthru(vcpu, xirr, irq_map, pimap, again); } #else -static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr) +static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr, bool *again) { return 1; } @@ -367,14 +379,31 @@ static inline int kvmppc_check_passthru(u32 xisr, __be32 xirr) * -1 if there was a guest wakeup IPI (which has now been cleared) * -2 if there is PCI passthrough external interrupt that was handled */ +static long kvmppc_read_one_intr(bool *again); long kvmppc_read_intr(void) { + long ret = 0; + long rc; + bool again; + + do { + again = false; + rc = kvmppc_read_one_intr(&again); + if (rc && (ret == 0 || rc > ret)) + ret = rc; + } while (again); + return ret; +} + +static long kvmppc_read_one_intr(bool *again) +{ unsigned long xics_phys; u32 h_xirr; __be32 xirr; u32 xisr; u8 host_ipi; + int64_t rc; /* see if a host IPI is pending */ host_ipi = local_paca->kvm_hstate.host_ipi; @@ -383,8 +412,14 @@ long kvmppc_read_intr(void) /* Now read the interrupt from the ICP */ xics_phys = local_paca->kvm_hstate.xics_phys; - if (unlikely(!xics_phys)) - return 1; + if (!xics_phys) { + /* Use OPAL to read the XIRR */ + rc = opal_rm_int_get_xirr(&xirr, false); + if (rc < 0) + return 1; + } else { + xirr = _lwzcix(xics_phys + XICS_XIRR); + } /* * Save XIRR for later. Since we get control in reverse endian @@ -392,7 +427,6 @@ long kvmppc_read_intr(void) * host endian. Note that xirr is the value read from the * XIRR register, while h_xirr is the host endian version. */ - xirr = _lwzcix(xics_phys + XICS_XIRR); h_xirr = be32_to_cpu(xirr); local_paca->kvm_hstate.saved_xirr = h_xirr; xisr = h_xirr & 0xffffff; @@ -411,8 +445,16 @@ long kvmppc_read_intr(void) * If it is an IPI, clear the MFRR and EOI it. */ if (xisr == XICS_IPI) { - _stbcix(xics_phys + XICS_MFRR, 0xff); - _stwcix(xics_phys + XICS_XIRR, xirr); + if (xics_phys) { + _stbcix(xics_phys + XICS_MFRR, 0xff); + _stwcix(xics_phys + XICS_XIRR, xirr); + } else { + opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff); + rc = opal_rm_int_eoi(h_xirr); + /* If rc > 0, there is another interrupt pending */ + *again = rc > 0; + } + /* * Need to ensure side effects of above stores * complete before proceeding. @@ -429,7 +471,11 @@ long kvmppc_read_intr(void) /* We raced with the host, * we need to resend that IPI, bummer */ - _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY); + if (xics_phys) + _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY); + else + opal_rm_int_set_mfrr(hard_smp_processor_id(), + IPI_PRIORITY); /* Let side effects complete */ smp_mb(); return 1; @@ -440,5 +486,5 @@ long kvmppc_read_intr(void) return -1; } - return kvmppc_check_passthru(xisr, xirr); + return kvmppc_check_passthru(xisr, xirr, again); } diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index 0fa70a9618d7..7ef0993214f3 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -16,6 +16,7 @@ #include <asm/machdep.h> #include <asm/cputhreads.h> #include <asm/hmi.h> +#include <asm/kvm_ppc.h> /* SRR1 bits for machine check on POWER7 */ #define SRR1_MC_LDSTERR (1ul << (63-42)) diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 99b4e9d5dd23..9ef3c4be952f 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -264,8 +264,10 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, if (pa) pteh |= HPTE_V_VALID; - else + else { pteh |= HPTE_V_ABSENT; + ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO); + } /*If we had host pte mapping then Check WIMG */ if (ptep && !hpte_cache_flags_ok(ptel, is_ci)) { @@ -351,6 +353,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, /* inval in progress, write a non-present HPTE */ pteh |= HPTE_V_ABSENT; pteh &= ~HPTE_V_VALID; + ptel &= ~(HPTE_R_KEY_HI | HPTE_R_KEY_LO); unlock_rmap(rmap); } else { kvmppc_add_revmap_chain(kvm, rev, rmap, pte_index, @@ -361,6 +364,11 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, } } + /* Convert to new format on P9 */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + ptel = hpte_old_to_new_r(pteh, ptel); + pteh = hpte_old_to_new_v(pteh); + } hpte[1] = cpu_to_be64(ptel); /* Write the first HPTE dword, unlocking the HPTE and making it valid */ @@ -386,6 +394,13 @@ long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags, #define LOCK_TOKEN (*(u32 *)(&get_paca()->paca_index)) #endif +static inline int is_mmio_hpte(unsigned long v, unsigned long r) +{ + return ((v & HPTE_V_ABSENT) && + (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == + (HPTE_R_KEY_HI | HPTE_R_KEY_LO)); +} + static inline int try_lock_tlbie(unsigned int *lock) { unsigned int tmp, old; @@ -409,13 +424,18 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, { long i; + /* + * We use the POWER9 5-operand versions of tlbie and tlbiel here. + * Since we are using RIC=0 PRS=0 R=0, and P7/P8 tlbiel ignores + * the RS field, this is backwards-compatible with P7 and P8. + */ if (global) { while (!try_lock_tlbie(&kvm->arch.tlbie_lock)) cpu_relax(); if (need_sync) asm volatile("ptesync" : : : "memory"); for (i = 0; i < npages; ++i) - asm volatile(PPC_TLBIE(%1,%0) : : + asm volatile(PPC_TLBIE_5(%0,%1,0,0,0) : : "r" (rbvalues[i]), "r" (kvm->arch.lpid)); asm volatile("eieio; tlbsync; ptesync" : : : "memory"); kvm->arch.tlbie_lock = 0; @@ -423,7 +443,8 @@ static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues, if (need_sync) asm volatile("ptesync" : : : "memory"); for (i = 0; i < npages; ++i) - asm volatile("tlbiel %0" : : "r" (rbvalues[i])); + asm volatile(PPC_TLBIEL(%0,%1,0,0,0) : : + "r" (rbvalues[i]), "r" (0)); asm volatile("ptesync" : : : "memory"); } } @@ -435,18 +456,23 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, __be64 *hpte; unsigned long v, r, rb; struct revmap_entry *rev; - u64 pte; + u64 pte, orig_pte, pte_r; if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - pte = be64_to_cpu(hpte[0]); + pte = orig_pte = be64_to_cpu(hpte[0]); + pte_r = be64_to_cpu(hpte[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + pte = hpte_new_to_old_v(pte, pte_r); + pte_r = hpte_new_to_old_r(pte_r); + } if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn) || ((flags & H_ANDCOND) && (pte & avpn) != 0)) { - __unlock_hpte(hpte, pte); + __unlock_hpte(hpte, orig_pte); return H_NOT_FOUND; } @@ -454,7 +480,7 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, v = pte & ~HPTE_V_HVLOCK; if (v & HPTE_V_VALID) { hpte[0] &= ~cpu_to_be64(HPTE_V_VALID); - rb = compute_tlbie_rb(v, be64_to_cpu(hpte[1]), pte_index); + rb = compute_tlbie_rb(v, pte_r, pte_index); do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true); /* * The reference (R) and change (C) bits in a HPT @@ -472,6 +498,9 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, note_hpte_modification(kvm, rev); unlock_hpte(hpte, 0); + if (is_mmio_hpte(v, pte_r)) + atomic64_inc(&kvm->arch.mmio_update); + if (v & HPTE_V_ABSENT) v = (v & ~HPTE_V_ABSENT) | HPTE_V_VALID; hpret[0] = v; @@ -498,7 +527,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) int global; long int ret = H_SUCCESS; struct revmap_entry *rev, *revs[4]; - u64 hp0; + u64 hp0, hp1; global = global_invalidates(kvm, 0); for (i = 0; i < 4 && ret == H_SUCCESS; ) { @@ -531,6 +560,11 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) } found = 0; hp0 = be64_to_cpu(hp[0]); + hp1 = be64_to_cpu(hp[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hp0 = hpte_new_to_old_v(hp0, hp1); + hp1 = hpte_new_to_old_r(hp1); + } if (hp0 & (HPTE_V_ABSENT | HPTE_V_VALID)) { switch (flags & 3) { case 0: /* absolute */ @@ -561,13 +595,14 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) rcbits = rev->guest_rpte & (HPTE_R_R|HPTE_R_C); args[j] |= rcbits << (56 - 5); hp[0] = 0; + if (is_mmio_hpte(hp0, hp1)) + atomic64_inc(&kvm->arch.mmio_update); continue; } /* leave it locked */ hp[0] &= ~cpu_to_be64(HPTE_V_VALID); - tlbrb[n] = compute_tlbie_rb(be64_to_cpu(hp[0]), - be64_to_cpu(hp[1]), pte_index); + tlbrb[n] = compute_tlbie_rb(hp0, hp1, pte_index); indexes[n] = j; hptes[n] = hp; revs[n] = rev; @@ -605,7 +640,7 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, __be64 *hpte; struct revmap_entry *rev; unsigned long v, r, rb, mask, bits; - u64 pte; + u64 pte_v, pte_r; if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; @@ -613,14 +648,16 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); while (!try_lock_hpte(hpte, HPTE_V_HVLOCK)) cpu_relax(); - pte = be64_to_cpu(hpte[0]); - if ((pte & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || - ((flags & H_AVPN) && (pte & ~0x7fUL) != avpn)) { - __unlock_hpte(hpte, pte); + v = pte_v = be64_to_cpu(hpte[0]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + v = hpte_new_to_old_v(v, be64_to_cpu(hpte[1])); + if ((v & (HPTE_V_ABSENT | HPTE_V_VALID)) == 0 || + ((flags & H_AVPN) && (v & ~0x7fUL) != avpn)) { + __unlock_hpte(hpte, pte_v); return H_NOT_FOUND; } - v = pte; + pte_r = be64_to_cpu(hpte[1]); bits = (flags << 55) & HPTE_R_PP0; bits |= (flags << 48) & HPTE_R_KEY_HI; bits |= flags & (HPTE_R_PP | HPTE_R_N | HPTE_R_KEY_LO); @@ -642,22 +679,26 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, * readonly to writable. If it should be writable, we'll * take a trap and let the page fault code sort it out. */ - pte = be64_to_cpu(hpte[1]); - r = (pte & ~mask) | bits; - if (hpte_is_writable(r) && !hpte_is_writable(pte)) + r = (pte_r & ~mask) | bits; + if (hpte_is_writable(r) && !hpte_is_writable(pte_r)) r = hpte_make_readonly(r); /* If the PTE is changing, invalidate it first */ - if (r != pte) { + if (r != pte_r) { rb = compute_tlbie_rb(v, r, pte_index); - hpte[0] = cpu_to_be64((v & ~HPTE_V_VALID) | + hpte[0] = cpu_to_be64((pte_v & ~HPTE_V_VALID) | HPTE_V_ABSENT); do_tlbies(kvm, &rb, 1, global_invalidates(kvm, flags), true); + /* Don't lose R/C bit updates done by hardware */ + r |= be64_to_cpu(hpte[1]) & (HPTE_R_R | HPTE_R_C); hpte[1] = cpu_to_be64(r); } } - unlock_hpte(hpte, v & ~HPTE_V_HVLOCK); + unlock_hpte(hpte, pte_v & ~HPTE_V_HVLOCK); asm volatile("ptesync" : : : "memory"); + if (is_mmio_hpte(v, pte_r)) + atomic64_inc(&kvm->arch.mmio_update); + return H_SUCCESS; } @@ -681,6 +722,10 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; r = be64_to_cpu(hpte[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + v = hpte_new_to_old_v(v, r); + r = hpte_new_to_old_r(r); + } if (v & HPTE_V_ABSENT) { v &= ~HPTE_V_ABSENT; v |= HPTE_V_VALID; @@ -798,10 +843,16 @@ void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep, unsigned long pte_index) { unsigned long rb; + u64 hp0, hp1; hptep[0] &= ~cpu_to_be64(HPTE_V_VALID); - rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]), - pte_index); + hp0 = be64_to_cpu(hptep[0]); + hp1 = be64_to_cpu(hptep[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hp0 = hpte_new_to_old_v(hp0, hp1); + hp1 = hpte_new_to_old_r(hp1); + } + rb = compute_tlbie_rb(hp0, hp1, pte_index); do_tlbies(kvm, &rb, 1, 1, true); } EXPORT_SYMBOL_GPL(kvmppc_invalidate_hpte); @@ -811,9 +862,15 @@ void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep, { unsigned long rb; unsigned char rbyte; + u64 hp0, hp1; - rb = compute_tlbie_rb(be64_to_cpu(hptep[0]), be64_to_cpu(hptep[1]), - pte_index); + hp0 = be64_to_cpu(hptep[0]); + hp1 = be64_to_cpu(hptep[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hp0 = hpte_new_to_old_v(hp0, hp1); + hp1 = hpte_new_to_old_r(hp1); + } + rb = compute_tlbie_rb(hp0, hp1, pte_index); rbyte = (be64_to_cpu(hptep[1]) & ~HPTE_R_R) >> 8; /* modify only the second-last byte, which contains the ref bit */ *((char *)hptep + 14) = rbyte; @@ -828,6 +885,37 @@ static int slb_base_page_shift[4] = { 20, /* 1M, unsupported */ }; +static struct mmio_hpte_cache_entry *mmio_cache_search(struct kvm_vcpu *vcpu, + unsigned long eaddr, unsigned long slb_v, long mmio_update) +{ + struct mmio_hpte_cache_entry *entry = NULL; + unsigned int pshift; + unsigned int i; + + for (i = 0; i < MMIO_HPTE_CACHE_SIZE; i++) { + entry = &vcpu->arch.mmio_cache.entry[i]; + if (entry->mmio_update == mmio_update) { + pshift = entry->slb_base_pshift; + if ((entry->eaddr >> pshift) == (eaddr >> pshift) && + entry->slb_v == slb_v) + return entry; + } + } + return NULL; +} + +static struct mmio_hpte_cache_entry * + next_mmio_cache_entry(struct kvm_vcpu *vcpu) +{ + unsigned int index = vcpu->arch.mmio_cache.index; + + vcpu->arch.mmio_cache.index++; + if (vcpu->arch.mmio_cache.index == MMIO_HPTE_CACHE_SIZE) + vcpu->arch.mmio_cache.index = 0; + + return &vcpu->arch.mmio_cache.entry[index]; +} + /* When called from virtmode, this func should be protected by * preempt_disable(), otherwise, the holding of HPTE_V_HVLOCK * can trigger deadlock issue. @@ -842,7 +930,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, unsigned long avpn; __be64 *hpte; unsigned long mask, val; - unsigned long v, r; + unsigned long v, r, orig_v; /* Get page shift, work out hash and AVPN etc. */ mask = SLB_VSID_B | HPTE_V_AVPN | HPTE_V_SECONDARY; @@ -877,6 +965,8 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, for (i = 0; i < 16; i += 2) { /* Read the PTE racily */ v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK; + if (cpu_has_feature(CPU_FTR_ARCH_300)) + v = hpte_new_to_old_v(v, be64_to_cpu(hpte[i+1])); /* Check valid/absent, hash, segment size and AVPN */ if (!(v & valid) || (v & mask) != val) @@ -885,8 +975,12 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, /* Lock the PTE and read it under the lock */ while (!try_lock_hpte(&hpte[i], HPTE_V_HVLOCK)) cpu_relax(); - v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK; + v = orig_v = be64_to_cpu(hpte[i]) & ~HPTE_V_HVLOCK; r = be64_to_cpu(hpte[i+1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + v = hpte_new_to_old_v(v, r); + r = hpte_new_to_old_r(r); + } /* * Check the HPTE again, including base page size @@ -896,7 +990,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, /* Return with the HPTE still locked */ return (hash << 3) + (i >> 1); - __unlock_hpte(&hpte[i], v); + __unlock_hpte(&hpte[i], orig_v); } if (val & HPTE_V_SECONDARY) @@ -924,30 +1018,45 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, { struct kvm *kvm = vcpu->kvm; long int index; - unsigned long v, r, gr; + unsigned long v, r, gr, orig_v; __be64 *hpte; unsigned long valid; struct revmap_entry *rev; unsigned long pp, key; + struct mmio_hpte_cache_entry *cache_entry = NULL; + long mmio_update = 0; /* For protection fault, expect to find a valid HPTE */ valid = HPTE_V_VALID; - if (status & DSISR_NOHPTE) + if (status & DSISR_NOHPTE) { valid |= HPTE_V_ABSENT; - - index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); - if (index < 0) { - if (status & DSISR_NOHPTE) - return status; /* there really was no HPTE */ - return 0; /* for prot fault, HPTE disappeared */ + mmio_update = atomic64_read(&kvm->arch.mmio_update); + cache_entry = mmio_cache_search(vcpu, addr, slb_v, mmio_update); } - hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); - v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; - r = be64_to_cpu(hpte[1]); - rev = real_vmalloc_addr(&kvm->arch.revmap[index]); - gr = rev->guest_rpte; + if (cache_entry) { + index = cache_entry->pte_index; + v = cache_entry->hpte_v; + r = cache_entry->hpte_r; + gr = cache_entry->rpte; + } else { + index = kvmppc_hv_find_lock_hpte(kvm, addr, slb_v, valid); + if (index < 0) { + if (status & DSISR_NOHPTE) + return status; /* there really was no HPTE */ + return 0; /* for prot fault, HPTE disappeared */ + } + hpte = (__be64 *)(kvm->arch.hpt_virt + (index << 4)); + v = orig_v = be64_to_cpu(hpte[0]) & ~HPTE_V_HVLOCK; + r = be64_to_cpu(hpte[1]); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + v = hpte_new_to_old_v(v, r); + r = hpte_new_to_old_r(r); + } + rev = real_vmalloc_addr(&kvm->arch.revmap[index]); + gr = rev->guest_rpte; - unlock_hpte(hpte, v); + unlock_hpte(hpte, orig_v); + } /* For not found, if the HPTE is valid by now, retry the instruction */ if ((status & DSISR_NOHPTE) && (v & HPTE_V_VALID)) @@ -985,12 +1094,32 @@ long kvmppc_hpte_hv_fault(struct kvm_vcpu *vcpu, unsigned long addr, vcpu->arch.pgfault_index = index; vcpu->arch.pgfault_hpte[0] = v; vcpu->arch.pgfault_hpte[1] = r; + vcpu->arch.pgfault_cache = cache_entry; /* Check the storage key to see if it is possibly emulated MMIO */ - if (data && (vcpu->arch.shregs.msr & MSR_IR) && - (r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == - (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) - return -2; /* MMIO emulation - load instr word */ + if ((r & (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) == + (HPTE_R_KEY_HI | HPTE_R_KEY_LO)) { + if (!cache_entry) { + unsigned int pshift = 12; + unsigned int pshift_index; + + if (slb_v & SLB_VSID_L) { + pshift_index = ((slb_v & SLB_VSID_LP) >> 4); + pshift = slb_base_page_shift[pshift_index]; + } + cache_entry = next_mmio_cache_entry(vcpu); + cache_entry->eaddr = addr; + cache_entry->slb_base_pshift = pshift; + cache_entry->pte_index = index; + cache_entry->hpte_v = v; + cache_entry->hpte_r = r; + cache_entry->rpte = gr; + cache_entry->slb_v = slb_v; + cache_entry->mmio_update = mmio_update; + } + if (data && (vcpu->arch.shregs.msr & MSR_IR)) + return -2; /* MMIO emulation - load instr word */ + } return -1; /* send fault up to host kernel mode */ } diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index a0ea63ac2b52..06edc4366639 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -70,7 +70,11 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) hcpu = hcore << threads_shift; kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu; smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION); - icp_native_cause_ipi_rm(hcpu); + if (paca[hcpu].kvm_hstate.xics_phys) + icp_native_cause_ipi_rm(hcpu); + else + opal_rm_int_set_mfrr(get_hard_smp_processor_id(hcpu), + IPI_PRIORITY); } #else static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { } @@ -737,7 +741,7 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) unsigned long eoi_rc; -static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr) +static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) { unsigned long xics_phys; int64_t rc; @@ -751,7 +755,12 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr) /* EOI it */ xics_phys = local_paca->kvm_hstate.xics_phys; - _stwcix(xics_phys + XICS_XIRR, xirr); + if (xics_phys) { + _stwcix(xics_phys + XICS_XIRR, xirr); + } else { + rc = opal_rm_int_eoi(be32_to_cpu(xirr)); + *again = rc > 0; + } } static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu) @@ -809,9 +818,10 @@ static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc) } long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, - u32 xirr, + __be32 xirr, struct kvmppc_irq_map *irq_map, - struct kvmppc_passthru_irqmap *pimap) + struct kvmppc_passthru_irqmap *pimap, + bool *again) { struct kvmppc_xics *xics; struct kvmppc_icp *icp; @@ -825,7 +835,8 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, icp_rm_deliver_irq(xics, icp, irq); /* EOI the interrupt */ - icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr); + icp_eoi(irq_desc_get_chip(irq_map->desc), irq_map->r_hwirq, xirr, + again); if (check_too_hard(xics, icp) == H_TOO_HARD) return 2; diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index c3c1d1bcfc67..9338a818e05c 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -501,17 +501,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) cmpwi r0, 0 beq 57f li r3, (LPCR_PECEDH | LPCR_PECE0) >> 4 - mfspr r4, SPRN_LPCR - rlwimi r4, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1) - mtspr SPRN_LPCR, r4 - isync - std r0, HSTATE_SCRATCH0(r13) - ptesync - ld r0, HSTATE_SCRATCH0(r13) -1: cmpd r0, r0 - bne 1b - nap - b . + mfspr r5, SPRN_LPCR + rlwimi r5, r3, 4, (LPCR_PECEDP | LPCR_PECEDH | LPCR_PECE0 | LPCR_PECE1) + b kvm_nap_sequence 57: li r0, 0 stbx r0, r3, r4 @@ -523,6 +515,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) * * *****************************************************************************/ +/* Stack frame offsets */ +#define STACK_SLOT_TID (112-16) +#define STACK_SLOT_PSSCR (112-24) + .global kvmppc_hv_entry kvmppc_hv_entry: @@ -581,12 +577,14 @@ kvmppc_hv_entry: ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ cmpwi r6,0 bne 10f - ld r6,KVM_SDR1(r9) lwz r7,KVM_LPID(r9) +BEGIN_FTR_SECTION + ld r6,KVM_SDR1(r9) li r0,LPID_RSVD /* switch to reserved LPID */ mtspr SPRN_LPID,r0 ptesync mtspr SPRN_SDR1,r6 /* switch to partition page table */ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) mtspr SPRN_LPID,r7 isync @@ -607,12 +605,8 @@ kvmppc_hv_entry: stdcx. r7,0,r6 bne 23b /* Flush the TLB of any entries for this LPID */ - /* use arch 2.07S as a proxy for POWER8 */ -BEGIN_FTR_SECTION - li r6,512 /* POWER8 has 512 sets */ -FTR_SECTION_ELSE - li r6,128 /* POWER7 has 128 sets */ -ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_207S) + lwz r6,KVM_TLB_SETS(r9) + li r0,0 /* RS for P9 version of tlbiel */ mtctr r6 li r7,0x800 /* IS field = 0b10 */ ptesync @@ -698,6 +692,14 @@ kvmppc_got_guest: mtspr SPRN_PURR,r7 mtspr SPRN_SPURR,r8 + /* Save host values of some registers */ +BEGIN_FTR_SECTION + mfspr r5, SPRN_TIDR + mfspr r6, SPRN_PSSCR + std r5, STACK_SLOT_TID(r1) + std r6, STACK_SLOT_PSSCR(r1) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + BEGIN_FTR_SECTION /* Set partition DABR */ /* Do this before re-enabling PMU to avoid P7 DABR corruption bug */ @@ -750,14 +752,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG) BEGIN_FTR_SECTION ld r5, VCPU_MMCR + 24(r4) ld r6, VCPU_SIER(r4) + mtspr SPRN_MMCR2, r5 + mtspr SPRN_SIER, r6 +BEGIN_FTR_SECTION_NESTED(96) lwz r7, VCPU_PMC + 24(r4) lwz r8, VCPU_PMC + 28(r4) ld r9, VCPU_MMCR + 32(r4) - mtspr SPRN_MMCR2, r5 - mtspr SPRN_SIER, r6 mtspr SPRN_SPMC1, r7 mtspr SPRN_SPMC2, r8 mtspr SPRN_MMCRS, r9 +END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_MMCR0, r3 isync @@ -813,20 +817,30 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtspr SPRN_EBBHR, r8 ld r5, VCPU_EBBRR(r4) ld r6, VCPU_BESCR(r4) - ld r7, VCPU_CSIGR(r4) - ld r8, VCPU_TACR(r4) + lwz r7, VCPU_GUEST_PID(r4) + ld r8, VCPU_WORT(r4) mtspr SPRN_EBBRR, r5 mtspr SPRN_BESCR, r6 - mtspr SPRN_CSIGR, r7 - mtspr SPRN_TACR, r8 + mtspr SPRN_PID, r7 + mtspr SPRN_WORT, r8 +BEGIN_FTR_SECTION + /* POWER8-only registers */ ld r5, VCPU_TCSCR(r4) ld r6, VCPU_ACOP(r4) - lwz r7, VCPU_GUEST_PID(r4) - ld r8, VCPU_WORT(r4) + ld r7, VCPU_CSIGR(r4) + ld r8, VCPU_TACR(r4) mtspr SPRN_TCSCR, r5 mtspr SPRN_ACOP, r6 - mtspr SPRN_PID, r7 - mtspr SPRN_WORT, r8 + mtspr SPRN_CSIGR, r7 + mtspr SPRN_TACR, r8 +FTR_SECTION_ELSE + /* POWER9-only registers */ + ld r5, VCPU_TID(r4) + ld r6, VCPU_PSSCR(r4) + oris r6, r6, PSSCR_EC@h /* This makes stop trap to HV */ + mtspr SPRN_TIDR, r5 + mtspr SPRN_PSSCR, r6 +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) 8: /* @@ -1341,20 +1355,29 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) std r8, VCPU_EBBHR(r9) mfspr r5, SPRN_EBBRR mfspr r6, SPRN_BESCR - mfspr r7, SPRN_CSIGR - mfspr r8, SPRN_TACR + mfspr r7, SPRN_PID + mfspr r8, SPRN_WORT std r5, VCPU_EBBRR(r9) std r6, VCPU_BESCR(r9) - std r7, VCPU_CSIGR(r9) - std r8, VCPU_TACR(r9) + stw r7, VCPU_GUEST_PID(r9) + std r8, VCPU_WORT(r9) +BEGIN_FTR_SECTION mfspr r5, SPRN_TCSCR mfspr r6, SPRN_ACOP - mfspr r7, SPRN_PID - mfspr r8, SPRN_WORT + mfspr r7, SPRN_CSIGR + mfspr r8, SPRN_TACR std r5, VCPU_TCSCR(r9) std r6, VCPU_ACOP(r9) - stw r7, VCPU_GUEST_PID(r9) - std r8, VCPU_WORT(r9) + std r7, VCPU_CSIGR(r9) + std r8, VCPU_TACR(r9) +FTR_SECTION_ELSE + mfspr r5, SPRN_TIDR + mfspr r6, SPRN_PSSCR + std r5, VCPU_TID(r9) + rldicl r6, r6, 4, 50 /* r6 &= PSSCR_GUEST_VIS */ + rotldi r6, r6, 60 + std r6, VCPU_PSSCR(r9) +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) /* * Restore various registers to 0, where non-zero values * set by the guest could disrupt the host. @@ -1363,12 +1386,14 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtspr SPRN_IAMR, r0 mtspr SPRN_CIABR, r0 mtspr SPRN_DAWRX, r0 - mtspr SPRN_TCSCR, r0 mtspr SPRN_WORT, r0 +BEGIN_FTR_SECTION + mtspr SPRN_TCSCR, r0 /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */ li r0, 1 sldi r0, r0, 31 mtspr SPRN_MMCRS, r0 +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) 8: /* Save and reset AMR and UAMOR before turning on the MMU */ @@ -1502,15 +1527,17 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) stw r8, VCPU_PMC + 20(r9) BEGIN_FTR_SECTION mfspr r5, SPRN_SIER + std r5, VCPU_SIER(r9) +BEGIN_FTR_SECTION_NESTED(96) mfspr r6, SPRN_SPMC1 mfspr r7, SPRN_SPMC2 mfspr r8, SPRN_MMCRS - std r5, VCPU_SIER(r9) stw r6, VCPU_PMC + 24(r9) stw r7, VCPU_PMC + 28(r9) std r8, VCPU_MMCR + 32(r9) lis r4, 0x8000 mtspr SPRN_MMCRS, r4 +END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) 22: /* Clear out SLB */ @@ -1519,6 +1546,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) slbia ptesync + /* Restore host values of some registers */ +BEGIN_FTR_SECTION + ld r5, STACK_SLOT_TID(r1) + ld r6, STACK_SLOT_PSSCR(r1) + mtspr SPRN_TIDR, r5 + mtspr SPRN_PSSCR, r6 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + /* * POWER7/POWER8 guest -> host partition switch code. * We don't have to lock against tlbies but we do @@ -1552,12 +1587,14 @@ kvmhv_switch_to_host: beq 19f /* Primary thread switches back to host partition */ - ld r6,KVM_HOST_SDR1(r4) lwz r7,KVM_HOST_LPID(r4) +BEGIN_FTR_SECTION + ld r6,KVM_HOST_SDR1(r4) li r8,LPID_RSVD /* switch to reserved LPID */ mtspr SPRN_LPID,r8 ptesync - mtspr SPRN_SDR1,r6 /* switch to partition page table */ + mtspr SPRN_SDR1,r6 /* switch to host page table */ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) mtspr SPRN_LPID,r7 isync @@ -2211,6 +2248,21 @@ BEGIN_FTR_SECTION ori r5, r5, LPCR_PECEDH rlwimi r5, r3, 0, LPCR_PECEDP END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) + +kvm_nap_sequence: /* desired LPCR value in r5 */ +BEGIN_FTR_SECTION + /* + * PSSCR bits: exit criterion = 1 (wakeup based on LPCR at sreset) + * enable state loss = 1 (allow SMT mode switch) + * requested level = 0 (just stop dispatching) + */ + lis r3, (PSSCR_EC | PSSCR_ESL)@h + mtspr SPRN_PSSCR, r3 + /* Set LPCR_PECE_HVEE bit to enable wakeup by HV interrupts */ + li r4, LPCR_PECE_HVEE@higher + sldi r4, r4, 32 + or r5, r5, r4 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) mtspr SPRN_LPCR,r5 isync li r0, 0 @@ -2219,7 +2271,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) ld r0, HSTATE_SCRATCH0(r13) 1: cmpd r0, r0 bne 1b +BEGIN_FTR_SECTION nap +FTR_SECTION_ELSE + PPC_STOP +ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) b . 33: mr r4, r3 @@ -2600,11 +2656,13 @@ kvmppc_save_tm: mfctr r7 mfspr r8, SPRN_AMR mfspr r10, SPRN_TAR + mfxer r11 std r5, VCPU_LR_TM(r9) stw r6, VCPU_CR_TM(r9) std r7, VCPU_CTR_TM(r9) std r8, VCPU_AMR_TM(r9) std r10, VCPU_TAR_TM(r9) + std r11, VCPU_XER_TM(r9) /* Restore r12 as trap number. */ lwz r12, VCPU_TRAP(r9) @@ -2697,11 +2755,13 @@ kvmppc_restore_tm: ld r7, VCPU_CTR_TM(r4) ld r8, VCPU_AMR_TM(r4) ld r9, VCPU_TAR_TM(r4) + ld r10, VCPU_XER_TM(r4) mtlr r5 mtcr r6 mtctr r7 mtspr SPRN_AMR, r8 mtspr SPRN_TAR, r9 + mtxer r10 /* * Load up PPR and DSCR values but don't put them in the actual SPRs diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 70963c845e96..efd1183a6b16 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -536,7 +536,6 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: case KVM_CAP_SPAPR_TCE_64: - case KVM_CAP_PPC_ALLOC_HTAB: case KVM_CAP_PPC_RTAS: case KVM_CAP_PPC_FIXUP_HCALL: case KVM_CAP_PPC_ENABLE_HCALL: @@ -545,13 +544,20 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) #endif r = 1; break; + + case KVM_CAP_PPC_ALLOC_HTAB: + r = hv_enabled; + break; #endif /* CONFIG_PPC_BOOK3S_64 */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE case KVM_CAP_PPC_SMT: - if (hv_enabled) - r = threads_per_subcore; - else - r = 0; + r = 0; + if (hv_enabled) { + if (cpu_has_feature(CPU_FTR_ARCH_300)) + r = 1; + else + r = threads_per_subcore; + } break; case KVM_CAP_PPC_RMA: r = 0; diff --git a/arch/powerpc/kvm/trace_hv.h b/arch/powerpc/kvm/trace_hv.h index fb21990c0fb4..ebc6dd449556 100644 --- a/arch/powerpc/kvm/trace_hv.h +++ b/arch/powerpc/kvm/trace_hv.h @@ -449,7 +449,7 @@ TRACE_EVENT(kvmppc_vcore_wakeup, __entry->tgid = current->tgid; ), - TP_printk("%s time %lld ns, tgid=%d", + TP_printk("%s time %llu ns, tgid=%d", __entry->waited ? "wait" : "poll", __entry->ns, __entry->tgid) ); diff --git a/arch/powerpc/mm/hash64_4k.c b/arch/powerpc/mm/hash64_4k.c index 42c702b3be1f..6fa450c12d6d 100644 --- a/arch/powerpc/mm/hash64_4k.c +++ b/arch/powerpc/mm/hash64_4k.c @@ -55,7 +55,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, */ rflags = htab_convert_pte_flags(new_pte); - if (!cpu_has_feature(CPU_FTR_NOEXECUTE) && + if (cpu_has_feature(CPU_FTR_NOEXECUTE) && !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); diff --git a/arch/powerpc/mm/hash64_64k.c b/arch/powerpc/mm/hash64_64k.c index 3bbbea07378c..1a68cb19b0e3 100644 --- a/arch/powerpc/mm/hash64_64k.c +++ b/arch/powerpc/mm/hash64_64k.c @@ -87,7 +87,7 @@ int __hash_page_4K(unsigned long ea, unsigned long access, unsigned long vsid, subpg_pte = new_pte & ~subpg_prot; rflags = htab_convert_pte_flags(subpg_pte); - if (!cpu_has_feature(CPU_FTR_NOEXECUTE) && + if (cpu_has_feature(CPU_FTR_NOEXECUTE) && !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) { /* @@ -258,7 +258,7 @@ int __hash_page_64K(unsigned long ea, unsigned long access, rflags = htab_convert_pte_flags(new_pte); - if (!cpu_has_feature(CPU_FTR_NOEXECUTE) && + if (cpu_has_feature(CPU_FTR_NOEXECUTE) && !cpu_has_feature(CPU_FTR_COHERENT_ICACHE)) rflags = hash_page_do_lazy_icache(rflags, __pte(old_pte), trap); diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index 83ddc0e171b0..ad9fd5245be2 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -221,13 +221,18 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, return -1; hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags; + hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" i=%x hpte_v=%016lx, hpte_r=%016lx\n", i, hpte_v, hpte_r); } + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hpte_r = hpte_old_to_new_r(hpte_v, hpte_r); + hpte_v = hpte_old_to_new_v(hpte_v); + } + hptep->r = cpu_to_be64(hpte_r); /* Guarantee the second dword is visible before the valid bit */ eieio(); @@ -295,6 +300,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, vpn, want_v & HPTE_V_AVPN, slot, newpp); hpte_v = be64_to_cpu(hptep->v); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); /* * We need to invalidate the TLB always because hpte_remove doesn't do * a tlb invalidate. If a hash bucket gets full, we "evict" a more/less @@ -309,6 +316,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, native_lock_hpte(hptep); /* recheck with locks held */ hpte_v = be64_to_cpu(hptep->v); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); if (unlikely(!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID))) { ret = -1; @@ -350,6 +359,8 @@ static long native_hpte_find(unsigned long vpn, int psize, int ssize) for (i = 0; i < HPTES_PER_GROUP; i++) { hptep = htab_address + slot; hpte_v = be64_to_cpu(hptep->v); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) /* HPTE matches */ @@ -409,6 +420,8 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, want_v = hpte_encode_avpn(vpn, bpsize, ssize); native_lock_hpte(hptep); hpte_v = be64_to_cpu(hptep->v); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); /* * We need to invalidate the TLB always because hpte_remove doesn't do @@ -467,6 +480,8 @@ static void native_hugepage_invalidate(unsigned long vsid, want_v = hpte_encode_avpn(vpn, psize, ssize); native_lock_hpte(hptep); hpte_v = be64_to_cpu(hptep->v); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + hpte_v = hpte_new_to_old_v(hpte_v, be64_to_cpu(hptep->r)); /* Even if we miss, we need to invalidate the TLB */ if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) @@ -504,6 +519,10 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, /* Look at the 8 bit LP value */ unsigned int lp = (hpte_r >> LP_SHIFT) & ((1 << LP_BITS) - 1); + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + hpte_v = hpte_new_to_old_v(hpte_v, hpte_r); + hpte_r = hpte_new_to_old_r(hpte_r); + } if (!(hpte_v & HPTE_V_LARGE)) { size = MMU_PAGE_4K; a_size = MMU_PAGE_4K; @@ -512,11 +531,7 @@ static void hpte_decode(struct hash_pte *hpte, unsigned long slot, a_size = hpte_page_sizes[lp] >> 4; } /* This works for all page sizes, and for 256M and 1T segments */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) - *ssize = hpte_r >> HPTE_R_3_0_SSIZE_SHIFT; - else - *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; - + *ssize = hpte_v >> HPTE_V_SSIZE_SHIFT; shift = mmu_psize_defs[size].shift; avpn = (HPTE_V_AVPN_VAL(hpte_v) & ~mmu_psize_defs[size].avpnm); @@ -639,6 +654,9 @@ static void native_flush_hash_range(unsigned long number, int local) want_v = hpte_encode_avpn(vpn, psize, ssize); native_lock_hpte(hptep); hpte_v = be64_to_cpu(hptep->v); + if (cpu_has_feature(CPU_FTR_ARCH_300)) + hpte_v = hpte_new_to_old_v(hpte_v, + be64_to_cpu(hptep->r)); if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 78dabf065ba9..8410b4bb36ed 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -796,37 +796,17 @@ static void update_hid_for_hash(void) static void __init hash_init_partition_table(phys_addr_t hash_table, unsigned long htab_size) { - unsigned long ps_field; - unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; + mmu_partition_table_init(); /* - * slb llp encoding for the page size used in VPM real mode. - * We can ignore that for lpid 0 + * PS field (VRMA page size) is not used for LPID 0, hence set to 0. + * For now, UPRT is 0 and we have no segment table. */ - ps_field = 0; htab_size = __ilog2(htab_size) - 18; - - BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large."); - partition_tb = __va(memblock_alloc_base(patb_size, patb_size, - MEMBLOCK_ALLOC_ANYWHERE)); - - /* Initialize the Partition Table with no entries */ - memset((void *)partition_tb, 0, patb_size); - partition_tb->patb0 = cpu_to_be64(ps_field | hash_table | htab_size); - /* - * FIXME!! This should be done via update_partition table - * For now UPRT is 0 for us. - */ - partition_tb->patb1 = 0; + mmu_partition_table_set_entry(0, hash_table | htab_size, 0); pr_info("Partition table %p\n", partition_tb); if (cpu_has_feature(CPU_FTR_POWER9_DD1)) update_hid_for_hash(); - /* - * update partition table control register, - * 64 K size. - */ - mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); - } static void __init htab_initialize(void) diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index a51c188b81f3..0cb6bd8bfccf 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -1085,7 +1085,7 @@ static int hot_add_node_scn_to_nid(unsigned long scn_addr) int hot_add_scn_to_nid(unsigned long scn_addr) { struct device_node *memory = NULL; - int nid, found = 0; + int nid; if (!numa_enabled || (min_common_depth < 0)) return first_online_node; @@ -1101,17 +1101,6 @@ int hot_add_scn_to_nid(unsigned long scn_addr) if (nid < 0 || !node_online(nid)) nid = first_online_node; - if (NODE_DATA(nid)->node_spanned_pages) - return nid; - - for_each_online_node(nid) { - if (NODE_DATA(nid)->node_spanned_pages) { - found = 1; - break; - } - } - - BUG_ON(!found); return nid; } diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 688b54517655..8d941c692eb3 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -177,23 +177,15 @@ redo: static void __init radix_init_partition_table(void) { - unsigned long rts_field; + unsigned long rts_field, dw0; + mmu_partition_table_init(); rts_field = radix__get_tree_size(); + dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR; + mmu_partition_table_set_entry(0, dw0, 0); - BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 24), "Partition table size too large."); - partition_tb = early_alloc_pgtable(1UL << PATB_SIZE_SHIFT); - partition_tb->patb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | - RADIX_PGD_INDEX_SIZE | PATB_HR); pr_info("Initializing Radix MMU\n"); pr_info("Partition table %p\n", partition_tb); - - memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); - /* - * update partition table control register, - * 64 K size. - */ - mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); } void __init radix_init_native(void) @@ -378,6 +370,8 @@ void __init radix__early_init_mmu(void) radix_init_partition_table(); } + memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); + radix_init_pgtable(); } diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index f5e8d4edb808..8bca7f58afc4 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -431,3 +431,37 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) } } #endif + +#ifdef CONFIG_PPC_BOOK3S_64 +void __init mmu_partition_table_init(void) +{ + unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; + + BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large."); + partition_tb = __va(memblock_alloc_base(patb_size, patb_size, + MEMBLOCK_ALLOC_ANYWHERE)); + + /* Initialize the Partition Table with no entries */ + memset((void *)partition_tb, 0, patb_size); + + /* + * update partition table control register, + * 64 K size. + */ + mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); +} + +void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, + unsigned long dw1) +{ + partition_tb[lpid].patb0 = cpu_to_be64(dw0); + partition_tb[lpid].patb1 = cpu_to_be64(dw1); + + /* Global flush of TLBs and partition table caches for this lpid */ + asm volatile("ptesync" : : : "memory"); + asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + asm volatile("eieio; tlbsync; ptesync" : : : "memory"); +} +EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); +#endif /* CONFIG_PPC_BOOK3S_64 */ diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 0fe98a567125..73a5cf18fd84 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -766,7 +766,7 @@ emit_clear: func = (u8 *) __bpf_call_base + imm; /* Save skb pointer if we need to re-cache skb data */ - if (bpf_helper_changes_skb_data(func)) + if (bpf_helper_changes_pkt_data(func)) PPC_BPF_STL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_func_call(image, ctx, (u64)func); @@ -775,7 +775,7 @@ emit_clear: PPC_MR(b2p[BPF_REG_0], 3); /* refresh skb cache */ - if (bpf_helper_changes_skb_data(func)) { + if (bpf_helper_changes_pkt_data(func)) { /* reload skb pointer to r3 */ PPC_BPF_LL(3, 1, bpf_jit_stack_local(ctx)); bpf_jit_emit_skb_loads(image, ctx); diff --git a/arch/powerpc/platforms/40x/Kconfig b/arch/powerpc/platforms/40x/Kconfig index e3257f24a8a1..1d7c1b142bf4 100644 --- a/arch/powerpc/platforms/40x/Kconfig +++ b/arch/powerpc/platforms/40x/Kconfig @@ -64,6 +64,7 @@ config XILINX_VIRTEX_GENERIC_BOARD default n select XILINX_VIRTEX_II_PRO select XILINX_VIRTEX_4_FX + select XILINX_INTC help This option enables generic support for Xilinx Virtex based boards. diff --git a/arch/powerpc/platforms/40x/virtex.c b/arch/powerpc/platforms/40x/virtex.c index 91a08ea758a8..e3d5e095846b 100644 --- a/arch/powerpc/platforms/40x/virtex.c +++ b/arch/powerpc/platforms/40x/virtex.c @@ -48,7 +48,7 @@ define_machine(virtex) { .probe = virtex_probe, .setup_arch = xilinx_pci_init, .init_IRQ = xilinx_intc_init_tree, - .get_irq = xilinx_intc_get_irq, + .get_irq = xintc_get_irq, .restart = ppc4xx_reset_system, .calibrate_decr = generic_calibrate_decr, }; diff --git a/arch/powerpc/platforms/44x/Kconfig b/arch/powerpc/platforms/44x/Kconfig index 48fc18041ff6..25b8d641ff9f 100644 --- a/arch/powerpc/platforms/44x/Kconfig +++ b/arch/powerpc/platforms/44x/Kconfig @@ -241,6 +241,7 @@ config XILINX_VIRTEX440_GENERIC_BOARD depends on 44x default n select XILINX_VIRTEX_5_FXT + select XILINX_INTC help This option enables generic support for Xilinx Virtex based boards that use a 440 based processor in the Virtex 5 FXT FPGA architecture. diff --git a/arch/powerpc/platforms/44x/virtex.c b/arch/powerpc/platforms/44x/virtex.c index a7e08026097a..3eb13ed926ee 100644 --- a/arch/powerpc/platforms/44x/virtex.c +++ b/arch/powerpc/platforms/44x/virtex.c @@ -54,7 +54,7 @@ define_machine(virtex) { .probe = virtex_probe, .setup_arch = xilinx_pci_init, .init_IRQ = xilinx_intc_init_tree, - .get_irq = xilinx_intc_get_irq, + .get_irq = xintc_get_irq, .calibrate_decr = generic_calibrate_decr, .restart = ppc4xx_reset_system, }; diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 44d2d842cee7..3aa40f1b20f5 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -304,8 +304,11 @@ OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE); OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE); OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE); OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR); +OPAL_CALL_REAL(opal_rm_int_get_xirr, OPAL_INT_GET_XIRR); OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR); OPAL_CALL(opal_int_eoi, OPAL_INT_EOI); +OPAL_CALL_REAL(opal_rm_int_eoi, OPAL_INT_EOI); OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); +OPAL_CALL_REAL(opal_rm_int_set_mfrr, OPAL_INT_SET_MFRR); OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); OPAL_CALL_REAL(opal_rm_pci_tce_kill, OPAL_PCI_TCE_KILL); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 6c9a65b52e63..b3b8930ac52f 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -896,3 +896,5 @@ EXPORT_SYMBOL_GPL(opal_leds_get_ind); EXPORT_SYMBOL_GPL(opal_leds_set_ind); /* Export this symbol for PowerNV Operator Panel class driver */ EXPORT_SYMBOL_GPL(opal_write_oppanel_async); +/* Export this for KVM */ +EXPORT_SYMBOL_GPL(opal_int_set_mfrr); diff --git a/arch/powerpc/platforms/ps3/htab.c b/arch/powerpc/platforms/ps3/htab.c index cb3c50328de8..cc2b281a3766 100644 --- a/arch/powerpc/platforms/ps3/htab.c +++ b/arch/powerpc/platforms/ps3/htab.c @@ -63,7 +63,7 @@ static long ps3_hpte_insert(unsigned long hpte_group, unsigned long vpn, vflags &= ~HPTE_V_SECONDARY; hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize, ssize) | rflags; + hpte_r = hpte_encode_r(ps3_mm_phys_to_lpar(pa), psize, apsize) | rflags; spin_lock_irqsave(&ps3_htab_lock, flags); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index aa35245d8d6d..f2c98f6c1c9c 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -145,7 +145,7 @@ static long pSeries_lpar_hpte_insert(unsigned long hpte_group, hpte_group, vpn, pa, rflags, vflags, psize); hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; - hpte_r = hpte_encode_r(pa, psize, apsize, ssize) | rflags; + hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; if (!(vflags & HPTE_V_BOLTED)) pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r); diff --git a/arch/powerpc/sysdev/xilinx_intc.c b/arch/powerpc/sysdev/xilinx_intc.c index 0f52d7955796..4a86dcff3fcd 100644 --- a/arch/powerpc/sysdev/xilinx_intc.c +++ b/arch/powerpc/sysdev/xilinx_intc.c @@ -29,194 +29,7 @@ #include <asm/processor.h> #include <asm/i8259.h> #include <asm/irq.h> - -/* - * INTC Registers - */ -#define XINTC_ISR 0 /* Interrupt Status */ -#define XINTC_IPR 4 /* Interrupt Pending */ -#define XINTC_IER 8 /* Interrupt Enable */ -#define XINTC_IAR 12 /* Interrupt Acknowledge */ -#define XINTC_SIE 16 /* Set Interrupt Enable bits */ -#define XINTC_CIE 20 /* Clear Interrupt Enable bits */ -#define XINTC_IVR 24 /* Interrupt Vector */ -#define XINTC_MER 28 /* Master Enable */ - -static struct irq_domain *master_irqhost; - -#define XILINX_INTC_MAXIRQS (32) - -/* The following table allows the interrupt type, edge or level, - * to be cached after being read from the device tree until the interrupt - * is mapped - */ -static int xilinx_intc_typetable[XILINX_INTC_MAXIRQS]; - -/* Map the interrupt type from the device tree to the interrupt types - * used by the interrupt subsystem - */ -static unsigned char xilinx_intc_map_senses[] = { - IRQ_TYPE_EDGE_RISING, - IRQ_TYPE_EDGE_FALLING, - IRQ_TYPE_LEVEL_HIGH, - IRQ_TYPE_LEVEL_LOW, -}; - -/* - * The interrupt controller is setup such that it doesn't work well with - * the level interrupt handler in the kernel because the handler acks the - * interrupt before calling the application interrupt handler. To deal with - * that, we use 2 different irq chips so that different functions can be - * used for level and edge type interrupts. - * - * IRQ Chip common (across level and edge) operations - */ -static void xilinx_intc_mask(struct irq_data *d) -{ - int irq = irqd_to_hwirq(d); - void * regs = irq_data_get_irq_chip_data(d); - pr_debug("mask: %d\n", irq); - out_be32(regs + XINTC_CIE, 1 << irq); -} - -static int xilinx_intc_set_type(struct irq_data *d, unsigned int flow_type) -{ - return 0; -} - -/* - * IRQ Chip level operations - */ -static void xilinx_intc_level_unmask(struct irq_data *d) -{ - int irq = irqd_to_hwirq(d); - void * regs = irq_data_get_irq_chip_data(d); - pr_debug("unmask: %d\n", irq); - out_be32(regs + XINTC_SIE, 1 << irq); - - /* ack level irqs because they can't be acked during - * ack function since the handle_level_irq function - * acks the irq before calling the inerrupt handler - */ - out_be32(regs + XINTC_IAR, 1 << irq); -} - -static struct irq_chip xilinx_intc_level_irqchip = { - .name = "Xilinx Level INTC", - .irq_mask = xilinx_intc_mask, - .irq_mask_ack = xilinx_intc_mask, - .irq_unmask = xilinx_intc_level_unmask, - .irq_set_type = xilinx_intc_set_type, -}; - -/* - * IRQ Chip edge operations - */ -static void xilinx_intc_edge_unmask(struct irq_data *d) -{ - int irq = irqd_to_hwirq(d); - void *regs = irq_data_get_irq_chip_data(d); - pr_debug("unmask: %d\n", irq); - out_be32(regs + XINTC_SIE, 1 << irq); -} - -static void xilinx_intc_edge_ack(struct irq_data *d) -{ - int irq = irqd_to_hwirq(d); - void * regs = irq_data_get_irq_chip_data(d); - pr_debug("ack: %d\n", irq); - out_be32(regs + XINTC_IAR, 1 << irq); -} - -static struct irq_chip xilinx_intc_edge_irqchip = { - .name = "Xilinx Edge INTC", - .irq_mask = xilinx_intc_mask, - .irq_unmask = xilinx_intc_edge_unmask, - .irq_ack = xilinx_intc_edge_ack, - .irq_set_type = xilinx_intc_set_type, -}; - -/* - * IRQ Host operations - */ - -/** - * xilinx_intc_xlate - translate virq# from device tree interrupts property - */ -static int xilinx_intc_xlate(struct irq_domain *h, struct device_node *ct, - const u32 *intspec, unsigned int intsize, - irq_hw_number_t *out_hwirq, - unsigned int *out_flags) -{ - if ((intsize < 2) || (intspec[0] >= XILINX_INTC_MAXIRQS)) - return -EINVAL; - - /* keep a copy of the interrupt type til the interrupt is mapped - */ - xilinx_intc_typetable[intspec[0]] = xilinx_intc_map_senses[intspec[1]]; - - /* Xilinx uses 2 interrupt entries, the 1st being the h/w - * interrupt number, the 2nd being the interrupt type, edge or level - */ - *out_hwirq = intspec[0]; - *out_flags = xilinx_intc_map_senses[intspec[1]]; - - return 0; -} -static int xilinx_intc_map(struct irq_domain *h, unsigned int virq, - irq_hw_number_t irq) -{ - irq_set_chip_data(virq, h->host_data); - - if (xilinx_intc_typetable[irq] == IRQ_TYPE_LEVEL_HIGH || - xilinx_intc_typetable[irq] == IRQ_TYPE_LEVEL_LOW) { - irq_set_chip_and_handler(virq, &xilinx_intc_level_irqchip, - handle_level_irq); - } else { - irq_set_chip_and_handler(virq, &xilinx_intc_edge_irqchip, - handle_edge_irq); - } - return 0; -} - -static const struct irq_domain_ops xilinx_intc_ops = { - .map = xilinx_intc_map, - .xlate = xilinx_intc_xlate, -}; - -struct irq_domain * __init -xilinx_intc_init(struct device_node *np) -{ - struct irq_domain * irq; - void * regs; - - /* Find and map the intc registers */ - regs = of_iomap(np, 0); - if (!regs) { - pr_err("xilinx_intc: could not map registers\n"); - return NULL; - } - - /* Setup interrupt controller */ - out_be32(regs + XINTC_IER, 0); /* disable all irqs */ - out_be32(regs + XINTC_IAR, ~(u32) 0); /* Acknowledge pending irqs */ - out_be32(regs + XINTC_MER, 0x3UL); /* Turn on the Master Enable. */ - - /* Allocate and initialize an irq_domain structure. */ - irq = irq_domain_add_linear(np, XILINX_INTC_MAXIRQS, &xilinx_intc_ops, - regs); - if (!irq) - panic(__FILE__ ": Cannot allocate IRQ host\n"); - - return irq; -} - -int xilinx_intc_get_irq(void) -{ - void * regs = master_irqhost->host_data; - pr_debug("get_irq:\n"); - return irq_linear_revmap(master_irqhost, in_be32(regs + XINTC_IVR)); -} +#include <linux/irqchip.h> #if defined(CONFIG_PPC_I8259) /* @@ -265,31 +78,11 @@ static void __init xilinx_i8259_setup_cascade(void) static inline void xilinx_i8259_setup_cascade(void) { return; } #endif /* defined(CONFIG_PPC_I8259) */ -static const struct of_device_id xilinx_intc_match[] __initconst = { - { .compatible = "xlnx,opb-intc-1.00.c", }, - { .compatible = "xlnx,xps-intc-1.00.a", }, - {} -}; - /* * Initialize master Xilinx interrupt controller */ void __init xilinx_intc_init_tree(void) { - struct device_node *np; - - /* find top level interrupt controller */ - for_each_matching_node(np, xilinx_intc_match) { - if (!of_get_property(np, "interrupts", NULL)) - break; - } - BUG_ON(!np); - - master_irqhost = xilinx_intc_init(np); - BUG_ON(!master_irqhost); - - irq_set_default_host(master_irqhost); - of_node_put(np); - + irqchip_init(); xilinx_i8259_setup_cascade(); } diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 426481d4cc86..c6722112527d 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -136,6 +136,7 @@ config S390 select HAVE_CMPXCHG_LOCAL select HAVE_DEBUG_KMEMLEAK select HAVE_DMA_API_DEBUG + select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS select HAVE_EFFICIENT_UNALIGNED_ACCESS @@ -169,8 +170,10 @@ config S390 select OLD_SIGSUSPEND3 select SPARSE_IRQ select SYSCTL_EXCEPTION_TRACE + select THREAD_INFO_IN_TASK select TTY select VIRT_CPU_ACCOUNTING + select ARCH_HAS_SCALED_CPUTIME select VIRT_TO_BUS select HAVE_NMI diff --git a/arch/s390/boot/compressed/head.S b/arch/s390/boot/compressed/head.S index 28c4f96a2d9c..11f6254c561e 100644 --- a/arch/s390/boot/compressed/head.S +++ b/arch/s390/boot/compressed/head.S @@ -46,7 +46,7 @@ mover_end: .align 8 .Lstack: - .quad 0x8000 + (1<<(PAGE_SHIFT+THREAD_ORDER)) + .quad 0x8000 + (1<<(PAGE_SHIFT+THREAD_SIZE_ORDER)) .Loffset: .quad 0x11000 .Lmvsize: diff --git a/arch/s390/configs/default_defconfig b/arch/s390/configs/default_defconfig index 45968686f918..e659daffe368 100644 --- a/arch/s390/configs/default_defconfig +++ b/arch/s390/configs/default_defconfig @@ -66,6 +66,8 @@ CONFIG_TRANSPARENT_HUGEPAGE=y CONFIG_CLEANCACHE=y CONFIG_FRONTSWAP=y CONFIG_CMA=y +CONFIG_CMA_DEBUG=y +CONFIG_CMA_DEBUGFS=y CONFIG_MEM_SOFT_DIRTY=y CONFIG_ZPOOL=m CONFIG_ZBUD=m @@ -366,6 +368,8 @@ CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m CONFIG_NET_TCPPROBE=m CONFIG_DEVTMPFS=y +CONFIG_DMA_CMA=y +CONFIG_CMA_SIZE_MBYTES=0 CONFIG_CONNECTOR=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m @@ -438,7 +442,6 @@ CONFIG_TUN=m CONFIG_VETH=m CONFIG_VIRTIO_NET=m CONFIG_NLMON=m -CONFIG_VHOST_NET=m # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_VENDOR_CHELSIO is not set # CONFIG_NET_VENDOR_INTEL is not set @@ -693,3 +696,4 @@ CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m CONFIG_KVM_S390_UCONTROL=y +CONFIG_VHOST_NET=m diff --git a/arch/s390/configs/gcov_defconfig b/arch/s390/configs/gcov_defconfig index 1dd05e345c4d..95ceac50bc65 100644 --- a/arch/s390/configs/gcov_defconfig +++ b/arch/s390/configs/gcov_defconfig @@ -362,6 +362,8 @@ CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m CONFIG_NET_TCPPROBE=m CONFIG_DEVTMPFS=y +CONFIG_DMA_CMA=y +CONFIG_CMA_SIZE_MBYTES=0 CONFIG_CONNECTOR=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m @@ -434,7 +436,6 @@ CONFIG_TUN=m CONFIG_VETH=m CONFIG_VIRTIO_NET=m CONFIG_NLMON=m -CONFIG_VHOST_NET=m # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_VENDOR_CHELSIO is not set # CONFIG_NET_VENDOR_INTEL is not set @@ -633,3 +634,4 @@ CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m CONFIG_KVM_S390_UCONTROL=y +CONFIG_VHOST_NET=m diff --git a/arch/s390/configs/performance_defconfig b/arch/s390/configs/performance_defconfig index 29d1178666f0..bc7b176f5795 100644 --- a/arch/s390/configs/performance_defconfig +++ b/arch/s390/configs/performance_defconfig @@ -362,6 +362,8 @@ CONFIG_BPF_JIT=y CONFIG_NET_PKTGEN=m CONFIG_NET_TCPPROBE=m CONFIG_DEVTMPFS=y +CONFIG_DMA_CMA=y +CONFIG_CMA_SIZE_MBYTES=0 CONFIG_CONNECTOR=y CONFIG_BLK_DEV_LOOP=m CONFIG_BLK_DEV_CRYPTOLOOP=m @@ -434,7 +436,6 @@ CONFIG_TUN=m CONFIG_VETH=m CONFIG_VIRTIO_NET=m CONFIG_NLMON=m -CONFIG_VHOST_NET=m # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_VENDOR_CHELSIO is not set # CONFIG_NET_VENDOR_INTEL is not set @@ -632,3 +633,4 @@ CONFIG_CMM=m CONFIG_APPLDATA_BASE=y CONFIG_KVM=m CONFIG_KVM_S390_UCONTROL=y +CONFIG_VHOST_NET=m diff --git a/arch/s390/crypto/prng.c b/arch/s390/crypto/prng.c index 9cc050f9536c..1113389d0a39 100644 --- a/arch/s390/crypto/prng.c +++ b/arch/s390/crypto/prng.c @@ -507,8 +507,10 @@ static ssize_t prng_tdes_read(struct file *file, char __user *ubuf, prng_data->prngws.byte_counter += n; prng_data->prngws.reseed_counter += n; - if (copy_to_user(ubuf, prng_data->buf, chunk)) - return -EFAULT; + if (copy_to_user(ubuf, prng_data->buf, chunk)) { + ret = -EFAULT; + break; + } nbytes -= chunk; ret += chunk; diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index 09bccb224d03..cf8a2d92467f 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -3,6 +3,7 @@ * * Copyright IBM Corp. 2006, 2008 * Author(s): Michael Holzheu <holzheu@de.ibm.com> + * License: GPL */ #define KMSG_COMPONENT "hypfs" @@ -18,7 +19,8 @@ #include <linux/time.h> #include <linux/parser.h> #include <linux/sysfs.h> -#include <linux/module.h> +#include <linux/init.h> +#include <linux/kobject.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/uio.h> @@ -443,7 +445,6 @@ static struct file_system_type hypfs_type = { .mount = hypfs_mount, .kill_sb = hypfs_kill_super }; -MODULE_ALIAS_FS("s390_hypfs"); static const struct super_operations hypfs_s_ops = { .statfs = simple_statfs, @@ -497,21 +498,4 @@ fail_dbfs_exit: pr_err("Initialization of hypfs failed with rc=%i\n", rc); return rc; } - -static void __exit hypfs_exit(void) -{ - unregister_filesystem(&hypfs_type); - sysfs_remove_mount_point(hypervisor_kobj, "s390"); - hypfs_diag0c_exit(); - hypfs_sprp_exit(); - hypfs_vm_exit(); - hypfs_diag_exit(); - hypfs_dbfs_exit(); -} - -module_init(hypfs_init) -module_exit(hypfs_exit) - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Michael Holzheu <holzheu@de.ibm.com>"); -MODULE_DESCRIPTION("s390 Hypervisor Filesystem"); +device_initcall(hypfs_init) diff --git a/arch/s390/include/asm/Kbuild b/arch/s390/include/asm/Kbuild index 20f196b82a6e..8aea32fe8bd2 100644 --- a/arch/s390/include/asm/Kbuild +++ b/arch/s390/include/asm/Kbuild @@ -1,6 +1,6 @@ - - +generic-y += asm-offsets.h generic-y += clkdev.h +generic-y += dma-contiguous.h generic-y += export.h generic-y += irq_work.h generic-y += mcs_spinlock.h diff --git a/arch/s390/include/asm/asm-offsets.h b/arch/s390/include/asm/asm-offsets.h deleted file mode 100644 index d370ee36a182..000000000000 --- a/arch/s390/include/asm/asm-offsets.h +++ /dev/null @@ -1 +0,0 @@ -#include <generated/asm-offsets.h> diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h index d28cc2f5b7b2..f7f69dfd2db2 100644 --- a/arch/s390/include/asm/atomic.h +++ b/arch/s390/include/asm/atomic.h @@ -1,13 +1,8 @@ /* - * Copyright IBM Corp. 1999, 2009 + * Copyright IBM Corp. 1999, 2016 * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>, * Denis Joseph Barrow, - * Arnd Bergmann <arndb@de.ibm.com>, - * - * Atomic operations that C can't guarantee us. - * Useful for resource counting etc. - * s390 uses 'Compare And Swap' for atomicity in SMP environment. - * + * Arnd Bergmann, */ #ifndef __ARCH_S390_ATOMIC__ @@ -15,62 +10,12 @@ #include <linux/compiler.h> #include <linux/types.h> +#include <asm/atomic_ops.h> #include <asm/barrier.h> #include <asm/cmpxchg.h> #define ATOMIC_INIT(i) { (i) } -#define __ATOMIC_NO_BARRIER "\n" - -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - -#define __ATOMIC_OR "lao" -#define __ATOMIC_AND "lan" -#define __ATOMIC_ADD "laa" -#define __ATOMIC_XOR "lax" -#define __ATOMIC_BARRIER "bcr 14,0\n" - -#define __ATOMIC_LOOP(ptr, op_val, op_string, __barrier) \ -({ \ - int old_val; \ - \ - typecheck(atomic_t *, ptr); \ - asm volatile( \ - op_string " %0,%2,%1\n" \ - __barrier \ - : "=d" (old_val), "+Q" ((ptr)->counter) \ - : "d" (op_val) \ - : "cc", "memory"); \ - old_val; \ -}) - -#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - -#define __ATOMIC_OR "or" -#define __ATOMIC_AND "nr" -#define __ATOMIC_ADD "ar" -#define __ATOMIC_XOR "xr" -#define __ATOMIC_BARRIER "\n" - -#define __ATOMIC_LOOP(ptr, op_val, op_string, __barrier) \ -({ \ - int old_val, new_val; \ - \ - typecheck(atomic_t *, ptr); \ - asm volatile( \ - " l %0,%2\n" \ - "0: lr %1,%0\n" \ - op_string " %1,%3\n" \ - " cs %0,%1,%2\n" \ - " jl 0b" \ - : "=&d" (old_val), "=&d" (new_val), "+Q" ((ptr)->counter)\ - : "d" (op_val) \ - : "cc", "memory"); \ - old_val; \ -}) - -#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - static inline int atomic_read(const atomic_t *v) { int c; @@ -90,27 +35,23 @@ static inline void atomic_set(atomic_t *v, int i) static inline int atomic_add_return(int i, atomic_t *v) { - return __ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_BARRIER) + i; + return __atomic_add_barrier(i, &v->counter) + i; } static inline int atomic_fetch_add(int i, atomic_t *v) { - return __ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_BARRIER); + return __atomic_add_barrier(i, &v->counter); } static inline void atomic_add(int i, atomic_t *v) { #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES if (__builtin_constant_p(i) && (i > -129) && (i < 128)) { - asm volatile( - "asi %0,%1\n" - : "+Q" (v->counter) - : "i" (i) - : "cc", "memory"); + __atomic_add_const(i, &v->counter); return; } #endif - __ATOMIC_LOOP(v, i, __ATOMIC_ADD, __ATOMIC_NO_BARRIER); + __atomic_add(i, &v->counter); } #define atomic_add_negative(_i, _v) (atomic_add_return(_i, _v) < 0) @@ -125,19 +66,19 @@ static inline void atomic_add(int i, atomic_t *v) #define atomic_dec_return(_v) atomic_sub_return(1, _v) #define atomic_dec_and_test(_v) (atomic_sub_return(1, _v) == 0) -#define ATOMIC_OPS(op, OP) \ +#define ATOMIC_OPS(op) \ static inline void atomic_##op(int i, atomic_t *v) \ { \ - __ATOMIC_LOOP(v, i, __ATOMIC_##OP, __ATOMIC_NO_BARRIER); \ + __atomic_##op(i, &v->counter); \ } \ static inline int atomic_fetch_##op(int i, atomic_t *v) \ { \ - return __ATOMIC_LOOP(v, i, __ATOMIC_##OP, __ATOMIC_BARRIER); \ + return __atomic_##op##_barrier(i, &v->counter); \ } -ATOMIC_OPS(and, AND) -ATOMIC_OPS(or, OR) -ATOMIC_OPS(xor, XOR) +ATOMIC_OPS(and) +ATOMIC_OPS(or) +ATOMIC_OPS(xor) #undef ATOMIC_OPS @@ -145,12 +86,7 @@ ATOMIC_OPS(xor, XOR) static inline int atomic_cmpxchg(atomic_t *v, int old, int new) { - asm volatile( - " cs %0,%2,%1" - : "+d" (old), "+Q" (v->counter) - : "d" (new) - : "cc", "memory"); - return old; + return __atomic_cmpxchg(&v->counter, old, new); } static inline int __atomic_add_unless(atomic_t *v, int a, int u) @@ -168,65 +104,11 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u) return c; } - -#undef __ATOMIC_LOOP - #define ATOMIC64_INIT(i) { (i) } -#define __ATOMIC64_NO_BARRIER "\n" - -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - -#define __ATOMIC64_OR "laog" -#define __ATOMIC64_AND "lang" -#define __ATOMIC64_ADD "laag" -#define __ATOMIC64_XOR "laxg" -#define __ATOMIC64_BARRIER "bcr 14,0\n" - -#define __ATOMIC64_LOOP(ptr, op_val, op_string, __barrier) \ -({ \ - long long old_val; \ - \ - typecheck(atomic64_t *, ptr); \ - asm volatile( \ - op_string " %0,%2,%1\n" \ - __barrier \ - : "=d" (old_val), "+Q" ((ptr)->counter) \ - : "d" (op_val) \ - : "cc", "memory"); \ - old_val; \ -}) - -#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - -#define __ATOMIC64_OR "ogr" -#define __ATOMIC64_AND "ngr" -#define __ATOMIC64_ADD "agr" -#define __ATOMIC64_XOR "xgr" -#define __ATOMIC64_BARRIER "\n" - -#define __ATOMIC64_LOOP(ptr, op_val, op_string, __barrier) \ -({ \ - long long old_val, new_val; \ - \ - typecheck(atomic64_t *, ptr); \ - asm volatile( \ - " lg %0,%2\n" \ - "0: lgr %1,%0\n" \ - op_string " %1,%3\n" \ - " csg %0,%1,%2\n" \ - " jl 0b" \ - : "=&d" (old_val), "=&d" (new_val), "+Q" ((ptr)->counter)\ - : "d" (op_val) \ - : "cc", "memory"); \ - old_val; \ -}) - -#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - -static inline long long atomic64_read(const atomic64_t *v) +static inline long atomic64_read(const atomic64_t *v) { - long long c; + long c; asm volatile( " lg %0,%1\n" @@ -234,71 +116,60 @@ static inline long long atomic64_read(const atomic64_t *v) return c; } -static inline void atomic64_set(atomic64_t *v, long long i) +static inline void atomic64_set(atomic64_t *v, long i) { asm volatile( " stg %1,%0\n" : "=Q" (v->counter) : "d" (i)); } -static inline long long atomic64_add_return(long long i, atomic64_t *v) +static inline long atomic64_add_return(long i, atomic64_t *v) { - return __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_BARRIER) + i; + return __atomic64_add_barrier(i, &v->counter) + i; } -static inline long long atomic64_fetch_add(long long i, atomic64_t *v) +static inline long atomic64_fetch_add(long i, atomic64_t *v) { - return __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_BARRIER); + return __atomic64_add_barrier(i, &v->counter); } -static inline void atomic64_add(long long i, atomic64_t *v) +static inline void atomic64_add(long i, atomic64_t *v) { #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES if (__builtin_constant_p(i) && (i > -129) && (i < 128)) { - asm volatile( - "agsi %0,%1\n" - : "+Q" (v->counter) - : "i" (i) - : "cc", "memory"); + __atomic64_add_const(i, &v->counter); return; } #endif - __ATOMIC64_LOOP(v, i, __ATOMIC64_ADD, __ATOMIC64_NO_BARRIER); + __atomic64_add(i, &v->counter); } #define atomic64_xchg(v, new) (xchg(&((v)->counter), new)) -static inline long long atomic64_cmpxchg(atomic64_t *v, - long long old, long long new) +static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new) { - asm volatile( - " csg %0,%2,%1" - : "+d" (old), "+Q" (v->counter) - : "d" (new) - : "cc", "memory"); - return old; + return __atomic64_cmpxchg(&v->counter, old, new); } -#define ATOMIC64_OPS(op, OP) \ +#define ATOMIC64_OPS(op) \ static inline void atomic64_##op(long i, atomic64_t *v) \ { \ - __ATOMIC64_LOOP(v, i, __ATOMIC64_##OP, __ATOMIC64_NO_BARRIER); \ + __atomic64_##op(i, &v->counter); \ } \ static inline long atomic64_fetch_##op(long i, atomic64_t *v) \ { \ - return __ATOMIC64_LOOP(v, i, __ATOMIC64_##OP, __ATOMIC64_BARRIER); \ + return __atomic64_##op##_barrier(i, &v->counter); \ } -ATOMIC64_OPS(and, AND) -ATOMIC64_OPS(or, OR) -ATOMIC64_OPS(xor, XOR) +ATOMIC64_OPS(and) +ATOMIC64_OPS(or) +ATOMIC64_OPS(xor) #undef ATOMIC64_OPS -#undef __ATOMIC64_LOOP -static inline int atomic64_add_unless(atomic64_t *v, long long i, long long u) +static inline int atomic64_add_unless(atomic64_t *v, long i, long u) { - long long c, old; + long c, old; c = atomic64_read(v); for (;;) { @@ -312,9 +183,9 @@ static inline int atomic64_add_unless(atomic64_t *v, long long i, long long u) return c != u; } -static inline long long atomic64_dec_if_positive(atomic64_t *v) +static inline long atomic64_dec_if_positive(atomic64_t *v) { - long long c, old, dec; + long c, old, dec; c = atomic64_read(v); for (;;) { @@ -333,9 +204,9 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v) #define atomic64_inc(_v) atomic64_add(1, _v) #define atomic64_inc_return(_v) atomic64_add_return(1, _v) #define atomic64_inc_and_test(_v) (atomic64_add_return(1, _v) == 0) -#define atomic64_sub_return(_i, _v) atomic64_add_return(-(long long)(_i), _v) -#define atomic64_fetch_sub(_i, _v) atomic64_fetch_add(-(long long)(_i), _v) -#define atomic64_sub(_i, _v) atomic64_add(-(long long)(_i), _v) +#define atomic64_sub_return(_i, _v) atomic64_add_return(-(long)(_i), _v) +#define atomic64_fetch_sub(_i, _v) atomic64_fetch_add(-(long)(_i), _v) +#define atomic64_sub(_i, _v) atomic64_add(-(long)(_i), _v) #define atomic64_sub_and_test(_i, _v) (atomic64_sub_return(_i, _v) == 0) #define atomic64_dec(_v) atomic64_sub(1, _v) #define atomic64_dec_return(_v) atomic64_sub_return(1, _v) diff --git a/arch/s390/include/asm/atomic_ops.h b/arch/s390/include/asm/atomic_ops.h new file mode 100644 index 000000000000..ac9e2b939d04 --- /dev/null +++ b/arch/s390/include/asm/atomic_ops.h @@ -0,0 +1,130 @@ +/* + * Low level function for atomic operations + * + * Copyright IBM Corp. 1999, 2016 + */ + +#ifndef __ARCH_S390_ATOMIC_OPS__ +#define __ARCH_S390_ATOMIC_OPS__ + +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + +#define __ATOMIC_OP(op_name, op_type, op_string, op_barrier) \ +static inline op_type op_name(op_type val, op_type *ptr) \ +{ \ + op_type old; \ + \ + asm volatile( \ + op_string " %[old],%[val],%[ptr]\n" \ + op_barrier \ + : [old] "=d" (old), [ptr] "+Q" (*ptr) \ + : [val] "d" (val) : "cc", "memory"); \ + return old; \ +} \ + +#define __ATOMIC_OPS(op_name, op_type, op_string) \ + __ATOMIC_OP(op_name, op_type, op_string, "\n") \ + __ATOMIC_OP(op_name##_barrier, op_type, op_string, "bcr 14,0\n") + +__ATOMIC_OPS(__atomic_add, int, "laa") +__ATOMIC_OPS(__atomic_and, int, "lan") +__ATOMIC_OPS(__atomic_or, int, "lao") +__ATOMIC_OPS(__atomic_xor, int, "lax") + +__ATOMIC_OPS(__atomic64_add, long, "laag") +__ATOMIC_OPS(__atomic64_and, long, "lang") +__ATOMIC_OPS(__atomic64_or, long, "laog") +__ATOMIC_OPS(__atomic64_xor, long, "laxg") + +#undef __ATOMIC_OPS +#undef __ATOMIC_OP + +static inline void __atomic_add_const(int val, int *ptr) +{ + asm volatile( + " asi %[ptr],%[val]\n" + : [ptr] "+Q" (*ptr) : [val] "i" (val) : "cc"); +} + +static inline void __atomic64_add_const(long val, long *ptr) +{ + asm volatile( + " agsi %[ptr],%[val]\n" + : [ptr] "+Q" (*ptr) : [val] "i" (val) : "cc"); +} + +#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +#define __ATOMIC_OP(op_name, op_string) \ +static inline int op_name(int val, int *ptr) \ +{ \ + int old, new; \ + \ + asm volatile( \ + "0: lr %[new],%[old]\n" \ + op_string " %[new],%[val]\n" \ + " cs %[old],%[new],%[ptr]\n" \ + " jl 0b" \ + : [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\ + : [val] "d" (val), "0" (*ptr) : "cc", "memory"); \ + return old; \ +} + +#define __ATOMIC_OPS(op_name, op_string) \ + __ATOMIC_OP(op_name, op_string) \ + __ATOMIC_OP(op_name##_barrier, op_string) + +__ATOMIC_OPS(__atomic_add, "ar") +__ATOMIC_OPS(__atomic_and, "nr") +__ATOMIC_OPS(__atomic_or, "or") +__ATOMIC_OPS(__atomic_xor, "xr") + +#undef __ATOMIC_OPS + +#define __ATOMIC64_OP(op_name, op_string) \ +static inline long op_name(long val, long *ptr) \ +{ \ + long old, new; \ + \ + asm volatile( \ + "0: lgr %[new],%[old]\n" \ + op_string " %[new],%[val]\n" \ + " csg %[old],%[new],%[ptr]\n" \ + " jl 0b" \ + : [old] "=d" (old), [new] "=&d" (new), [ptr] "+Q" (*ptr)\ + : [val] "d" (val), "0" (*ptr) : "cc", "memory"); \ + return old; \ +} + +#define __ATOMIC64_OPS(op_name, op_string) \ + __ATOMIC64_OP(op_name, op_string) \ + __ATOMIC64_OP(op_name##_barrier, op_string) + +__ATOMIC64_OPS(__atomic64_add, "agr") +__ATOMIC64_OPS(__atomic64_and, "ngr") +__ATOMIC64_OPS(__atomic64_or, "ogr") +__ATOMIC64_OPS(__atomic64_xor, "xgr") + +#undef __ATOMIC64_OPS + +#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +static inline int __atomic_cmpxchg(int *ptr, int old, int new) +{ + asm volatile( + " cs %[old],%[new],%[ptr]" + : [old] "+d" (old), [ptr] "+Q" (*ptr) + : [new] "d" (new) : "cc", "memory"); + return old; +} + +static inline long __atomic64_cmpxchg(long *ptr, long old, long new) +{ + asm volatile( + " csg %[old],%[new],%[ptr]" + : [old] "+d" (old), [ptr] "+Q" (*ptr) + : [new] "d" (new) : "cc", "memory"); + return old; +} + +#endif /* __ARCH_S390_ATOMIC_OPS__ */ diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h index 8043f10da6b5..d92047da5ccb 100644 --- a/arch/s390/include/asm/bitops.h +++ b/arch/s390/include/asm/bitops.h @@ -42,57 +42,9 @@ #include <linux/typecheck.h> #include <linux/compiler.h> +#include <asm/atomic_ops.h> #include <asm/barrier.h> -#define __BITOPS_NO_BARRIER "\n" - -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - -#define __BITOPS_OR "laog" -#define __BITOPS_AND "lang" -#define __BITOPS_XOR "laxg" -#define __BITOPS_BARRIER "bcr 14,0\n" - -#define __BITOPS_LOOP(__addr, __val, __op_string, __barrier) \ -({ \ - unsigned long __old; \ - \ - typecheck(unsigned long *, (__addr)); \ - asm volatile( \ - __op_string " %0,%2,%1\n" \ - __barrier \ - : "=d" (__old), "+Q" (*(__addr)) \ - : "d" (__val) \ - : "cc", "memory"); \ - __old; \ -}) - -#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - -#define __BITOPS_OR "ogr" -#define __BITOPS_AND "ngr" -#define __BITOPS_XOR "xgr" -#define __BITOPS_BARRIER "\n" - -#define __BITOPS_LOOP(__addr, __val, __op_string, __barrier) \ -({ \ - unsigned long __old, __new; \ - \ - typecheck(unsigned long *, (__addr)); \ - asm volatile( \ - " lg %0,%2\n" \ - "0: lgr %1,%0\n" \ - __op_string " %1,%3\n" \ - " csg %0,%1,%2\n" \ - " jl 0b" \ - : "=&d" (__old), "=&d" (__new), "+Q" (*(__addr))\ - : "d" (__val) \ - : "cc", "memory"); \ - __old; \ -}) - -#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ - #define __BITOPS_WORDS(bits) (((bits) + BITS_PER_LONG - 1) / BITS_PER_LONG) static inline unsigned long * @@ -128,7 +80,7 @@ static inline void set_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = 1UL << (nr & (BITS_PER_LONG - 1)); - __BITOPS_LOOP(addr, mask, __BITOPS_OR, __BITOPS_NO_BARRIER); + __atomic64_or(mask, addr); } static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr) @@ -149,7 +101,7 @@ static inline void clear_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); - __BITOPS_LOOP(addr, mask, __BITOPS_AND, __BITOPS_NO_BARRIER); + __atomic64_and(mask, addr); } static inline void change_bit(unsigned long nr, volatile unsigned long *ptr) @@ -170,7 +122,7 @@ static inline void change_bit(unsigned long nr, volatile unsigned long *ptr) } #endif mask = 1UL << (nr & (BITS_PER_LONG - 1)); - __BITOPS_LOOP(addr, mask, __BITOPS_XOR, __BITOPS_NO_BARRIER); + __atomic64_xor(mask, addr); } static inline int @@ -180,7 +132,7 @@ test_and_set_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); - old = __BITOPS_LOOP(addr, mask, __BITOPS_OR, __BITOPS_BARRIER); + old = __atomic64_or_barrier(mask, addr); return (old & mask) != 0; } @@ -191,7 +143,7 @@ test_and_clear_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = ~(1UL << (nr & (BITS_PER_LONG - 1))); - old = __BITOPS_LOOP(addr, mask, __BITOPS_AND, __BITOPS_BARRIER); + old = __atomic64_and_barrier(mask, addr); return (old & ~mask) != 0; } @@ -202,7 +154,7 @@ test_and_change_bit(unsigned long nr, volatile unsigned long *ptr) unsigned long old, mask; mask = 1UL << (nr & (BITS_PER_LONG - 1)); - old = __BITOPS_LOOP(addr, mask, __BITOPS_XOR, __BITOPS_BARRIER); + old = __atomic64_xor_barrier(mask, addr); return (old & mask) != 0; } diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h index 03516476127b..b69d8bc231a5 100644 --- a/arch/s390/include/asm/cpu_mf.h +++ b/arch/s390/include/asm/cpu_mf.h @@ -104,7 +104,8 @@ struct hws_basic_entry { unsigned int P:1; /* 28 PSW Problem state */ unsigned int AS:2; /* 29-30 PSW address-space control */ unsigned int I:1; /* 31 entry valid or invalid */ - unsigned int:16; + unsigned int CL:2; /* 32-33 Configuration Level */ + unsigned int:14; unsigned int prim_asn:16; /* primary ASN */ unsigned long long ia; /* Instruction Address */ unsigned long long gpp; /* Guest Program Parameter */ diff --git a/arch/s390/include/asm/elf.h b/arch/s390/include/asm/elf.h index 1736c7d3c94c..f4381e1fb19e 100644 --- a/arch/s390/include/asm/elf.h +++ b/arch/s390/include/asm/elf.h @@ -193,7 +193,7 @@ extern char elf_platform[]; do { \ set_personality(PER_LINUX | \ (current->personality & (~PER_MASK))); \ - current_thread_info()->sys_call_table = \ + current->thread.sys_call_table = \ (unsigned long) &sys_call_table; \ } while (0) #else /* CONFIG_COMPAT */ @@ -204,11 +204,11 @@ do { \ (current->personality & ~PER_MASK)); \ if ((ex).e_ident[EI_CLASS] == ELFCLASS32) { \ set_thread_flag(TIF_31BIT); \ - current_thread_info()->sys_call_table = \ + current->thread.sys_call_table = \ (unsigned long) &sys_call_table_emu; \ } else { \ clear_thread_flag(TIF_31BIT); \ - current_thread_info()->sys_call_table = \ + current->thread.sys_call_table = \ (unsigned long) &sys_call_table; \ } \ } while (0) diff --git a/arch/s390/include/asm/facilities_src.h b/arch/s390/include/asm/facilities_src.h deleted file mode 100644 index 3b758f66e48b..000000000000 --- a/arch/s390/include/asm/facilities_src.h +++ /dev/null @@ -1,82 +0,0 @@ -/* - * Copyright IBM Corp. 2015 - */ - -#ifndef S390_GEN_FACILITIES_C -#error "This file can only be included by gen_facilities.c" -#endif - -#include <linux/kconfig.h> - -struct facility_def { - char *name; - int *bits; -}; - -static struct facility_def facility_defs[] = { - { - /* - * FACILITIES_ALS contains the list of facilities that are - * required to run a kernel that is compiled e.g. with - * -march=<machine>. - */ - .name = "FACILITIES_ALS", - .bits = (int[]){ -#ifdef CONFIG_HAVE_MARCH_Z900_FEATURES - 0, /* N3 instructions */ - 1, /* z/Arch mode installed */ -#endif -#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES - 18, /* long displacement facility */ -#endif -#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES - 7, /* stfle */ - 17, /* message security assist */ - 21, /* extended-immediate facility */ - 25, /* store clock fast */ -#endif -#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES - 27, /* mvcos */ - 32, /* compare and swap and store */ - 33, /* compare and swap and store 2 */ - 34, /* general extension facility */ - 35, /* execute extensions */ -#endif -#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES - 45, /* fast-BCR, etc. */ -#endif -#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES - 49, /* misc-instruction-extensions */ - 52, /* interlocked facility 2 */ -#endif -#ifdef CONFIG_HAVE_MARCH_Z13_FEATURES - 53, /* load-and-zero-rightmost-byte, etc. */ -#endif - -1 /* END */ - } - }, - { - .name = "FACILITIES_KVM", - .bits = (int[]){ - 0, /* N3 instructions */ - 1, /* z/Arch mode installed */ - 2, /* z/Arch mode active */ - 3, /* DAT-enhancement */ - 4, /* idte segment table */ - 5, /* idte region table */ - 6, /* ASN-and-LX reuse */ - 7, /* stfle */ - 8, /* enhanced-DAT 1 */ - 9, /* sense-running-status */ - 10, /* conditional sske */ - 13, /* ipte-range */ - 14, /* nonquiescing key-setting */ - 73, /* transactional execution */ - 75, /* access-exception-fetch/store indication */ - 76, /* msa extension 3 */ - 77, /* msa extension 4 */ - 78, /* enhanced-DAT 2 */ - -1 /* END */ - } - }, -}; diff --git a/arch/s390/include/asm/ipl.h b/arch/s390/include/asm/ipl.h index 4da22b2f0521..edb5161df7e2 100644 --- a/arch/s390/include/asm/ipl.h +++ b/arch/s390/include/asm/ipl.h @@ -97,7 +97,7 @@ void __init save_area_add_vxrs(struct save_area *, __vector128 *vxrs); extern void do_reipl(void); extern void do_halt(void); extern void do_poff(void); -extern void ipl_save_parameters(void); +extern void ipl_verify_parameters(void); extern void ipl_update_parameters(void); extern size_t append_ipl_vmparm(char *, size_t); extern size_t append_ipl_scpdata(char *, size_t); diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h index 7b93b78f423c..9bfad2ad6312 100644 --- a/arch/s390/include/asm/lowcore.h +++ b/arch/s390/include/asm/lowcore.h @@ -95,7 +95,7 @@ struct lowcore { /* Current process. */ __u64 current_task; /* 0x0310 */ - __u64 thread_info; /* 0x0318 */ + __u8 pad_0x318[0x320-0x318]; /* 0x0318 */ __u64 kernel_stack; /* 0x0320 */ /* Interrupt, panic and restart stack. */ @@ -126,7 +126,8 @@ struct lowcore { __u64 percpu_offset; /* 0x0378 */ __u64 vdso_per_cpu_data; /* 0x0380 */ __u64 machine_flags; /* 0x0388 */ - __u8 pad_0x0390[0x0398-0x0390]; /* 0x0390 */ + __u32 preempt_count; /* 0x0390 */ + __u8 pad_0x0394[0x0398-0x0394]; /* 0x0394 */ __u64 gmap; /* 0x0398 */ __u32 spinlock_lockval; /* 0x03a0 */ __u32 fpu_flags; /* 0x03a4 */ diff --git a/arch/s390/include/asm/mutex.h b/arch/s390/include/asm/mutex.h deleted file mode 100644 index 458c1f7fbc18..000000000000 --- a/arch/s390/include/asm/mutex.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Pull in the generic implementation for the mutex fastpath. - * - * TODO: implement optimized primitives instead, or leave the generic - * implementation in place, or pick the atomic_xchg() based generic - * implementation. (see asm-generic/mutex-xchg.h for details) - */ - -#include <asm-generic/mutex-dec.h> diff --git a/arch/s390/include/asm/pci_clp.h b/arch/s390/include/asm/pci_clp.h index e75c64cbcf08..c232ef9711f5 100644 --- a/arch/s390/include/asm/pci_clp.h +++ b/arch/s390/include/asm/pci_clp.h @@ -46,6 +46,8 @@ struct clp_fh_list_entry { #define CLP_UTIL_STR_LEN 64 #define CLP_PFIP_NR_SEGMENTS 4 +extern bool zpci_unique_uid; + /* List PCI functions request */ struct clp_req_list_pci { struct clp_req_hdr hdr; @@ -59,7 +61,8 @@ struct clp_rsp_list_pci { u64 resume_token; u32 reserved2; u16 max_fn; - u8 reserved3; + u8 : 7; + u8 uid_checking : 1; u8 entry_size; struct clp_fh_list_entry fh_list[CLP_FH_LIST_NR_ENTRIES]; } __packed; diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index f4eb9843eed4..166f703dad7c 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -27,17 +27,17 @@ extern int page_table_allocate_pgste; static inline void clear_table(unsigned long *s, unsigned long val, size_t n) { - typedef struct { char _[n]; } addrtype; - - *s = val; - n = (n / 256) - 1; - asm volatile( - " mvc 8(248,%0),0(%0)\n" - "0: mvc 256(256,%0),0(%0)\n" - " la %0,256(%0)\n" - " brct %1,0b\n" - : "+a" (s), "+d" (n), "=m" (*(addrtype *) s) - : "m" (*(addrtype *) s)); + struct addrtype { char _[256]; }; + int i; + + for (i = 0; i < n; i += 256) { + *s = val; + asm volatile( + "mvc 8(248,%[s]),0(%[s])\n" + : "+m" (*(struct addrtype *) s) + : [s] "a" (s)); + s += 256 / sizeof(long); + } } static inline void crst_table_init(unsigned long *crst, unsigned long entry) diff --git a/arch/s390/include/asm/preempt.h b/arch/s390/include/asm/preempt.h new file mode 100644 index 000000000000..b0776b2c8dcf --- /dev/null +++ b/arch/s390/include/asm/preempt.h @@ -0,0 +1,137 @@ +#ifndef __ASM_PREEMPT_H +#define __ASM_PREEMPT_H + +#include <asm/current.h> +#include <linux/thread_info.h> +#include <asm/atomic_ops.h> + +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + +#define PREEMPT_ENABLED (0 + PREEMPT_NEED_RESCHED) + +static inline int preempt_count(void) +{ + return READ_ONCE(S390_lowcore.preempt_count) & ~PREEMPT_NEED_RESCHED; +} + +static inline void preempt_count_set(int pc) +{ + int old, new; + + do { + old = READ_ONCE(S390_lowcore.preempt_count); + new = (old & PREEMPT_NEED_RESCHED) | + (pc & ~PREEMPT_NEED_RESCHED); + } while (__atomic_cmpxchg(&S390_lowcore.preempt_count, + old, new) != old); +} + +#define init_task_preempt_count(p) do { } while (0) + +#define init_idle_preempt_count(p, cpu) do { \ + S390_lowcore.preempt_count = PREEMPT_ENABLED; \ +} while (0) + +static inline void set_preempt_need_resched(void) +{ + __atomic_and(~PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count); +} + +static inline void clear_preempt_need_resched(void) +{ + __atomic_or(PREEMPT_NEED_RESCHED, &S390_lowcore.preempt_count); +} + +static inline bool test_preempt_need_resched(void) +{ + return !(READ_ONCE(S390_lowcore.preempt_count) & PREEMPT_NEED_RESCHED); +} + +static inline void __preempt_count_add(int val) +{ + if (__builtin_constant_p(val) && (val >= -128) && (val <= 127)) + __atomic_add_const(val, &S390_lowcore.preempt_count); + else + __atomic_add(val, &S390_lowcore.preempt_count); +} + +static inline void __preempt_count_sub(int val) +{ + __preempt_count_add(-val); +} + +static inline bool __preempt_count_dec_and_test(void) +{ + return __atomic_add(-1, &S390_lowcore.preempt_count) == 1; +} + +static inline bool should_resched(int preempt_offset) +{ + return unlikely(READ_ONCE(S390_lowcore.preempt_count) == + preempt_offset); +} + +#else /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +#define PREEMPT_ENABLED (0) + +static inline int preempt_count(void) +{ + return READ_ONCE(S390_lowcore.preempt_count); +} + +static inline void preempt_count_set(int pc) +{ + S390_lowcore.preempt_count = pc; +} + +#define init_task_preempt_count(p) do { } while (0) + +#define init_idle_preempt_count(p, cpu) do { \ + S390_lowcore.preempt_count = PREEMPT_ENABLED; \ +} while (0) + +static inline void set_preempt_need_resched(void) +{ +} + +static inline void clear_preempt_need_resched(void) +{ +} + +static inline bool test_preempt_need_resched(void) +{ + return false; +} + +static inline void __preempt_count_add(int val) +{ + S390_lowcore.preempt_count += val; +} + +static inline void __preempt_count_sub(int val) +{ + S390_lowcore.preempt_count -= val; +} + +static inline bool __preempt_count_dec_and_test(void) +{ + return !--S390_lowcore.preempt_count && tif_need_resched(); +} + +static inline bool should_resched(int preempt_offset) +{ + return unlikely(preempt_count() == preempt_offset && + tif_need_resched()); +} + +#endif /* CONFIG_HAVE_MARCH_Z196_FEATURES */ + +#ifdef CONFIG_PREEMPT +extern asmlinkage void preempt_schedule(void); +#define __preempt_schedule() preempt_schedule() +extern asmlinkage void preempt_schedule_notrace(void); +#define __preempt_schedule_notrace() preempt_schedule_notrace() +#endif /* CONFIG_PREEMPT */ + +#endif /* __ASM_PREEMPT_H */ diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h index 602af692efdc..6bca916a5ba0 100644 --- a/arch/s390/include/asm/processor.h +++ b/arch/s390/include/asm/processor.h @@ -110,14 +110,20 @@ typedef struct { struct thread_struct { unsigned int acrs[NUM_ACRS]; unsigned long ksp; /* kernel stack pointer */ + unsigned long user_timer; /* task cputime in user space */ + unsigned long system_timer; /* task cputime in kernel space */ + unsigned long sys_call_table; /* system call table address */ mm_segment_t mm_segment; unsigned long gmap_addr; /* address of last gmap fault. */ unsigned int gmap_write_flag; /* gmap fault write indication */ unsigned int gmap_int_code; /* int code of last gmap fault */ unsigned int gmap_pfault; /* signal of a pending guest pfault */ + /* Per-thread information related to debugging */ struct per_regs per_user; /* User specified PER registers */ struct per_event per_event; /* Cause of the last PER trap */ unsigned long per_flags; /* Flags to control debug behavior */ + unsigned int system_call; /* system call number in signal */ + unsigned long last_break; /* last breaking-event-address. */ /* pfault_wait is used to block the process on a pfault event */ unsigned long pfault_wait; struct list_head list; @@ -234,9 +240,10 @@ static inline unsigned short stap(void) /* * Give up the time slice of the virtual PU. */ -void cpu_relax(void); +#define cpu_relax_yield cpu_relax_yield +void cpu_relax_yield(void); -#define cpu_relax_lowlatency() barrier() +#define cpu_relax() barrier() #define ECAG_CACHE_ATTRIBUTE 0 #define ECAG_CPU_ATTRIBUTE 1 diff --git a/arch/s390/include/asm/sclp.h b/arch/s390/include/asm/sclp.h index 2ad9c204b1a2..8db92a5b3bf1 100644 --- a/arch/s390/include/asm/sclp.h +++ b/arch/s390/include/asm/sclp.h @@ -101,7 +101,8 @@ struct zpci_report_error_header { u8 data[0]; /* Subsequent Data passed verbatim to SCLP ET 24 */ } __packed; -int sclp_get_core_info(struct sclp_core_info *info); +int _sclp_get_core_info_early(struct sclp_core_info *info); +int _sclp_get_core_info(struct sclp_core_info *info); int sclp_core_configure(u8 core); int sclp_core_deconfigure(u8 core); int sclp_sdias_blk_count(void); @@ -119,4 +120,11 @@ void sclp_early_detect(void); void _sclp_print_early(const char *); void sclp_ocf_cpc_name_copy(char *dst); +static inline int sclp_get_core_info(struct sclp_core_info *info, int early) +{ + if (early) + return _sclp_get_core_info_early(info); + return _sclp_get_core_info(info); +} + #endif /* _ASM_S390_SCLP_H */ diff --git a/arch/s390/include/asm/scsw.h b/arch/s390/include/asm/scsw.h index 4af99cdaddf5..17a7904f001a 100644 --- a/arch/s390/include/asm/scsw.h +++ b/arch/s390/include/asm/scsw.h @@ -96,7 +96,8 @@ struct tm_scsw { u32 dstat:8; u32 cstat:8; u32 fcxs:8; - u32 schxs:8; + u32 ifob:1; + u32 sesq:7; } __attribute__ ((packed)); /** @@ -177,6 +178,9 @@ union scsw { #define SCHN_STAT_INTF_CTRL_CHK 0x02 #define SCHN_STAT_CHAIN_CHECK 0x01 +#define SCSW_SESQ_DEV_NOFCX 3 +#define SCSW_SESQ_PATH_NOFCX 4 + /* * architectured values for first sense byte */ diff --git a/arch/s390/include/asm/smp.h b/arch/s390/include/asm/smp.h index 0cc383b9be7f..3deb134587b7 100644 --- a/arch/s390/include/asm/smp.h +++ b/arch/s390/include/asm/smp.h @@ -36,6 +36,7 @@ extern void smp_yield_cpu(int cpu); extern void smp_cpu_set_polarization(int cpu, int val); extern int smp_cpu_get_polarization(int cpu); extern void smp_fill_possible_mask(void); +extern void smp_detect_cpus(void); #else /* CONFIG_SMP */ @@ -56,6 +57,7 @@ static inline int smp_store_status(int cpu) { return 0; } static inline int smp_vcpu_scheduled(int cpu) { return 1; } static inline void smp_yield_cpu(int cpu) { } static inline void smp_fill_possible_mask(void) { } +static inline void smp_detect_cpus(void) { } #endif /* CONFIG_SMP */ @@ -69,6 +71,12 @@ static inline void smp_stop_cpu(void) } } +/* Return thread 0 CPU number as base CPU */ +static inline int smp_get_base_cpu(int cpu) +{ + return cpu - (cpu % (smp_cpu_mtid + 1)); +} + #ifdef CONFIG_HOTPLUG_CPU extern int smp_rescan_cpus(void); extern void __noreturn cpu_die(void); diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h index 7e9e09f600fa..7ecd8902a5c3 100644 --- a/arch/s390/include/asm/spinlock.h +++ b/arch/s390/include/asm/spinlock.h @@ -23,6 +23,14 @@ _raw_compare_and_swap(unsigned int *lock, unsigned int old, unsigned int new) return __sync_bool_compare_and_swap(lock, old, new); } +#ifndef CONFIG_SMP +static inline bool arch_vcpu_is_preempted(int cpu) { return false; } +#else +bool arch_vcpu_is_preempted(int cpu); +#endif + +#define vcpu_is_preempted arch_vcpu_is_preempted + /* * Simple spin lock operations. There are two variants, one clears IRQ's * on the local processor, one does not. diff --git a/arch/s390/include/asm/string.h b/arch/s390/include/asm/string.h index 8662f5c8e17f..15a3c005c274 100644 --- a/arch/s390/include/asm/string.h +++ b/arch/s390/include/asm/string.h @@ -14,6 +14,7 @@ #define __HAVE_ARCH_MEMCHR /* inline & arch function */ #define __HAVE_ARCH_MEMCMP /* arch function */ #define __HAVE_ARCH_MEMCPY /* gcc builtin & arch function */ +#define __HAVE_ARCH_MEMMOVE /* gcc builtin & arch function */ #define __HAVE_ARCH_MEMSCAN /* inline & arch function */ #define __HAVE_ARCH_MEMSET /* gcc builtin & arch function */ #define __HAVE_ARCH_STRCAT /* inline & arch function */ @@ -32,6 +33,7 @@ extern int memcmp(const void *, const void *, size_t); extern void *memcpy(void *, const void *, size_t); extern void *memset(void *, int, size_t); +extern void *memmove(void *, const void *, size_t); extern int strcmp(const char *,const char *); extern size_t strlcat(char *, const char *, size_t); extern size_t strlcpy(char *, const char *, size_t); @@ -40,7 +42,6 @@ extern char *strncpy(char *, const char *, size_t); extern char *strrchr(const char *, int); extern char *strstr(const char *, const char *); -#undef __HAVE_ARCH_MEMMOVE #undef __HAVE_ARCH_STRCHR #undef __HAVE_ARCH_STRNCHR #undef __HAVE_ARCH_STRNCMP diff --git a/arch/s390/include/asm/sysinfo.h b/arch/s390/include/asm/sysinfo.h index 2728114d5484..229326c942c7 100644 --- a/arch/s390/include/asm/sysinfo.h +++ b/arch/s390/include/asm/sysinfo.h @@ -107,6 +107,11 @@ struct sysinfo_2_2_2 { char reserved_3[5]; unsigned short cpus_dedicated; unsigned short cpus_shared; + char reserved_4[3]; + unsigned char vsne; + uuid_be uuid; + char reserved_5[160]; + char ext_name[256]; }; #define LPAR_CHAR_DEDICATED (1 << 7) @@ -127,7 +132,7 @@ struct sysinfo_3_2_2 { unsigned int caf; char cpi[16]; char reserved_1[3]; - char ext_name_encoding; + unsigned char evmne; unsigned int reserved_2; uuid_be uuid; } vm[8]; diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h index f15c0398c363..a5b54a445eb8 100644 --- a/arch/s390/include/asm/thread_info.h +++ b/arch/s390/include/asm/thread_info.h @@ -12,10 +12,10 @@ /* * Size of kernel stack for each process */ -#define THREAD_ORDER 2 +#define THREAD_SIZE_ORDER 2 #define ASYNC_ORDER 2 -#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) +#define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define ASYNC_SIZE (PAGE_SIZE << ASYNC_ORDER) #ifndef __ASSEMBLY__ @@ -30,15 +30,7 @@ * - if the contents of this structure are changed, the assembly constants must also be changed */ struct thread_info { - struct task_struct *task; /* main task structure */ unsigned long flags; /* low level flags */ - unsigned long sys_call_table; /* System call table address */ - unsigned int cpu; /* current CPU */ - int preempt_count; /* 0 => preemptable, <0 => BUG */ - unsigned int system_call; - __u64 user_timer; - __u64 system_timer; - unsigned long last_break; /* last breaking-event-address. */ }; /* @@ -46,26 +38,14 @@ struct thread_info { */ #define INIT_THREAD_INFO(tsk) \ { \ - .task = &tsk, \ .flags = 0, \ - .cpu = 0, \ - .preempt_count = INIT_PREEMPT_COUNT, \ } -#define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) -/* how to get the thread information struct from C */ -static inline struct thread_info *current_thread_info(void) -{ - return (struct thread_info *) S390_lowcore.thread_info; -} - void arch_release_task_struct(struct task_struct *tsk); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src); -#define THREAD_SIZE_ORDER THREAD_ORDER - #endif /* diff --git a/arch/s390/include/asm/timex.h b/arch/s390/include/asm/timex.h index 0bb08f341c09..de8298800722 100644 --- a/arch/s390/include/asm/timex.h +++ b/arch/s390/include/asm/timex.h @@ -52,11 +52,9 @@ static inline void store_clock_comparator(__u64 *time) void clock_comparator_work(void); -void __init ptff_init(void); +void __init time_early_init(void); extern unsigned char ptff_function_mask[16]; -extern unsigned long lpar_offset; -extern unsigned long initial_leap_seconds; /* Function codes for the ptff instruction. */ #define PTFF_QAF 0x00 /* query available functions */ @@ -100,21 +98,28 @@ struct ptff_qui { unsigned int pad_0x5c[41]; } __packed; -static inline int ptff(void *ptff_block, size_t len, unsigned int func) -{ - typedef struct { char _[len]; } addrtype; - register unsigned int reg0 asm("0") = func; - register unsigned long reg1 asm("1") = (unsigned long) ptff_block; - int rc; - - asm volatile( - " .word 0x0104\n" - " ipm %0\n" - " srl %0,28\n" - : "=d" (rc), "+m" (*(addrtype *) ptff_block) - : "d" (reg0), "d" (reg1) : "cc"); - return rc; -} +/* + * ptff - Perform timing facility function + * @ptff_block: Pointer to ptff parameter block + * @len: Length of parameter block + * @func: Function code + * Returns: Condition code (0 on success) + */ +#define ptff(ptff_block, len, func) \ +({ \ + struct addrtype { char _[len]; }; \ + register unsigned int reg0 asm("0") = func; \ + register unsigned long reg1 asm("1") = (unsigned long) (ptff_block);\ + int rc; \ + \ + asm volatile( \ + " .word 0x0104\n" \ + " ipm %0\n" \ + " srl %0,28\n" \ + : "=d" (rc), "+m" (*(struct addrtype *) reg1) \ + : "d" (reg0), "d" (reg1) : "cc"); \ + rc; \ +}) static inline unsigned long long local_tick_disable(void) { diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 15711de10403..853b2a3d8dee 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -104,12 +104,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { @@ -162,5 +156,13 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, #define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0) #define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr) do { } while (0) #define tlb_migrate_finish(mm) do { } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) + +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} #endif /* _S390_TLB_H */ diff --git a/arch/s390/include/asm/topology.h b/arch/s390/include/asm/topology.h index f15f5571ca2b..fa1bfce10370 100644 --- a/arch/s390/include/asm/topology.h +++ b/arch/s390/include/asm/topology.h @@ -22,21 +22,22 @@ struct cpu_topology_s390 { cpumask_t drawer_mask; }; -DECLARE_PER_CPU(struct cpu_topology_s390, cpu_topology); - -#define topology_physical_package_id(cpu) (per_cpu(cpu_topology, cpu).socket_id) -#define topology_thread_id(cpu) (per_cpu(cpu_topology, cpu).thread_id) -#define topology_sibling_cpumask(cpu) \ - (&per_cpu(cpu_topology, cpu).thread_mask) -#define topology_core_id(cpu) (per_cpu(cpu_topology, cpu).core_id) -#define topology_core_cpumask(cpu) (&per_cpu(cpu_topology, cpu).core_mask) -#define topology_book_id(cpu) (per_cpu(cpu_topology, cpu).book_id) -#define topology_book_cpumask(cpu) (&per_cpu(cpu_topology, cpu).book_mask) -#define topology_drawer_id(cpu) (per_cpu(cpu_topology, cpu).drawer_id) -#define topology_drawer_cpumask(cpu) (&per_cpu(cpu_topology, cpu).drawer_mask) +extern struct cpu_topology_s390 cpu_topology[NR_CPUS]; +extern cpumask_t cpus_with_topology; + +#define topology_physical_package_id(cpu) (cpu_topology[cpu].socket_id) +#define topology_thread_id(cpu) (cpu_topology[cpu].thread_id) +#define topology_sibling_cpumask(cpu) (&cpu_topology[cpu].thread_mask) +#define topology_core_id(cpu) (cpu_topology[cpu].core_id) +#define topology_core_cpumask(cpu) (&cpu_topology[cpu].core_mask) +#define topology_book_id(cpu) (cpu_topology[cpu].book_id) +#define topology_book_cpumask(cpu) (&cpu_topology[cpu].book_mask) +#define topology_drawer_id(cpu) (cpu_topology[cpu].drawer_id) +#define topology_drawer_cpumask(cpu) (&cpu_topology[cpu].drawer_mask) #define mc_capable() 1 +void topology_init_early(void); int topology_cpu_init(struct cpu *); int topology_set_cpu_management(int fc); void topology_schedule_update(void); @@ -46,6 +47,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu); #else /* CONFIG_SCHED_TOPOLOGY */ +static inline void topology_init_early(void) { } static inline void topology_schedule_update(void) { } static inline int topology_cpu_init(struct cpu *cpu) { return 0; } static inline void topology_expect_change(void) { } @@ -65,7 +67,7 @@ static inline void topology_expect_change(void) { } #define cpu_to_node cpu_to_node static inline int cpu_to_node(int cpu) { - return per_cpu(cpu_topology, cpu).node_id; + return cpu_topology[cpu].node_id; } /* Returns a pointer to the cpumask of CPUs on node 'node'. */ diff --git a/arch/s390/include/asm/uaccess.h b/arch/s390/include/asm/uaccess.h index 52d7c8709279..f82b04e85a21 100644 --- a/arch/s390/include/asm/uaccess.h +++ b/arch/s390/include/asm/uaccess.h @@ -37,14 +37,14 @@ #define get_ds() (KERNEL_DS) #define get_fs() (current->thread.mm_segment) -#define set_fs(x) \ -({ \ +#define set_fs(x) \ +{ \ unsigned long __pto; \ current->thread.mm_segment = (x); \ __pto = current->thread.mm_segment.ar4 ? \ S390_lowcore.user_asce : S390_lowcore.kernel_asce; \ __ctl_load(__pto, 7, 7); \ -}) +} #define segment_eq(a,b) ((a).ar4 == (b).ar4) diff --git a/arch/s390/include/asm/vdso.h b/arch/s390/include/asm/vdso.h index d0a2dbf2433d..88bdc477a843 100644 --- a/arch/s390/include/asm/vdso.h +++ b/arch/s390/include/asm/vdso.h @@ -33,6 +33,8 @@ struct vdso_data { __u32 ectg_available; /* ECTG instruction present 0x58 */ __u32 tk_mult; /* Mult. used for xtime_nsec 0x5c */ __u32 tk_shift; /* Shift used for xtime_nsec 0x60 */ + __u32 ts_dir; /* TOD steering direction 0x64 */ + __u64 ts_end; /* TOD steering end 0x68 */ }; struct vdso_per_cpu_data { diff --git a/arch/s390/include/uapi/asm/Kbuild b/arch/s390/include/uapi/asm/Kbuild index cc44b09c25fc..bf736e764cb4 100644 --- a/arch/s390/include/uapi/asm/Kbuild +++ b/arch/s390/include/uapi/asm/Kbuild @@ -12,6 +12,7 @@ header-y += dasd.h header-y += debug.h header-y += errno.h header-y += fcntl.h +header-y += hypfs.h header-y += ioctl.h header-y += ioctls.h header-y += ipcbuf.h @@ -29,16 +30,16 @@ header-y += ptrace.h header-y += qeth.h header-y += resource.h header-y += schid.h +header-y += sclp_ctl.h header-y += sembuf.h header-y += setup.h header-y += shmbuf.h +header-y += sie.h header-y += sigcontext.h header-y += siginfo.h header-y += signal.h header-y += socket.h header-y += sockios.h -header-y += sclp_ctl.h -header-y += sie.h header-y += stat.h header-y += statfs.h header-y += swab.h diff --git a/arch/s390/include/uapi/asm/socket.h b/arch/s390/include/uapi/asm/socket.h index 41b51c2f4f1b..b24a64cbfeb1 100644 --- a/arch/s390/include/uapi/asm/socket.h +++ b/arch/s390/include/uapi/asm/socket.h @@ -96,4 +96,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _ASM_SOCKET_H */ diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 1f0fe98f6db9..36b5101c8606 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -2,20 +2,47 @@ # Makefile for the linux kernel. # -KCOV_INSTRUMENT_early.o := n -KCOV_INSTRUMENT_sclp.o := n -KCOV_INSTRUMENT_als.o := n - ifdef CONFIG_FUNCTION_TRACER -# Don't trace early setup code and tracing code -CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) + +# Do not trace tracer code +CFLAGS_REMOVE_ftrace.o = $(CC_FLAGS_FTRACE) + +# Do not trace early setup code +CFLAGS_REMOVE_als.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_early.o = $(CC_FLAGS_FTRACE) +CFLAGS_REMOVE_sclp.o = $(CC_FLAGS_FTRACE) + +endif + +GCOV_PROFILE_als.o := n +GCOV_PROFILE_early.o := n +GCOV_PROFILE_sclp.o := n + +KCOV_INSTRUMENT_als.o := n +KCOV_INSTRUMENT_early.o := n +KCOV_INSTRUMENT_sclp.o := n + +UBSAN_SANITIZE_als.o := n +UBSAN_SANITIZE_early.o := n +UBSAN_SANITIZE_sclp.o := n + +# +# Use -march=z900 for sclp.c and als.c to be able to print an error +# message if the kernel is started on a machine which is too old +# +ifneq ($(CC_FLAGS_MARCH),-march=z900) +CFLAGS_REMOVE_als.o += $(CC_FLAGS_MARCH) +CFLAGS_als.o += -march=z900 +CFLAGS_REMOVE_sclp.o += $(CC_FLAGS_MARCH) +CFLAGS_sclp.o += -march=z900 +AFLAGS_REMOVE_head.o += $(CC_FLAGS_MARCH) +AFLAGS_head.o += -march=z900 endif # # Passing null pointers is ok for smp code, since we access the lowcore here. # -CFLAGS_smp.o := -Wno-nonnull +CFLAGS_smp.o := -Wno-nonnull # # Disable tailcall optimizations for stack / callchain walking functions @@ -30,27 +57,7 @@ CFLAGS_dumpstack.o += -fno-optimize-sibling-calls # CFLAGS_ptrace.o += -DUTS_MACHINE='"$(UTS_MACHINE)"' -CFLAGS_sysinfo.o += -w - -# -# Use -march=z900 for sclp.c and als.c to be able to print an error -# message if the kernel is started on a machine which is too old -# -CFLAGS_REMOVE_sclp.o = $(CC_FLAGS_FTRACE) -CFLAGS_REMOVE_als.o = $(CC_FLAGS_FTRACE) -ifneq ($(CC_FLAGS_MARCH),-march=z900) -CFLAGS_REMOVE_sclp.o += $(CC_FLAGS_MARCH) -CFLAGS_sclp.o += -march=z900 -CFLAGS_REMOVE_als.o += $(CC_FLAGS_MARCH) -CFLAGS_als.o += -march=z900 -AFLAGS_REMOVE_head.o += $(CC_FLAGS_MARCH) -AFLAGS_head.o += -march=z900 -endif -GCOV_PROFILE_sclp.o := n -GCOV_PROFILE_als.o := n -UBSAN_SANITIZE_als.o := n -UBSAN_SANITIZE_early.o := n -UBSAN_SANITIZE_sclp.o := n +CFLAGS_sysinfo.o += -w obj-y := traps.o time.o process.o base.o early.o setup.o idle.o vtime.o obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index f3df9e0a5dec..c4b3570ded5b 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -25,12 +25,14 @@ int main(void) { /* task struct offsets */ - OFFSET(__TASK_thread_info, task_struct, stack); + OFFSET(__TASK_stack, task_struct, stack); OFFSET(__TASK_thread, task_struct, thread); OFFSET(__TASK_pid, task_struct, pid); BLANK(); /* thread struct offsets */ OFFSET(__THREAD_ksp, thread_struct, ksp); + OFFSET(__THREAD_sysc_table, thread_struct, sys_call_table); + OFFSET(__THREAD_last_break, thread_struct, last_break); OFFSET(__THREAD_FPU_fpc, thread_struct, fpu.fpc); OFFSET(__THREAD_FPU_regs, thread_struct, fpu.regs); OFFSET(__THREAD_per_cause, thread_struct, per_event.cause); @@ -39,14 +41,7 @@ int main(void) OFFSET(__THREAD_trap_tdb, thread_struct, trap_tdb); BLANK(); /* thread info offsets */ - OFFSET(__TI_task, thread_info, task); - OFFSET(__TI_flags, thread_info, flags); - OFFSET(__TI_sysc_table, thread_info, sys_call_table); - OFFSET(__TI_cpu, thread_info, cpu); - OFFSET(__TI_precount, thread_info, preempt_count); - OFFSET(__TI_user_timer, thread_info, user_timer); - OFFSET(__TI_system_timer, thread_info, system_timer); - OFFSET(__TI_last_break, thread_info, last_break); + OFFSET(__TI_flags, task_struct, thread_info.flags); BLANK(); /* pt_regs offsets */ OFFSET(__PT_ARGS, pt_regs, args); @@ -79,6 +74,8 @@ int main(void) OFFSET(__VDSO_ECTG_OK, vdso_data, ectg_available); OFFSET(__VDSO_TK_MULT, vdso_data, tk_mult); OFFSET(__VDSO_TK_SHIFT, vdso_data, tk_shift); + OFFSET(__VDSO_TS_DIR, vdso_data, ts_dir); + OFFSET(__VDSO_TS_END, vdso_data, ts_end); OFFSET(__VDSO_ECTG_BASE, vdso_per_cpu_data, ectg_timer_base); OFFSET(__VDSO_ECTG_USER, vdso_per_cpu_data, ectg_user_time); OFFSET(__VDSO_CPU_NR, vdso_per_cpu_data, cpu_nr); @@ -159,7 +156,6 @@ int main(void) OFFSET(__LC_INT_CLOCK, lowcore, int_clock); OFFSET(__LC_MCCK_CLOCK, lowcore, mcck_clock); OFFSET(__LC_CURRENT, lowcore, current_task); - OFFSET(__LC_THREAD_INFO, lowcore, thread_info); OFFSET(__LC_KERNEL_STACK, lowcore, kernel_stack); OFFSET(__LC_ASYNC_STACK, lowcore, async_stack); OFFSET(__LC_PANIC_STACK, lowcore, panic_stack); @@ -173,6 +169,7 @@ int main(void) OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset); OFFSET(__LC_VDSO_PER_CPU, lowcore, vdso_per_cpu_data); OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags); + OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count); OFFSET(__LC_GMAP, lowcore, gmap); OFFSET(__LC_PASTE, lowcore, paste); /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c index 4af60374eba0..6f2a6ab13cb5 100644 --- a/arch/s390/kernel/compat_signal.c +++ b/arch/s390/kernel/compat_signal.c @@ -446,7 +446,7 @@ static int setup_frame32(struct ksignal *ksig, sigset_t *set, /* set extra registers only for synchronous signals */ regs->gprs[4] = regs->int_code & 127; regs->gprs[5] = regs->int_parm_long; - regs->gprs[6] = task_thread_info(current)->last_break; + regs->gprs[6] = current->thread.last_break; } return 0; @@ -523,7 +523,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set, regs->gprs[2] = ksig->sig; regs->gprs[3] = (__force __u64) &frame->info; regs->gprs[4] = (__force __u64) &frame->uc; - regs->gprs[5] = task_thread_info(current)->last_break; + regs->gprs[5] = current->thread.last_break; return 0; } diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c index 2374c5b46bbc..d038c8cea6cb 100644 --- a/arch/s390/kernel/early.c +++ b/arch/s390/kernel/early.c @@ -293,6 +293,7 @@ static noinline __init void setup_lowcore_early(void) psw.addr = (unsigned long) s390_base_pgm_handler; S390_lowcore.program_new_psw = psw; s390_base_pgm_handler_fn = early_pgm_check_handler; + S390_lowcore.preempt_count = INIT_PREEMPT_COUNT; } static noinline __init void setup_facility_list(void) @@ -391,7 +392,49 @@ static int __init cad_init(void) } early_initcall(cad_init); -static __init void rescue_initrd(void) +static __init void memmove_early(void *dst, const void *src, size_t n) +{ + unsigned long addr; + long incr; + psw_t old; + + if (!n) + return; + incr = 1; + if (dst > src) { + incr = -incr; + dst += n - 1; + src += n - 1; + } + old = S390_lowcore.program_new_psw; + S390_lowcore.program_new_psw.mask = __extract_psw(); + asm volatile( + " larl %[addr],1f\n" + " stg %[addr],%[psw_pgm_addr]\n" + "0: mvc 0(1,%[dst]),0(%[src])\n" + " agr %[dst],%[incr]\n" + " agr %[src],%[incr]\n" + " brctg %[n],0b\n" + "1:\n" + : [addr] "=&d" (addr), + [psw_pgm_addr] "=&Q" (S390_lowcore.program_new_psw.addr), + [dst] "+&a" (dst), [src] "+&a" (src), [n] "+d" (n) + : [incr] "d" (incr) + : "cc", "memory"); + S390_lowcore.program_new_psw = old; +} + +static __init noinline void ipl_save_parameters(void) +{ + void *src, *dst; + + src = (void *)(unsigned long) S390_lowcore.ipl_parmblock_ptr; + dst = (void *) IPL_PARMBLOCK_ORIGIN; + memmove_early(dst, src, PAGE_SIZE); + S390_lowcore.ipl_parmblock_ptr = IPL_PARMBLOCK_ORIGIN; +} + +static __init noinline void rescue_initrd(void) { #ifdef CONFIG_BLK_DEV_INITRD unsigned long min_initrd_addr = (unsigned long) _end + (4UL << 20); @@ -405,7 +448,7 @@ static __init void rescue_initrd(void) return; if (INITRD_START >= min_initrd_addr) return; - memmove((void *) min_initrd_addr, (void *) INITRD_START, INITRD_SIZE); + memmove_early((void *) min_initrd_addr, (void *) INITRD_START, INITRD_SIZE); INITRD_START = min_initrd_addr; #endif } @@ -467,7 +510,8 @@ void __init startup_init(void) ipl_save_parameters(); rescue_initrd(); clear_bss_section(); - ptff_init(); + ipl_verify_parameters(); + time_early_init(); init_kernel_storage_key(); lockdep_off(); setup_lowcore_early(); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index 49a30737adde..97298c58b2be 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -42,7 +42,7 @@ __PT_R13 = __PT_GPRS + 104 __PT_R14 = __PT_GPRS + 112 __PT_R15 = __PT_GPRS + 120 -STACK_SHIFT = PAGE_SHIFT + THREAD_ORDER +STACK_SHIFT = PAGE_SHIFT + THREAD_SIZE_ORDER STACK_SIZE = 1 << STACK_SHIFT STACK_INIT = STACK_SIZE - STACK_FRAME_OVERHEAD - __PT_SIZE @@ -123,8 +123,14 @@ _PIF_WORK = (_PIF_PER_TRAP) .macro LAST_BREAK scratch srag \scratch,%r10,23 +#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES jz .+10 - stg %r10,__TI_last_break(%r12) + stg %r10,__TASK_thread+__THREAD_last_break(%r12) +#else + jz .+14 + lghi \scratch,__TASK_thread + stg %r10,__THREAD_last_break(\scratch,%r12) +#endif .endm .macro REENABLE_IRQS @@ -186,14 +192,13 @@ ENTRY(__switch_to) stmg %r6,%r15,__SF_GPRS(%r15) # store gprs of prev task lgr %r1,%r2 aghi %r1,__TASK_thread # thread_struct of prev task - lg %r5,__TASK_thread_info(%r3) # get thread_info of next + lg %r5,__TASK_stack(%r3) # start of kernel stack of next stg %r15,__THREAD_ksp(%r1) # store kernel stack of prev lgr %r1,%r3 aghi %r1,__TASK_thread # thread_struct of next task lgr %r15,%r5 aghi %r15,STACK_INIT # end of kernel stack of next stg %r3,__LC_CURRENT # store task struct of next - stg %r5,__LC_THREAD_INFO # store thread info of next stg %r15,__LC_KERNEL_STACK # store end of kernel stack lg %r15,__THREAD_ksp(%r1) # load kernel stack of next /* c4 is used in guest detection: arch/s390/kernel/perf_cpum_sf.c */ @@ -274,7 +279,7 @@ ENTRY(system_call) .Lsysc_stmg: stmg %r8,%r15,__LC_SAVE_AREA_SYNC lg %r10,__LC_LAST_BREAK - lg %r12,__LC_THREAD_INFO + lg %r12,__LC_CURRENT lghi %r14,_PIF_SYSCALL .Lsysc_per: lg %r15,__LC_KERNEL_STACK @@ -288,7 +293,13 @@ ENTRY(system_call) mvc __PT_INT_CODE(4,%r11),__LC_SVC_ILC stg %r14,__PT_FLAGS(%r11) .Lsysc_do_svc: - lg %r10,__TI_sysc_table(%r12) # address of system call table + # load address of system call table +#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES + lg %r10,__TASK_thread+__THREAD_sysc_table(%r12) +#else + lghi %r13,__TASK_thread + lg %r10,__THREAD_sysc_table(%r13,%r12) +#endif llgh %r8,__PT_INT_CODE+2(%r11) slag %r8,%r8,2 # shift and test for svc 0 jnz .Lsysc_nr_ok @@ -389,7 +400,6 @@ ENTRY(system_call) TSTMSK __PT_FLAGS(%r11),_PIF_SYSCALL jno .Lsysc_return lmg %r2,%r7,__PT_R2(%r11) # load svc arguments - lg %r10,__TI_sysc_table(%r12) # address of system call table lghi %r8,0 # svc 0 returns -ENOSYS llgh %r1,__PT_INT_CODE+2(%r11) # load new svc number cghi %r1,NR_syscalls @@ -457,7 +467,7 @@ ENTRY(system_call) # ENTRY(ret_from_fork) la %r11,STACK_FRAME_OVERHEAD(%r15) - lg %r12,__LC_THREAD_INFO + lg %r12,__LC_CURRENT brasl %r14,schedule_tail TRACE_IRQS_ON ssm __LC_SVC_NEW_PSW # reenable interrupts @@ -478,7 +488,7 @@ ENTRY(pgm_check_handler) stpt __LC_SYNC_ENTER_TIMER stmg %r8,%r15,__LC_SAVE_AREA_SYNC lg %r10,__LC_LAST_BREAK - lg %r12,__LC_THREAD_INFO + lg %r12,__LC_CURRENT larl %r13,cleanup_critical lmg %r8,%r9,__LC_PGM_OLD_PSW tmhh %r8,0x0001 # test problem state bit @@ -501,7 +511,7 @@ ENTRY(pgm_check_handler) 2: LAST_BREAK %r14 UPDATE_VTIME %r14,%r15,__LC_SYNC_ENTER_TIMER lg %r15,__LC_KERNEL_STACK - lg %r14,__TI_task(%r12) + lgr %r14,%r12 aghi %r14,__TASK_thread # pointer to thread_struct lghi %r13,__LC_PGM_TDB tm __LC_PGM_ILC+2,0x02 # check for transaction abort @@ -567,7 +577,7 @@ ENTRY(io_int_handler) stpt __LC_ASYNC_ENTER_TIMER stmg %r8,%r15,__LC_SAVE_AREA_ASYNC lg %r10,__LC_LAST_BREAK - lg %r12,__LC_THREAD_INFO + lg %r12,__LC_CURRENT larl %r13,cleanup_critical lmg %r8,%r9,__LC_IO_OLD_PSW SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER @@ -626,7 +636,7 @@ ENTRY(io_int_handler) jo .Lio_work_user # yes -> do resched & signal #ifdef CONFIG_PREEMPT # check for preemptive scheduling - icm %r0,15,__TI_precount(%r12) + icm %r0,15,__LC_PREEMPT_COUNT jnz .Lio_restore # preemption is disabled TSTMSK __TI_flags(%r12),_TIF_NEED_RESCHED jno .Lio_restore @@ -741,7 +751,7 @@ ENTRY(ext_int_handler) stpt __LC_ASYNC_ENTER_TIMER stmg %r8,%r15,__LC_SAVE_AREA_ASYNC lg %r10,__LC_LAST_BREAK - lg %r12,__LC_THREAD_INFO + lg %r12,__LC_CURRENT larl %r13,cleanup_critical lmg %r8,%r9,__LC_EXT_OLD_PSW SWITCH_ASYNC __LC_SAVE_AREA_ASYNC,__LC_ASYNC_ENTER_TIMER @@ -798,13 +808,10 @@ ENTRY(save_fpu_regs) TSTMSK __LC_CPU_FLAGS,_CIF_FPU bor %r14 stfpc __THREAD_FPU_fpc(%r2) -.Lsave_fpu_regs_fpc_end: lg %r3,__THREAD_FPU_regs(%r2) TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX jz .Lsave_fpu_regs_fp # no -> store FP regs -.Lsave_fpu_regs_vx_low: VSTM %v0,%v15,0,%r3 # vstm 0,15,0(3) -.Lsave_fpu_regs_vx_high: VSTM %v16,%v31,256,%r3 # vstm 16,31,256(3) j .Lsave_fpu_regs_done # -> set CIF_FPU flag .Lsave_fpu_regs_fp: @@ -851,9 +858,7 @@ load_fpu_regs: TSTMSK __LC_MACHINE_FLAGS,MACHINE_FLAG_VX lg %r4,__THREAD_FPU_regs(%r4) # %r4 <- reg save area jz .Lload_fpu_regs_fp # -> no VX, load FP regs -.Lload_fpu_regs_vx: VLM %v0,%v15,0,%r4 -.Lload_fpu_regs_vx_high: VLM %v16,%v31,256,%r4 j .Lload_fpu_regs_done .Lload_fpu_regs_fp: @@ -889,7 +894,7 @@ ENTRY(mcck_int_handler) spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # revalidate cpu timer lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# revalidate gprs lg %r10,__LC_LAST_BREAK - lg %r12,__LC_THREAD_INFO + lg %r12,__LC_CURRENT larl %r13,cleanup_critical lmg %r8,%r9,__LC_MCK_OLD_PSW TSTMSK __LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE @@ -948,7 +953,7 @@ ENTRY(mcck_int_handler) .Lmcck_panic: lg %r15,__LC_PANIC_STACK - aghi %r15,-(STACK_FRAME_OVERHEAD + __PT_SIZE) + la %r11,STACK_FRAME_OVERHEAD(%r15) j .Lmcck_skip # @@ -1085,7 +1090,7 @@ cleanup_critical: jhe 0f # set up saved registers r10 and r12 stg %r10,16(%r11) # r10 last break - stg %r12,32(%r11) # r12 thread-info pointer + stg %r12,32(%r11) # r12 task struct pointer 0: # check if the user time update has been done clg %r9,BASED(.Lcleanup_system_call_insn+24) jh 0f @@ -1106,7 +1111,9 @@ cleanup_critical: lg %r9,16(%r11) srag %r9,%r9,23 jz 0f - mvc __TI_last_break(8,%r12),16(%r11) + lgr %r9,%r12 + aghi %r9,__TASK_thread + mvc __THREAD_last_break(8,%r9),16(%r11) 0: # set up saved register r11 lg %r15,__LC_KERNEL_STACK la %r9,STACK_FRAME_OVERHEAD(%r15) diff --git a/arch/s390/kernel/head.S b/arch/s390/kernel/head.S index 4431905f8cfa..0b5ebf8a3d30 100644 --- a/arch/s390/kernel/head.S +++ b/arch/s390/kernel/head.S @@ -315,7 +315,7 @@ ENTRY(startup_kdump) jg startup_continue .Lstack: - .long 0x8000 + (1<<(PAGE_SHIFT+THREAD_ORDER)) + .long 0x8000 + (1<<(PAGE_SHIFT+THREAD_SIZE_ORDER)) .align 8 6: .long 0x7fffffff,0xffffffff diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 03c2b469c472..482d3526e32b 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -32,11 +32,10 @@ ENTRY(startup_continue) # # Setup stack # - larl %r15,init_thread_union - stg %r15,__LC_THREAD_INFO # cache thread info in lowcore - lg %r14,__TI_task(%r15) # cache current in lowcore + larl %r14,init_task stg %r14,__LC_CURRENT - aghi %r15,1<<(PAGE_SHIFT+THREAD_ORDER) # init_task_union + THREAD_SIZE + larl %r15,init_thread_union + aghi %r15,1<<(PAGE_SHIFT+THREAD_SIZE_ORDER) # init_task_union + THREAD_SIZE stg %r15,__LC_KERNEL_STACK # set end of kernel stack aghi %r15,-160 # diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c index 295bfb7124bc..ff3364a067ff 100644 --- a/arch/s390/kernel/ipl.c +++ b/arch/s390/kernel/ipl.c @@ -1991,10 +1991,9 @@ void __init ipl_update_parameters(void) diag308_set_works = 1; } -void __init ipl_save_parameters(void) +void __init ipl_verify_parameters(void) { struct cio_iplinfo iplinfo; - void *src, *dst; if (cio_get_iplinfo(&iplinfo)) return; @@ -2005,10 +2004,6 @@ void __init ipl_save_parameters(void) if (!iplinfo.is_qdio) return; ipl_flags |= IPL_PARMBLOCK_VALID; - src = (void *)(unsigned long)S390_lowcore.ipl_parmblock_ptr; - dst = (void *)IPL_PARMBLOCK_ORIGIN; - memmove(dst, src, PAGE_SIZE); - S390_lowcore.ipl_parmblock_ptr = IPL_PARMBLOCK_ORIGIN; } static LIST_HEAD(rcall); diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c index 285d6561076d..ef60f4177331 100644 --- a/arch/s390/kernel/irq.c +++ b/arch/s390/kernel/irq.c @@ -168,7 +168,7 @@ void do_softirq_own_stack(void) old = current_stack_pointer(); /* Check against async. stack address range. */ new = S390_lowcore.async_stack; - if (((new - old) >> (PAGE_SHIFT + THREAD_ORDER)) != 0) { + if (((new - old) >> (PAGE_SHIFT + THREAD_SIZE_ORDER)) != 0) { /* Need to switch to the async. stack. */ new -= STACK_FRAME_OVERHEAD; ((struct stack_frame *) new)->back_chain = old; diff --git a/arch/s390/kernel/lgr.c b/arch/s390/kernel/lgr.c index 6ea6d69339b5..ae7dff110054 100644 --- a/arch/s390/kernel/lgr.c +++ b/arch/s390/kernel/lgr.c @@ -5,7 +5,8 @@ * Author(s): Michael Holzheu <holzheu@linux.vnet.ibm.com> */ -#include <linux/module.h> +#include <linux/init.h> +#include <linux/export.h> #include <linux/timer.h> #include <linux/slab.h> #include <asm/facility.h> @@ -183,4 +184,4 @@ static int __init lgr_init(void) lgr_timer_set(); return 0; } -module_init(lgr_init); +device_initcall(lgr_init); diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index fcc634c1479a..763dec18edcd 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -995,39 +995,36 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) regs.int_parm = CPU_MF_INT_SF_PRA; sde_regs = (struct perf_sf_sde_regs *) ®s.int_parm_long; - regs.psw.addr = sfr->basic.ia; - if (sfr->basic.T) - regs.psw.mask |= PSW_MASK_DAT; - if (sfr->basic.W) - regs.psw.mask |= PSW_MASK_WAIT; - if (sfr->basic.P) - regs.psw.mask |= PSW_MASK_PSTATE; - switch (sfr->basic.AS) { - case 0x0: - regs.psw.mask |= PSW_ASC_PRIMARY; - break; - case 0x1: - regs.psw.mask |= PSW_ASC_ACCREG; - break; - case 0x2: - regs.psw.mask |= PSW_ASC_SECONDARY; - break; - case 0x3: - regs.psw.mask |= PSW_ASC_HOME; - break; - } + psw_bits(regs.psw).ia = sfr->basic.ia; + psw_bits(regs.psw).t = sfr->basic.T; + psw_bits(regs.psw).w = sfr->basic.W; + psw_bits(regs.psw).p = sfr->basic.P; + psw_bits(regs.psw).as = sfr->basic.AS; /* - * A non-zero guest program parameter indicates a guest - * sample. - * Note that some early samples or samples from guests without + * Use the hardware provided configuration level to decide if the + * sample belongs to a guest or host. If that is not available, + * fall back to the following heuristics: + * A non-zero guest program parameter always indicates a guest + * sample. Some early samples or samples from guests without * lpp usage would be misaccounted to the host. We use the asn - * value as a heuristic to detect most of these guest samples. - * If the value differs from the host hpp value, we assume - * it to be a KVM guest. + * value as an addon heuristic to detect most of these guest samples. + * If the value differs from the host hpp value, we assume to be a + * KVM guest. */ - if (sfr->basic.gpp || sfr->basic.prim_asn != (u16) sfr->basic.hpp) + switch (sfr->basic.CL) { + case 1: /* logical partition */ + sde_regs->in_guest = 0; + break; + case 2: /* virtual machine */ sde_regs->in_guest = 1; + break; + default: /* old machine, use heuristics */ + if (sfr->basic.gpp || + sfr->basic.prim_asn != (u16)sfr->basic.hpp) + sde_regs->in_guest = 1; + break; + } overflow = 0; if (perf_exclude_event(event, ®s, sde_regs)) diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index bba4fa74b321..400d14f0b9f5 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -103,7 +103,6 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) int copy_thread(unsigned long clone_flags, unsigned long new_stackp, unsigned long arg, struct task_struct *p) { - struct thread_info *ti; struct fake_frame { struct stack_frame sf; @@ -121,9 +120,8 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, memset(&p->thread.per_event, 0, sizeof(p->thread.per_event)); clear_tsk_thread_flag(p, TIF_SINGLE_STEP); /* Initialize per thread user and system timer values */ - ti = task_thread_info(p); - ti->user_timer = 0; - ti->system_timer = 0; + p->thread.user_timer = 0; + p->thread.system_timer = 0; frame->sf.back_chain = 0; /* new return point is ret_from_fork */ diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c index 81d0808085e6..9e60ef144d03 100644 --- a/arch/s390/kernel/processor.c +++ b/arch/s390/kernel/processor.c @@ -53,7 +53,7 @@ void s390_update_cpu_mhz(void) on_each_cpu(update_cpu_mhz, NULL, 0); } -void notrace cpu_relax(void) +void notrace cpu_relax_yield(void) { if (!smp_cpu_mtid && MACHINE_HAS_DIAG44) { diag_stat_inc(DIAG_STAT_X044); @@ -61,7 +61,7 @@ void notrace cpu_relax(void) } barrier(); } -EXPORT_SYMBOL(cpu_relax); +EXPORT_SYMBOL(cpu_relax_yield); /* * cpu_init - initializes state that is per-CPU. diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 9336e824e2db..b81ab8882e2e 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -461,7 +461,7 @@ long arch_ptrace(struct task_struct *child, long request, } return 0; case PTRACE_GET_LAST_BREAK: - put_user(task_thread_info(child)->last_break, + put_user(child->thread.last_break, (unsigned long __user *) data); return 0; case PTRACE_ENABLE_TE: @@ -811,7 +811,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, } return 0; case PTRACE_GET_LAST_BREAK: - put_user(task_thread_info(child)->last_break, + put_user(child->thread.last_break, (unsigned int __user *) data); return 0; } @@ -997,10 +997,10 @@ static int s390_last_break_get(struct task_struct *target, if (count > 0) { if (kbuf) { unsigned long *k = kbuf; - *k = task_thread_info(target)->last_break; + *k = target->thread.last_break; } else { unsigned long __user *u = ubuf; - if (__put_user(task_thread_info(target)->last_break, u)) + if (__put_user(target->thread.last_break, u)) return -EFAULT; } } @@ -1113,7 +1113,7 @@ static int s390_system_call_get(struct task_struct *target, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { - unsigned int *data = &task_thread_info(target)->system_call; + unsigned int *data = &target->thread.system_call; return user_regset_copyout(&pos, &count, &kbuf, &ubuf, data, 0, sizeof(unsigned int)); } @@ -1123,7 +1123,7 @@ static int s390_system_call_set(struct task_struct *target, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - unsigned int *data = &task_thread_info(target)->system_call; + unsigned int *data = &target->thread.system_call; return user_regset_copyin(&pos, &count, &kbuf, &ubuf, data, 0, sizeof(unsigned int)); } @@ -1327,7 +1327,7 @@ static int s390_compat_last_break_get(struct task_struct *target, compat_ulong_t last_break; if (count > 0) { - last_break = task_thread_info(target)->last_break; + last_break = target->thread.last_break; if (kbuf) { unsigned long *k = kbuf; *k = last_break; diff --git a/arch/s390/kernel/setup.c b/arch/s390/kernel/setup.c index 7f7ba5f23f13..adfac9f0a89f 100644 --- a/arch/s390/kernel/setup.c +++ b/arch/s390/kernel/setup.c @@ -35,6 +35,7 @@ #include <linux/root_dev.h> #include <linux/console.h> #include <linux/kernel_stat.h> +#include <linux/dma-contiguous.h> #include <linux/device.h> #include <linux/notifier.h> #include <linux/pfn.h> @@ -303,7 +304,7 @@ static void __init setup_lowcore(void) * Setup lowcore for boot cpu */ BUILD_BUG_ON(sizeof(struct lowcore) != LC_PAGES * 4096); - lc = __alloc_bootmem_low(LC_PAGES * PAGE_SIZE, LC_PAGES * PAGE_SIZE, 0); + lc = memblock_virt_alloc_low(sizeof(*lc), sizeof(*lc)); lc->restart_psw.mask = PSW_KERNEL_BITS; lc->restart_psw.addr = (unsigned long) restart_int_handler; lc->external_new_psw.mask = PSW_KERNEL_BITS | @@ -324,15 +325,15 @@ static void __init setup_lowcore(void) lc->kernel_stack = ((unsigned long) &init_thread_union) + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); lc->async_stack = (unsigned long) - __alloc_bootmem(ASYNC_SIZE, ASYNC_SIZE, 0) + memblock_virt_alloc(ASYNC_SIZE, ASYNC_SIZE) + ASYNC_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); lc->panic_stack = (unsigned long) - __alloc_bootmem(PAGE_SIZE, PAGE_SIZE, 0) + memblock_virt_alloc(PAGE_SIZE, PAGE_SIZE) + PAGE_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); - lc->current_task = (unsigned long) init_thread_union.thread_info.task; - lc->thread_info = (unsigned long) &init_thread_union; + lc->current_task = (unsigned long)&init_task; lc->lpp = LPP_MAGIC; lc->machine_flags = S390_lowcore.machine_flags; + lc->preempt_count = S390_lowcore.preempt_count; lc->stfl_fac_list = S390_lowcore.stfl_fac_list; memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list, MAX_FACILITY_BIT/8); @@ -349,7 +350,7 @@ static void __init setup_lowcore(void) lc->last_update_timer = S390_lowcore.last_update_timer; lc->last_update_clock = S390_lowcore.last_update_clock; - restart_stack = __alloc_bootmem(ASYNC_SIZE, ASYNC_SIZE, 0); + restart_stack = memblock_virt_alloc(ASYNC_SIZE, ASYNC_SIZE); restart_stack += ASYNC_SIZE; /* @@ -412,7 +413,7 @@ static void __init setup_resources(void) bss_resource.end = (unsigned long) &__bss_stop - 1; for_each_memblock(memory, reg) { - res = alloc_bootmem_low(sizeof(*res)); + res = memblock_virt_alloc(sizeof(*res), 8); res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; res->name = "System RAM"; @@ -426,7 +427,7 @@ static void __init setup_resources(void) std_res->start > res->end) continue; if (std_res->end > res->end) { - sub_res = alloc_bootmem_low(sizeof(*sub_res)); + sub_res = memblock_virt_alloc(sizeof(*sub_res), 8); *sub_res = *std_res; sub_res->end = res->end; std_res->start = res->end + 1; @@ -445,7 +446,7 @@ static void __init setup_resources(void) * part of the System RAM resource. */ if (crashk_res.end) { - memblock_add(crashk_res.start, resource_size(&crashk_res)); + memblock_add_node(crashk_res.start, resource_size(&crashk_res), 0); memblock_reserve(crashk_res.start, resource_size(&crashk_res)); insert_resource(&iomem_resource, &crashk_res); } @@ -903,6 +904,7 @@ void __init setup_arch(char **cmdline_p) setup_memory_end(); setup_memory(); + dma_contiguous_reserve(memory_end); check_initrd(); reserve_crashkernel(); @@ -921,6 +923,8 @@ void __init setup_arch(char **cmdline_p) cpu_detect_mhz_feature(); cpu_init(); numa_setup(); + smp_detect_cpus(); + topology_init_early(); /* * Create kernel page tables and switch to virtual addressing. diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c index d82562cf0a0e..9f241d1efeda 100644 --- a/arch/s390/kernel/signal.c +++ b/arch/s390/kernel/signal.c @@ -359,7 +359,7 @@ static int setup_frame(int sig, struct k_sigaction *ka, /* set extra registers only for synchronous signals */ regs->gprs[4] = regs->int_code & 127; regs->gprs[5] = regs->int_parm_long; - regs->gprs[6] = task_thread_info(current)->last_break; + regs->gprs[6] = current->thread.last_break; } return 0; } @@ -430,7 +430,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set, regs->gprs[2] = ksig->sig; regs->gprs[3] = (unsigned long) &frame->info; regs->gprs[4] = (unsigned long) &frame->uc; - regs->gprs[5] = task_thread_info(current)->last_break; + regs->gprs[5] = current->thread.last_break; return 0; } @@ -467,13 +467,13 @@ void do_signal(struct pt_regs *regs) * the debugger may change all our registers, including the system * call information. */ - current_thread_info()->system_call = + current->thread.system_call = test_pt_regs_flag(regs, PIF_SYSCALL) ? regs->int_code : 0; if (get_signal(&ksig)) { /* Whee! Actually deliver the signal. */ - if (current_thread_info()->system_call) { - regs->int_code = current_thread_info()->system_call; + if (current->thread.system_call) { + regs->int_code = current->thread.system_call; /* Check for system call restarting. */ switch (regs->gprs[2]) { case -ERESTART_RESTARTBLOCK: @@ -506,8 +506,8 @@ void do_signal(struct pt_regs *regs) /* No handlers present - check for system call restart */ clear_pt_regs_flag(regs, PIF_SYSCALL); - if (current_thread_info()->system_call) { - regs->int_code = current_thread_info()->system_call; + if (current->thread.system_call) { + regs->int_code = current->thread.system_call; switch (regs->gprs[2]) { case -ERESTART_RESTARTBLOCK: /* Restart with sys_restart_syscall */ diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index 35531fe1c5ea..e49f61aadaf9 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -19,6 +19,7 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/workqueue.h> +#include <linux/bootmem.h> #include <linux/module.h> #include <linux/init.h> #include <linux/mm.h> @@ -259,16 +260,14 @@ static void pcpu_prepare_secondary(struct pcpu *pcpu, int cpu) static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) { struct lowcore *lc = pcpu->lowcore; - struct thread_info *ti = task_thread_info(tsk); lc->kernel_stack = (unsigned long) task_stack_page(tsk) + THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs); - lc->thread_info = (unsigned long) task_thread_info(tsk); lc->current_task = (unsigned long) tsk; lc->lpp = LPP_MAGIC; lc->current_pid = tsk->pid; - lc->user_timer = ti->user_timer; - lc->system_timer = ti->system_timer; + lc->user_timer = tsk->thread.user_timer; + lc->system_timer = tsk->thread.system_timer; lc->steal_timer = 0; } @@ -368,10 +367,15 @@ int smp_find_processor_id(u16 address) return -1; } -int smp_vcpu_scheduled(int cpu) +bool arch_vcpu_is_preempted(int cpu) { - return pcpu_running(pcpu_devices + cpu); + if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu)) + return false; + if (pcpu_running(pcpu_devices + cpu)) + return false; + return true; } +EXPORT_SYMBOL(arch_vcpu_is_preempted); void smp_yield_cpu(int cpu) { @@ -657,14 +661,12 @@ int smp_cpu_get_polarization(int cpu) return pcpu_devices[cpu].polarization; } -static struct sclp_core_info *smp_get_core_info(void) +static void __ref smp_get_core_info(struct sclp_core_info *info, int early) { static int use_sigp_detection; - struct sclp_core_info *info; int address; - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (info && (use_sigp_detection || sclp_get_core_info(info))) { + if (use_sigp_detection || sclp_get_core_info(info, early)) { use_sigp_detection = 1; for (address = 0; address < (SCLP_MAX_CORES << smp_cpu_mt_shift); @@ -678,7 +680,6 @@ static struct sclp_core_info *smp_get_core_info(void) } info->combined = info->configured; } - return info; } static int smp_add_present_cpu(int cpu); @@ -719,17 +720,15 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, int sysfs_add) return nr; } -static void __init smp_detect_cpus(void) +void __init smp_detect_cpus(void) { unsigned int cpu, mtid, c_cpus, s_cpus; struct sclp_core_info *info; u16 address; /* Get CPU information */ - info = smp_get_core_info(); - if (!info) - panic("smp_detect_cpus failed to allocate memory\n"); - + info = memblock_virt_alloc(sizeof(*info), 8); + smp_get_core_info(info, 1); /* Find boot CPU type */ if (sclp.has_core_type) { address = stap(); @@ -765,7 +764,7 @@ static void __init smp_detect_cpus(void) get_online_cpus(); __smp_rescan_cpus(info, 0); put_online_cpus(); - kfree(info); + memblock_free_early((unsigned long)info, sizeof(*info)); } /* @@ -802,7 +801,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) pcpu = pcpu_devices + cpu; if (pcpu->state != CPU_STATE_CONFIGURED) return -EIO; - base = cpu - (cpu % (smp_cpu_mtid + 1)); + base = smp_get_base_cpu(cpu); for (i = 0; i <= smp_cpu_mtid; i++) { if (base + i < nr_cpu_ids) if (cpu_online(base + i)) @@ -902,7 +901,6 @@ void __init smp_prepare_cpus(unsigned int max_cpus) /* request the 0x1202 external call external interrupt */ if (register_external_irq(EXT_IRQ_EXTERNAL_CALL, do_ext_call_interrupt)) panic("Couldn't request external interrupt 0x1202"); - smp_detect_cpus(); } void __init smp_prepare_boot_cpu(void) @@ -968,7 +966,7 @@ static ssize_t cpu_configure_store(struct device *dev, rc = -EBUSY; /* disallow configuration changes of online cpus and cpu 0 */ cpu = dev->id; - cpu -= cpu % (smp_cpu_mtid + 1); + cpu = smp_get_base_cpu(cpu); if (cpu == 0) goto out; for (i = 0; i <= smp_cpu_mtid; i++) @@ -1047,22 +1045,18 @@ static struct attribute_group cpu_online_attr_group = { .attrs = cpu_online_attrs, }; -static int smp_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) +static int smp_cpu_online(unsigned int cpu) { - unsigned int cpu = (unsigned int)(long)hcpu; struct device *s = &per_cpu(cpu_device, cpu)->dev; - int err = 0; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - err = sysfs_create_group(&s->kobj, &cpu_online_attr_group); - break; - case CPU_DEAD: - sysfs_remove_group(&s->kobj, &cpu_online_attr_group); - break; - } - return notifier_from_errno(err); + return sysfs_create_group(&s->kobj, &cpu_online_attr_group); +} +static int smp_cpu_pre_down(unsigned int cpu) +{ + struct device *s = &per_cpu(cpu_device, cpu)->dev; + + sysfs_remove_group(&s->kobj, &cpu_online_attr_group); + return 0; } static int smp_add_present_cpu(int cpu) @@ -1083,20 +1077,12 @@ static int smp_add_present_cpu(int cpu) rc = sysfs_create_group(&s->kobj, &cpu_common_attr_group); if (rc) goto out_cpu; - if (cpu_online(cpu)) { - rc = sysfs_create_group(&s->kobj, &cpu_online_attr_group); - if (rc) - goto out_online; - } rc = topology_cpu_init(c); if (rc) goto out_topology; return 0; out_topology: - if (cpu_online(cpu)) - sysfs_remove_group(&s->kobj, &cpu_online_attr_group); -out_online: sysfs_remove_group(&s->kobj, &cpu_common_attr_group); out_cpu: #ifdef CONFIG_HOTPLUG_CPU @@ -1113,9 +1099,10 @@ int __ref smp_rescan_cpus(void) struct sclp_core_info *info; int nr; - info = smp_get_core_info(); + info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; + smp_get_core_info(info, 0); get_online_cpus(); mutex_lock(&smp_cpu_state_mutex); nr = __smp_rescan_cpus(info, 1); @@ -1149,17 +1136,15 @@ static int __init s390_smp_init(void) if (rc) return rc; #endif - cpu_notifier_register_begin(); for_each_present_cpu(cpu) { rc = smp_add_present_cpu(cpu); if (rc) goto out; } - __hotcpu_notifier(smp_cpu_notify, 0); - + rc = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "s390/smp:online", + smp_cpu_online, smp_cpu_pre_down); out: - cpu_notifier_register_done(); return rc; } subsys_initcall(s390_smp_init); diff --git a/arch/s390/kernel/swsusp.S b/arch/s390/kernel/swsusp.S index 2d6b6e81f812..1ff21f05d7dd 100644 --- a/arch/s390/kernel/swsusp.S +++ b/arch/s390/kernel/swsusp.S @@ -194,7 +194,7 @@ pgm_check_entry: /* Suspend CPU not available -> panic */ larl %r15,init_thread_union - ahi %r15,1<<(PAGE_SHIFT+THREAD_ORDER) + ahi %r15,1<<(PAGE_SHIFT+THREAD_SIZE_ORDER) larl %r2,.Lpanic_string larl %r3,_sclp_print_early lghi %r1,0 diff --git a/arch/s390/kernel/sysinfo.c b/arch/s390/kernel/sysinfo.c index bfda6aa40280..24021c1e3ecb 100644 --- a/arch/s390/kernel/sysinfo.c +++ b/arch/s390/kernel/sysinfo.c @@ -56,6 +56,20 @@ int stsi(void *sysinfo, int fc, int sel1, int sel2) } EXPORT_SYMBOL(stsi); +static bool convert_ext_name(unsigned char encoding, char *name, size_t len) +{ + switch (encoding) { + case 1: /* EBCDIC */ + EBCASC(name, len); + break; + case 2: /* UTF-8 */ + break; + default: + return false; + } + return true; +} + static void stsi_1_1_1(struct seq_file *m, struct sysinfo_1_1_1 *info) { int i; @@ -207,24 +221,19 @@ static void stsi_2_2_2(struct seq_file *m, struct sysinfo_2_2_2 *info) seq_printf(m, "LPAR CPUs S-MTID: %d\n", info->mt_stid); seq_printf(m, "LPAR CPUs PS-MTID: %d\n", info->mt_psmtid); } + if (convert_ext_name(info->vsne, info->ext_name, sizeof(info->ext_name))) { + seq_printf(m, "LPAR Extended Name: %-.256s\n", info->ext_name); + seq_printf(m, "LPAR UUID: %pUb\n", &info->uuid); + } } static void print_ext_name(struct seq_file *m, int lvl, struct sysinfo_3_2_2 *info) { - if (info->vm[lvl].ext_name_encoding == 0) - return; - if (info->ext_names[lvl][0] == 0) - return; - switch (info->vm[lvl].ext_name_encoding) { - case 1: /* EBCDIC */ - EBCASC(info->ext_names[lvl], sizeof(info->ext_names[lvl])); - break; - case 2: /* UTF-8 */ - break; - default: + size_t len = sizeof(info->ext_names[lvl]); + + if (!convert_ext_name(info->vm[lvl].evmne, info->ext_names[lvl], len)) return; - } seq_printf(m, "VM%02d Extended Name: %-.256s\n", lvl, info->ext_names[lvl]); } diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c index 0bfcc492987e..867d0a057046 100644 --- a/arch/s390/kernel/time.c +++ b/arch/s390/kernel/time.c @@ -59,19 +59,27 @@ ATOMIC_NOTIFIER_HEAD(s390_epoch_delta_notifier); EXPORT_SYMBOL(s390_epoch_delta_notifier); unsigned char ptff_function_mask[16]; -unsigned long lpar_offset; -unsigned long initial_leap_seconds; + +static unsigned long long lpar_offset; +static unsigned long long initial_leap_seconds; +static unsigned long long tod_steering_end; +static long long tod_steering_delta; /* * Get time offsets with PTFF */ -void __init ptff_init(void) +void __init time_early_init(void) { struct ptff_qto qto; struct ptff_qui qui; + /* Initialize TOD steering parameters */ + tod_steering_end = sched_clock_base_cc; + vdso_data->ts_end = tod_steering_end; + if (!test_facility(28)) return; + ptff(&ptff_function_mask, sizeof(ptff_function_mask), PTFF_QAF); /* get LPAR offset */ @@ -80,7 +88,7 @@ void __init ptff_init(void) /* get initial leap seconds */ if (ptff_query(PTFF_QUI) && ptff(&qui, sizeof(qui), PTFF_QUI) == 0) - initial_leap_seconds = (unsigned long) + initial_leap_seconds = (unsigned long long) ((long) qui.old_leap * 4096000000L); } @@ -123,18 +131,6 @@ void clock_comparator_work(void) cd->event_handler(cd); } -/* - * Fixup the clock comparator. - */ -static void fixup_clock_comparator(unsigned long long delta) -{ - /* If nobody is waiting there's nothing to fix. */ - if (S390_lowcore.clock_comparator == -1ULL) - return; - S390_lowcore.clock_comparator += delta; - set_clock_comparator(S390_lowcore.clock_comparator); -} - static int s390_next_event(unsigned long delta, struct clock_event_device *evt) { @@ -215,7 +211,21 @@ void read_boot_clock64(struct timespec64 *ts) static cycle_t read_tod_clock(struct clocksource *cs) { - return get_tod_clock(); + unsigned long long now, adj; + + preempt_disable(); /* protect from changes to steering parameters */ + now = get_tod_clock(); + adj = tod_steering_end - now; + if (unlikely((s64) adj >= 0)) + /* + * manually steer by 1 cycle every 2^16 cycles. This + * corresponds to shifting the tod delta by 15. 1s is + * therefore steered in ~9h. The adjust will decrease + * over time, until it finally reaches 0. + */ + now += (tod_steering_delta < 0) ? (adj >> 15) : -(adj >> 15); + preempt_enable(); + return now; } static struct clocksource clocksource_tod = { @@ -384,6 +394,55 @@ static inline int check_sync_clock(void) return rc; } +/* + * Apply clock delta to the global data structures. + * This is called once on the CPU that performed the clock sync. + */ +static void clock_sync_global(unsigned long long delta) +{ + unsigned long now, adj; + struct ptff_qto qto; + + /* Fixup the monotonic sched clock. */ + sched_clock_base_cc += delta; + /* Adjust TOD steering parameters. */ + vdso_data->tb_update_count++; + now = get_tod_clock(); + adj = tod_steering_end - now; + if (unlikely((s64) adj >= 0)) + /* Calculate how much of the old adjustment is left. */ + tod_steering_delta = (tod_steering_delta < 0) ? + -(adj >> 15) : (adj >> 15); + tod_steering_delta += delta; + if ((abs(tod_steering_delta) >> 48) != 0) + panic("TOD clock sync offset %lli is too large to drift\n", + tod_steering_delta); + tod_steering_end = now + (abs(tod_steering_delta) << 15); + vdso_data->ts_dir = (tod_steering_delta < 0) ? 0 : 1; + vdso_data->ts_end = tod_steering_end; + vdso_data->tb_update_count++; + /* Update LPAR offset. */ + if (ptff_query(PTFF_QTO) && ptff(&qto, sizeof(qto), PTFF_QTO) == 0) + lpar_offset = qto.tod_epoch_difference; + /* Call the TOD clock change notifier. */ + atomic_notifier_call_chain(&s390_epoch_delta_notifier, 0, &delta); +} + +/* + * Apply clock delta to the per-CPU data structures of this CPU. + * This is called for each online CPU after the call to clock_sync_global. + */ +static void clock_sync_local(unsigned long long delta) +{ + /* Add the delta to the clock comparator. */ + if (S390_lowcore.clock_comparator != -1ULL) { + S390_lowcore.clock_comparator += delta; + set_clock_comparator(S390_lowcore.clock_comparator); + } + /* Adjust the last_update_clock time-stamp. */ + S390_lowcore.last_update_clock += delta; +} + /* Single threaded workqueue used for stp sync events */ static struct workqueue_struct *time_sync_wq; @@ -397,31 +456,9 @@ static void __init time_init_wq(void) struct clock_sync_data { atomic_t cpus; int in_sync; - unsigned long long fixup_cc; + unsigned long long clock_delta; }; -static void clock_sync_cpu(struct clock_sync_data *sync) -{ - atomic_dec(&sync->cpus); - enable_sync_clock(); - while (sync->in_sync == 0) { - __udelay(1); - /* - * A different cpu changes *in_sync. Therefore use - * barrier() to force memory access. - */ - barrier(); - } - if (sync->in_sync != 1) - /* Didn't work. Clear per-cpu in sync bit again. */ - disable_sync_clock(NULL); - /* - * This round of TOD syncing is done. Set the clock comparator - * to the next tick and let the processor continue. - */ - fixup_clock_comparator(sync->fixup_cc); -} - /* * Server Time Protocol (STP) code. */ @@ -523,54 +560,46 @@ void stp_queue_work(void) static int stp_sync_clock(void *data) { - static int first; + struct clock_sync_data *sync = data; unsigned long long clock_delta; - struct clock_sync_data *stp_sync; - struct ptff_qto qto; + static int first; int rc; - stp_sync = data; - - if (xchg(&first, 1) == 1) { - /* Slave */ - clock_sync_cpu(stp_sync); - return 0; - } - - /* Wait until all other cpus entered the sync function. */ - while (atomic_read(&stp_sync->cpus) != 0) - cpu_relax(); - enable_sync_clock(); - - rc = 0; - if (stp_info.todoff[0] || stp_info.todoff[1] || - stp_info.todoff[2] || stp_info.todoff[3] || - stp_info.tmd != 2) { - rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0, &clock_delta); - if (rc == 0) { - /* fixup the monotonic sched clock */ - sched_clock_base_cc += clock_delta; - if (ptff_query(PTFF_QTO) && - ptff(&qto, sizeof(qto), PTFF_QTO) == 0) - /* Update LPAR offset */ - lpar_offset = qto.tod_epoch_difference; - atomic_notifier_call_chain(&s390_epoch_delta_notifier, - 0, &clock_delta); - stp_sync->fixup_cc = clock_delta; - fixup_clock_comparator(clock_delta); - rc = chsc_sstpi(stp_page, &stp_info, - sizeof(struct stp_sstpi)); - if (rc == 0 && stp_info.tmd != 2) - rc = -EAGAIN; + if (xchg(&first, 1) == 0) { + /* Wait until all other cpus entered the sync function. */ + while (atomic_read(&sync->cpus) != 0) + cpu_relax(); + rc = 0; + if (stp_info.todoff[0] || stp_info.todoff[1] || + stp_info.todoff[2] || stp_info.todoff[3] || + stp_info.tmd != 2) { + rc = chsc_sstpc(stp_page, STP_OP_SYNC, 0, + &clock_delta); + if (rc == 0) { + sync->clock_delta = clock_delta; + clock_sync_global(clock_delta); + rc = chsc_sstpi(stp_page, &stp_info, + sizeof(struct stp_sstpi)); + if (rc == 0 && stp_info.tmd != 2) + rc = -EAGAIN; + } } + sync->in_sync = rc ? -EAGAIN : 1; + xchg(&first, 0); + } else { + /* Slave */ + atomic_dec(&sync->cpus); + /* Wait for in_sync to be set. */ + while (READ_ONCE(sync->in_sync) == 0) + __udelay(1); } - if (rc) { + if (sync->in_sync != 1) + /* Didn't work. Clear per-cpu in sync bit again. */ disable_sync_clock(NULL); - stp_sync->in_sync = -EAGAIN; - } else - stp_sync->in_sync = 1; - xchg(&first, 0); + /* Apply clock delta to per-CPU fields of this CPU. */ + clock_sync_local(sync->clock_delta); + return 0; } diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index e959c02e0cac..93dcbae1e98d 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -7,6 +7,7 @@ #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt #include <linux/workqueue.h> +#include <linux/bootmem.h> #include <linux/cpuset.h> #include <linux/device.h> #include <linux/export.h> @@ -41,15 +42,17 @@ static bool topology_enabled = true; static DECLARE_WORK(topology_work, topology_work_fn); /* - * Socket/Book linked lists and per_cpu(cpu_topology) updates are + * Socket/Book linked lists and cpu_topology updates are * protected by "sched_domains_mutex". */ static struct mask_info socket_info; static struct mask_info book_info; static struct mask_info drawer_info; -DEFINE_PER_CPU(struct cpu_topology_s390, cpu_topology); -EXPORT_PER_CPU_SYMBOL_GPL(cpu_topology); +struct cpu_topology_s390 cpu_topology[NR_CPUS]; +EXPORT_SYMBOL_GPL(cpu_topology); + +cpumask_t cpus_with_topology; static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) { @@ -97,7 +100,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core, if (lcpu < 0) continue; for (i = 0; i <= smp_cpu_mtid; i++) { - topo = &per_cpu(cpu_topology, lcpu + i); + topo = &cpu_topology[lcpu + i]; topo->drawer_id = drawer->id; topo->book_id = book->id; topo->socket_id = socket->id; @@ -106,6 +109,7 @@ static void add_cpus_to_mask(struct topology_core *tl_core, cpumask_set_cpu(lcpu + i, &drawer->mask); cpumask_set_cpu(lcpu + i, &book->mask); cpumask_set_cpu(lcpu + i, &socket->mask); + cpumask_set_cpu(lcpu + i, &cpus_with_topology); smp_cpu_set_polarization(lcpu + i, tl_core->pp); } } @@ -220,7 +224,7 @@ static void update_cpu_masks(void) int cpu; for_each_possible_cpu(cpu) { - topo = &per_cpu(cpu_topology, cpu); + topo = &cpu_topology[cpu]; topo->thread_mask = cpu_thread_map(cpu); topo->core_mask = cpu_group_map(&socket_info, cpu); topo->book_mask = cpu_group_map(&book_info, cpu); @@ -231,6 +235,8 @@ static void update_cpu_masks(void) topo->socket_id = cpu; topo->book_id = cpu; topo->drawer_id = cpu; + if (cpu_present(cpu)) + cpumask_set_cpu(cpu, &cpus_with_topology); } } numa_update_cpu_topology(); @@ -241,12 +247,12 @@ void store_topology(struct sysinfo_15_1_x *info) stsi(info, 15, 1, min(topology_max_mnest, 4)); } -int arch_update_cpu_topology(void) +static int __arch_update_cpu_topology(void) { struct sysinfo_15_1_x *info = tl_info; - struct device *dev; - int cpu, rc = 0; + int rc = 0; + cpumask_clear(&cpus_with_topology); if (MACHINE_HAS_TOPOLOGY) { rc = 1; store_topology(info); @@ -255,6 +261,15 @@ int arch_update_cpu_topology(void) update_cpu_masks(); if (!MACHINE_HAS_TOPOLOGY) topology_update_polarization_simple(); + return rc; +} + +int arch_update_cpu_topology(void) +{ + struct device *dev; + int cpu, rc; + + rc = __arch_update_cpu_topology(); for_each_online_cpu(cpu) { dev = get_cpu_device(cpu); kobject_uevent(&dev->kobj, KOBJ_CHANGE); @@ -394,23 +409,23 @@ int topology_cpu_init(struct cpu *cpu) static const struct cpumask *cpu_thread_mask(int cpu) { - return &per_cpu(cpu_topology, cpu).thread_mask; + return &cpu_topology[cpu].thread_mask; } const struct cpumask *cpu_coregroup_mask(int cpu) { - return &per_cpu(cpu_topology, cpu).core_mask; + return &cpu_topology[cpu].core_mask; } static const struct cpumask *cpu_book_mask(int cpu) { - return &per_cpu(cpu_topology, cpu).book_mask; + return &cpu_topology[cpu].book_mask; } static const struct cpumask *cpu_drawer_mask(int cpu) { - return &per_cpu(cpu_topology, cpu).drawer_mask; + return &cpu_topology[cpu].drawer_mask; } static int __init early_parse_topology(char *p) @@ -438,19 +453,20 @@ static void __init alloc_masks(struct sysinfo_15_1_x *info, nr_masks *= info->mag[TOPOLOGY_NR_MAG - offset - 1 - i]; nr_masks = max(nr_masks, 1); for (i = 0; i < nr_masks; i++) { - mask->next = kzalloc(sizeof(*mask->next), GFP_KERNEL); + mask->next = memblock_virt_alloc(sizeof(*mask->next), 8); mask = mask->next; } } -static int __init s390_topology_init(void) +void __init topology_init_early(void) { struct sysinfo_15_1_x *info; int i; + set_sched_topology(s390_topology); if (!MACHINE_HAS_TOPOLOGY) - return 0; - tl_info = (struct sysinfo_15_1_x *)__get_free_page(GFP_KERNEL); + goto out; + tl_info = memblock_virt_alloc(sizeof(*tl_info), PAGE_SIZE); info = tl_info; store_topology(info); pr_info("The CPU configuration topology of the machine is:"); @@ -460,10 +476,9 @@ static int __init s390_topology_init(void) alloc_masks(info, &socket_info, 1); alloc_masks(info, &book_info, 2); alloc_masks(info, &drawer_info, 3); - set_sched_topology(s390_topology); - return 0; +out: + __arch_update_cpu_topology(); } -early_initcall(s390_topology_init); static int __init topology_init(void) { diff --git a/arch/s390/kernel/vdso32/clock_gettime.S b/arch/s390/kernel/vdso32/clock_gettime.S index 5eec9afbb5b5..a5769b83d90e 100644 --- a/arch/s390/kernel/vdso32/clock_gettime.S +++ b/arch/s390/kernel/vdso32/clock_gettime.S @@ -99,8 +99,27 @@ __kernel_clock_gettime: tml %r4,0x0001 /* pending update ? loop */ jnz 11b stcke 0(%r15) /* Store TOD clock */ - lm %r0,%r1,1(%r15) - s %r0,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ + lm %r0,%r1,__VDSO_TS_END(%r5) /* TOD steering end time */ + s %r0,1(%r15) /* no - ts_steering_end */ + sl %r1,5(%r15) + brc 3,22f + ahi %r0,-1 +22: ltr %r0,%r0 /* past end of steering? */ + jm 24f + srdl %r0,15 /* 1 per 2^16 */ + tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ + jz 23f + lcr %r0,%r0 /* negative TOD offset */ + lcr %r1,%r1 + je 23f + ahi %r0,-1 +23: a %r0,1(%r15) /* add TOD timestamp */ + al %r1,5(%r15) + brc 12,25f + ahi %r0,1 + j 25f +24: lm %r0,%r1,1(%r15) /* load TOD timestamp */ +25: s %r0,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ sl %r1,__VDSO_XTIME_STAMP+4(%r5) brc 3,12f ahi %r0,-1 diff --git a/arch/s390/kernel/vdso32/gettimeofday.S b/arch/s390/kernel/vdso32/gettimeofday.S index 719de6186b20..63b86dceb0bf 100644 --- a/arch/s390/kernel/vdso32/gettimeofday.S +++ b/arch/s390/kernel/vdso32/gettimeofday.S @@ -31,8 +31,27 @@ __kernel_gettimeofday: tml %r4,0x0001 /* pending update ? loop */ jnz 1b stcke 0(%r15) /* Store TOD clock */ - lm %r0,%r1,1(%r15) - s %r0,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ + lm %r0,%r1,__VDSO_TS_END(%r5) /* TOD steering end time */ + s %r0,1(%r15) + sl %r1,5(%r15) + brc 3,14f + ahi %r0,-1 +14: ltr %r0,%r0 /* past end of steering? */ + jm 16f + srdl %r0,15 /* 1 per 2^16 */ + tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ + jz 15f + lcr %r0,%r0 /* negative TOD offset */ + lcr %r1,%r1 + je 15f + ahi %r0,-1 +15: a %r0,1(%r15) /* add TOD timestamp */ + al %r1,5(%r15) + brc 12,17f + ahi %r0,1 + j 17f +16: lm %r0,%r1,1(%r15) /* load TOD timestamp */ +17: s %r0,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ sl %r1,__VDSO_XTIME_STAMP+4(%r5) brc 3,3f ahi %r0,-1 diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S index 61541fb93dc6..9c3b12626dba 100644 --- a/arch/s390/kernel/vdso64/clock_gettime.S +++ b/arch/s390/kernel/vdso64/clock_gettime.S @@ -83,8 +83,17 @@ __kernel_clock_gettime: tmll %r4,0x0001 /* pending update ? loop */ jnz 5b stcke 0(%r15) /* Store TOD clock */ - lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ lg %r1,1(%r15) + lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */ + slgr %r0,%r1 /* now - ts_steering_end */ + ltgr %r0,%r0 /* past end of steering ? */ + jm 17f + srlg %r0,%r0,15 /* 1 per 2^16 */ + tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ + jz 18f + lcgr %r0,%r0 /* negative TOD offset */ +18: algr %r1,%r0 /* add steering offset */ +17: lgf %r2,__VDSO_TK_SHIFT(%r5) /* Timekeeper shift */ sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */ alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */ diff --git a/arch/s390/kernel/vdso64/gettimeofday.S b/arch/s390/kernel/vdso64/gettimeofday.S index 6ce46707663c..b02e62f3bc12 100644 --- a/arch/s390/kernel/vdso64/gettimeofday.S +++ b/arch/s390/kernel/vdso64/gettimeofday.S @@ -31,7 +31,16 @@ __kernel_gettimeofday: jnz 0b stcke 0(%r15) /* Store TOD clock */ lg %r1,1(%r15) - sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ + lg %r0,__VDSO_TS_END(%r5) /* TOD steering end time */ + slgr %r0,%r1 /* now - ts_steering_end */ + ltgr %r0,%r0 /* past end of steering ? */ + jm 6f + srlg %r0,%r0,15 /* 1 per 2^16 */ + tm __VDSO_TS_DIR+3(%r5),0x01 /* steering direction? */ + jz 7f + lcgr %r0,%r0 /* negative TOD offset */ +7: algr %r1,%r0 /* add steering offset */ +6: sg %r1,__VDSO_XTIME_STAMP(%r5) /* TOD - cycle_last */ msgf %r1,__VDSO_TK_MULT(%r5) /* * tk->mult */ alg %r1,__VDSO_XTIME_NSEC(%r5) /* + tk->xtime_nsec */ lg %r0,__VDSO_XTIME_SEC(%r5) /* tk->xtime_sec */ diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 856e30d8463f..6b246aadf311 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -96,7 +96,6 @@ static void update_mt_scaling(void) */ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) { - struct thread_info *ti = task_thread_info(tsk); u64 timer, clock, user, system, steal; u64 user_scaled, system_scaled; @@ -119,13 +118,13 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) update_mt_scaling(); - user = S390_lowcore.user_timer - ti->user_timer; + user = S390_lowcore.user_timer - tsk->thread.user_timer; S390_lowcore.steal_timer -= user; - ti->user_timer = S390_lowcore.user_timer; + tsk->thread.user_timer = S390_lowcore.user_timer; - system = S390_lowcore.system_timer - ti->system_timer; + system = S390_lowcore.system_timer - tsk->thread.system_timer; S390_lowcore.steal_timer -= system; - ti->system_timer = S390_lowcore.system_timer; + tsk->thread.system_timer = S390_lowcore.system_timer; user_scaled = user; system_scaled = system; @@ -137,8 +136,10 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) user_scaled = (user_scaled * mult) / div; system_scaled = (system_scaled * mult) / div; } - account_user_time(tsk, user, user_scaled); - account_system_time(tsk, hardirq_offset, system, system_scaled); + account_user_time(tsk, user); + tsk->utimescaled += user_scaled; + account_system_time(tsk, hardirq_offset, system); + tsk->stimescaled += system_scaled; steal = S390_lowcore.steal_timer; if ((s64) steal > 0) { @@ -151,15 +152,11 @@ static int do_account_vtime(struct task_struct *tsk, int hardirq_offset) void vtime_task_switch(struct task_struct *prev) { - struct thread_info *ti; - do_account_vtime(prev, 0); - ti = task_thread_info(prev); - ti->user_timer = S390_lowcore.user_timer; - ti->system_timer = S390_lowcore.system_timer; - ti = task_thread_info(current); - S390_lowcore.user_timer = ti->user_timer; - S390_lowcore.system_timer = ti->system_timer; + prev->thread.user_timer = S390_lowcore.user_timer; + prev->thread.system_timer = S390_lowcore.system_timer; + S390_lowcore.user_timer = current->thread.user_timer; + S390_lowcore.system_timer = current->thread.system_timer; } /* @@ -179,7 +176,6 @@ void vtime_account_user(struct task_struct *tsk) */ void vtime_account_irq_enter(struct task_struct *tsk) { - struct thread_info *ti = task_thread_info(tsk); u64 timer, system, system_scaled; timer = S390_lowcore.last_update_timer; @@ -191,9 +187,9 @@ void vtime_account_irq_enter(struct task_struct *tsk) time_after64(jiffies_64, this_cpu_read(mt_scaling_jiffies))) update_mt_scaling(); - system = S390_lowcore.system_timer - ti->system_timer; + system = S390_lowcore.system_timer - tsk->thread.system_timer; S390_lowcore.steal_timer -= system; - ti->system_timer = S390_lowcore.system_timer; + tsk->thread.system_timer = S390_lowcore.system_timer; system_scaled = system; /* Do MT utilization scaling */ if (smp_cpu_mtid) { @@ -202,7 +198,8 @@ void vtime_account_irq_enter(struct task_struct *tsk) system_scaled = (system_scaled * mult) / div; } - account_system_time(tsk, 0, system, system_scaled); + account_system_time(tsk, 0, system); + tsk->stimescaled += system_scaled; virt_timer_forward(system); } diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index be4db07f70d3..af13f1a135b6 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -415,7 +415,7 @@ static int __write_machine_check(struct kvm_vcpu *vcpu, int rc; mci.val = mchk->mcic; - /* take care of lazy register loading via vcpu load/put */ + /* take care of lazy register loading */ save_fpu_regs(); save_access_regs(vcpu->run->s.regs.acrs); diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index 9c7a1ecfe6bd..bec71e902be3 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -1812,22 +1812,7 @@ __u64 kvm_s390_get_cpu_timer(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) { - /* Save host register state */ - save_fpu_regs(); - vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc; - vcpu->arch.host_fpregs.regs = current->thread.fpu.regs; - - if (MACHINE_HAS_VX) - current->thread.fpu.regs = vcpu->run->s.regs.vrs; - else - current->thread.fpu.regs = vcpu->run->s.regs.fprs; - current->thread.fpu.fpc = vcpu->run->s.regs.fpc; - if (test_fp_ctl(current->thread.fpu.fpc)) - /* User space provided an invalid FPC, let's clear it */ - current->thread.fpu.fpc = 0; - save_access_regs(vcpu->arch.host_acrs); - restore_access_regs(vcpu->run->s.regs.acrs); gmap_enable(vcpu->arch.enabled_gmap); atomic_or(CPUSTAT_RUNNING, &vcpu->arch.sie_block->cpuflags); if (vcpu->arch.cputm_enabled && !is_vcpu_idle(vcpu)) @@ -1844,16 +1829,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) vcpu->arch.enabled_gmap = gmap_get_enabled(); gmap_disable(vcpu->arch.enabled_gmap); - /* Save guest register state */ - save_fpu_regs(); - vcpu->run->s.regs.fpc = current->thread.fpu.fpc; - - /* Restore host register state */ - current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc; - current->thread.fpu.regs = vcpu->arch.host_fpregs.regs; - - save_access_regs(vcpu->run->s.regs.acrs); - restore_access_regs(vcpu->arch.host_acrs); } static void kvm_s390_vcpu_initial_reset(struct kvm_vcpu *vcpu) @@ -2243,7 +2218,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, { memcpy(&vcpu->run->s.regs.acrs, &sregs->acrs, sizeof(sregs->acrs)); memcpy(&vcpu->arch.sie_block->gcr, &sregs->crs, sizeof(sregs->crs)); - restore_access_regs(vcpu->run->s.regs.acrs); return 0; } @@ -2257,11 +2231,9 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) { - /* make sure the new values will be lazily loaded */ - save_fpu_regs(); if (test_fp_ctl(fpu->fpc)) return -EINVAL; - current->thread.fpu.fpc = fpu->fpc; + vcpu->run->s.regs.fpc = fpu->fpc; if (MACHINE_HAS_VX) convert_fp_to_vx((__vector128 *) vcpu->run->s.regs.vrs, (freg_t *) fpu->fprs); @@ -2279,7 +2251,7 @@ int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) (__vector128 *) vcpu->run->s.regs.vrs); else memcpy(fpu->fprs, vcpu->run->s.regs.fprs, sizeof(fpu->fprs)); - fpu->fpc = current->thread.fpu.fpc; + fpu->fpc = vcpu->run->s.regs.fpc; return 0; } @@ -2740,6 +2712,20 @@ static void sync_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) if (riccb->valid) vcpu->arch.sie_block->ecb3 |= 0x01; } + save_access_regs(vcpu->arch.host_acrs); + restore_access_regs(vcpu->run->s.regs.acrs); + /* save host (userspace) fprs/vrs */ + save_fpu_regs(); + vcpu->arch.host_fpregs.fpc = current->thread.fpu.fpc; + vcpu->arch.host_fpregs.regs = current->thread.fpu.regs; + if (MACHINE_HAS_VX) + current->thread.fpu.regs = vcpu->run->s.regs.vrs; + else + current->thread.fpu.regs = vcpu->run->s.regs.fprs; + current->thread.fpu.fpc = vcpu->run->s.regs.fpc; + if (test_fp_ctl(current->thread.fpu.fpc)) + /* User space provided an invalid FPC, let's clear it */ + current->thread.fpu.fpc = 0; kvm_run->kvm_dirty_regs = 0; } @@ -2758,6 +2744,15 @@ static void store_regs(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) kvm_run->s.regs.pft = vcpu->arch.pfault_token; kvm_run->s.regs.pfs = vcpu->arch.pfault_select; kvm_run->s.regs.pfc = vcpu->arch.pfault_compare; + save_access_regs(vcpu->run->s.regs.acrs); + restore_access_regs(vcpu->arch.host_acrs); + /* Save guest register state */ + save_fpu_regs(); + vcpu->run->s.regs.fpc = current->thread.fpu.fpc; + /* Restore will be done lazily at return */ + current->thread.fpu.fpc = vcpu->arch.host_fpregs.fpc; + current->thread.fpu.regs = vcpu->arch.host_fpregs.regs; + } int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) @@ -2874,7 +2869,7 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr) { /* * The guest FPRS and ACRS are in the host FPRS/ACRS due to the lazy - * copying in vcpu load/put. Lets update our copies before we save + * switch in the run ioctl. Let's update our copies before we save * it into the save area */ save_fpu_regs(); diff --git a/arch/s390/lib/mem.S b/arch/s390/lib/mem.S index be9fa65bfac4..7422a706f310 100644 --- a/arch/s390/lib/mem.S +++ b/arch/s390/lib/mem.S @@ -8,6 +8,45 @@ #include <asm/export.h> /* + * void *memmove(void *dest, const void *src, size_t n) + */ +ENTRY(memmove) + ltgr %r4,%r4 + lgr %r1,%r2 + bzr %r14 + clgr %r2,%r3 + jnh .Lmemmove_forward + la %r5,0(%r4,%r3) + clgr %r2,%r5 + jl .Lmemmove_reverse +.Lmemmove_forward: + aghi %r4,-1 + srlg %r0,%r4,8 + ltgr %r0,%r0 + jz .Lmemmove_rest +.Lmemmove_loop: + mvc 0(256,%r1),0(%r3) + la %r1,256(%r1) + la %r3,256(%r3) + brctg %r0,.Lmemmove_loop +.Lmemmove_rest: + larl %r5,.Lmemmove_mvc + ex %r4,0(%r5) + br %r14 +.Lmemmove_reverse: + aghi %r4,-1 +.Lmemmove_reverse_loop: + ic %r0,0(%r4,%r3) + stc %r0,0(%r4,%r1) + brctg %r4,.Lmemmove_reverse_loop + ic %r0,0(%r4,%r3) + stc %r0,0(%r4,%r1) + br %r14 +.Lmemmove_mvc: + mvc 0(1,%r1),0(%r3) +EXPORT_SYMBOL(memmove) + +/* * memset implementation * * This code corresponds to the C construct below. We do distinguish diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c index e5f50a7d2f4e..e48a48ec24bc 100644 --- a/arch/s390/lib/spinlock.c +++ b/arch/s390/lib/spinlock.c @@ -37,15 +37,6 @@ static inline void _raw_compare_and_delay(unsigned int *lock, unsigned int old) asm(".insn rsy,0xeb0000000022,%0,0,%1" : : "d" (old), "Q" (*lock)); } -static inline int cpu_is_preempted(int cpu) -{ - if (test_cpu_flag_of(CIF_ENABLED_WAIT, cpu)) - return 0; - if (smp_vcpu_scheduled(cpu)) - return 0; - return 1; -} - void arch_spin_lock_wait(arch_spinlock_t *lp) { unsigned int cpu = SPINLOCK_LOCKVAL; @@ -62,7 +53,7 @@ void arch_spin_lock_wait(arch_spinlock_t *lp) continue; } /* First iteration: check if the lock owner is running. */ - if (first_diag && cpu_is_preempted(~owner)) { + if (first_diag && arch_vcpu_is_preempted(~owner)) { smp_yield_cpu(~owner); first_diag = 0; continue; @@ -81,7 +72,7 @@ void arch_spin_lock_wait(arch_spinlock_t *lp) * yield the CPU unconditionally. For LPAR rely on the * sense running status. */ - if (!MACHINE_IS_LPAR || cpu_is_preempted(~owner)) { + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) { smp_yield_cpu(~owner); first_diag = 0; } @@ -108,7 +99,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags) continue; } /* Check if the lock owner is running. */ - if (first_diag && cpu_is_preempted(~owner)) { + if (first_diag && arch_vcpu_is_preempted(~owner)) { smp_yield_cpu(~owner); first_diag = 0; continue; @@ -127,7 +118,7 @@ void arch_spin_lock_wait_flags(arch_spinlock_t *lp, unsigned long flags) * yield the CPU unconditionally. For LPAR rely on the * sense running status. */ - if (!MACHINE_IS_LPAR || cpu_is_preempted(~owner)) { + if (!MACHINE_IS_LPAR || arch_vcpu_is_preempted(~owner)) { smp_yield_cpu(~owner); first_diag = 0; } @@ -165,7 +156,7 @@ void _raw_read_lock_wait(arch_rwlock_t *rw) owner = 0; while (1) { if (count-- <= 0) { - if (owner && cpu_is_preempted(~owner)) + if (owner && arch_vcpu_is_preempted(~owner)) smp_yield_cpu(~owner); count = spin_retry; } @@ -211,7 +202,7 @@ void _raw_write_lock_wait(arch_rwlock_t *rw, unsigned int prev) owner = 0; while (1) { if (count-- <= 0) { - if (owner && cpu_is_preempted(~owner)) + if (owner && arch_vcpu_is_preempted(~owner)) smp_yield_cpu(~owner); count = spin_retry; } @@ -241,7 +232,7 @@ void _raw_write_lock_wait(arch_rwlock_t *rw) owner = 0; while (1) { if (count-- <= 0) { - if (owner && cpu_is_preempted(~owner)) + if (owner && arch_vcpu_is_preempted(~owner)) smp_yield_cpu(~owner); count = spin_retry; } @@ -285,7 +276,7 @@ void arch_lock_relax(unsigned int cpu) { if (!cpu) return; - if (MACHINE_IS_LPAR && !cpu_is_preempted(~cpu)) + if (MACHINE_IS_LPAR && !arch_vcpu_is_preempted(~cpu)) return; smp_yield_cpu(~cpu); } diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c index 661d9fe63c43..d1faae5cdd12 100644 --- a/arch/s390/mm/fault.c +++ b/arch/s390/mm/fault.c @@ -733,6 +733,7 @@ block: * return to userspace schedule() to block. */ __set_current_state(TASK_UNINTERRUPTIBLE); set_tsk_need_resched(tsk); + set_preempt_need_resched(); } } out: diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 3ba622702ce4..ec1f0dedb948 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1015,7 +1015,7 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); - radix_tree_replace_slot(slot, rmap); + radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 1848292766ef..45becc8a44ec 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -34,7 +34,7 @@ static void __ref *vmem_alloc_pages(unsigned int order) if (slab_is_available()) return (void *)__get_free_pages(GFP_KERNEL, order); - return alloc_bootmem_align(size, size); + return (void *) memblock_alloc(size, size); } static inline pud_t *vmem_pud_alloc(void) @@ -61,17 +61,16 @@ pmd_t *vmem_pmd_alloc(void) pte_t __ref *vmem_pte_alloc(void) { + unsigned long size = PTRS_PER_PTE * sizeof(pte_t); pte_t *pte; if (slab_is_available()) pte = (pte_t *) page_table_alloc(&init_mm); else - pte = alloc_bootmem_align(PTRS_PER_PTE * sizeof(pte_t), - PTRS_PER_PTE * sizeof(pte_t)); + pte = (pte_t *) memblock_alloc(size, size); if (!pte) return NULL; - clear_table((unsigned long *) pte, _PAGE_INVALID, - PTRS_PER_PTE * sizeof(pte_t)); + clear_table((unsigned long *) pte, _PAGE_INVALID, size); return pte; } diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index bee281f3163d..167b31b186c1 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -981,7 +981,7 @@ static noinline int bpf_jit_insn(struct bpf_jit *jit, struct bpf_prog *fp, int i EMIT2(0x0d00, REG_14, REG_W1); /* lgr %b0,%r2: load return value into %b0 */ EMIT4(0xb9040000, BPF_REG_0, REG_2); - if (bpf_helper_changes_skb_data((void *)func)) { + if (bpf_helper_changes_pkt_data((void *)func)) { jit->seen |= SEEN_SKB_CHANGE; /* lg %b1,ST_OFF_SKBP(%r15) */ EMIT6_DISP_LH(0xe3000000, 0x0004, BPF_REG_1, REG_0, diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c index 37e0bb835516..cfd08384f0ab 100644 --- a/arch/s390/numa/mode_emu.c +++ b/arch/s390/numa/mode_emu.c @@ -21,6 +21,7 @@ #include <linux/kernel.h> #include <linux/cpumask.h> #include <linux/memblock.h> +#include <linux/bootmem.h> #include <linux/node.h> #include <linux/memory.h> #include <linux/slab.h> @@ -307,13 +308,11 @@ fail: /* * Allocate and initialize core to node mapping */ -static void create_core_to_node_map(void) +static void __ref create_core_to_node_map(void) { int i; - emu_cores = kzalloc(sizeof(*emu_cores), GFP_KERNEL); - if (emu_cores == NULL) - panic("Could not allocate cores to node memory"); + emu_cores = memblock_virt_alloc(sizeof(*emu_cores), 8); for (i = 0; i < ARRAY_SIZE(emu_cores->to_node_id); i++) emu_cores->to_node_id[i] = NODE_ID_FREE; } @@ -354,13 +353,13 @@ static struct toptree *toptree_from_topology(void) phys = toptree_new(TOPTREE_ID_PHYS, 1); - for_each_online_cpu(cpu) { - top = &per_cpu(cpu_topology, cpu); + for_each_cpu(cpu, &cpus_with_topology) { + top = &cpu_topology[cpu]; node = toptree_get_child(phys, 0); drawer = toptree_get_child(node, top->drawer_id); book = toptree_get_child(drawer, top->book_id); mc = toptree_get_child(book, top->socket_id); - core = toptree_get_child(mc, top->core_id); + core = toptree_get_child(mc, smp_get_base_cpu(cpu)); if (!drawer || !book || !mc || !core) panic("NUMA emulation could not allocate memory"); cpumask_set_cpu(cpu, &core->mask); @@ -378,7 +377,7 @@ static void topology_add_core(struct toptree *core) int cpu; for_each_cpu(cpu, &core->mask) { - top = &per_cpu(cpu_topology, cpu); + top = &cpu_topology[cpu]; cpumask_copy(&top->thread_mask, &core->mask); cpumask_copy(&top->core_mask, &core_mc(core)->mask); cpumask_copy(&top->book_mask, &core_book(core)->mask); @@ -425,6 +424,27 @@ static void print_node_to_core_map(void) } } +static void pin_all_possible_cpus(void) +{ + int core_id, node_id, cpu; + static int initialized; + + if (initialized) + return; + print_node_to_core_map(); + node_id = 0; + for_each_possible_cpu(cpu) { + core_id = smp_get_base_cpu(cpu); + if (emu_cores->to_node_id[core_id] != NODE_ID_FREE) + continue; + pin_core_to_node(core_id, node_id); + cpu_topology[cpu].node_id = node_id; + node_id = (node_id + 1) % emu_nodes; + } + print_node_to_core_map(); + initialized = 1; +} + /* * Transfer physical topology into a NUMA topology and modify CPU masks * according to the NUMA topology. @@ -442,7 +462,7 @@ static void emu_update_cpu_topology(void) toptree_free(phys); toptree_to_topology(numa); toptree_free(numa); - print_node_to_core_map(); + pin_all_possible_cpus(); } /* diff --git a/arch/s390/numa/toptree.c b/arch/s390/numa/toptree.c index 902d350d859a..26f622b1cd11 100644 --- a/arch/s390/numa/toptree.c +++ b/arch/s390/numa/toptree.c @@ -7,6 +7,7 @@ */ #include <linux/kernel.h> +#include <linux/bootmem.h> #include <linux/cpumask.h> #include <linux/list.h> #include <linux/list_sort.h> @@ -25,10 +26,14 @@ * RETURNS: * Pointer to the new tree node or NULL on error */ -struct toptree *toptree_alloc(int level, int id) +struct toptree __ref *toptree_alloc(int level, int id) { - struct toptree *res = kzalloc(sizeof(struct toptree), GFP_KERNEL); + struct toptree *res; + if (slab_is_available()) + res = kzalloc(sizeof(*res), GFP_KERNEL); + else + res = memblock_virt_alloc(sizeof(*res), 8); if (!res) return res; @@ -65,7 +70,7 @@ static void toptree_remove(struct toptree *cand) * cleanly using toptree_remove. Possible children are freed * recursively. In the end @cand itself is freed. */ -void toptree_free(struct toptree *cand) +void __ref toptree_free(struct toptree *cand) { struct toptree *child, *tmp; @@ -73,7 +78,10 @@ void toptree_free(struct toptree *cand) toptree_remove(cand); toptree_for_each_child_safe(child, tmp, cand) toptree_free(child); - kfree(cand); + if (slab_is_available()) + kfree(cand); + else + memblock_free_early((unsigned long)cand, sizeof(*cand)); } /** diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 15ffc19c8c0c..64e1734bebb7 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -722,6 +722,11 @@ struct dev_pm_ops pcibios_pm_ops = { static int zpci_alloc_domain(struct zpci_dev *zdev) { + if (zpci_unique_uid) { + zdev->domain = (u16) zdev->uid; + return 0; + } + spin_lock(&zpci_domain_lock); zdev->domain = find_first_zero_bit(zpci_domain, ZPCI_NR_DEVICES); if (zdev->domain == ZPCI_NR_DEVICES) { @@ -735,6 +740,9 @@ static int zpci_alloc_domain(struct zpci_dev *zdev) static void zpci_free_domain(struct zpci_dev *zdev) { + if (zpci_unique_uid) + return; + spin_lock(&zpci_domain_lock); clear_bit(zdev->domain, zpci_domain); spin_unlock(&zpci_domain_lock); diff --git a/arch/s390/pci/pci_clp.c b/arch/s390/pci/pci_clp.c index 1a4512c8544a..e3ef63b36b5a 100644 --- a/arch/s390/pci/pci_clp.c +++ b/arch/s390/pci/pci_clp.c @@ -22,6 +22,8 @@ #include <asm/clp.h> #include <uapi/asm/clp.h> +bool zpci_unique_uid; + static inline void zpci_err_clp(unsigned int rsp, int rc) { struct { @@ -315,6 +317,7 @@ static int clp_list_pci(struct clp_req_rsp_list_pci *rrb, goto out; } + zpci_unique_uid = rrb->response.uid_checking; WARN_ON_ONCE(rrb->response.entry_size != sizeof(struct clp_fh_list_entry)); diff --git a/arch/s390/pci/pci_debug.c b/arch/s390/pci/pci_debug.c index 38993b156924..c2f786f0ea06 100644 --- a/arch/s390/pci/pci_debug.c +++ b/arch/s390/pci/pci_debug.c @@ -69,7 +69,7 @@ static void pci_sw_counter_show(struct seq_file *m) int i; for (i = 0; i < ARRAY_SIZE(pci_sw_names); i++, counter++) - seq_printf(m, "%26s:\t%llu\n", pci_sw_names[i], + seq_printf(m, "%26s:\t%lu\n", pci_sw_names[i], atomic64_read(counter)); } diff --git a/arch/s390/pci/pci_dma.c b/arch/s390/pci/pci_dma.c index 6b2f72f523b9..1d7a9c71944a 100644 --- a/arch/s390/pci/pci_dma.c +++ b/arch/s390/pci/pci_dma.c @@ -181,14 +181,17 @@ static int __dma_purge_tlb(struct zpci_dev *zdev, dma_addr_t dma_addr, /* * With zdev->tlb_refresh == 0, rpcit is not required to establish new * translations when previously invalid translation-table entries are - * validated. With lazy unmap, it also is skipped for previously valid + * validated. With lazy unmap, rpcit is skipped for previously valid * entries, but a global rpcit is then required before any address can * be re-used, i.e. after each iommu bitmap wrap-around. */ - if (!zdev->tlb_refresh && - (!s390_iommu_strict || - ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID))) - return 0; + if ((flags & ZPCI_PTE_VALID_MASK) == ZPCI_PTE_VALID) { + if (!zdev->tlb_refresh) + return 0; + } else { + if (!s390_iommu_strict) + return 0; + } return zpci_refresh_trans((u64) zdev->fh << 32, dma_addr, PAGE_ALIGN(size)); @@ -257,7 +260,7 @@ static dma_addr_t dma_alloc_address(struct device *dev, int size) spin_lock_irqsave(&zdev->iommu_bitmap_lock, flags); offset = __dma_alloc_iommu(dev, zdev->next_bit, size); if (offset == -1) { - if (!zdev->tlb_refresh && !s390_iommu_strict) { + if (!s390_iommu_strict) { /* global flush before DMA addresses are reused */ if (zpci_refresh_global(zdev)) goto out_error; @@ -292,7 +295,7 @@ static void dma_free_address(struct device *dev, dma_addr_t dma_addr, int size) if (!zdev->iommu_bitmap) goto out; - if (zdev->tlb_refresh || s390_iommu_strict) + if (s390_iommu_strict) bitmap_clear(zdev->iommu_bitmap, offset, size); else bitmap_set(zdev->lazy_bitmap, offset, size); @@ -388,8 +391,6 @@ static void *s390_dma_alloc(struct device *dev, size_t size, return NULL; pa = page_to_phys(page); - memset((void *) pa, 0, size); - map = s390_dma_map_pages(dev, page, 0, size, DMA_BIDIRECTIONAL, 0); if (dma_mapping_error(dev, map)) { free_pages(pa, get_order(size)); @@ -419,6 +420,7 @@ static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg, size_t size, dma_addr_t *handle, enum dma_data_direction dir) { + unsigned long nr_pages = PAGE_ALIGN(size) >> PAGE_SHIFT; struct zpci_dev *zdev = to_zpci(to_pci_dev(dev)); dma_addr_t dma_addr_base, dma_addr; int flags = ZPCI_PTE_VALID; @@ -426,8 +428,7 @@ static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg, unsigned long pa = 0; int ret; - size = PAGE_ALIGN(size); - dma_addr_base = dma_alloc_address(dev, size >> PAGE_SHIFT); + dma_addr_base = dma_alloc_address(dev, nr_pages); if (dma_addr_base == DMA_ERROR_CODE) return -ENOMEM; @@ -436,26 +437,27 @@ static int __s390_dma_map_sg(struct device *dev, struct scatterlist *sg, flags |= ZPCI_TABLE_PROTECTED; for (s = sg; dma_addr < dma_addr_base + size; s = sg_next(s)) { - pa = page_to_phys(sg_page(s)) + s->offset; - ret = __dma_update_trans(zdev, pa, dma_addr, s->length, flags); + pa = page_to_phys(sg_page(s)); + ret = __dma_update_trans(zdev, pa, dma_addr, + s->offset + s->length, flags); if (ret) goto unmap; - dma_addr += s->length; + dma_addr += s->offset + s->length; } ret = __dma_purge_tlb(zdev, dma_addr_base, size, flags); if (ret) goto unmap; *handle = dma_addr_base; - atomic64_add(size >> PAGE_SHIFT, &zdev->mapped_pages); + atomic64_add(nr_pages, &zdev->mapped_pages); return ret; unmap: dma_update_trans(zdev, 0, dma_addr_base, dma_addr - dma_addr_base, ZPCI_PTE_INVALID); - dma_free_address(dev, dma_addr_base, size >> PAGE_SHIFT); + dma_free_address(dev, dma_addr_base, nr_pages); zpci_err("map error:\n"); zpci_err_dma(ret, pa); return ret; @@ -564,7 +566,7 @@ int zpci_dma_init_device(struct zpci_dev *zdev) rc = -ENOMEM; goto free_dma_table; } - if (!zdev->tlb_refresh && !s390_iommu_strict) { + if (!s390_iommu_strict) { zdev->lazy_bitmap = vzalloc(zdev->iommu_pages / 8); if (!zdev->lazy_bitmap) { rc = -ENOMEM; diff --git a/arch/s390/tools/Makefile b/arch/s390/tools/Makefile index 6d9814c9df2b..4b5e1e499527 100644 --- a/arch/s390/tools/Makefile +++ b/arch/s390/tools/Makefile @@ -9,7 +9,5 @@ define filechk_facilities.h $(obj)/gen_facilities endef -$(obj)/gen_facilities.o: $(srctree)/arch/s390/tools/gen_facilities.c - include/generated/facilities.h: $(obj)/gen_facilities FORCE $(call filechk,facilities.h) diff --git a/arch/s390/tools/gen_facilities.c b/arch/s390/tools/gen_facilities.c index fe4e6c910dd7..8cc53b1e6d03 100644 --- a/arch/s390/tools/gen_facilities.c +++ b/arch/s390/tools/gen_facilities.c @@ -7,13 +7,83 @@ * */ -#define S390_GEN_FACILITIES_C - #include <strings.h> #include <string.h> #include <stdlib.h> #include <stdio.h> -#include <asm/facilities_src.h> + +struct facility_def { + char *name; + int *bits; +}; + +static struct facility_def facility_defs[] = { + { + /* + * FACILITIES_ALS contains the list of facilities that are + * required to run a kernel that is compiled e.g. with + * -march=<machine>. + */ + .name = "FACILITIES_ALS", + .bits = (int[]){ +#ifdef CONFIG_HAVE_MARCH_Z900_FEATURES + 0, /* N3 instructions */ + 1, /* z/Arch mode installed */ +#endif +#ifdef CONFIG_HAVE_MARCH_Z990_FEATURES + 18, /* long displacement facility */ +#endif +#ifdef CONFIG_HAVE_MARCH_Z9_109_FEATURES + 7, /* stfle */ + 17, /* message security assist */ + 21, /* extended-immediate facility */ + 25, /* store clock fast */ +#endif +#ifdef CONFIG_HAVE_MARCH_Z10_FEATURES + 27, /* mvcos */ + 32, /* compare and swap and store */ + 33, /* compare and swap and store 2 */ + 34, /* general extension facility */ + 35, /* execute extensions */ +#endif +#ifdef CONFIG_HAVE_MARCH_Z196_FEATURES + 45, /* fast-BCR, etc. */ +#endif +#ifdef CONFIG_HAVE_MARCH_ZEC12_FEATURES + 49, /* misc-instruction-extensions */ + 52, /* interlocked facility 2 */ +#endif +#ifdef CONFIG_HAVE_MARCH_Z13_FEATURES + 53, /* load-and-zero-rightmost-byte, etc. */ +#endif + -1 /* END */ + } + }, + { + .name = "FACILITIES_KVM", + .bits = (int[]){ + 0, /* N3 instructions */ + 1, /* z/Arch mode installed */ + 2, /* z/Arch mode active */ + 3, /* DAT-enhancement */ + 4, /* idte segment table */ + 5, /* idte region table */ + 6, /* ASN-and-LX reuse */ + 7, /* stfle */ + 8, /* enhanced-DAT 1 */ + 9, /* sense-running-status */ + 10, /* conditional sske */ + 13, /* ipte-range */ + 14, /* nonquiescing key-setting */ + 73, /* transactional execution */ + 75, /* access-exception-fetch/store indication */ + 76, /* msa extension 3 */ + 77, /* msa extension 4 */ + 78, /* enhanced-DAT 2 */ + -1 /* END */ + } + }, +}; static void print_facility_list(struct facility_def *def) { diff --git a/arch/score/include/asm/mutex.h b/arch/score/include/asm/mutex.h deleted file mode 100644 index 10d48fe4db97..000000000000 --- a/arch/score/include/asm/mutex.h +++ /dev/null @@ -1,6 +0,0 @@ -#ifndef _ASM_SCORE_MUTEX_H -#define _ASM_SCORE_MUTEX_H - -#include <asm-generic/mutex-dec.h> - -#endif /* _ASM_SCORE_MUTEX_H */ diff --git a/arch/score/include/asm/processor.h b/arch/score/include/asm/processor.h index 851f441991d2..d9a922d8711b 100644 --- a/arch/score/include/asm/processor.h +++ b/arch/score/include/asm/processor.h @@ -24,7 +24,6 @@ extern unsigned long get_wchan(struct task_struct *p); #define current_text_addr() ({ __label__ _l; _l: &&_l; }) #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() #define release_thread(thread) do {} while (0) /* diff --git a/arch/sh/include/asm/mutex-llsc.h b/arch/sh/include/asm/mutex-llsc.h deleted file mode 100644 index dad29b687bd3..000000000000 --- a/arch/sh/include/asm/mutex-llsc.h +++ /dev/null @@ -1,109 +0,0 @@ -/* - * arch/sh/include/asm/mutex-llsc.h - * - * SH-4A optimized mutex locking primitives - * - * Please look into asm-generic/mutex-xchg.h for a formal definition. - */ -#ifndef __ASM_SH_MUTEX_LLSC_H -#define __ASM_SH_MUTEX_LLSC_H - -/* - * Attempting to lock a mutex on SH4A is done like in ARMv6+ architecure. - * with a bastardized atomic decrement (it is not a reliable atomic decrement - * but it satisfies the defined semantics for our purpose, while being - * smaller and faster than a real atomic decrement or atomic swap. - * The idea is to attempt decrementing the lock value only once. If once - * decremented it isn't zero, or if its store-back fails due to a dispute - * on the exclusive store, we simply bail out immediately through the slow - * path where the lock will be reattempted until it succeeds. - */ -static inline void -__mutex_fastpath_lock(atomic_t *count, void (*fail_fn)(atomic_t *)) -{ - int __done, __res; - - __asm__ __volatile__ ( - "movli.l @%2, %0 \n" - "add #-1, %0 \n" - "movco.l %0, @%2 \n" - "movt %1 \n" - : "=&z" (__res), "=&r" (__done) - : "r" (&(count)->counter) - : "t"); - - if (unlikely(!__done || __res != 0)) - fail_fn(count); -} - -static inline int -__mutex_fastpath_lock_retval(atomic_t *count) -{ - int __done, __res; - - __asm__ __volatile__ ( - "movli.l @%2, %0 \n" - "add #-1, %0 \n" - "movco.l %0, @%2 \n" - "movt %1 \n" - : "=&z" (__res), "=&r" (__done) - : "r" (&(count)->counter) - : "t"); - - if (unlikely(!__done || __res != 0)) - __res = -1; - - return __res; -} - -static inline void -__mutex_fastpath_unlock(atomic_t *count, void (*fail_fn)(atomic_t *)) -{ - int __done, __res; - - __asm__ __volatile__ ( - "movli.l @%2, %0 \n\t" - "add #1, %0 \n\t" - "movco.l %0, @%2 \n\t" - "movt %1 \n\t" - : "=&z" (__res), "=&r" (__done) - : "r" (&(count)->counter) - : "t"); - - if (unlikely(!__done || __res <= 0)) - fail_fn(count); -} - -/* - * If the unlock was done on a contended lock, or if the unlock simply fails - * then the mutex remains locked. - */ -#define __mutex_slowpath_needs_to_unlock() 1 - -/* - * For __mutex_fastpath_trylock we do an atomic decrement and check the - * result and put it in the __res variable. - */ -static inline int -__mutex_fastpath_trylock(atomic_t *count, int (*fail_fn)(atomic_t *)) -{ - int __res, __orig; - - __asm__ __volatile__ ( - "1: movli.l @%2, %0 \n\t" - "dt %0 \n\t" - "movco.l %0,@%2 \n\t" - "bf 1b \n\t" - "cmp/eq #0,%0 \n\t" - "bt 2f \n\t" - "mov #0, %1 \n\t" - "bf 3f \n\t" - "2: mov #1, %1 \n\t" - "3: " - : "=&z" (__orig), "=&r" (__res) - : "r" (&count->counter) - : "t"); - - return __res; -} -#endif /* __ASM_SH_MUTEX_LLSC_H */ diff --git a/arch/sh/include/asm/mutex.h b/arch/sh/include/asm/mutex.h deleted file mode 100644 index d8e37716a4a0..000000000000 --- a/arch/sh/include/asm/mutex.h +++ /dev/null @@ -1,12 +0,0 @@ -/* - * Pull in the generic implementation for the mutex fastpath. - * - * TODO: implement optimized primitives instead, or leave the generic - * implementation in place, or pick the atomic_xchg() based generic - * implementation. (see asm-generic/mutex-xchg.h for details) - */ -#if defined(CONFIG_CPU_SH4A) -#include <asm/mutex-llsc.h> -#else -#include <asm-generic/mutex-dec.h> -#endif diff --git a/arch/sh/include/asm/processor.h b/arch/sh/include/asm/processor.h index f9a09942a32d..5addd69f70ef 100644 --- a/arch/sh/include/asm/processor.h +++ b/arch/sh/include/asm/processor.h @@ -97,7 +97,6 @@ extern struct sh_cpuinfo cpu_data[]; #define cpu_sleep() __asm__ __volatile__ ("sleep" : : : "memory") #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() void default_idle(void); void stop_this_cpu(void *); diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 025cdb1032f6..46e0d635e36f 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -65,6 +65,9 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address) tlb->end = address + PAGE_SIZE; } +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) + /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, @@ -115,18 +118,18 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { return tlb_remove_page(tlb, page); } +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) #define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) #define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 165ecdd24d22..cf4034c66362 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -155,6 +155,9 @@ config PGTABLE_LEVELS default 4 if 64BIT default 3 +config ARCH_SUPPORTS_UPROBES + def_bool y if SPARC64 + source "init/Kconfig" source "kernel/Kconfig.freezer" diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig index 3583d676a916..b2e650d1764f 100644 --- a/arch/sparc/configs/sparc64_defconfig +++ b/arch/sparc/configs/sparc64_defconfig @@ -213,6 +213,7 @@ CONFIG_SCHEDSTATS=y # CONFIG_RCU_CPU_STALL_DETECTOR is not set CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_UPROBE_EVENTS=y CONFIG_KEYS=y CONFIG_CRYPTO_NULL=m CONFIG_CRYPTO_TEST=m diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index cfc918067f80..0569bfac4afb 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -15,7 +15,6 @@ generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += module.h -generic-y += mutex.h generic-y += preempt.h generic-y += rwsem.h generic-y += serial.h diff --git a/arch/sparc/include/asm/kdebug_64.h b/arch/sparc/include/asm/kdebug_64.h index 04465de8f3b5..867286bf7b1a 100644 --- a/arch/sparc/include/asm/kdebug_64.h +++ b/arch/sparc/include/asm/kdebug_64.h @@ -10,6 +10,8 @@ enum die_val { DIE_OOPS = 1, DIE_DEBUG, /* ta 0x70 */ DIE_DEBUG_2, /* ta 0x71 */ + DIE_BPT, /* ta 0x73 */ + DIE_SSTEP, /* ta 0x74 */ DIE_DIE, DIE_TRAP, DIE_TRAP_TL1, diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 1fb317fbc0b3..314b66851348 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -826,7 +826,7 @@ static inline unsigned long __pmd_page(pmd_t pmd) #define pgd_page_vaddr(pgd) \ ((unsigned long) __va(pgd_val(pgd))) #define pgd_present(pgd) (pgd_val(pgd) != 0U) -#define pgd_clear(pgdp) (pgd_val(*(pgd)) = 0UL) +#define pgd_clear(pgdp) (pgd_val(*(pgdp)) = 0UL) static inline unsigned long pud_large(pud_t pud) { diff --git a/arch/sparc/include/asm/processor_32.h b/arch/sparc/include/asm/processor_32.h index 812fd08f3e62..365d4cb267b4 100644 --- a/arch/sparc/include/asm/processor_32.h +++ b/arch/sparc/include/asm/processor_32.h @@ -119,7 +119,6 @@ extern struct task_struct *last_task_used_math; int do_mathemu(struct pt_regs *regs, struct task_struct *fpt); #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() extern void (*sparc_idle)(void); diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h index ce2595c89471..6448cfc8292f 100644 --- a/arch/sparc/include/asm/processor_64.h +++ b/arch/sparc/include/asm/processor_64.h @@ -216,7 +216,6 @@ unsigned long get_wchan(struct task_struct *task); "nop\n\t" \ ".previous" \ ::: "memory") -#define cpu_relax_lowlatency() cpu_relax() /* Prefetch support. This is tuned for UltraSPARC-III and later. * UltraSPARC-I will treat these as nops, and UltraSPARC-II has diff --git a/arch/sparc/include/asm/ptrace.h b/arch/sparc/include/asm/ptrace.h index bac6a946ee00..ca57f08bd3db 100644 --- a/arch/sparc/include/asm/ptrace.h +++ b/arch/sparc/include/asm/ptrace.h @@ -61,7 +61,10 @@ extern union global_cpu_snapshot global_cpu_snapshot[NR_CPUS]; #define force_successful_syscall_return() set_thread_noerror(1) #define user_mode(regs) (!((regs)->tstate & TSTATE_PRIV)) #define instruction_pointer(regs) ((regs)->tpc) -#define instruction_pointer_set(regs, val) ((regs)->tpc = (val)) +#define instruction_pointer_set(regs, val) do { \ + (regs)->tpc = (val); \ + (regs)->tnpc = (val)+4; \ + } while (0) #define user_stack_pointer(regs) ((regs)->u_regs[UREG_FP]) static inline int is_syscall_success(struct pt_regs *regs) { @@ -77,6 +80,36 @@ unsigned long profile_pc(struct pt_regs *); #else #define profile_pc(regs) instruction_pointer(regs) #endif + +#define MAX_REG_OFFSET (offsetof(struct pt_regs, magic)) + +extern int regs_query_register_offset(const char *name); + +/** + * regs_get_register() - get register value from its offset + * @regs: pt_regs from which register value is gotten + * @offset: offset number of the register. + * + * regs_get_register returns the value of a register whose + * offset from @regs. The @offset is the offset of the register + * in struct pt_regs. If @offset is bigger than MAX_REG_OFFSET, + * this returns 0. + */ +static inline unsigned long regs_get_register(struct pt_regs *regs, + unsigned long offset) +{ + if (unlikely(offset >= MAX_REG_OFFSET)) + return 0; + if (offset == PT_V9_Y) + return *(unsigned int *)((unsigned long)regs + offset); + return *(unsigned long *)((unsigned long)regs + offset); +} + +/* Valid only for Kernel mode traps. */ +static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) +{ + return regs->u_regs[UREG_I6]; +} #else /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */ #else /* (defined(__sparc__) && defined(__arch64__)) */ diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 3d7b925f6516..38a24f257b85 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -180,7 +180,7 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ /* flag bit 4 is available */ #define TIF_UNALIGNED 5 /* allowed to do unaligned accesses */ -/* flag bit 6 is available */ +#define TIF_UPROBE 6 /* breakpointed or singlestepped */ #define TIF_32BIT 7 /* 32-bit binary */ #define TIF_NOHZ 8 /* in adaptive nohz mode */ #define TIF_SECCOMP 9 /* secure computing */ @@ -199,6 +199,7 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define _TIF_SIGPENDING (1<<TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1<<TIF_NEED_RESCHED) #define _TIF_UNALIGNED (1<<TIF_UNALIGNED) +#define _TIF_UPROBE (1<<TIF_UPROBE) #define _TIF_32BIT (1<<TIF_32BIT) #define _TIF_NOHZ (1<<TIF_NOHZ) #define _TIF_SECCOMP (1<<TIF_SECCOMP) @@ -209,7 +210,8 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define _TIF_USER_WORK_MASK ((0xff << TI_FLAG_WSAVED_SHIFT) | \ _TIF_DO_NOTIFY_RESUME_MASK | \ _TIF_NEED_RESCHED) -#define _TIF_DO_NOTIFY_RESUME_MASK (_TIF_NOTIFY_RESUME | _TIF_SIGPENDING) +#define _TIF_DO_NOTIFY_RESUME_MASK (_TIF_NOTIFY_RESUME | \ + _TIF_SIGPENDING | _TIF_UPROBE) #define is_32bit_task() (test_thread_flag(TIF_32BIT)) diff --git a/arch/sparc/include/asm/topology_64.h b/arch/sparc/include/asm/topology_64.h index 7b4898a36eee..225543000122 100644 --- a/arch/sparc/include/asm/topology_64.h +++ b/arch/sparc/include/asm/topology_64.h @@ -4,6 +4,7 @@ #ifdef CONFIG_NUMA #include <asm/mmzone.h> +#include <asm/cpudata.h> static inline int cpu_to_node(int cpu) { diff --git a/arch/sparc/include/asm/ttable.h b/arch/sparc/include/asm/ttable.h index 781b9f1dbdc2..82e7df296abc 100644 --- a/arch/sparc/include/asm/ttable.h +++ b/arch/sparc/include/asm/ttable.h @@ -186,6 +186,12 @@ #define KPROBES_TRAP(lvl) TRAP_ARG(bad_trap, lvl) #endif +#ifdef CONFIG_UPROBES +#define UPROBES_TRAP(lvl) TRAP_ARG(uprobe_trap, lvl) +#else +#define UPROBES_TRAP(lvl) TRAP_ARG(bad_trap, lvl) +#endif + #ifdef CONFIG_KGDB #define KGDB_TRAP(lvl) TRAP_IRQ(kgdb_trap, lvl) #else diff --git a/arch/sparc/include/asm/uprobes.h b/arch/sparc/include/asm/uprobes.h new file mode 100644 index 000000000000..f87aae5a908e --- /dev/null +++ b/arch/sparc/include/asm/uprobes.h @@ -0,0 +1,59 @@ +#ifndef _ASM_UPROBES_H +#define _ASM_UPROBES_H +/* + * User-space Probes (UProbes) for sparc + * + * Copyright (C) 2013 Oracle, Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Authors: + * Jose E. Marchesi <jose.marchesi@oracle.com> + * Eric Saint Etienne <eric.saint.etienne@oracle.com> + */ + +typedef u32 uprobe_opcode_t; + +#define MAX_UINSN_BYTES 4 +#define UPROBE_XOL_SLOT_BYTES (MAX_UINSN_BYTES * 2) + +#define UPROBE_SWBP_INSN_SIZE 4 +#define UPROBE_SWBP_INSN 0x91d02073 /* ta 0x73 */ +#define UPROBE_STP_INSN 0x91d02074 /* ta 0x74 */ + +#define ANNUL_BIT (1 << 29) + +struct arch_uprobe { + union { + u8 insn[MAX_UINSN_BYTES]; + u32 ixol; + }; +}; + +struct arch_uprobe_task { + u32 saved_tpc; + u32 saved_tnpc; +}; + +struct task_struct; +struct notifier_block; + +extern int arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr); +extern int arch_uprobe_pre_xol(struct arch_uprobe *aup, struct pt_regs *regs); +extern int arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs); +extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk); +extern int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data); +extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs); + +#endif /* _ASM_UPROBES_H */ diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h index 31aede3af088..a25dc32f5d6a 100644 --- a/arch/sparc/include/uapi/asm/socket.h +++ b/arch/sparc/include/uapi/asm/socket.h @@ -86,6 +86,8 @@ #define SO_CNX_ADVICE 0x0037 +#define SCM_TIMESTAMPING_OPT_STATS 0x0038 + /* Security levels - as per NRL IPv6 - don't actually do anything */ #define SO_SECURITY_AUTHENTICATION 0x5001 #define SO_SECURITY_ENCRYPTION_TRANSPORT 0x5002 diff --git a/arch/sparc/kernel/Makefile b/arch/sparc/kernel/Makefile index fa3c02d41138..aac609889ee4 100644 --- a/arch/sparc/kernel/Makefile +++ b/arch/sparc/kernel/Makefile @@ -116,4 +116,5 @@ obj-$(CONFIG_COMPAT) += $(audit--y) pc--$(CONFIG_PERF_EVENTS) := perf_event.o obj-$(CONFIG_SPARC64) += $(pc--y) +obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_SPARC64) += jump_label.o diff --git a/arch/sparc/kernel/leon_kernel.c b/arch/sparc/kernel/leon_kernel.c index 33cd171d933e..afcdd5e4f43f 100644 --- a/arch/sparc/kernel/leon_kernel.c +++ b/arch/sparc/kernel/leon_kernel.c @@ -349,37 +349,37 @@ void __init leon_init_timers(void) /* Find GPTIMER Timer Registers base address otherwise bail out. */ nnp = rootnp; - do { - np = of_find_node_by_name(nnp, "GAISLER_GPTIMER"); - if (!np) { - np = of_find_node_by_name(nnp, "01_011"); - if (!np) - goto bad; - } - ampopts = 0; - pp = of_find_property(np, "ampopts", &len); - if (pp) { - ampopts = *(int *)pp->value; - if (ampopts == 0) { - /* Skip this instance, resource already - * allocated by other OS */ - nnp = np; - continue; - } +retry: + np = of_find_node_by_name(nnp, "GAISLER_GPTIMER"); + if (!np) { + np = of_find_node_by_name(nnp, "01_011"); + if (!np) + goto bad; + } + + ampopts = 0; + pp = of_find_property(np, "ampopts", &len); + if (pp) { + ampopts = *(int *)pp->value; + if (ampopts == 0) { + /* Skip this instance, resource already + * allocated by other OS */ + nnp = np; + goto retry; } + } + + /* Select Timer-Instance on Timer Core. Default is zero */ + leon3_gptimer_idx = ampopts & 0x7; - /* Select Timer-Instance on Timer Core. Default is zero */ - leon3_gptimer_idx = ampopts & 0x7; - - pp = of_find_property(np, "reg", &len); - if (pp) - leon3_gptimer_regs = *(struct leon3_gptimer_regs_map **) - pp->value; - pp = of_find_property(np, "interrupts", &len); - if (pp) - leon3_gptimer_irq = *(unsigned int *)pp->value; - } while (0); + pp = of_find_property(np, "reg", &len); + if (pp) + leon3_gptimer_regs = *(struct leon3_gptimer_regs_map **) + pp->value; + pp = of_find_property(np, "interrupts", &len); + if (pp) + leon3_gptimer_irq = *(unsigned int *)pp->value; if (!(leon3_gptimer_regs && leon3_irqctrl_regs && leon3_gptimer_irq)) goto bad; diff --git a/arch/sparc/kernel/pci_sun4v.c b/arch/sparc/kernel/pci_sun4v.c index 06981cc716b6..f4daccd12bf5 100644 --- a/arch/sparc/kernel/pci_sun4v.c +++ b/arch/sparc/kernel/pci_sun4v.c @@ -242,6 +242,7 @@ static void *dma_4v_alloc_coherent(struct device *dev, size_t size, return ret; iommu_map_fail: + local_irq_restore(flags); iommu_tbl_range_free(tbl, *dma_addrp, npages, IOMMU_ERROR_CODE); range_alloc_fail: @@ -414,6 +415,7 @@ bad: return DMA_ERROR_CODE; iommu_map_fail: + local_irq_restore(flags); iommu_tbl_range_free(tbl, bus_addr, npages, IOMMU_ERROR_CODE); return DMA_ERROR_CODE; } @@ -478,11 +480,10 @@ static int dma_4v_map_sg(struct device *dev, struct scatterlist *sglist, BUG_ON(direction == DMA_NONE); iommu = dev->archdata.iommu; - atu = iommu->atu; - if (nelems == 0 || !iommu) return 0; - + atu = iommu->atu; + prot = HV_PCI_MAP_ATTR_READ; if (direction != DMA_TO_DEVICE) prot |= HV_PCI_MAP_ATTR_WRITE; diff --git a/arch/sparc/kernel/power.c b/arch/sparc/kernel/power.c index 1836cb965ff8..4b60f385c98f 100644 --- a/arch/sparc/kernel/power.c +++ b/arch/sparc/kernel/power.c @@ -67,9 +67,4 @@ static struct platform_driver power_driver = { }, }; -static int __init power_init(void) -{ - return platform_driver_register(&power_driver); -} - -device_initcall(power_init); +builtin_platform_driver(power_driver); diff --git a/arch/sparc/kernel/ptrace_64.c b/arch/sparc/kernel/ptrace_64.c index ac082dd8c67d..96494b2ef41f 100644 --- a/arch/sparc/kernel/ptrace_64.c +++ b/arch/sparc/kernel/ptrace_64.c @@ -46,6 +46,43 @@ /* #define ALLOW_INIT_TRACING */ +struct pt_regs_offset { + const char *name; + int offset; +}; + +#define REG_OFFSET_NAME(n, r) \ + {.name = n, .offset = (PT_V9_##r)} +#define REG_OFFSET_END {.name = NULL, .offset = 0} + +static const struct pt_regs_offset regoffset_table[] = { + REG_OFFSET_NAME("g0", G0), + REG_OFFSET_NAME("g1", G1), + REG_OFFSET_NAME("g2", G2), + REG_OFFSET_NAME("g3", G3), + REG_OFFSET_NAME("g4", G4), + REG_OFFSET_NAME("g5", G5), + REG_OFFSET_NAME("g6", G6), + REG_OFFSET_NAME("g7", G7), + + REG_OFFSET_NAME("i0", I0), + REG_OFFSET_NAME("i1", I1), + REG_OFFSET_NAME("i2", I2), + REG_OFFSET_NAME("i3", I3), + REG_OFFSET_NAME("i4", I4), + REG_OFFSET_NAME("i5", I5), + REG_OFFSET_NAME("i6", I6), + REG_OFFSET_NAME("i7", I7), + + REG_OFFSET_NAME("tstate", TSTATE), + REG_OFFSET_NAME("pc", TPC), + REG_OFFSET_NAME("npc", TNPC), + REG_OFFSET_NAME("y", Y), + REG_OFFSET_NAME("lr", I7), + + REG_OFFSET_END, +}; + /* * Called by kernel/ptrace.c when detaching.. * @@ -1107,3 +1144,20 @@ asmlinkage void syscall_trace_leave(struct pt_regs *regs) if (test_thread_flag(TIF_NOHZ)) user_enter(); } + +/** + * regs_query_register_offset() - query register offset from its name + * @name: the name of a register + * + * regs_query_register_offset() returns the offset of a register in struct + * pt_regs from its name. If the name is invalid, this returns -EINVAL; + */ +int regs_query_register_offset(const char *name) +{ + const struct pt_regs_offset *roff; + + for (roff = regoffset_table; roff->name != NULL; roff++) + if (!strcmp(roff->name, name)) + return roff->offset; + return -EINVAL; +} diff --git a/arch/sparc/kernel/signal_64.c b/arch/sparc/kernel/signal_64.c index 5ee930c48f4c..c782c9b716db 100644 --- a/arch/sparc/kernel/signal_64.c +++ b/arch/sparc/kernel/signal_64.c @@ -545,6 +545,8 @@ static void do_signal(struct pt_regs *regs, unsigned long orig_i0) void do_notify_resume(struct pt_regs *regs, unsigned long orig_i0, unsigned long thread_info_flags) { user_exit(); + if (thread_info_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); if (thread_info_flags & _TIF_SIGPENDING) do_signal(regs, orig_i0); if (thread_info_flags & _TIF_NOTIFY_RESUME) { diff --git a/arch/sparc/kernel/sysfs.c b/arch/sparc/kernel/sysfs.c index fa8e21abb5e0..4808b6d23455 100644 --- a/arch/sparc/kernel/sysfs.c +++ b/arch/sparc/kernel/sysfs.c @@ -221,7 +221,7 @@ static struct device_attribute cpu_core_attrs[] = { static DEFINE_PER_CPU(struct cpu, cpu_devices); -static void register_cpu_online(unsigned int cpu) +static int register_cpu_online(unsigned int cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); struct device *s = &c->dev; @@ -231,11 +231,12 @@ static void register_cpu_online(unsigned int cpu) device_create_file(s, &cpu_core_attrs[i]); register_mmu_stats(s); + return 0; } -#ifdef CONFIG_HOTPLUG_CPU -static void unregister_cpu_online(unsigned int cpu) +static int unregister_cpu_online(unsigned int cpu) { +#ifdef CONFIG_HOTPLUG_CPU struct cpu *c = &per_cpu(cpu_devices, cpu); struct device *s = &c->dev; int i; @@ -243,33 +244,10 @@ static void unregister_cpu_online(unsigned int cpu) unregister_mmu_stats(s); for (i = 0; i < ARRAY_SIZE(cpu_core_attrs); i++) device_remove_file(s, &cpu_core_attrs[i]); -} -#endif - -static int sysfs_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned int)(long)hcpu; - - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - register_cpu_online(cpu); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_DEAD: - case CPU_DEAD_FROZEN: - unregister_cpu_online(cpu); - break; #endif - } - return NOTIFY_OK; + return 0; } -static struct notifier_block sysfs_cpu_nb = { - .notifier_call = sysfs_cpu_notify, -}; - static void __init check_mmu_stats(void) { unsigned long dummy1, err; @@ -294,26 +272,21 @@ static void register_nodes(void) static int __init topology_init(void) { - int cpu; + int cpu, ret; register_nodes(); check_mmu_stats(); - cpu_notifier_register_begin(); - for_each_possible_cpu(cpu) { struct cpu *c = &per_cpu(cpu_devices, cpu); register_cpu(c, cpu); - if (cpu_online(cpu)) - register_cpu_online(cpu); } - __register_cpu_notifier(&sysfs_cpu_nb); - - cpu_notifier_register_done(); - + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "sparc/topology:online", + register_cpu_online, unregister_cpu_online); + WARN_ON(ret < 0); return 0; } diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c index 4094a51b1970..496fa926e1e0 100644 --- a/arch/sparc/kernel/traps_64.c +++ b/arch/sparc/kernel/traps_64.c @@ -85,7 +85,7 @@ static void dump_tl1_traplog(struct tl1_traplog *p) void bad_trap(struct pt_regs *regs, long lvl) { - char buffer[32]; + char buffer[36]; siginfo_t info; if (notify_die(DIE_TRAP, "bad trap", regs, @@ -116,7 +116,7 @@ void bad_trap(struct pt_regs *regs, long lvl) void bad_trap_tl1(struct pt_regs *regs, long lvl) { - char buffer[32]; + char buffer[36]; if (notify_die(DIE_TRAP_TL1, "bad trap tl1", regs, 0, lvl, SIGTRAP) == NOTIFY_STOP) diff --git a/arch/sparc/kernel/ttable_64.S b/arch/sparc/kernel/ttable_64.S index c6dfdaa29e20..7bd8f6556352 100644 --- a/arch/sparc/kernel/ttable_64.S +++ b/arch/sparc/kernel/ttable_64.S @@ -165,7 +165,7 @@ tl0_resv169: BTRAP(0x169) BTRAP(0x16a) BTRAP(0x16b) BTRAP(0x16c) tl0_linux64: LINUX_64BIT_SYSCALL_TRAP tl0_gsctx: TRAP(sparc64_get_context) TRAP(sparc64_set_context) tl0_resv170: KPROBES_TRAP(0x170) KPROBES_TRAP(0x171) KGDB_TRAP(0x172) -tl0_resv173: BTRAP(0x173) BTRAP(0x174) BTRAP(0x175) BTRAP(0x176) BTRAP(0x177) +tl0_resv173: UPROBES_TRAP(0x173) UPROBES_TRAP(0x174) BTRAP(0x175) BTRAP(0x176) BTRAP(0x177) tl0_resv178: BTRAP(0x178) BTRAP(0x179) BTRAP(0x17a) BTRAP(0x17b) BTRAP(0x17c) tl0_resv17d: BTRAP(0x17d) BTRAP(0x17e) BTRAP(0x17f) #define BTRAPS(x) BTRAP(x) BTRAP(x+1) BTRAP(x+2) BTRAP(x+3) BTRAP(x+4) BTRAP(x+5) BTRAP(x+6) BTRAP(x+7) diff --git a/arch/sparc/kernel/uprobes.c b/arch/sparc/kernel/uprobes.c new file mode 100644 index 000000000000..b68314050602 --- /dev/null +++ b/arch/sparc/kernel/uprobes.c @@ -0,0 +1,331 @@ +/* + * User-space Probes (UProbes) for sparc + * + * Copyright (C) 2013 Oracle Inc. + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Authors: + * Jose E. Marchesi <jose.marchesi@oracle.com> + * Eric Saint Etienne <eric.saint.etienne@oracle.com> + */ + +#include <linux/kernel.h> +#include <linux/highmem.h> +#include <linux/uprobes.h> +#include <linux/uaccess.h> +#include <linux/sched.h> /* For struct task_struct */ +#include <linux/kdebug.h> + +#include <asm/cacheflush.h> +#include <asm/uaccess.h> + +/* Compute the address of the breakpoint instruction and return it. + * + * Note that uprobe_get_swbp_addr is defined as a weak symbol in + * kernel/events/uprobe.c. + */ +unsigned long uprobe_get_swbp_addr(struct pt_regs *regs) +{ + return instruction_pointer(regs); +} + +static void copy_to_page(struct page *page, unsigned long vaddr, + const void *src, int len) +{ + void *kaddr = kmap_atomic(page); + + memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len); + kunmap_atomic(kaddr); +} + +/* Fill in the xol area with the probed instruction followed by the + * single-step trap. Some fixups in the copied instruction are + * performed at this point. + * + * Note that uprobe_xol_copy is defined as a weak symbol in + * kernel/events/uprobe.c. + */ +void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, + void *src, unsigned long len) +{ + const u32 stp_insn = UPROBE_STP_INSN; + u32 insn = *(u32 *) src; + + /* Branches annulling their delay slot must be fixed to not do + * so. Clearing the annul bit on these instructions we can be + * sure the single-step breakpoint in the XOL slot will be + * executed. + */ + + u32 op = (insn >> 30) & 0x3; + u32 op2 = (insn >> 22) & 0x7; + + if (op == 0 && + (op2 == 1 || op2 == 2 || op2 == 3 || op2 == 5 || op2 == 6) && + (insn & ANNUL_BIT) == ANNUL_BIT) + insn &= ~ANNUL_BIT; + + copy_to_page(page, vaddr, &insn, len); + copy_to_page(page, vaddr+len, &stp_insn, 4); +} + + +/* Instruction analysis/validity. + * + * This function returns 0 on success or a -ve number on error. + */ +int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe, + struct mm_struct *mm, unsigned long addr) +{ + /* Any unsupported instruction? Then return -EINVAL */ + return 0; +} + +/* If INSN is a relative control transfer instruction, return the + * corrected branch destination value. + * + * Note that regs->tpc and regs->tnpc still hold the values of the + * program counters at the time of the single-step trap due to the + * execution of the UPROBE_STP_INSN at utask->xol_vaddr + 4. + * + */ +static unsigned long relbranch_fixup(u32 insn, struct uprobe_task *utask, + struct pt_regs *regs) +{ + /* Branch not taken, no mods necessary. */ + if (regs->tnpc == regs->tpc + 0x4UL) + return utask->autask.saved_tnpc + 0x4UL; + + /* The three cases are call, branch w/prediction, + * and traditional branch. + */ + if ((insn & 0xc0000000) == 0x40000000 || + (insn & 0xc1c00000) == 0x00400000 || + (insn & 0xc1c00000) == 0x00800000) { + unsigned long real_pc = (unsigned long) utask->vaddr; + unsigned long ixol_addr = utask->xol_vaddr; + + /* The instruction did all the work for us + * already, just apply the offset to the correct + * instruction location. + */ + return (real_pc + (regs->tnpc - ixol_addr)); + } + + /* It is jmpl or some other absolute PC modification instruction, + * leave NPC as-is. + */ + return regs->tnpc; +} + +/* If INSN is an instruction which writes its PC location + * into a destination register, fix that up. + */ +static int retpc_fixup(struct pt_regs *regs, u32 insn, + unsigned long real_pc) +{ + unsigned long *slot = NULL; + int rc = 0; + + /* Simplest case is 'call', which always uses %o7 */ + if ((insn & 0xc0000000) == 0x40000000) + slot = ®s->u_regs[UREG_I7]; + + /* 'jmpl' encodes the register inside of the opcode */ + if ((insn & 0xc1f80000) == 0x81c00000) { + unsigned long rd = ((insn >> 25) & 0x1f); + + if (rd <= 15) { + slot = ®s->u_regs[rd]; + } else { + unsigned long fp = regs->u_regs[UREG_FP]; + /* Hard case, it goes onto the stack. */ + flushw_all(); + + rd -= 16; + if (test_thread_64bit_stack(fp)) { + unsigned long __user *uslot = + (unsigned long __user *) (fp + STACK_BIAS) + rd; + rc = __put_user(real_pc, uslot); + } else { + unsigned int __user *uslot = (unsigned int + __user *) fp + rd; + rc = __put_user((u32) real_pc, uslot); + } + } + } + if (slot != NULL) + *slot = real_pc; + return rc; +} + +/* Single-stepping can be avoided for certain instructions: NOPs and + * instructions that can be emulated. This function determines + * whether the instruction where the uprobe is installed falls in one + * of these cases and emulates it. + * + * This function returns true if the single-stepping can be skipped, + * false otherwise. + */ +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + /* We currently only emulate NOP instructions. + */ + + if (auprobe->ixol == (1 << 24)) { + regs->tnpc += 4; + regs->tpc += 4; + return true; + } + + return false; +} + +/* Prepare to execute out of line. At this point + * current->utask->xol_vaddr points to an allocated XOL slot properly + * initialized with the original instruction and the single-stepping + * trap instruction. + * + * This function returns 0 on success, any other number on error. + */ +int arch_uprobe_pre_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + struct arch_uprobe_task *autask = ¤t->utask->autask; + + /* Save the current program counters so they can be restored + * later. + */ + autask->saved_tpc = regs->tpc; + autask->saved_tnpc = regs->tnpc; + + /* Adjust PC and NPC so the first instruction in the XOL slot + * will be executed by the user task. + */ + instruction_pointer_set(regs, utask->xol_vaddr); + + return 0; +} + +/* Prepare to resume execution after the single-step. Called after + * single-stepping. To avoid the SMP problems that can occur when we + * temporarily put back the original opcode to single-step, we + * single-stepped a copy of the instruction. + * + * This function returns 0 on success, any other number on error. + */ +int arch_uprobe_post_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + struct arch_uprobe_task *autask = &utask->autask; + u32 insn = auprobe->ixol; + int rc = 0; + + if (utask->state == UTASK_SSTEP_ACK) { + regs->tnpc = relbranch_fixup(insn, utask, regs); + regs->tpc = autask->saved_tnpc; + rc = retpc_fixup(regs, insn, (unsigned long) utask->vaddr); + } else { + regs->tnpc = utask->vaddr+4; + regs->tpc = autask->saved_tnpc+4; + } + return rc; +} + +/* Handler for uprobe traps. This is called from the traps table and + * triggers the proper die notification. + */ +asmlinkage void uprobe_trap(struct pt_regs *regs, + unsigned long trap_level) +{ + BUG_ON(trap_level != 0x173 && trap_level != 0x174); + + /* We are only interested in user-mode code. Uprobe traps + * shall not be present in kernel code. + */ + if (!user_mode(regs)) { + local_irq_enable(); + bad_trap(regs, trap_level); + return; + } + + /* trap_level == 0x173 --> ta 0x73 + * trap_level == 0x174 --> ta 0x74 + */ + if (notify_die((trap_level == 0x173) ? DIE_BPT : DIE_SSTEP, + (trap_level == 0x173) ? "bpt" : "sstep", + regs, 0, trap_level, SIGTRAP) != NOTIFY_STOP) + bad_trap(regs, trap_level); +} + +/* Callback routine for handling die notifications. +*/ +int arch_uprobe_exception_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + int ret = NOTIFY_DONE; + struct die_args *args = (struct die_args *)data; + + /* We are only interested in userspace traps */ + if (args->regs && !user_mode(args->regs)) + return NOTIFY_DONE; + + switch (val) { + case DIE_BPT: + if (uprobe_pre_sstep_notifier(args->regs)) + ret = NOTIFY_STOP; + break; + + case DIE_SSTEP: + if (uprobe_post_sstep_notifier(args->regs)) + ret = NOTIFY_STOP; + + default: + break; + } + + return ret; +} + +/* This function gets called when a XOL instruction either gets + * trapped or the thread has a fatal signal, so reset the instruction + * pointer to its probed address. + */ +void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ + struct uprobe_task *utask = current->utask; + + instruction_pointer_set(regs, utask->vaddr); +} + +/* If xol insn itself traps and generates a signal(Say, + * SIGILL/SIGSEGV/etc), then detect the case where a singlestepped + * instruction jumps back to its own address. + */ +bool arch_uprobe_xol_was_trapped(struct task_struct *t) +{ + return false; +} + +unsigned long +arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, + struct pt_regs *regs) +{ + unsigned long orig_ret_vaddr = regs->u_regs[UREG_I7]; + + regs->u_regs[UREG_I7] = trampoline_vaddr-8; + + return orig_ret_vaddr + 8; +} diff --git a/arch/tile/include/asm/Kbuild b/arch/tile/include/asm/Kbuild index ba35c41c71ff..2d1f5638974c 100644 --- a/arch/tile/include/asm/Kbuild +++ b/arch/tile/include/asm/Kbuild @@ -21,7 +21,6 @@ generic-y += local64.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h generic-y += msgbuf.h -generic-y += mutex.h generic-y += param.h generic-y += parport.h generic-y += poll.h diff --git a/arch/tile/include/asm/processor.h b/arch/tile/include/asm/processor.h index 0684e88aacd8..0bc9968b97a1 100644 --- a/arch/tile/include/asm/processor.h +++ b/arch/tile/include/asm/processor.h @@ -264,8 +264,6 @@ static inline void cpu_relax(void) barrier(); } -#define cpu_relax_lowlatency() cpu_relax() - /* Info on this processor (see fs/proc/cpuinfo.c) */ struct seq_operations; extern const struct seq_operations cpuinfo_op; diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c index 2cd5b6874c7b..1669240c7a25 100644 --- a/arch/um/drivers/net_kern.c +++ b/arch/um/drivers/net_kern.c @@ -256,13 +256,6 @@ static void uml_net_tx_timeout(struct net_device *dev) netif_wake_queue(dev); } -static int uml_net_change_mtu(struct net_device *dev, int new_mtu) -{ - dev->mtu = new_mtu; - - return 0; -} - #ifdef CONFIG_NET_POLL_CONTROLLER static void uml_net_poll_controller(struct net_device *dev) { @@ -374,7 +367,6 @@ static const struct net_device_ops uml_netdev_ops = { .ndo_set_rx_mode = uml_net_set_multicast_list, .ndo_tx_timeout = uml_net_tx_timeout, .ndo_set_mac_address = eth_mac_addr, - .ndo_change_mtu = uml_net_change_mtu, .ndo_validate_addr = eth_validate_addr, #ifdef CONFIG_NET_POLL_CONTROLLER .ndo_poll_controller = uml_net_poll_controller, diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild index 904f3ebf4220..052f7f6d0551 100644 --- a/arch/um/include/asm/Kbuild +++ b/arch/um/include/asm/Kbuild @@ -17,7 +17,6 @@ generic-y += irq_work.h generic-y += kdebug.h generic-y += mcs_spinlock.h generic-y += mm-arch-hooks.h -generic-y += mutex.h generic-y += param.h generic-y += pci.h generic-y += percpu.h diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 821ff0acfe17..600a2e9bfee2 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -116,12 +116,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { @@ -141,6 +135,15 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) + +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) #define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr) diff --git a/arch/unicore32/include/asm/mutex.h b/arch/unicore32/include/asm/mutex.h deleted file mode 100644 index fab7d0e8adf6..000000000000 --- a/arch/unicore32/include/asm/mutex.h +++ /dev/null @@ -1,20 +0,0 @@ -/* - * linux/arch/unicore32/include/asm/mutex.h - * - * Code specific to PKUnity SoC and UniCore ISA - * - * Copyright (C) 2001-2010 GUAN Xue-tao - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - * UniCore optimized mutex locking primitives - * - * Please look into asm-generic/mutex-xchg.h for a formal definition. - */ -#ifndef __UNICORE_MUTEX_H__ -#define __UNICORE_MUTEX_H__ - -# include <asm-generic/mutex-xchg.h> -#endif diff --git a/arch/unicore32/include/asm/processor.h b/arch/unicore32/include/asm/processor.h index 8d21b7adf26b..4eaa42167667 100644 --- a/arch/unicore32/include/asm/processor.h +++ b/arch/unicore32/include/asm/processor.h @@ -71,7 +71,6 @@ extern void release_thread(struct task_struct *); unsigned long get_wchan(struct task_struct *p); #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() #define task_pt_regs(p) \ ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bada636d1065..dd47e60aabf5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -9,28 +9,50 @@ config 64BIT config X86_32 def_bool y depends on !64BIT + # Options that are inherently 32-bit kernel only: + select ARCH_WANT_IPC_PARSE_VERSION + select CLKSRC_I8253 + select CLONE_BACKWARDS + select HAVE_AOUT + select HAVE_GENERIC_DMA_COHERENT + select MODULES_USE_ELF_REL + select OLD_SIGACTION config X86_64 def_bool y depends on 64BIT + # Options that are inherently 64-bit kernel only: + select ARCH_HAS_GIGANTIC_PAGE + select ARCH_SUPPORTS_INT128 + select ARCH_USE_CMPXCHG_LOCKREF + select HAVE_ARCH_SOFT_DIRTY + select MODULES_USE_ELF_RELA + select X86_DEV_DMA_OPS -### Arch settings +# +# Arch settings +# +# ( Note that options that are marked 'if X86_64' could in principle be +# ported to 32-bit as well. ) +# config X86 def_bool y + # + # Note: keep this list sorted alphabetically + # select ACPI_LEGACY_TABLES_LOOKUP if ACPI select ACPI_SYSTEM_POWER_STATES_SUPPORT if ACPI select ANON_INODES select ARCH_CLOCKSOURCE_DATA select ARCH_DISCARD_MEMBLOCK - select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI + select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_GCOV_PROFILE_ALL - select ARCH_HAS_GIGANTIC_PAGE if X86_64 select ARCH_HAS_KCOV if X86_64 - select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_MMIO_FLUSH + select ARCH_HAS_PMEM_API if X86_64 select ARCH_HAS_SG_CHAIN select ARCH_HAS_UBSAN_SANITIZE_ALL select ARCH_HAVE_NMI_SAFE_CMPXCHG @@ -39,23 +61,17 @@ config X86 select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - select ARCH_SUPPORTS_INT128 if X86_64 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_CMPXCHG_LOCKREF if X86_64 select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP - select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_FRAME_POINTERS - select ARCH_WANT_IPC_PARSE_VERSION if X86_32 + select ARCH_WANTS_DYNAMIC_TASK_STRUCT select BUILDTIME_EXTABLE_SORT select CLKEVT_I8253 - select CLKSRC_I8253 if X86_32 select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG - select CLONE_BACKWARDS if X86_32 - select COMPAT_OLD_SIGACTION if IA32_EMULATION select DCACHE_WORD_ACCESS select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT @@ -77,7 +93,6 @@ config X86 select HAVE_ACPI_APEI if ACPI select HAVE_ACPI_APEI_NMI if ACPI select HAVE_ALIGNED_STRUCT_PAGE if SLUB - select HAVE_AOUT if X86_32 select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HARDENED_USERCOPY select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE @@ -88,12 +103,10 @@ config X86 select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT select HAVE_ARCH_SECCOMP_FILTER - select HAVE_ARCH_SOFT_DIRTY if X86_64 select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE - select HAVE_ARCH_WITHIN_STACK_FRAMES - select HAVE_EBPF_JIT if X86_64 select HAVE_ARCH_VMAP_STACK if X86_64 + select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL @@ -106,6 +119,7 @@ config X86 select HAVE_DMA_CONTIGUOUS select HAVE_DYNAMIC_FTRACE select HAVE_DYNAMIC_FTRACE_WITH_REGS + select HAVE_EBPF_JIT if X86_64 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EXIT_THREAD select HAVE_FENTRY if X86_64 @@ -113,7 +127,6 @@ config X86 select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS - select HAVE_GENERIC_DMA_COHERENT if X86_32 select HAVE_HW_BREAKPOINT select HAVE_IDE select HAVE_IOREMAP_PROT @@ -142,15 +155,11 @@ config X86 select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API + select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS - select HAVE_UID16 if X86_32 || IA32_EMULATION select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER select IRQ_FORCED_THREADING - select MODULES_USE_ELF_RELA if X86_64 - select MODULES_USE_ELF_REL if X86_32 - select OLD_SIGACTION if X86_32 - select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION select PERF_EVENTS select RTC_LIB select RTC_MC146818_LIB @@ -160,11 +169,7 @@ config X86 select THREAD_INFO_IN_TASK select USER_STACKTRACE_SUPPORT select VIRT_TO_BUS - select X86_DEV_DMA_OPS if X86_64 select X86_FEATURE_NAMES if PROC_FS - select HAVE_STACK_VALIDATION if X86_64 - select ARCH_USES_HIGH_VMA_FLAGS if X86_INTEL_MEMORY_PROTECTION_KEYS - select ARCH_HAS_PKEYS if X86_INTEL_MEMORY_PROTECTION_KEYS config INSTRUCTION_DECODER def_bool y @@ -939,6 +944,27 @@ config SCHED_MC making when dealing with multi-core CPU chips at a cost of slightly increased overhead in some places. If unsure say N here. +config SCHED_MC_PRIO + bool "CPU core priorities scheduler support" + depends on SCHED_MC && CPU_SUP_INTEL + select X86_INTEL_PSTATE + select CPU_FREQ + default y + ---help--- + Intel Turbo Boost Max Technology 3.0 enabled CPUs have a + core ordering determined at manufacturing time, which allows + certain cores to reach higher turbo frequencies (when running + single threaded workloads) than others. + + Enabling this kernel feature teaches the scheduler about + the TBM3 (aka ITMT) priority order of the CPU cores and adjusts the + scheduler's CPU selection logic accordingly, so that higher + overall system performance can be achieved. + + This feature will have no effect on CPUs without this feature. + + If unsure say Y here. + source "kernel/Kconfig.preempt" config UP_LATE_INIT @@ -1025,7 +1051,7 @@ config X86_MCE_INTEL config X86_MCE_AMD def_bool y prompt "AMD MCE features" - depends on X86_MCE && X86_LOCAL_APIC + depends on X86_MCE && X86_LOCAL_APIC && AMD_NB ---help--- Additional support for AMD specific MCE features such as the DRAM Error Threshold. @@ -1525,7 +1551,7 @@ config X86_CHECK_BIOS_CORRUPTION line. By default it scans the low 64k of memory every 60 seconds; see the memory_corruption_check_size and memory_corruption_check_period parameters in - Documentation/kernel-parameters.txt to adjust this. + Documentation/admin-guide/kernel-parameters.rst to adjust this. When enabled with the default parameters, this option has almost no overhead, as it reserves a relatively small amount @@ -1737,6 +1763,8 @@ config X86_INTEL_MEMORY_PROTECTION_KEYS def_bool y # Note: only available in 64-bit mode depends on CPU_SUP_INTEL && X86_64 + select ARCH_USES_HIGH_VMA_FLAGS + select ARCH_HAS_PKEYS ---help--- Memory Protection Keys provides a mechanism for enforcing page-based protections, but without requiring modification of the @@ -2092,7 +2120,7 @@ config DEBUG_HOTPLUG_CPU0 config COMPAT_VDSO def_bool n prompt "Disable the 32-bit vDSO (needed for glibc 2.3.3)" - depends on X86_32 || IA32_EMULATION + depends on COMPAT_32 ---help--- Certain buggy versions of glibc will crash if they are presented with a 32-bit vDSO that is not mapped at the address @@ -2694,9 +2722,10 @@ source "fs/Kconfig.binfmt" config IA32_EMULATION bool "IA32 Emulation" depends on X86_64 + select ARCH_WANT_OLD_COMPAT_IPC select BINFMT_ELF select COMPAT_BINFMT_ELF - select ARCH_WANT_OLD_COMPAT_IPC + select COMPAT_OLD_SIGACTION ---help--- Include code to run legacy 32-bit programs under a 64-bit kernel. You should likely turn this on, unless you're @@ -2721,6 +2750,12 @@ config X86_X32 elf32_x86_64 support enabled to compile a kernel with this option set. +config COMPAT_32 + def_bool y + depends on IA32_EMULATION || X86_32 + select HAVE_UID16 + select OLD_SIGSUSPEND3 + config COMPAT def_bool y depends on IA32_EMULATION || X86_X32 diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 12ea8f8384f4..0d810fb15eac 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -65,7 +65,7 @@ clean-files += cpustr.h # --------------------------------------------------------------------------- -KBUILD_CFLAGS := $(USERINCLUDE) $(REALMODE_CFLAGS) -D_SETUP +KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n UBSAN_SANITIZE := n diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index cc69e37548db..ff01c8fc76f7 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -537,6 +537,69 @@ free_handle: efi_call_early(free_pool, pci_handle); } +static void retrieve_apple_device_properties(struct boot_params *boot_params) +{ + efi_guid_t guid = APPLE_PROPERTIES_PROTOCOL_GUID; + struct setup_data *data, *new; + efi_status_t status; + u32 size = 0; + void *p; + + status = efi_call_early(locate_protocol, &guid, NULL, &p); + if (status != EFI_SUCCESS) + return; + + if (efi_table_attr(apple_properties_protocol, version, p) != 0x10000) { + efi_printk(sys_table, "Unsupported properties proto version\n"); + return; + } + + efi_call_proto(apple_properties_protocol, get_all, p, NULL, &size); + if (!size) + return; + + do { + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, + size + sizeof(struct setup_data), &new); + if (status != EFI_SUCCESS) { + efi_printk(sys_table, + "Failed to alloc mem for properties\n"); + return; + } + + status = efi_call_proto(apple_properties_protocol, get_all, p, + new->data, &size); + + if (status == EFI_BUFFER_TOO_SMALL) + efi_call_early(free_pool, new); + } while (status == EFI_BUFFER_TOO_SMALL); + + new->type = SETUP_APPLE_PROPERTIES; + new->len = size; + new->next = 0; + + data = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data; + if (!data) + boot_params->hdr.setup_data = (unsigned long)new; + else { + while (data->next) + data = (struct setup_data *)(unsigned long)data->next; + data->next = (unsigned long)new; + } +} + +static void setup_quirks(struct boot_params *boot_params) +{ + efi_char16_t const apple[] = { 'A', 'p', 'p', 'l', 'e', 0 }; + efi_char16_t *fw_vendor = (efi_char16_t *)(unsigned long) + efi_table_attr(efi_system_table, fw_vendor, sys_table); + + if (!memcmp(fw_vendor, apple, sizeof(apple))) { + if (IS_ENABLED(CONFIG_APPLE_PROPERTIES)) + retrieve_apple_device_properties(boot_params); + } +} + static efi_status_t setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height) { @@ -1098,6 +1161,8 @@ struct boot_params *efi_main(struct efi_config *c, setup_efi_pci(boot_params); + setup_quirks(boot_params); + status = efi_call_early(allocate_pool, EFI_LOADER_DATA, sizeof(*gdt), (void **)&gdt); if (status != EFI_SUCCESS) { diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index efdfba21a5b2..4d85e600db78 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -119,8 +119,7 @@ ENTRY(startup_32) */ /* Load new GDT with the 64bit segments using 32bit descriptor */ - leal gdt(%ebp), %eax - movl %eax, gdt+2(%ebp) + addl %ebp, gdt+2(%ebp) lgdt gdt(%ebp) /* Enable PAE mode */ diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c index 0857b1a1de3b..c194d5717ae5 100644 --- a/arch/x86/crypto/crc32c-intel_glue.c +++ b/arch/x86/crypto/crc32c-intel_glue.c @@ -48,26 +48,13 @@ #ifdef CONFIG_X86_64 /* * use carryless multiply version of crc32c when buffer - * size is >= 512 (when eager fpu is enabled) or - * >= 1024 (when eager fpu is disabled) to account + * size is >= 512 to account * for fpu state save/restore overhead. */ -#define CRC32C_PCL_BREAKEVEN_EAGERFPU 512 -#define CRC32C_PCL_BREAKEVEN_NOEAGERFPU 1024 +#define CRC32C_PCL_BREAKEVEN 512 asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, unsigned int crc_init); -static int crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_EAGERFPU; -#if defined(X86_FEATURE_EAGER_FPU) -#define set_pcl_breakeven_point() \ -do { \ - if (!use_eager_fpu()) \ - crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU; \ -} while (0) -#else -#define set_pcl_breakeven_point() \ - (crc32c_pcl_breakeven = CRC32C_PCL_BREAKEVEN_NOEAGERFPU) -#endif #endif /* CONFIG_X86_64 */ static u32 crc32c_intel_le_hw_byte(u32 crc, unsigned char const *data, size_t length) @@ -190,7 +177,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, * use faster PCL version if datasize is large enough to * overcome kernel fpu state save/restore overhead */ - if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { kernel_fpu_begin(); *crcp = crc_pcl(data, len, *crcp); kernel_fpu_end(); @@ -202,7 +189,7 @@ static int crc32c_pcl_intel_update(struct shash_desc *desc, const u8 *data, static int __crc32c_pcl_intel_finup(u32 *crcp, const u8 *data, unsigned int len, u8 *out) { - if (len >= crc32c_pcl_breakeven && irq_fpu_usable()) { + if (len >= CRC32C_PCL_BREAKEVEN && irq_fpu_usable()) { kernel_fpu_begin(); *(__le32 *)out = ~cpu_to_le32(crc_pcl(data, len, *crcp)); kernel_fpu_end(); @@ -261,7 +248,6 @@ static int __init crc32c_intel_mod_init(void) alg.update = crc32c_pcl_intel_update; alg.finup = crc32c_pcl_intel_finup; alg.digest = crc32c_pcl_intel_digest; - set_pcl_breakeven_point(); } #endif return crypto_register_shash(&alg); diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 9a9e5884066c..05ed3d393da7 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -90,8 +90,8 @@ For 32-bit we have the following conventions - kernel is built with #define SIZEOF_PTREGS 21*8 - .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 - addq $-(15*8+\addskip), %rsp + .macro ALLOC_PT_GPREGS_ON_STACK + addq $-(15*8), %rsp .endm .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 @@ -147,15 +147,6 @@ For 32-bit we have the following conventions - kernel is built with movq 5*8+\offset(%rsp), %rbx .endm - .macro ZERO_EXTRA_REGS - xorl %r15d, %r15d - xorl %r14d, %r14d - xorl %r13d, %r13d - xorl %r12d, %r12d - xorl %ebp, %ebp - xorl %ebx, %ebx - .endm - .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 movq 6*8(%rsp), %r11 @@ -201,6 +192,26 @@ For 32-bit we have the following conventions - kernel is built with .byte 0xf1 .endm +/* + * This is a sneaky trick to help the unwinder find pt_regs on the stack. The + * frame pointer is replaced with an encoded pointer to pt_regs. The encoding + * is just setting the LSB, which makes it an invalid stack address and is also + * a signal to the unwinder that it's a pt_regs pointer in disguise. + * + * NOTE: This macro must be used *after* SAVE_EXTRA_REGS because it corrupts + * the original rbp. + */ +.macro ENCODE_FRAME_POINTER ptregs_offset=0 +#ifdef CONFIG_FRAME_POINTER + .if \ptregs_offset + leaq \ptregs_offset(%rsp), %rbp + .else + mov %rsp, %rbp + .endif + orq $0x1, %rbp +#endif +.endm + #endif /* CONFIG_X86_64 */ /* diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 21b352a11b49..acc0c6f36f3f 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -45,6 +45,7 @@ #include <asm/asm.h> #include <asm/smap.h> #include <asm/export.h> +#include <asm/frame.h> .section .entry.text, "ax" @@ -175,6 +176,22 @@ SET_KERNEL_GS %edx .endm +/* + * This is a sneaky trick to help the unwinder find pt_regs on the stack. The + * frame pointer is replaced with an encoded pointer to pt_regs. The encoding + * is just setting the LSB, which makes it an invalid stack address and is also + * a signal to the unwinder that it's a pt_regs pointer in disguise. + * + * NOTE: This macro must be used *after* SAVE_ALL because it corrupts the + * original rbp. + */ +.macro ENCODE_FRAME_POINTER +#ifdef CONFIG_FRAME_POINTER + mov %esp, %ebp + orl $0x1, %ebp +#endif +.endm + .macro RESTORE_INT_REGS popl %ebx popl %ecx @@ -238,6 +255,23 @@ ENTRY(__switch_to_asm) END(__switch_to_asm) /* + * The unwinder expects the last frame on the stack to always be at the same + * offset from the end of the page, which allows it to validate the stack. + * Calling schedule_tail() directly would break that convention because its an + * asmlinkage function so its argument has to be pushed on the stack. This + * wrapper creates a proper "end of stack" frame header before the call. + */ +ENTRY(schedule_tail_wrapper) + FRAME_BEGIN + + pushl %eax + call schedule_tail + popl %eax + + FRAME_END + ret +ENDPROC(schedule_tail_wrapper) +/* * A newly forked process directly context switches into this address. * * eax: prev task we switched from @@ -245,9 +279,7 @@ END(__switch_to_asm) * edi: kernel thread arg */ ENTRY(ret_from_fork) - pushl %eax - call schedule_tail - popl %eax + call schedule_tail_wrapper testl %ebx, %ebx jnz 1f /* kernel threads are uncommon */ @@ -307,13 +339,13 @@ END(ret_from_exception) #ifdef CONFIG_PREEMPT ENTRY(resume_kernel) DISABLE_INTERRUPTS(CLBR_ANY) -need_resched: +.Lneed_resched: cmpl $0, PER_CPU_VAR(__preempt_count) jnz restore_all testl $X86_EFLAGS_IF, PT_EFLAGS(%esp) # interrupts off (exception path) ? jz restore_all call preempt_schedule_irq - jmp need_resched + jmp .Lneed_resched END(resume_kernel) #endif @@ -334,7 +366,7 @@ GLOBAL(__begin_SYSENTER_singlestep_region) */ ENTRY(xen_sysenter_target) addl $5*4, %esp /* remove xen-provided frame */ - jmp sysenter_past_esp + jmp .Lsysenter_past_esp #endif /* @@ -371,7 +403,7 @@ ENTRY(xen_sysenter_target) */ ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp), %esp -sysenter_past_esp: +.Lsysenter_past_esp: pushl $__USER_DS /* pt_regs->ss */ pushl %ebp /* pt_regs->sp (stashed in bp) */ pushfl /* pt_regs->flags (except IF = 0) */ @@ -504,9 +536,9 @@ ENTRY(entry_INT80_32) restore_all: TRACE_IRQS_IRET -restore_all_notrace: +.Lrestore_all_notrace: #ifdef CONFIG_X86_ESPFIX32 - ALTERNATIVE "jmp restore_nocheck", "", X86_BUG_ESPFIX + ALTERNATIVE "jmp .Lrestore_nocheck", "", X86_BUG_ESPFIX movl PT_EFLAGS(%esp), %eax # mix EFLAGS, SS and CS /* @@ -518,22 +550,23 @@ restore_all_notrace: movb PT_CS(%esp), %al andl $(X86_EFLAGS_VM | (SEGMENT_TI_MASK << 8) | SEGMENT_RPL_MASK), %eax cmpl $((SEGMENT_LDT << 8) | USER_RPL), %eax - je ldt_ss # returning to user-space with LDT SS + je .Lldt_ss # returning to user-space with LDT SS #endif -restore_nocheck: +.Lrestore_nocheck: RESTORE_REGS 4 # skip orig_eax/error_code -irq_return: +.Lirq_return: INTERRUPT_RETURN + .section .fixup, "ax" ENTRY(iret_exc ) pushl $0 # no error code pushl $do_iret_error - jmp error_code + jmp common_exception .previous - _ASM_EXTABLE(irq_return, iret_exc) + _ASM_EXTABLE(.Lirq_return, iret_exc) #ifdef CONFIG_X86_ESPFIX32 -ldt_ss: +.Lldt_ss: /* * Setup and switch to ESPFIX stack * @@ -562,7 +595,7 @@ ldt_ss: */ DISABLE_INTERRUPTS(CLBR_EAX) lss (%esp), %esp /* switch to espfix segment */ - jmp restore_nocheck + jmp .Lrestore_nocheck #endif ENDPROC(entry_INT80_32) @@ -624,6 +657,7 @@ common_interrupt: ASM_CLAC addl $-0x80, (%esp) /* Adjust vector into the [-256, -1] range */ SAVE_ALL + ENCODE_FRAME_POINTER TRACE_IRQS_OFF movl %esp, %eax call do_IRQ @@ -635,6 +669,7 @@ ENTRY(name) \ ASM_CLAC; \ pushl $~(nr); \ SAVE_ALL; \ + ENCODE_FRAME_POINTER; \ TRACE_IRQS_OFF \ movl %esp, %eax; \ call fn; \ @@ -659,7 +694,7 @@ ENTRY(coprocessor_error) ASM_CLAC pushl $0 pushl $do_coprocessor_error - jmp error_code + jmp common_exception END(coprocessor_error) ENTRY(simd_coprocessor_error) @@ -673,14 +708,14 @@ ENTRY(simd_coprocessor_error) #else pushl $do_simd_coprocessor_error #endif - jmp error_code + jmp common_exception END(simd_coprocessor_error) ENTRY(device_not_available) ASM_CLAC pushl $-1 # mark this as an int pushl $do_device_not_available - jmp error_code + jmp common_exception END(device_not_available) #ifdef CONFIG_PARAVIRT @@ -694,59 +729,59 @@ ENTRY(overflow) ASM_CLAC pushl $0 pushl $do_overflow - jmp error_code + jmp common_exception END(overflow) ENTRY(bounds) ASM_CLAC pushl $0 pushl $do_bounds - jmp error_code + jmp common_exception END(bounds) ENTRY(invalid_op) ASM_CLAC pushl $0 pushl $do_invalid_op - jmp error_code + jmp common_exception END(invalid_op) ENTRY(coprocessor_segment_overrun) ASM_CLAC pushl $0 pushl $do_coprocessor_segment_overrun - jmp error_code + jmp common_exception END(coprocessor_segment_overrun) ENTRY(invalid_TSS) ASM_CLAC pushl $do_invalid_TSS - jmp error_code + jmp common_exception END(invalid_TSS) ENTRY(segment_not_present) ASM_CLAC pushl $do_segment_not_present - jmp error_code + jmp common_exception END(segment_not_present) ENTRY(stack_segment) ASM_CLAC pushl $do_stack_segment - jmp error_code + jmp common_exception END(stack_segment) ENTRY(alignment_check) ASM_CLAC pushl $do_alignment_check - jmp error_code + jmp common_exception END(alignment_check) ENTRY(divide_error) ASM_CLAC pushl $0 # no error code pushl $do_divide_error - jmp error_code + jmp common_exception END(divide_error) #ifdef CONFIG_X86_MCE @@ -754,7 +789,7 @@ ENTRY(machine_check) ASM_CLAC pushl $0 pushl machine_check_vector - jmp error_code + jmp common_exception END(machine_check) #endif @@ -762,13 +797,14 @@ ENTRY(spurious_interrupt_bug) ASM_CLAC pushl $0 pushl $do_spurious_interrupt_bug - jmp error_code + jmp common_exception END(spurious_interrupt_bug) #ifdef CONFIG_XEN ENTRY(xen_hypervisor_callback) pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL + ENCODE_FRAME_POINTER TRACE_IRQS_OFF /* @@ -823,6 +859,7 @@ ENTRY(xen_failsafe_callback) jmp iret_exc 5: pushl $-1 /* orig_ax = -1 => not a system call */ SAVE_ALL + ENCODE_FRAME_POINTER jmp ret_from_exception .section .fixup, "ax" @@ -882,7 +919,7 @@ ftrace_call: popl %edx popl %ecx popl %eax -ftrace_ret: +.Lftrace_ret: #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call ftrace_graph_call: @@ -952,7 +989,7 @@ GLOBAL(ftrace_regs_call) popl %gs addl $8, %esp /* Skip orig_ax and ip */ popf /* Pop flags at end (no addl to corrupt flags) */ - jmp ftrace_ret + jmp .Lftrace_ret popf jmp ftrace_stub @@ -963,7 +1000,7 @@ ENTRY(mcount) jb ftrace_stub /* Paging not enabled yet? */ cmpl $ftrace_stub, ftrace_trace_function - jnz trace + jnz .Ltrace #ifdef CONFIG_FUNCTION_GRAPH_TRACER cmpl $ftrace_stub, ftrace_graph_return jnz ftrace_graph_caller @@ -976,7 +1013,7 @@ ftrace_stub: ret /* taken from glibc */ -trace: +.Ltrace: pushl %eax pushl %ecx pushl %edx @@ -1027,7 +1064,7 @@ return_to_handler: ENTRY(trace_page_fault) ASM_CLAC pushl $trace_do_page_fault - jmp error_code + jmp common_exception END(trace_page_fault) #endif @@ -1035,7 +1072,10 @@ ENTRY(page_fault) ASM_CLAC pushl $do_page_fault ALIGN -error_code: + jmp common_exception +END(page_fault) + +common_exception: /* the function address is in %gs's slot on the stack */ pushl %fs pushl %es @@ -1047,6 +1087,7 @@ error_code: pushl %edx pushl %ecx pushl %ebx + ENCODE_FRAME_POINTER cld movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs @@ -1064,7 +1105,7 @@ error_code: movl %esp, %eax # pt_regs pointer call *%edi jmp ret_from_exception -END(page_fault) +END(common_exception) ENTRY(debug) /* @@ -1079,6 +1120,7 @@ ENTRY(debug) ASM_CLAC pushl $-1 # mark this as an int SAVE_ALL + ENCODE_FRAME_POINTER xorl %edx, %edx # error code 0 movl %esp, %eax # pt_regs pointer @@ -1094,11 +1136,11 @@ ENTRY(debug) .Ldebug_from_sysenter_stack: /* We're on the SYSENTER stack. Switch off. */ - movl %esp, %ebp + movl %esp, %ebx movl PER_CPU_VAR(cpu_current_top_of_stack), %esp TRACE_IRQS_OFF call do_debug - movl %ebp, %esp + movl %ebx, %esp jmp ret_from_exception END(debug) @@ -1116,11 +1158,12 @@ ENTRY(nmi) movl %ss, %eax cmpw $__ESPFIX_SS, %ax popl %eax - je nmi_espfix_stack + je .Lnmi_espfix_stack #endif pushl %eax # pt_regs->orig_ax SAVE_ALL + ENCODE_FRAME_POINTER xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer @@ -1132,21 +1175,21 @@ ENTRY(nmi) /* Not on SYSENTER stack. */ call do_nmi - jmp restore_all_notrace + jmp .Lrestore_all_notrace .Lnmi_from_sysenter_stack: /* * We're on the SYSENTER stack. Switch off. No one (not even debug) * is using the thread stack right now, so it's safe for us to use it. */ - movl %esp, %ebp + movl %esp, %ebx movl PER_CPU_VAR(cpu_current_top_of_stack), %esp call do_nmi - movl %ebp, %esp - jmp restore_all_notrace + movl %ebx, %esp + jmp .Lrestore_all_notrace #ifdef CONFIG_X86_ESPFIX32 -nmi_espfix_stack: +.Lnmi_espfix_stack: /* * create the pointer to lss back */ @@ -1159,12 +1202,13 @@ nmi_espfix_stack: .endr pushl %eax SAVE_ALL + ENCODE_FRAME_POINTER FIXUP_ESPFIX_STACK # %eax == %esp xorl %edx, %edx # zero error code call do_nmi RESTORE_REGS lss 12+4(%esp), %esp # back to espfix stack - jmp irq_return + jmp .Lirq_return #endif END(nmi) @@ -1172,6 +1216,7 @@ ENTRY(int3) ASM_CLAC pushl $-1 # mark this as an int SAVE_ALL + ENCODE_FRAME_POINTER TRACE_IRQS_OFF xorl %edx, %edx # zero error code movl %esp, %eax # pt_regs pointer @@ -1181,14 +1226,14 @@ END(int3) ENTRY(general_protection) pushl $do_general_protection - jmp error_code + jmp common_exception END(general_protection) #ifdef CONFIG_KVM_GUEST ENTRY(async_page_fault) ASM_CLAC pushl $do_async_page_fault - jmp error_code + jmp common_exception END(async_page_fault) #endif diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ef766a358b37..5b219707c2f2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -38,12 +38,6 @@ #include <asm/export.h> #include <linux/err.h> -/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ -#include <linux/elf-em.h> -#define AUDIT_ARCH_X86_64 (EM_X86_64|__AUDIT_ARCH_64BIT|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_64BIT 0x80000000 -#define __AUDIT_ARCH_LE 0x40000000 - .code64 .section .entry.text, "ax" @@ -469,6 +463,7 @@ END(irq_entries_start) ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS + ENCODE_FRAME_POINTER testb $3, CS(%rsp) jz 1f @@ -985,6 +980,7 @@ ENTRY(xen_failsafe_callback) ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS + ENCODE_FRAME_POINTER jmp error_exit END(xen_failsafe_callback) @@ -1028,6 +1024,7 @@ ENTRY(paranoid_entry) cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 + ENCODE_FRAME_POINTER 8 movl $1, %ebx movl $MSR_GS_BASE, %ecx rdmsr @@ -1075,6 +1072,7 @@ ENTRY(error_entry) cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 + ENCODE_FRAME_POINTER 8 xorl %ebx, %ebx testb $3, CS+8(%rsp) jz .Lerror_kernelspace @@ -1257,6 +1255,7 @@ ENTRY(nmi) pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + ENCODE_FRAME_POINTER /* * At this point we no longer need to worry about stack damage @@ -1270,11 +1269,10 @@ ENTRY(nmi) /* * Return back to user mode. We must *not* do the normal exit - * work, because we don't want to enable interrupts. Fortunately, - * do_nmi doesn't modify pt_regs. + * work, because we don't want to enable interrupts. */ SWAPGS - jmp restore_c_regs_and_iret + jmp restore_regs_and_iret .Lnmi_from_kernel: /* diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 23c881caabd1..e739002427ed 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -161,8 +161,6 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) } text_start = addr - image->sym_vvar_start; - current->mm->context.vdso = (void __user *)text_start; - current->mm->context.vdso_image = image; /* * MAYWRITE to allow gdb to COW and set breakpoints @@ -189,14 +187,12 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) if (IS_ERR(vma)) { ret = PTR_ERR(vma); do_munmap(mm, text_start, image->size); + } else { + current->mm->context.vdso = (void __user *)text_start; + current->mm->context.vdso_image = image; } up_fail: - if (ret) { - current->mm->context.vdso = NULL; - current->mm->context.vdso_image = NULL; - } - up_write(&mm->mmap_sem); return ret; } diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 9d4bf3ab049e..f1c22584a46f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -69,7 +69,7 @@ u64 x86_perf_event_update(struct perf_event *event) int shift = 64 - x86_pmu.cntval_bits; u64 prev_raw_count, new_raw_count; int idx = hwc->idx; - s64 delta; + u64 delta; if (idx == INTEL_PMC_IDX_FIXED_BTS) return 0; @@ -365,7 +365,11 @@ int x86_add_exclusive(unsigned int what) { int i; - if (x86_pmu.lbr_pt_coexist) + /* + * When lbr_pt_coexist we allow PT to coexist with either LBR or BTS. + * LBR and BTS are still mutually exclusive. + */ + if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) return 0; if (!atomic_inc_not_zero(&x86_pmu.lbr_exclusive[what])) { @@ -388,7 +392,7 @@ fail_unlock: void x86_del_exclusive(unsigned int what) { - if (x86_pmu.lbr_pt_coexist) + if (x86_pmu.lbr_pt_coexist && what == x86_lbr_exclusive_pt) return; atomic_dec(&x86_pmu.lbr_exclusive[what]); @@ -2299,7 +2303,7 @@ valid_user_frame(const void __user *fp, unsigned long size) static unsigned long get_segment_base(unsigned int segment) { struct desc_struct *desc; - int idx = segment >> 3; + unsigned int idx = segment >> 3; if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { #ifdef CONFIG_MODIFY_LDT_SYSCALL diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a74a2dbc0180..cb8522290e6a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4034,7 +4034,7 @@ __init int intel_pmu_init(void) /* Support full width counters using alternative MSR range */ if (x86_pmu.intel_cap.full_width_write) { - x86_pmu.max_period = x86_pmu.cntval_mask; + x86_pmu.max_period = x86_pmu.cntval_mask >> 1; x86_pmu.perfctr = MSR_IA32_PMC0; pr_cont("full-width counters, "); } diff --git a/arch/x86/events/intel/cstate.c b/arch/x86/events/intel/cstate.c index 4f5ac726335f..da51e5a3e2ff 100644 --- a/arch/x86/events/intel/cstate.c +++ b/arch/x86/events/intel/cstate.c @@ -540,6 +540,7 @@ static const struct x86_cpu_id intel_cstates_match[] __initconst = { X86_CSTATES_MODEL(INTEL_FAM6_SKYLAKE_DESKTOP, snb_cstates), X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNL, knl_cstates), + X86_CSTATES_MODEL(INTEL_FAM6_XEON_PHI_KNM, knl_cstates), { }, }; MODULE_DEVICE_TABLE(x86cpu, intel_cstates_match); diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index c5047b8f777b..1c1b9fe705c8 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -36,13 +36,6 @@ static DEFINE_PER_CPU(struct pt, pt_ctx); static struct pt_pmu pt_pmu; -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX -}; - /* * Capabilities of Intel PT hardware, such as number of address bits or * supported output schemes, are cached and exported to userspace as "caps" @@ -64,21 +57,21 @@ static struct pt_cap_desc { u8 reg; u32 mask; } pt_caps[] = { - PT_CAP(max_subleaf, 0, CR_EAX, 0xffffffff), - PT_CAP(cr3_filtering, 0, CR_EBX, BIT(0)), - PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)), - PT_CAP(ip_filtering, 0, CR_EBX, BIT(2)), - PT_CAP(mtc, 0, CR_EBX, BIT(3)), - PT_CAP(ptwrite, 0, CR_EBX, BIT(4)), - PT_CAP(power_event_trace, 0, CR_EBX, BIT(5)), - PT_CAP(topa_output, 0, CR_ECX, BIT(0)), - PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), - PT_CAP(single_range_output, 0, CR_ECX, BIT(2)), - PT_CAP(payloads_lip, 0, CR_ECX, BIT(31)), - PT_CAP(num_address_ranges, 1, CR_EAX, 0x3), - PT_CAP(mtc_periods, 1, CR_EAX, 0xffff0000), - PT_CAP(cycle_thresholds, 1, CR_EBX, 0xffff), - PT_CAP(psb_periods, 1, CR_EBX, 0xffff0000), + PT_CAP(max_subleaf, 0, CPUID_EAX, 0xffffffff), + PT_CAP(cr3_filtering, 0, CPUID_EBX, BIT(0)), + PT_CAP(psb_cyc, 0, CPUID_EBX, BIT(1)), + PT_CAP(ip_filtering, 0, CPUID_EBX, BIT(2)), + PT_CAP(mtc, 0, CPUID_EBX, BIT(3)), + PT_CAP(ptwrite, 0, CPUID_EBX, BIT(4)), + PT_CAP(power_event_trace, 0, CPUID_EBX, BIT(5)), + PT_CAP(topa_output, 0, CPUID_ECX, BIT(0)), + PT_CAP(topa_multiple_entries, 0, CPUID_ECX, BIT(1)), + PT_CAP(single_range_output, 0, CPUID_ECX, BIT(2)), + PT_CAP(payloads_lip, 0, CPUID_ECX, BIT(31)), + PT_CAP(num_address_ranges, 1, CPUID_EAX, 0x3), + PT_CAP(mtc_periods, 1, CPUID_EAX, 0xffff0000), + PT_CAP(cycle_thresholds, 1, CPUID_EBX, 0xffff), + PT_CAP(psb_periods, 1, CPUID_EBX, 0xffff0000), }; static u32 pt_cap_get(enum pt_capabilities cap) @@ -213,10 +206,10 @@ static int __init pt_pmu_hw_init(void) for (i = 0; i < PT_CPUID_LEAVES; i++) { cpuid_count(20, i, - &pt_pmu.caps[CR_EAX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_EBX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_ECX + i*PT_CPUID_REGS_NUM], - &pt_pmu.caps[CR_EDX + i*PT_CPUID_REGS_NUM]); + &pt_pmu.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM], + &pt_pmu.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM]); } ret = -ENOMEM; diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index a77ee026643d..bcbb1d2ae10b 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -604,7 +604,7 @@ struct x86_pmu { u64 lbr_sel_mask; /* LBR_SELECT valid bits */ const int *lbr_sel_map; /* lbr_select mappings */ bool lbr_double_abort; /* duplicated lbr aborts */ - bool lbr_pt_coexist; /* LBR may coexist with PT */ + bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ /* * Intel PT/LBR/BTS are exclusive diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild index 2cfed174e3c9..2b892e2313a9 100644 --- a/arch/x86/include/asm/Kbuild +++ b/arch/x86/include/asm/Kbuild @@ -6,10 +6,6 @@ generated-y += unistd_32_ia32.h generated-y += unistd_64_x32.h generated-y += xen-hypercalls.h -genhdr-y += unistd_32.h -genhdr-y += unistd_64.h -genhdr-y += unistd_x32.h - generic-y += clkdev.h generic-y += cputime.h generic-y += dma-contiguous.h diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h index 5391b0ae7cc3..395b69551fce 100644 --- a/arch/x86/include/asm/acpi.h +++ b/arch/x86/include/asm/acpi.h @@ -94,7 +94,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate) boot_cpu_data.x86_model <= 0x05 && boot_cpu_data.x86_mask < 0x0A) return 1; - else if (amd_e400_c1e_detected) + else if (boot_cpu_has(X86_BUG_AMD_APIC_C1E)) return 1; else return max_cstate; diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 5e828da2e18f..00c88a01301d 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -21,6 +21,10 @@ extern int amd_numa_init(void); extern int amd_get_subcaches(int); extern int amd_set_subcaches(int, unsigned long); +extern int amd_smn_read(u16 node, u32 address, u32 *value); +extern int amd_smn_write(u16 node, u32 address, u32 value); +extern int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo); + struct amd_l3_cache { unsigned indices; u8 subcaches[4]; @@ -55,6 +59,7 @@ struct threshold_bank { }; struct amd_northbridge { + struct pci_dev *root; struct pci_dev *misc; struct pci_dev *link; struct amd_l3_cache l3_cache; @@ -66,7 +71,6 @@ struct amd_northbridge_info { u64 flags; struct amd_northbridge *nb; }; -extern struct amd_northbridge_info amd_northbridges; #define AMD_NB_GART BIT(0) #define AMD_NB_L3_INDEX_DISABLE BIT(1) @@ -74,20 +78,9 @@ extern struct amd_northbridge_info amd_northbridges; #ifdef CONFIG_AMD_NB -static inline u16 amd_nb_num(void) -{ - return amd_northbridges.num; -} - -static inline bool amd_nb_has_feature(unsigned feature) -{ - return ((amd_northbridges.flags & feature) == feature); -} - -static inline struct amd_northbridge *node_to_amd_nb(int node) -{ - return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; -} +u16 amd_nb_num(void); +bool amd_nb_has_feature(unsigned int feature); +struct amd_northbridge *node_to_amd_nb(int node); static inline u16 amd_pci_dev_to_node_id(struct pci_dev *pdev) { diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index f5aaf6c83222..0c5fbc68e82d 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -11,7 +11,6 @@ #include <asm/fixmap.h> #include <asm/mpspec.h> #include <asm/msr.h> -#include <asm/idle.h> #define ARCH_APICTIMER_STOPS_ON_C3 1 @@ -196,7 +195,7 @@ static inline void native_apic_msr_write(u32 reg, u32 v) static inline void native_apic_msr_eoi_write(u32 reg, u32 v) { - wrmsr(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); + wrmsr_notrace(APIC_BASE_MSR + (APIC_EOI >> 4), APIC_EOI_ACK, 0); } static inline u32 native_apic_msr_read(u32 reg) @@ -332,6 +331,7 @@ struct apic { * on write for EOI. */ void (*eoi_write)(u32 reg, u32 v); + void (*native_eoi_write)(u32 reg, u32 v); u64 (*icr_read)(void); void (*icr_write)(u32 low, u32 high); void (*wait_icr_idle)(void); @@ -639,7 +639,6 @@ extern void irq_exit(void); static inline void entering_irq(void) { irq_enter(); - exit_idle(); } static inline void entering_ack_irq(void) diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 03d269bed941..24118c0b4640 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -272,7 +272,6 @@ struct compat_shmid64_ds { /* * The type of struct elf_prstatus.pr_reg in compatible core dumps. */ -#ifdef CONFIG_X86_X32_ABI typedef struct user_regs_struct compat_elf_gregset_t; /* Full regset -- prstatus on x32, otherwise on ia32 */ @@ -281,10 +280,9 @@ typedef struct user_regs_struct compat_elf_gregset_t; do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \ while (0) +#ifdef CONFIG_X86_X32_ABI #define COMPAT_USE_64BIT_TIME \ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) -#else -typedef struct user_regs_struct32 compat_elf_gregset_t; #endif /* diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 1d2b69fc0ceb..d59c15c3defd 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -204,6 +204,7 @@ static __always_inline __pure bool _static_cpu_has(u16 bit) #define static_cpu_has_bug(bit) static_cpu_has((bit)) #define boot_cpu_has_bug(bit) cpu_has_bug(&boot_cpu_data, (bit)) +#define boot_cpu_set_bug(bit) set_cpu_cap(&boot_cpu_data, (bit)) #define MAX_CPU_FEATURES (NCAPINTS * 32) #define cpu_have_feature boot_cpu_has diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index a39629206864..59ac427960d4 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -104,7 +104,6 @@ #define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ #define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ -#define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ @@ -193,6 +192,7 @@ #define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ #define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ #define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ #define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ #define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ @@ -226,6 +226,7 @@ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ #define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ @@ -279,8 +280,10 @@ #define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ +#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ #define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ #define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_RDPID (16*32+ 22) /* RDPID instruction */ /* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ #define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ @@ -311,4 +314,6 @@ #define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ #define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ #define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ +#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ + #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 476b574de99e..ec23d8e1297c 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -1,13 +1,17 @@ #ifndef _ASM_X86_E820_H #define _ASM_X86_E820_H -#ifdef CONFIG_EFI +/* + * E820_X_MAX is the maximum size of the extended E820 table. The extended + * table may contain up to 3 extra E820 entries per possible NUMA node, so we + * make room for 3 * MAX_NUMNODES possible entries, beyond the standard 128. + * Also note that E820_X_MAX *must* be defined before we include uapi/asm/e820.h. + */ #include <linux/numa.h> #define E820_X_MAX (E820MAX + 3 * MAX_NUMNODES) -#else /* ! CONFIG_EFI */ -#define E820_X_MAX E820MAX -#endif + #include <uapi/asm/e820.h> + #ifndef __ASSEMBLY__ /* see comment in arch/x86/kernel/e820.c */ extern struct e820map *e820; diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 389d700b961e..e99675b9c861 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -210,12 +210,18 @@ static inline bool efi_is_64bit(void) return __efi_early()->is64; } +#define efi_table_attr(table, attr, instance) \ + (efi_is_64bit() ? \ + ((table##_64_t *)(unsigned long)instance)->attr : \ + ((table##_32_t *)(unsigned long)instance)->attr) + +#define efi_call_proto(protocol, f, instance, ...) \ + __efi_early()->call(efi_table_attr(protocol, f, instance), \ + instance, ##__VA_ARGS__) + #define efi_call_early(f, ...) \ - __efi_early()->call(efi_is_64bit() ? \ - ((efi_boot_services_64_t *)(unsigned long) \ - __efi_early()->boot_services)->f : \ - ((efi_boot_services_32_t *)(unsigned long) \ - __efi_early()->boot_services)->f, __VA_ARGS__) + __efi_early()->call(efi_table_attr(efi_boot_services, f, \ + __efi_early()->boot_services), __VA_ARGS__) #define __efi_call_early(f, ...) \ __efi_early()->call((unsigned long)f, __VA_ARGS__); diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 1429a7c736db..0877ae018fc9 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -27,16 +27,6 @@ extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); /* - * Some instructions like VIA's padlock instructions generate a spurious - * DNA fault but don't modify SSE registers. And these instructions - * get used from interrupt context as well. To prevent these kernel instructions - * in interrupt context interacting wrongly with other user/kernel fpu usage, we - * should use them only in the context of irq_ts_save/restore() - */ -extern int irq_ts_save(void); -extern void irq_ts_restore(int TS_state); - -/* * Query the presence of one or more xfeatures. Works on any legacy CPU as well. * * If 'feature_name' is set then put a human-readable description of diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h index 2737366ea583..d4a684997497 100644 --- a/arch/x86/include/asm/fpu/internal.h +++ b/arch/x86/include/asm/fpu/internal.h @@ -60,11 +60,6 @@ extern u64 fpu__get_supported_xfeatures_mask(void); /* * FPU related CPU feature flag helper routines: */ -static __always_inline __pure bool use_eager_fpu(void) -{ - return static_cpu_has(X86_FEATURE_EAGER_FPU); -} - static __always_inline __pure bool use_xsaveopt(void) { return static_cpu_has(X86_FEATURE_XSAVEOPT); @@ -484,42 +479,42 @@ extern int copy_fpstate_to_sigframe(void __user *buf, void __user *fp, int size) DECLARE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); /* - * Must be run with preemption disabled: this clears the fpu_fpregs_owner_ctx, - * on this CPU. + * The in-register FPU state for an FPU context on a CPU is assumed to be + * valid if the fpu->last_cpu matches the CPU, and the fpu_fpregs_owner_ctx + * matches the FPU. * - * This will disable any lazy FPU state restore of the current FPU state, - * but if the current thread owns the FPU, it will still be saved by. + * If the FPU register state is valid, the kernel can skip restoring the + * FPU state from memory. + * + * Any code that clobbers the FPU registers or updates the in-memory + * FPU state for a task MUST let the rest of the kernel know that the + * FPU registers are no longer valid for this task. + * + * Either one of these invalidation functions is enough. Invalidate + * a resource you control: CPU if using the CPU for something else + * (with preemption disabled), FPU for the current task, or a task that + * is prevented from running by the current task. */ -static inline void __cpu_disable_lazy_restore(unsigned int cpu) +static inline void __cpu_invalidate_fpregs_state(void) { - per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; + __this_cpu_write(fpu_fpregs_owner_ctx, NULL); } -static inline int fpu_want_lazy_restore(struct fpu *fpu, unsigned int cpu) -{ - return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; -} - - -/* - * Wrap lazy FPU TS handling in a 'hw fpregs activation/deactivation' - * idiom, which is then paired with the sw-flag (fpregs_active) later on: - */ - -static inline void __fpregs_activate_hw(void) +static inline void __fpu_invalidate_fpregs_state(struct fpu *fpu) { - if (!use_eager_fpu()) - clts(); + fpu->last_cpu = -1; } -static inline void __fpregs_deactivate_hw(void) +static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu) { - if (!use_eager_fpu()) - stts(); + return fpu == this_cpu_read_stable(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu; } -/* Must be paired with an 'stts' (fpregs_deactivate_hw()) after! */ -static inline void __fpregs_deactivate(struct fpu *fpu) +/* + * These generally need preemption protection to work, + * do try to avoid using these on their own: + */ +static inline void fpregs_deactivate(struct fpu *fpu) { WARN_ON_FPU(!fpu->fpregs_active); @@ -528,8 +523,7 @@ static inline void __fpregs_deactivate(struct fpu *fpu) trace_x86_fpu_regs_deactivated(fpu); } -/* Must be paired with a 'clts' (fpregs_activate_hw()) before! */ -static inline void __fpregs_activate(struct fpu *fpu) +static inline void fpregs_activate(struct fpu *fpu) { WARN_ON_FPU(fpu->fpregs_active); @@ -554,51 +548,19 @@ static inline int fpregs_active(void) } /* - * Encapsulate the CR0.TS handling together with the - * software flag. - * - * These generally need preemption protection to work, - * do try to avoid using these on their own. - */ -static inline void fpregs_activate(struct fpu *fpu) -{ - __fpregs_activate_hw(); - __fpregs_activate(fpu); -} - -static inline void fpregs_deactivate(struct fpu *fpu) -{ - __fpregs_deactivate(fpu); - __fpregs_deactivate_hw(); -} - -/* * FPU state switching for scheduling. * * This is a two-stage process: * - * - switch_fpu_prepare() saves the old state and - * sets the new state of the CR0.TS bit. This is - * done within the context of the old process. + * - switch_fpu_prepare() saves the old state. + * This is done within the context of the old process. * * - switch_fpu_finish() restores the new state as * necessary. */ -typedef struct { int preload; } fpu_switch_t; - -static inline fpu_switch_t -switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) +static inline void +switch_fpu_prepare(struct fpu *old_fpu, int cpu) { - fpu_switch_t fpu; - - /* - * If the task has used the math, pre-load the FPU on xsave processors - * or if the past 5 consecutive context-switches used math. - */ - fpu.preload = static_cpu_has(X86_FEATURE_FPU) && - new_fpu->fpstate_active && - (use_eager_fpu() || new_fpu->counter > 5); - if (old_fpu->fpregs_active) { if (!copy_fpregs_to_fpstate(old_fpu)) old_fpu->last_cpu = -1; @@ -608,29 +570,8 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) /* But leave fpu_fpregs_owner_ctx! */ old_fpu->fpregs_active = 0; trace_x86_fpu_regs_deactivated(old_fpu); - - /* Don't change CR0.TS if we just switch! */ - if (fpu.preload) { - new_fpu->counter++; - __fpregs_activate(new_fpu); - trace_x86_fpu_regs_activated(new_fpu); - prefetch(&new_fpu->state); - } else { - __fpregs_deactivate_hw(); - } - } else { - old_fpu->counter = 0; + } else old_fpu->last_cpu = -1; - if (fpu.preload) { - new_fpu->counter++; - if (fpu_want_lazy_restore(new_fpu, cpu)) - fpu.preload = 0; - else - prefetch(&new_fpu->state); - fpregs_activate(new_fpu); - } - } - return fpu; } /* @@ -638,15 +579,19 @@ switch_fpu_prepare(struct fpu *old_fpu, struct fpu *new_fpu, int cpu) */ /* - * By the time this gets called, we've already cleared CR0.TS and - * given the process the FPU if we are going to preload the FPU - * state - all we need to do is to conditionally restore the register - * state itself. + * Set up the userspace FPU context for the new task, if the task + * has used the FPU. */ -static inline void switch_fpu_finish(struct fpu *new_fpu, fpu_switch_t fpu_switch) +static inline void switch_fpu_finish(struct fpu *new_fpu, int cpu) { - if (fpu_switch.preload) - copy_kernel_to_fpregs(&new_fpu->state); + bool preload = static_cpu_has(X86_FEATURE_FPU) && + new_fpu->fpstate_active; + + if (preload) { + if (!fpregs_state_valid(new_fpu, cpu)) + copy_kernel_to_fpregs(&new_fpu->state); + fpregs_activate(new_fpu); + } } /* diff --git a/arch/x86/include/asm/fpu/types.h b/arch/x86/include/asm/fpu/types.h index 48df486b02f9..3c80f5b9c09d 100644 --- a/arch/x86/include/asm/fpu/types.h +++ b/arch/x86/include/asm/fpu/types.h @@ -322,17 +322,6 @@ struct fpu { unsigned char fpregs_active; /* - * @counter: - * - * This counter contains the number of consecutive context switches - * during which the FPU stays used. If this is over a threshold, the - * lazy FPU restore logic becomes eager, to save the trap overhead. - * This is an unsigned char so that after 256 iterations the counter - * wraps and the context switch behavior turns lazy again; this is to - * deal with bursty apps that only use the FPU for a short time: - */ - unsigned char counter; - /* * @state: * * In-memory copy of all FPU registers that we save/restore @@ -340,29 +329,6 @@ struct fpu { * the registers in the FPU are more recent than this state * copy. If the task context-switches away then they get * saved here and represent the FPU state. - * - * After context switches there may be a (short) time period - * during which the in-FPU hardware registers are unchanged - * and still perfectly match this state, if the tasks - * scheduled afterwards are not using the FPU. - * - * This is the 'lazy restore' window of optimization, which - * we track though 'fpu_fpregs_owner_ctx' and 'fpu->last_cpu'. - * - * We detect whether a subsequent task uses the FPU via setting - * CR0::TS to 1, which causes any FPU use to raise a #NM fault. - * - * During this window, if the task gets scheduled again, we - * might be able to skip having to do a restore from this - * memory buffer to the hardware registers - at the cost of - * incurring the overhead of #NM fault traps. - * - * Note that on modern CPUs that support the XSAVEOPT (or other - * optimized XSAVE instructions), we don't use #NM traps anymore, - * as the hardware can track whether FPU registers need saving - * or not. On such CPUs we activate the non-lazy ('eagerfpu') - * logic, which unconditionally saves/restores all FPU state - * across context switches. (if FPU state exists.) */ union fpregs_state state; /* diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index 430bacf73074..1b2799e0699a 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -21,21 +21,16 @@ /* Supervisor features */ #define XFEATURE_MASK_SUPERVISOR (XFEATURE_MASK_PT) -/* Supported features which support lazy state saving */ -#define XFEATURE_MASK_LAZY (XFEATURE_MASK_FP | \ +/* All currently supported features */ +#define XCNTXT_MASK (XFEATURE_MASK_FP | \ XFEATURE_MASK_SSE | \ XFEATURE_MASK_YMM | \ XFEATURE_MASK_OPMASK | \ XFEATURE_MASK_ZMM_Hi256 | \ - XFEATURE_MASK_Hi16_ZMM) - -/* Supported features which require eager state saving */ -#define XFEATURE_MASK_EAGER (XFEATURE_MASK_BNDREGS | \ - XFEATURE_MASK_BNDCSR | \ - XFEATURE_MASK_PKRU) - -/* All currently supported features */ -#define XCNTXT_MASK (XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER) + XFEATURE_MASK_Hi16_ZMM | \ + XFEATURE_MASK_PKRU | \ + XFEATURE_MASK_BNDREGS | \ + XFEATURE_MASK_BNDCSR) #ifdef CONFIG_X86_64 #define REX_PREFIX "0x48, " diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h deleted file mode 100644 index c5d1785373ed..000000000000 --- a/arch/x86/include/asm/idle.h +++ /dev/null @@ -1,22 +0,0 @@ -#ifndef _ASM_X86_IDLE_H -#define _ASM_X86_IDLE_H - -#define IDLE_START 1 -#define IDLE_END 2 - -struct notifier_block; -void idle_notifier_register(struct notifier_block *n); -void idle_notifier_unregister(struct notifier_block *n); - -#ifdef CONFIG_X86_64 -void enter_idle(void); -void exit_idle(void); -#else /* !CONFIG_X86_64 */ -static inline void enter_idle(void) { } -static inline void exit_idle(void) { } -static inline void __exit_idle(void) { } -#endif /* CONFIG_X86_64 */ - -void amd_e400_remove_cpu(int cpu); - -#endif /* _ASM_X86_IDLE_H */ diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index d31881188431..29a594a3b82a 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -21,7 +21,6 @@ enum die_val { DIE_NMIUNKNOWN, }; -extern void printk_address(unsigned long address); extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index bdde80731f49..7892530cbacf 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -191,6 +191,8 @@ enum { #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 #define PFERR_PK_BIT 5 +#define PFERR_GUEST_FINAL_BIT 32 +#define PFERR_GUEST_PAGE_BIT 33 #define PFERR_PRESENT_MASK (1U << PFERR_PRESENT_BIT) #define PFERR_WRITE_MASK (1U << PFERR_WRITE_BIT) @@ -198,6 +200,13 @@ enum { #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) #define PFERR_PK_MASK (1U << PFERR_PK_BIT) +#define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) +#define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) + +#define PFERR_NESTED_GUEST_PAGE (PFERR_GUEST_PAGE_MASK | \ + PFERR_USER_MASK | \ + PFERR_WRITE_MASK | \ + PFERR_PRESENT_MASK) /* apic attention bits */ #define KVM_APIC_CHECK_VAPIC 0 @@ -1062,6 +1071,7 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3); +bool pdptrs_changed(struct kvm_vcpu *vcpu); int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, const void *val, int bytes); @@ -1124,7 +1134,8 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr); struct x86_emulate_ctxt; int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); +int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port); +int kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); int kvm_vcpu_halt(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); @@ -1203,7 +1214,7 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu); int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u64 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); void kvm_mmu_new_cr3(struct kvm_vcpu *vcpu); @@ -1358,7 +1369,8 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu); extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn); -void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); +int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu); +int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err); int kvm_is_in_guest(void); diff --git a/arch/x86/include/asm/kvm_page_track.h b/arch/x86/include/asm/kvm_page_track.h index c2b8d24a235c..d74747b031ec 100644 --- a/arch/x86/include/asm/kvm_page_track.h +++ b/arch/x86/include/asm/kvm_page_track.h @@ -29,9 +29,20 @@ struct kvm_page_track_notifier_node { * @gpa: the physical address written by guest. * @new: the data was written to the address. * @bytes: the written length. + * @node: this node */ void (*track_write)(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, - int bytes); + int bytes, struct kvm_page_track_notifier_node *node); + /* + * It is called when memory slot is being moved or removed + * users can drop write-protection for the pages in that memory slot + * + * @kvm: the kvm where memory slot being moved or removed + * @slot: the memory slot being moved or removed + * @node: this node + */ + void (*track_flush_slot)(struct kvm *kvm, struct kvm_memory_slot *slot, + struct kvm_page_track_notifier_node *node); }; void kvm_page_track_init(struct kvm *kvm); @@ -58,4 +69,5 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, struct kvm_page_track_notifier_node *n); void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, int bytes); +void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot); #endif diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index ef01fef3eebc..6c119cfae218 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -9,7 +9,6 @@ #define LHCALL_FLUSH_TLB 5 #define LHCALL_LOAD_IDT_ENTRY 6 #define LHCALL_SET_STACK 7 -#define LHCALL_TS 8 #define LHCALL_SET_CLOCKEVENT 9 #define LHCALL_HALT 10 #define LHCALL_SET_PMD 13 diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 9bd7ff5ffbcc..5132f2a6c0a2 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -252,8 +252,10 @@ static inline void cmci_recheck(void) {} #ifdef CONFIG_X86_MCE_AMD void mce_amd_feature_init(struct cpuinfo_x86 *c); +int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr); #else static inline void mce_amd_feature_init(struct cpuinfo_x86 *c) { } +static inline int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) { return -EINVAL; }; #endif int mce_available(struct cpuinfo_x86 *c); @@ -293,9 +295,7 @@ void do_machine_check(struct pt_regs *, long); /* * Threshold handler */ - extern void (*mce_threshold_vector)(void); -extern void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); /* Deferred error interrupt handler */ extern void (*deferred_error_int_vector)(void); @@ -356,27 +356,31 @@ enum smca_bank_types { N_SMCA_BANK_TYPES }; -struct smca_bank_name { - const char *name; /* Short name for sysfs */ - const char *long_name; /* Long name for pretty-printing */ -}; - -extern struct smca_bank_name smca_bank_names[N_SMCA_BANK_TYPES]; +#define HWID_MCATYPE(hwid, mcatype) (((hwid) << 16) | (mcatype)) -#define HWID_MCATYPE(hwid, mcatype) ((hwid << 16) | mcatype) - -struct smca_hwid_mcatype { +struct smca_hwid { unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ u32 hwid_mcatype; /* (hwid,mcatype) tuple */ u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */ }; -struct smca_bank_info { - struct smca_hwid_mcatype *type; - u32 type_instance; +struct smca_bank { + struct smca_hwid *hwid; + /* Instance ID */ + u32 id; }; -extern struct smca_bank_info smca_banks[MAX_NR_BANKS]; +extern struct smca_bank smca_banks[MAX_NR_BANKS]; + +extern const char *smca_get_long_name(enum smca_bank_types t); + +extern int mce_threshold_create_device(unsigned int cpu); +extern int mce_threshold_remove_device(unsigned int cpu); + +#else + +static inline int mce_threshold_create_device(unsigned int cpu) { return 0; }; +static inline int mce_threshold_remove_device(unsigned int cpu) { return 0; }; #endif diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index da0d81fa0b54..38711df3bcb5 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -20,6 +20,15 @@ do { \ (u32)((u64)(val)), \ (u32)((u64)(val) >> 32)) +struct ucode_patch { + struct list_head plist; + void *data; /* Intel uses only this one */ + u32 patch_id; + u16 equiv_cpu; +}; + +extern struct list_head microcode_cache; + struct cpu_signature { unsigned int sig; unsigned int pf; @@ -55,12 +64,7 @@ struct ucode_cpu_info { void *mc; }; extern struct ucode_cpu_info ucode_cpu_info[]; - -#ifdef CONFIG_MICROCODE -int __init microcode_init(void); -#else -static inline int __init microcode_init(void) { return 0; }; -#endif +struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa); #ifdef CONFIG_MICROCODE_INTEL extern struct microcode_ops * __init init_intel_microcode(void); @@ -131,11 +135,13 @@ static inline unsigned int x86_cpuid_family(void) } #ifdef CONFIG_MICROCODE +int __init microcode_init(void); extern void __init load_ucode_bsp(void); extern void load_ucode_ap(void); void reload_early_microcode(void); extern bool get_builtin_firmware(struct cpio_data *cd, const char *name); #else +static inline int __init microcode_init(void) { return 0; }; static inline void __init load_ucode_bsp(void) { } static inline void load_ucode_ap(void) { } static inline void reload_early_microcode(void) { } diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h index 15eb75484cc0..3e3e20be829a 100644 --- a/arch/x86/include/asm/microcode_amd.h +++ b/arch/x86/include/asm/microcode_amd.h @@ -40,38 +40,18 @@ struct microcode_amd { unsigned int mpb[0]; }; -static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, - unsigned int sig) -{ - int i = 0; - - if (!equiv_cpu_table) - return 0; - - while (equiv_cpu_table[i].installed_cpu != 0) { - if (sig == equiv_cpu_table[i].installed_cpu) - return equiv_cpu_table[i].equiv_cpu; - - i++; - } - return 0; -} - -extern int __apply_microcode_amd(struct microcode_amd *mc_amd); -extern int apply_microcode_amd(int cpu); -extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); - #define PATCH_MAX_SIZE PAGE_SIZE #ifdef CONFIG_MICROCODE_AMD extern void __init load_ucode_amd_bsp(unsigned int family); -extern void load_ucode_amd_ap(void); -extern int __init save_microcode_in_initrd_amd(void); +extern void load_ucode_amd_ap(unsigned int family); +extern int __init save_microcode_in_initrd_amd(unsigned int family); void reload_ucode_amd(void); #else static inline void __init load_ucode_amd_bsp(unsigned int family) {} -static inline void load_ucode_amd_ap(void) {} -static inline int __init save_microcode_in_initrd_amd(void) { return -EINVAL; } +static inline void load_ucode_amd_ap(unsigned int family) {} +static inline int __init +save_microcode_in_initrd_amd(unsigned int family) { return -EINVAL; } void reload_ucode_amd(void) {} #endif diff --git a/arch/x86/include/asm/microcode_intel.h b/arch/x86/include/asm/microcode_intel.h index 5e69154c9f07..195becc6f780 100644 --- a/arch/x86/include/asm/microcode_intel.h +++ b/arch/x86/include/asm/microcode_intel.h @@ -52,10 +52,6 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -extern int has_newer_microcode(void *mc, unsigned int csig, int cpf, int rev); -extern int microcode_sanity_check(void *mc, int print_err); -extern int find_matching_signature(void *mc, unsigned int csig, int cpf); - #ifdef CONFIG_MICROCODE_INTEL extern void __init load_ucode_intel_bsp(void); extern void load_ucode_intel_ap(void); diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 8e0a9fe86de4..306c7e12af55 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -47,7 +47,7 @@ struct ldt_struct { * allocations, but it's not worth trying to optimize. */ struct desc_struct *entries; - int size; + unsigned int size; }; /* diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 78f3760ca1f2..710273c617b8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -37,6 +37,10 @@ #define EFER_FFXSR (1<<_EFER_FFXSR) /* Intel MSRs. Some also available on other CPUs */ + +#define MSR_PPIN_CTL 0x0000004e +#define MSR_PPIN 0x0000004f + #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index b5fee97813cd..db0b90c3b03e 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -70,14 +70,14 @@ extern struct tracepoint __tracepoint_read_msr; extern struct tracepoint __tracepoint_write_msr; extern struct tracepoint __tracepoint_rdpmc; #define msr_tracepoint_active(t) static_key_false(&(t).key) -extern void do_trace_write_msr(unsigned msr, u64 val, int failed); -extern void do_trace_read_msr(unsigned msr, u64 val, int failed); -extern void do_trace_rdpmc(unsigned msr, u64 val, int failed); +extern void do_trace_write_msr(unsigned int msr, u64 val, int failed); +extern void do_trace_read_msr(unsigned int msr, u64 val, int failed); +extern void do_trace_rdpmc(unsigned int msr, u64 val, int failed); #else #define msr_tracepoint_active(t) false -static inline void do_trace_write_msr(unsigned msr, u64 val, int failed) {} -static inline void do_trace_read_msr(unsigned msr, u64 val, int failed) {} -static inline void do_trace_rdpmc(unsigned msr, u64 val, int failed) {} +static inline void do_trace_write_msr(unsigned int msr, u64 val, int failed) {} +static inline void do_trace_read_msr(unsigned int msr, u64 val, int failed) {} +static inline void do_trace_rdpmc(unsigned int msr, u64 val, int failed) {} #endif static inline unsigned long long native_read_msr(unsigned int msr) @@ -115,22 +115,36 @@ static inline unsigned long long native_read_msr_safe(unsigned int msr, } /* Can be uninlined because referenced by paravirt */ -notrace static inline void native_write_msr(unsigned int msr, - unsigned low, unsigned high) +static inline void notrace +__native_write_msr_notrace(unsigned int msr, u32 low, u32 high) { asm volatile("1: wrmsr\n" "2:\n" _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_unsafe) : : "c" (msr), "a"(low), "d" (high) : "memory"); +} + +/* Can be uninlined because referenced by paravirt */ +static inline void notrace +native_write_msr(unsigned int msr, u32 low, u32 high) +{ + __native_write_msr_notrace(msr, low, high); if (msr_tracepoint_active(__tracepoint_write_msr)) do_trace_write_msr(msr, ((u64)high << 32 | low), 0); } +static inline void +wrmsr_notrace(unsigned int msr, u32 low, u32 high) +{ + __native_write_msr_notrace(msr, low, high); +} + /* Can be uninlined because referenced by paravirt */ -notrace static inline int native_write_msr_safe(unsigned int msr, - unsigned low, unsigned high) +static inline int notrace +native_write_msr_safe(unsigned int msr, u32 low, u32 high) { int err; + asm volatile("2: wrmsr ; xor %[err],%[err]\n" "1:\n\t" ".section .fixup,\"ax\"\n\t" @@ -223,7 +237,7 @@ do { \ (void)((high) = (u32)(__val >> 32)); \ } while (0) -static inline void wrmsr(unsigned msr, unsigned low, unsigned high) +static inline void wrmsr(unsigned int msr, u32 low, u32 high) { native_write_msr(msr, low, high); } @@ -231,13 +245,13 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high) #define rdmsrl(msr, val) \ ((val) = native_read_msr((msr))) -static inline void wrmsrl(unsigned msr, u64 val) +static inline void wrmsrl(unsigned int msr, u64 val) { native_write_msr(msr, (u32)(val & 0xffffffffULL), (u32)(val >> 32)); } /* wrmsr with exception handling */ -static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high) +static inline int wrmsr_safe(unsigned int msr, u32 low, u32 high) { return native_write_msr_safe(msr, low, high); } @@ -252,7 +266,7 @@ static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high) __err; \ }) -static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) +static inline int rdmsrl_safe(unsigned int msr, unsigned long long *p) { int err; @@ -325,12 +339,12 @@ static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no, struct msr *msrs) { - rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); + rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); } static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no, struct msr *msrs) { - wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); + wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); } static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) diff --git a/arch/x86/include/asm/mutex.h b/arch/x86/include/asm/mutex.h deleted file mode 100644 index 7d3a48275394..000000000000 --- a/arch/x86/include/asm/mutex.h +++ /dev/null @@ -1,5 +0,0 @@ -#ifdef CONFIG_X86_32 -# include <asm/mutex_32.h> -#else -# include <asm/mutex_64.h> -#endif diff --git a/arch/x86/include/asm/mutex_32.h b/arch/x86/include/asm/mutex_32.h deleted file mode 100644 index e9355a84fc67..000000000000 --- a/arch/x86/include/asm/mutex_32.h +++ /dev/null @@ -1,110 +0,0 @@ -/* - * Assembly implementation of the mutex fastpath, based on atomic - * decrement/increment. - * - * started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - */ -#ifndef _ASM_X86_MUTEX_32_H -#define _ASM_X86_MUTEX_32_H - -#include <asm/alternative.h> - -/** - * __mutex_fastpath_lock - try to take the lock by moving the count - * from 1 to a 0 value - * @count: pointer of type atomic_t - * @fn: function to call if the original value was not 1 - * - * Change the count from 1 to a value lower than 1, and call <fn> if it - * wasn't 1 originally. This function MUST leave the value lower than 1 - * even when the "1" assertion wasn't true. - */ -#define __mutex_fastpath_lock(count, fail_fn) \ -do { \ - unsigned int dummy; \ - \ - typecheck(atomic_t *, count); \ - typecheck_fn(void (*)(atomic_t *), fail_fn); \ - \ - asm volatile(LOCK_PREFIX " decl (%%eax)\n" \ - " jns 1f \n" \ - " call " #fail_fn "\n" \ - "1:\n" \ - : "=a" (dummy) \ - : "a" (count) \ - : "memory", "ecx", "edx"); \ -} while (0) - - -/** - * __mutex_fastpath_lock_retval - try to take the lock by moving the count - * from 1 to a 0 value - * @count: pointer of type atomic_t - * - * Change the count from 1 to a value lower than 1. This function returns 0 - * if the fastpath succeeds, or -1 otherwise. - */ -static inline int __mutex_fastpath_lock_retval(atomic_t *count) -{ - if (unlikely(atomic_dec_return(count) < 0)) - return -1; - else - return 0; -} - -/** - * __mutex_fastpath_unlock - try to promote the mutex from 0 to 1 - * @count: pointer of type atomic_t - * @fail_fn: function to call if the original value was not 0 - * - * try to promote the mutex from 0 to 1. if it wasn't 0, call <fail_fn>. - * In the failure case, this function is allowed to either set the value - * to 1, or to set it to a value lower than 1. - * - * If the implementation sets it to a value of lower than 1, the - * __mutex_slowpath_needs_to_unlock() macro needs to return 1, it needs - * to return 0 otherwise. - */ -#define __mutex_fastpath_unlock(count, fail_fn) \ -do { \ - unsigned int dummy; \ - \ - typecheck(atomic_t *, count); \ - typecheck_fn(void (*)(atomic_t *), fail_fn); \ - \ - asm volatile(LOCK_PREFIX " incl (%%eax)\n" \ - " jg 1f\n" \ - " call " #fail_fn "\n" \ - "1:\n" \ - : "=a" (dummy) \ - : "a" (count) \ - : "memory", "ecx", "edx"); \ -} while (0) - -#define __mutex_slowpath_needs_to_unlock() 1 - -/** - * __mutex_fastpath_trylock - try to acquire the mutex, without waiting - * - * @count: pointer of type atomic_t - * @fail_fn: fallback function - * - * Change the count from 1 to a value lower than 1, and return 0 (failure) - * if it wasn't 1 originally, or return 1 (success) otherwise. This function - * MUST leave the value lower than 1 even when the "1" assertion wasn't true. - * Additionally, if the value was < 0 originally, this function must not leave - * it to 0 on failure. - */ -static inline int __mutex_fastpath_trylock(atomic_t *count, - int (*fail_fn)(atomic_t *)) -{ - /* cmpxchg because it never induces a false contention state. */ - if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1)) - return 1; - - return 0; -} - -#endif /* _ASM_X86_MUTEX_32_H */ diff --git a/arch/x86/include/asm/mutex_64.h b/arch/x86/include/asm/mutex_64.h deleted file mode 100644 index d9850758464e..000000000000 --- a/arch/x86/include/asm/mutex_64.h +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Assembly implementation of the mutex fastpath, based on atomic - * decrement/increment. - * - * started by Ingo Molnar: - * - * Copyright (C) 2004, 2005, 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - */ -#ifndef _ASM_X86_MUTEX_64_H -#define _ASM_X86_MUTEX_64_H - -/** - * __mutex_fastpath_lock - decrement and call function if negative - * @v: pointer of type atomic_t - * @fail_fn: function to call if the result is negative - * - * Atomically decrements @v and calls <fail_fn> if the result is negative. - */ -#ifdef CC_HAVE_ASM_GOTO -static inline void __mutex_fastpath_lock(atomic_t *v, - void (*fail_fn)(atomic_t *)) -{ - asm_volatile_goto(LOCK_PREFIX " decl %0\n" - " jns %l[exit]\n" - : : "m" (v->counter) - : "memory", "cc" - : exit); - fail_fn(v); -exit: - return; -} -#else -#define __mutex_fastpath_lock(v, fail_fn) \ -do { \ - unsigned long dummy; \ - \ - typecheck(atomic_t *, v); \ - typecheck_fn(void (*)(atomic_t *), fail_fn); \ - \ - asm volatile(LOCK_PREFIX " decl (%%rdi)\n" \ - " jns 1f \n" \ - " call " #fail_fn "\n" \ - "1:" \ - : "=D" (dummy) \ - : "D" (v) \ - : "rax", "rsi", "rdx", "rcx", \ - "r8", "r9", "r10", "r11", "memory"); \ -} while (0) -#endif - -/** - * __mutex_fastpath_lock_retval - try to take the lock by moving the count - * from 1 to a 0 value - * @count: pointer of type atomic_t - * - * Change the count from 1 to a value lower than 1. This function returns 0 - * if the fastpath succeeds, or -1 otherwise. - */ -static inline int __mutex_fastpath_lock_retval(atomic_t *count) -{ - if (unlikely(atomic_dec_return(count) < 0)) - return -1; - else - return 0; -} - -/** - * __mutex_fastpath_unlock - increment and call function if nonpositive - * @v: pointer of type atomic_t - * @fail_fn: function to call if the result is nonpositive - * - * Atomically increments @v and calls <fail_fn> if the result is nonpositive. - */ -#ifdef CC_HAVE_ASM_GOTO -static inline void __mutex_fastpath_unlock(atomic_t *v, - void (*fail_fn)(atomic_t *)) -{ - asm_volatile_goto(LOCK_PREFIX " incl %0\n" - " jg %l[exit]\n" - : : "m" (v->counter) - : "memory", "cc" - : exit); - fail_fn(v); -exit: - return; -} -#else -#define __mutex_fastpath_unlock(v, fail_fn) \ -do { \ - unsigned long dummy; \ - \ - typecheck(atomic_t *, v); \ - typecheck_fn(void (*)(atomic_t *), fail_fn); \ - \ - asm volatile(LOCK_PREFIX " incl (%%rdi)\n" \ - " jg 1f\n" \ - " call " #fail_fn "\n" \ - "1:" \ - : "=D" (dummy) \ - : "D" (v) \ - : "rax", "rsi", "rdx", "rcx", \ - "r8", "r9", "r10", "r11", "memory"); \ -} while (0) -#endif - -#define __mutex_slowpath_needs_to_unlock() 1 - -/** - * __mutex_fastpath_trylock - try to acquire the mutex, without waiting - * - * @count: pointer of type atomic_t - * @fail_fn: fallback function - * - * Change the count from 1 to 0 and return 1 (success), or return 0 (failure) - * if it wasn't 1 originally. [the fallback function is never used on - * x86_64, because all x86_64 CPUs have a CMPXCHG instruction.] - */ -static inline int __mutex_fastpath_trylock(atomic_t *count, - int (*fail_fn)(atomic_t *)) -{ - if (likely(atomic_read(count) == 1 && atomic_cmpxchg(count, 1, 0) == 1)) - return 1; - - return 0; -} - -#endif /* _ASM_X86_MUTEX_64_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index ce932812f142..1eea6ca40694 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -41,11 +41,6 @@ static inline void set_debugreg(unsigned long val, int reg) PVOP_VCALL2(pv_cpu_ops.set_debugreg, reg, val); } -static inline void clts(void) -{ - PVOP_VCALL0(pv_cpu_ops.clts); -} - static inline unsigned long read_cr0(void) { return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr0); @@ -678,6 +673,11 @@ static __always_inline void pv_kick(int cpu) PVOP_VCALL1(pv_lock_ops.kick, cpu); } +static __always_inline bool pv_vcpu_is_preempted(int cpu) +{ + return PVOP_CALLEE1(bool, pv_lock_ops.vcpu_is_preempted, cpu); +} + #endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 0f400c0e4979..bb2de45a60f2 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -103,8 +103,6 @@ struct pv_cpu_ops { unsigned long (*get_debugreg)(int regno); void (*set_debugreg)(int regno, unsigned long value); - void (*clts)(void); - unsigned long (*read_cr0)(void); void (*write_cr0)(unsigned long); @@ -310,6 +308,8 @@ struct pv_lock_ops { void (*wait)(u8 *ptr, u8 val); void (*kick)(int cpu); + + struct paravirt_callee_save vcpu_is_preempted; }; /* This contains all the paravirt structures: we get a convenient @@ -508,6 +508,18 @@ int paravirt_disable_iospace(void); #define PVOP_TEST_NULL(op) ((void)op) #endif +#define PVOP_RETMASK(rettype) \ + ({ unsigned long __mask = ~0UL; \ + switch (sizeof(rettype)) { \ + case 1: __mask = 0xffUL; break; \ + case 2: __mask = 0xffffUL; break; \ + case 4: __mask = 0xffffffffUL; break; \ + default: break; \ + } \ + __mask; \ + }) + + #define ____PVOP_CALL(rettype, op, clbr, call_clbr, extra_clbr, \ pre, post, ...) \ ({ \ @@ -535,7 +547,7 @@ int paravirt_disable_iospace(void); paravirt_clobber(clbr), \ ##__VA_ARGS__ \ : "memory", "cc" extra_clbr); \ - __ret = (rettype)__eax; \ + __ret = (rettype)(__eax & PVOP_RETMASK(rettype)); \ } \ __ret; \ }) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 84f58de08c2b..9fa03604b2b3 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -507,17 +507,6 @@ do { \ #endif -/* This is not atomic against other CPUs -- CPU preemption needs to be off */ -#define x86_test_and_clear_bit_percpu(bit, var) \ -({ \ - bool old__; \ - asm volatile("btr %2,"__percpu_arg(1)"\n\t" \ - CC_SET(c) \ - : CC_OUT(c) (old__), "+m" (var) \ - : "dIr" (bit)); \ - old__; \ -}) - static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr, const unsigned long __percpu *addr) { diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 17f218645701..ec1f3c651150 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -24,7 +24,13 @@ static __always_inline int preempt_count(void) static __always_inline void preempt_count_set(int pc) { - raw_cpu_write_4(__preempt_count, pc); + int old, new; + + do { + old = raw_cpu_read_4(__preempt_count); + new = (old & PREEMPT_NEED_RESCHED) | + (pc & ~PREEMPT_NEED_RESCHED); + } while (raw_cpu_cmpxchg_4(__preempt_count, old, new) != old); } /* diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 984a7bf17f6a..6aa741fbe1df 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -137,6 +137,17 @@ struct cpuinfo_x86 { u32 microcode; }; +struct cpuid_regs { + u32 eax, ebx, ecx, edx; +}; + +enum cpuid_regs_idx { + CPUID_EAX = 0, + CPUID_EBX, + CPUID_ECX, + CPUID_EDX, +}; + #define X86_VENDOR_INTEL 0 #define X86_VENDOR_CYRIX 1 #define X86_VENDOR_AMD 2 @@ -178,6 +189,9 @@ extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); +extern u32 get_scattered_cpuid_leaf(unsigned int level, + unsigned int sub_leaf, + enum cpuid_regs_idx reg); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern void init_amd_cacheinfo(struct cpuinfo_x86 *c); @@ -588,8 +602,6 @@ static __always_inline void cpu_relax(void) rep_nop(); } -#define cpu_relax_lowlatency() cpu_relax() - /* Stop speculative execution and prefetching of modified code. */ static inline void sync_core(void) { @@ -621,10 +633,9 @@ static inline void sync_core(void) } extern void select_idle_routine(const struct cpuinfo_x86 *c); -extern void init_amd_e400_c1e_mask(void); +extern void amd_e400_c1e_apic_setup(void); extern unsigned long boot_option_idle_override; -extern bool amd_e400_c1e_detected; enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, IDLE_POLL}; diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index eaba08076030..c343ab52579f 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -32,6 +32,12 @@ static inline void queued_spin_unlock(struct qspinlock *lock) { pv_queued_spin_unlock(lock); } + +#define vcpu_is_preempted vcpu_is_preempted +static inline bool vcpu_is_preempted(int cpu) +{ + return pv_vcpu_is_preempted(cpu); +} #else static inline void queued_spin_unlock(struct qspinlock *lock) { diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 19a2224f9e16..12af3e35edfa 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -6,11 +6,6 @@ #include <asm/nops.h> -static inline void native_clts(void) -{ - asm volatile("clts"); -} - /* * Volatile isn't enough to prevent the compiler from reordering the * read/write functions for the control registers and messing everything up. @@ -208,16 +203,8 @@ static inline void load_gs_index(unsigned selector) #endif -/* Clear the 'TS' bit */ -static inline void clts(void) -{ - native_clts(); -} - #endif/* CONFIG_PARAVIRT */ -#define stts() write_cr0(read_cr0() | X86_CR0_TS) - static inline void clflush(volatile void *__p) { asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 37f2e0b377ad..a3269c897ec5 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -30,8 +30,7 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); -void stack_type_str(enum stack_type type, const char **begin, - const char **end); +const char *stack_type_name(enum stack_type type); static inline bool on_stack(struct stack_info *info, void *addr, size_t len) { @@ -43,8 +42,6 @@ static inline bool on_stack(struct stack_info *info, void *addr, size_t len) addr + len > begin && addr + len <= end); } -extern int kstack_depth_to_print; - #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else @@ -86,9 +83,6 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl); -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl); - extern unsigned int code_bytes; /* The form of the top of the frame on the stack */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index cf75871d2f81..6358a85e2270 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -146,4 +146,36 @@ struct pci_bus; int x86_pci_root_bus_node(int bus); void x86_pci_root_bus_resources(int bus, struct list_head *resources); +extern bool x86_topology_update; + +#ifdef CONFIG_SCHED_MC_PRIO +#include <asm/percpu.h> + +DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority); +extern unsigned int __read_mostly sysctl_sched_itmt_enabled; + +/* Interface to set priority of a cpu */ +void sched_set_itmt_core_prio(int prio, int core_cpu); + +/* Interface to notify scheduler that system supports ITMT */ +int sched_set_itmt_support(void); + +/* Interface to notify scheduler that system revokes ITMT support */ +void sched_clear_itmt_support(void); + +#else /* CONFIG_SCHED_MC_PRIO */ + +#define sysctl_sched_itmt_enabled 0 +static inline void sched_set_itmt_core_prio(int prio, int core_cpu) +{ +} +static inline int sched_set_itmt_support(void) +{ + return 0; +} +static inline void sched_clear_itmt_support(void) +{ +} +#endif /* CONFIG_SCHED_MC_PRIO */ + #endif /* _ASM_X86_TOPOLOGY_H */ diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h index 9217ab1f5bf6..342e59789fcd 100644 --- a/arch/x86/include/asm/trace/fpu.h +++ b/arch/x86/include/asm/trace/fpu.h @@ -14,7 +14,6 @@ DECLARE_EVENT_CLASS(x86_fpu, __field(struct fpu *, fpu) __field(bool, fpregs_active) __field(bool, fpstate_active) - __field(int, counter) __field(u64, xfeatures) __field(u64, xcomp_bv) ), @@ -23,17 +22,15 @@ DECLARE_EVENT_CLASS(x86_fpu, __entry->fpu = fpu; __entry->fpregs_active = fpu->fpregs_active; __entry->fpstate_active = fpu->fpstate_active; - __entry->counter = fpu->counter; if (boot_cpu_has(X86_FEATURE_OSXSAVE)) { __entry->xfeatures = fpu->state.xsave.header.xfeatures; __entry->xcomp_bv = fpu->state.xsave.header.xcomp_bv; } ), - TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d counter: %d xfeatures: %llx xcomp_bv: %llx", + TP_printk("x86/fpu: %p fpregs_active: %d fpstate_active: %d xfeatures: %llx xcomp_bv: %llx", __entry->fpu, __entry->fpregs_active, __entry->fpstate_active, - __entry->counter, __entry->xfeatures, __entry->xcomp_bv ) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index faf3687f1035..ea148313570f 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -68,6 +68,12 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ }) +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP +# define WARN_ON_IN_IRQ() WARN_ON_ONCE(!in_task()) +#else +# define WARN_ON_IN_IRQ() +#endif + /** * access_ok: - Checks if a user space pointer is valid * @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that @@ -88,8 +94,11 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. */ -#define access_ok(type, addr, size) \ - likely(!__range_not_ok(addr, size, user_addr_max())) +#define access_ok(type, addr, size) \ +({ \ + WARN_ON_IN_IRQ(); \ + likely(!__range_not_ok(addr, size, user_addr_max())); \ +}) /* * These are the main single-value transfer routines. They automatically diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 46de9ac4b990..c5a7f3a930dd 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -13,6 +13,7 @@ struct unwind_state { int graph_idx; #ifdef CONFIG_FRAME_POINTER unsigned long *bp; + struct pt_regs *regs; #else unsigned long *sp; #endif @@ -47,7 +48,15 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) if (unwind_done(state)) return NULL; - return state->bp + 1; + return state->regs ? &state->regs->ip : state->bp + 1; +} + +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + return state->regs; } #else /* !CONFIG_FRAME_POINTER */ @@ -58,6 +67,11 @@ unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) return NULL; } +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +{ + return NULL; +} + #endif /* CONFIG_FRAME_POINTER */ #endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index e728699db774..3a01996db58f 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -89,8 +89,13 @@ static inline unsigned int __getcpu(void) * works on all CPUs. This is volatile so that it orders * correctly wrt barrier() and to keep gcc from cleverly * hoisting it out of the calling function. + * + * If RDPID is available, use it. */ - asm volatile ("lsl %1,%0" : "=r" (p) : "r" (__PER_CPU_SEG)); + alternative_io ("lsl %[p],%[seg]", + ".byte 0xf3,0x0f,0xc7,0xf8", /* RDPID %eax/rax */ + X86_FEATURE_RDPID, + [p] "=a" (p), [seg] "r" (__PER_CPU_SEG)); return p; } diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index a002b07a7099..2b5b2d4b924e 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -25,6 +25,7 @@ #define VMX_H +#include <linux/bitops.h> #include <linux/types.h> #include <uapi/asm/vmx.h> @@ -60,6 +61,7 @@ */ #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 #define SECONDARY_EXEC_ENABLE_EPT 0x00000002 +#define SECONDARY_EXEC_DESC 0x00000004 #define SECONDARY_EXEC_RDTSCP 0x00000008 #define SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE 0x00000010 #define SECONDARY_EXEC_ENABLE_VPID 0x00000020 @@ -110,6 +112,36 @@ #define VMX_MISC_SAVE_EFER_LMA 0x00000020 #define VMX_MISC_ACTIVITY_HLT 0x00000040 +static inline u32 vmx_basic_vmcs_revision_id(u64 vmx_basic) +{ + return vmx_basic & GENMASK_ULL(30, 0); +} + +static inline u32 vmx_basic_vmcs_size(u64 vmx_basic) +{ + return (vmx_basic & GENMASK_ULL(44, 32)) >> 32; +} + +static inline int vmx_misc_preemption_timer_rate(u64 vmx_misc) +{ + return vmx_misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK; +} + +static inline int vmx_misc_cr3_count(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(24, 16)) >> 16; +} + +static inline int vmx_misc_max_msr(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(27, 25)) >> 25; +} + +static inline int vmx_misc_mseg_revid(u64 vmx_misc) +{ + return (vmx_misc & GENMASK_ULL(63, 32)) >> 32; +} + /* VMCS Encodings */ enum vmcs_field { VIRTUAL_PROCESSOR_ID = 0x00000000, @@ -399,10 +431,11 @@ enum vmcs_field { #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT (KVM_USER_MEM_SLOTS + 2) #define VMX_NR_VPIDS (1 << 16) +#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_VPID_EXTENT_SINGLE_CONTEXT 1 #define VMX_VPID_EXTENT_ALL_CONTEXT 2 +#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL 3 -#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR 0 #define VMX_EPT_EXTENT_CONTEXT 1 #define VMX_EPT_EXTENT_GLOBAL 2 #define VMX_EPT_EXTENT_SHIFT 24 @@ -419,8 +452,10 @@ enum vmcs_field { #define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) #define VMX_VPID_INVVPID_BIT (1ull << 0) /* (32 - 32) */ +#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT (1ull << 8) /* (40 - 32) */ #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT (1ull << 9) /* (41 - 32) */ #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT (1ull << 10) /* (42 - 32) */ +#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT (1ull << 11) /* (43 - 32) */ #define VMX_EPT_DEFAULT_GAW 3 #define VMX_EPT_MAX_GAW 0x4 diff --git a/arch/x86/include/uapi/asm/bootparam.h b/arch/x86/include/uapi/asm/bootparam.h index c18ce67495fa..b10bf319ed20 100644 --- a/arch/x86/include/uapi/asm/bootparam.h +++ b/arch/x86/include/uapi/asm/bootparam.h @@ -7,6 +7,7 @@ #define SETUP_DTB 2 #define SETUP_PCI 3 #define SETUP_EFI 4 +#define SETUP_APPLE_PROPERTIES 5 /* ram_size flags */ #define RAMDISK_IMAGE_START_MASK 0x07FF diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 94dc8ca434e0..1421a6585126 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -45,7 +45,9 @@ struct kvm_steal_time { __u64 steal; __u32 version; __u32 flags; - __u32 pad[12]; + __u8 preempted; + __u8 u8_pad[3]; + __u32 pad[11]; }; #define KVM_STEAL_ALIGNMENT_BITS 5 diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 69a6e07e3149..eb6247a7009b 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -28,6 +28,7 @@ struct mce { __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ + __u64 ppin; /* Protected Processor Inventory Number */ }; #define MCE_GET_RECORD_LEN _IOR('M', 1, int) diff --git a/arch/x86/include/uapi/asm/prctl.h b/arch/x86/include/uapi/asm/prctl.h index ae135de547f5..835aa51c7f6e 100644 --- a/arch/x86/include/uapi/asm/prctl.h +++ b/arch/x86/include/uapi/asm/prctl.h @@ -6,10 +6,8 @@ #define ARCH_GET_FS 0x1003 #define ARCH_GET_GS 0x1004 -#ifdef CONFIG_CHECKPOINT_RESTORE -# define ARCH_MAP_VDSO_X32 0x2001 -# define ARCH_MAP_VDSO_32 0x2002 -# define ARCH_MAP_VDSO_64 0x2003 -#endif +#define ARCH_MAP_VDSO_X32 0x2001 +#define ARCH_MAP_VDSO_32 0x2002 +#define ARCH_MAP_VDSO_64 0x2003 #endif /* _ASM_X86_PRCTL_H */ diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index 37fee272618f..14458658e988 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -65,6 +65,8 @@ #define EXIT_REASON_TPR_BELOW_THRESHOLD 43 #define EXIT_REASON_APIC_ACCESS 44 #define EXIT_REASON_EOI_INDUCED 45 +#define EXIT_REASON_GDTR_IDTR 46 +#define EXIT_REASON_LDTR_TR 47 #define EXIT_REASON_EPT_VIOLATION 48 #define EXIT_REASON_EPT_MISCONFIG 49 #define EXIT_REASON_INVEPT 50 @@ -113,6 +115,8 @@ { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ + { EXIT_REASON_GDTR_IDTR, "GDTR_IDTR" }, \ + { EXIT_REASON_LDTR_TR, "LDTR_TR" }, \ { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ { EXIT_REASON_INVEPT, "INVEPT" }, \ @@ -129,6 +133,7 @@ { EXIT_REASON_XRSTORS, "XRSTORS" } #define VMX_ABORT_SAVE_GUEST_MSR_FAIL 1 +#define VMX_ABORT_LOAD_HOST_PDPTE_FAIL 2 #define VMX_ABORT_LOAD_HOST_MSR_FAIL 4 #endif /* _UAPIVMX_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 79076d75bdbf..05110c1097ae 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -123,6 +123,7 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o +obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o ifdef CONFIG_FRAME_POINTER obj-y += unwind_frame.o diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c index c280df6b2aa2..ea3046e0b0cf 100644 --- a/arch/x86/kernel/acpi/apei.c +++ b/arch/x86/kernel/acpi/apei.c @@ -24,9 +24,6 @@ int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) struct acpi_hest_ia_corrected *cmc; struct acpi_hest_ia_error_bank *mc_bank; - if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) - return 0; - cmc = (struct acpi_hest_ia_corrected *)hest_hdr; if (!cmc->enabled) return 0; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 931ced8ca345..4764fa56924d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -76,6 +76,7 @@ int acpi_fix_pin2_polarity __initdata; static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; #endif +#ifdef CONFIG_X86_IO_APIC /* * Locks related to IOAPIC hotplug * Hotplug side: @@ -88,6 +89,7 @@ static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; * ->ioapic_lock */ static DEFINE_MUTEX(acpi_ioapic_lock); +#endif /* -------------------------------------------------------------------------- Boot-time Configuration diff --git a/arch/x86/kernel/acpi/wakeup_64.S b/arch/x86/kernel/acpi/wakeup_64.S index 169963f471bb..50b8ed0317a3 100644 --- a/arch/x86/kernel/acpi/wakeup_64.S +++ b/arch/x86/kernel/acpi/wakeup_64.S @@ -109,6 +109,15 @@ ENTRY(do_suspend_lowlevel) movq pt_regs_r14(%rax), %r14 movq pt_regs_r15(%rax), %r15 +#ifdef CONFIG_KASAN + /* + * The suspend path may have poisoned some areas deeper in the stack, + * which we now need to unpoison. + */ + movq %rsp, %rdi + call kasan_unpoison_task_stack_below +#endif + xorl %eax, %eax addq $8, %rsp FRAME_END diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4fdf6230d93c..458da8509b75 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -13,8 +13,20 @@ #include <linux/spinlock.h> #include <asm/amd_nb.h> +#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450 +#define PCI_DEVICE_ID_AMD_17H_DF_F3 0x1463 +#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464 + +/* Protect the PCI config register pairs used for SMN and DF indirect access. */ +static DEFINE_MUTEX(smn_mutex); + static u32 *flush_words; +static const struct pci_device_id amd_root_ids[] = { + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) }, + {} +}; + const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, @@ -24,9 +36,10 @@ const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, {} }; -EXPORT_SYMBOL(amd_nb_misc_ids); +EXPORT_SYMBOL_GPL(amd_nb_misc_ids); static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) }, @@ -34,6 +47,7 @@ static const struct pci_device_id amd_nb_link_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_M60H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, {} }; @@ -44,8 +58,25 @@ const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = { { } }; -struct amd_northbridge_info amd_northbridges; -EXPORT_SYMBOL(amd_northbridges); +static struct amd_northbridge_info amd_northbridges; + +u16 amd_nb_num(void) +{ + return amd_northbridges.num; +} +EXPORT_SYMBOL_GPL(amd_nb_num); + +bool amd_nb_has_feature(unsigned int feature) +{ + return ((amd_northbridges.flags & feature) == feature); +} +EXPORT_SYMBOL_GPL(amd_nb_has_feature); + +struct amd_northbridge *node_to_amd_nb(int node) +{ + return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL; +} +EXPORT_SYMBOL_GPL(node_to_amd_nb); static struct pci_dev *next_northbridge(struct pci_dev *dev, const struct pci_device_id *ids) @@ -58,13 +89,106 @@ static struct pci_dev *next_northbridge(struct pci_dev *dev, return dev; } +static int __amd_smn_rw(u16 node, u32 address, u32 *value, bool write) +{ + struct pci_dev *root; + int err = -ENODEV; + + if (node >= amd_northbridges.num) + goto out; + + root = node_to_amd_nb(node)->root; + if (!root) + goto out; + + mutex_lock(&smn_mutex); + + err = pci_write_config_dword(root, 0x60, address); + if (err) { + pr_warn("Error programming SMN address 0x%x.\n", address); + goto out_unlock; + } + + err = (write ? pci_write_config_dword(root, 0x64, *value) + : pci_read_config_dword(root, 0x64, value)); + if (err) + pr_warn("Error %s SMN address 0x%x.\n", + (write ? "writing to" : "reading from"), address); + +out_unlock: + mutex_unlock(&smn_mutex); + +out: + return err; +} + +int amd_smn_read(u16 node, u32 address, u32 *value) +{ + return __amd_smn_rw(node, address, value, false); +} +EXPORT_SYMBOL_GPL(amd_smn_read); + +int amd_smn_write(u16 node, u32 address, u32 value) +{ + return __amd_smn_rw(node, address, &value, true); +} +EXPORT_SYMBOL_GPL(amd_smn_write); + +/* + * Data Fabric Indirect Access uses FICAA/FICAD. + * + * Fabric Indirect Configuration Access Address (FICAA): Constructed based + * on the device's Instance Id and the PCI function and register offset of + * the desired register. + * + * Fabric Indirect Configuration Access Data (FICAD): There are FICAD LO + * and FICAD HI registers but so far we only need the LO register. + */ +int amd_df_indirect_read(u16 node, u8 func, u16 reg, u8 instance_id, u32 *lo) +{ + struct pci_dev *F4; + u32 ficaa; + int err = -ENODEV; + + if (node >= amd_northbridges.num) + goto out; + + F4 = node_to_amd_nb(node)->link; + if (!F4) + goto out; + + ficaa = 1; + ficaa |= reg & 0x3FC; + ficaa |= (func & 0x7) << 11; + ficaa |= instance_id << 16; + + mutex_lock(&smn_mutex); + + err = pci_write_config_dword(F4, 0x5C, ficaa); + if (err) { + pr_warn("Error writing DF Indirect FICAA, FICAA=0x%x\n", ficaa); + goto out_unlock; + } + + err = pci_read_config_dword(F4, 0x98, lo); + if (err) + pr_warn("Error reading DF Indirect FICAD LO, FICAA=0x%x.\n", ficaa); + +out_unlock: + mutex_unlock(&smn_mutex); + +out: + return err; +} +EXPORT_SYMBOL_GPL(amd_df_indirect_read); + int amd_cache_northbridges(void) { u16 i = 0; struct amd_northbridge *nb; - struct pci_dev *misc, *link; + struct pci_dev *root, *misc, *link; - if (amd_nb_num()) + if (amd_northbridges.num) return 0; misc = NULL; @@ -74,15 +198,17 @@ int amd_cache_northbridges(void) if (!i) return -ENODEV; - nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL); + nb = kcalloc(i, sizeof(struct amd_northbridge), GFP_KERNEL); if (!nb) return -ENOMEM; amd_northbridges.nb = nb; amd_northbridges.num = i; - link = misc = NULL; - for (i = 0; i != amd_nb_num(); i++) { + link = misc = root = NULL; + for (i = 0; i != amd_northbridges.num; i++) { + node_to_amd_nb(i)->root = root = + next_northbridge(root, amd_root_ids); node_to_amd_nb(i)->misc = misc = next_northbridge(misc, amd_nb_misc_ids); node_to_amd_nb(i)->link = link = @@ -139,13 +265,13 @@ struct resource *amd_get_mmconfig_range(struct resource *res) { u32 address; u64 base, msr; - unsigned segn_busn_bits; + unsigned int segn_busn_bits; if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) return NULL; /* assume all cpus from fam10h have mmconfig */ - if (boot_cpu_data.x86 < 0x10) + if (boot_cpu_data.x86 < 0x10) return NULL; address = MSR_FAM10H_MMIO_CONF_BASE; @@ -226,14 +352,14 @@ static void amd_cache_gart(void) if (!amd_nb_has_feature(AMD_NB_GART)) return; - flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL); + flush_words = kmalloc_array(amd_northbridges.num, sizeof(u32), GFP_KERNEL); if (!flush_words) { amd_northbridges.flags &= ~AMD_NB_GART; pr_notice("Cannot initialize GART flush words, GART support disabled\n"); return; } - for (i = 0; i != amd_nb_num(); i++) + for (i = 0; i != amd_northbridges.num; i++) pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c, &flush_words[i]); } @@ -246,18 +372,20 @@ void amd_flush_garts(void) if (!amd_nb_has_feature(AMD_NB_GART)) return; - /* Avoid races between AGP and IOMMU. In theory it's not needed - but I'm not sure if the hardware won't lose flush requests - when another is pending. This whole thing is so expensive anyways - that it doesn't matter to serialize more. -AK */ + /* + * Avoid races between AGP and IOMMU. In theory it's not needed + * but I'm not sure if the hardware won't lose flush requests + * when another is pending. This whole thing is so expensive anyways + * that it doesn't matter to serialize more. -AK + */ spin_lock_irqsave(&gart_lock, flags); flushed = 0; - for (i = 0; i < amd_nb_num(); i++) { + for (i = 0; i < amd_northbridges.num; i++) { pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c, flush_words[i] | 1); flushed++; } - for (i = 0; i < amd_nb_num(); i++) { + for (i = 0; i < amd_northbridges.num; i++) { u32 w; /* Make sure the hardware actually executed the flush*/ for (;;) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 88c657b057e2..bb47e5eacd44 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -48,7 +48,6 @@ #include <asm/io_apic.h> #include <asm/desc.h> #include <asm/hpet.h> -#include <asm/idle.h> #include <asm/mtrr.h> #include <asm/time.h> #include <asm/smp.h> @@ -894,11 +893,13 @@ void __init setup_boot_APIC_clock(void) /* Setup the lapic or request the broadcast */ setup_APIC_timer(); + amd_e400_c1e_apic_setup(); } void setup_secondary_APIC_clock(void) { setup_APIC_timer(); + amd_e400_c1e_apic_setup(); } /* @@ -2263,6 +2264,7 @@ void __init apic_set_eoi_write(void (*eoi_write)(u32 reg, u32 v)) for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { /* Should happen once for each apic */ WARN_ON((*drv)->eoi_write == eoi_write); + (*drv)->native_eoi_write = (*drv)->eoi_write; (*drv)->eoi_write = eoi_write; } } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 48e6d84f173e..945e512a112a 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -48,7 +48,6 @@ #include <linux/bootmem.h> #include <asm/irqdomain.h> -#include <asm/idle.h> #include <asm/io.h> #include <asm/smp.h> #include <asm/cpu.h> diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index aeef53ce93e1..35690a168cf7 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -815,9 +815,9 @@ static __init void map_mmioh_high_uv3(int index, int min_pnode, int max_pnode) l = li; } addr1 = (base << shift) + - f * (unsigned long)(1 << m_io); + f * (1ULL << m_io); addr2 = (base << shift) + - (l + 1) * (unsigned long)(1 << m_io); + (l + 1) * (1ULL << m_io); pr_info("UV: %s[%03d..%03d] NASID 0x%04x ADDR 0x%016lx - 0x%016lx\n", id, fi, li, lnasid, addr1, addr2); if (max_io < l) diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 51287cd90bf6..643818a7688b 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -906,14 +906,14 @@ static int apm_cpu_idle(struct cpuidle_device *dev, static int use_apm_idle; /* = 0 */ static unsigned int last_jiffies; /* = 0 */ static unsigned int last_stime; /* = 0 */ - cputime_t stime; + cputime_t stime, utime; int apm_idle_done = 0; unsigned int jiffies_since_last_check = jiffies - last_jiffies; unsigned int bucket; recalc: - task_cputime(current, NULL, &stime); + task_cputime(current, &utime, &stime); if (jiffies_since_last_check > IDLE_CALC_LIMIT) { use_apm_idle = 0; } else if (jiffies_since_last_check > idle_period) { diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4a8697f7d4ef..33b63670bf09 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -20,13 +20,11 @@ obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += common.o obj-y += rdrand.o obj-y += match.o +obj-y += bugs.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o -obj-$(CONFIG_X86_32) += bugs.o -obj-$(CONFIG_X86_64) += bugs_64.o - obj-$(CONFIG_CPU_SUP_INTEL) += intel.o obj-$(CONFIG_CPU_SUP_AMD) += amd.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 1e81a37c034e..71cae73a5076 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -20,6 +20,10 @@ #include "cpu.h" +static const int amd_erratum_383[]; +static const int amd_erratum_400[]; +static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); + /* * nodes_per_socket: Stores the number of nodes per socket. * Refer to Fam15h Models 00-0fh BKDG - CPUID Fn8000_001E_ECX @@ -314,11 +318,30 @@ static void amd_get_topology(struct cpuinfo_x86 *c) smp_num_siblings = ((ebx >> 8) & 3) + 1; c->x86_max_cores /= smp_num_siblings; c->cpu_core_id = ebx & 0xff; + + /* + * We may have multiple LLCs if L3 caches exist, so check if we + * have an L3 cache by looking at the L3 cache CPUID leaf. + */ + if (cpuid_edx(0x80000006)) { + if (c->x86 == 0x17) { + /* + * LLC is at the core complex level. + * Core complex id is ApicId[3]. + */ + per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; + } else { + /* LLC is at the node level. */ + per_cpu(cpu_llc_id, cpu) = node_id; + } + } } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) { u64 value; rdmsrl(MSR_FAM10H_NODE_ID, value); node_id = value & 7; + + per_cpu(cpu_llc_id, cpu) = node_id; } else return; @@ -329,9 +352,6 @@ static void amd_get_topology(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_AMD_DCM); cus_per_node = c->x86_max_cores / nodes_per_socket; - /* store NodeID, use llc_shared_map to store sibling info */ - per_cpu(cpu_llc_id, cpu) = node_id; - /* core id has to be in the [0 .. cores_per_node - 1] range */ c->cpu_core_id %= cus_per_node; } @@ -356,15 +376,6 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; amd_get_topology(c); - - /* - * Fix percpu cpu_llc_id here as LLC topology is different - * for Fam17h systems. - */ - if (c->x86 != 0x17 || !cpuid_edx(0x80000006)) - return; - - per_cpu(cpu_llc_id, cpu) = c->apicid >> 3; #endif } @@ -585,11 +596,16 @@ static void early_init_amd(struct cpuinfo_x86 *c) /* F16h erratum 793, CVE-2013-6885 */ if (c->x86 == 0x16 && c->x86_model <= 0xf) msr_set_bit(MSR_AMD64_LS_CFG, 15); -} -static const int amd_erratum_383[]; -static const int amd_erratum_400[]; -static bool cpu_has_amd_erratum(struct cpuinfo_x86 *cpu, const int *erratum); + /* + * Check whether the machine is affected by erratum 400. This is + * used to select the proper idle routine and to enable the check + * whether the machine is affected in arch_post_acpi_init(), which + * sets the X86_BUG_AMD_APIC_C1E bug depending on the MSR check. + */ + if (cpu_has_amd_erratum(c, amd_erratum_400)) + set_cpu_bug(c, X86_BUG_AMD_E400); +} static void init_amd_k8(struct cpuinfo_x86 *c) { @@ -770,9 +786,6 @@ static void init_amd(struct cpuinfo_x86 *c) if (c->x86 > 0x11) set_cpu_cap(c, X86_FEATURE_ARAT); - if (cpu_has_amd_erratum(c, amd_erratum_400)) - set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); - rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); /* 3DNow or LM implies PREFETCHW */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index bd17db15a2c1..a44ef52184df 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -16,15 +16,19 @@ #include <asm/msr.h> #include <asm/paravirt.h> #include <asm/alternative.h> +#include <asm/pgtable.h> +#include <asm/cacheflush.h> void __init check_bugs(void) { identify_boot_cpu(); -#ifndef CONFIG_SMP - pr_info("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif + if (!IS_ENABLED(CONFIG_SMP)) { + pr_info("CPU: "); + print_cpu_info(&boot_cpu_data); + } + +#ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. * @@ -40,4 +44,18 @@ void __init check_bugs(void) alternative_instructions(); fpu__init_check_bugs(); +#else /* CONFIG_X86_64 */ + alternative_instructions(); + + /* + * Make sure the first 2MB area is not mapped by huge pages + * There are typically fixed size MTRRs in there and overlapping + * MTRRs into large pages causes slow downs. + * + * Right now we don't do that with gbpages because there seems + * very little benefit for that case. + */ + if (!direct_gbpages) + set_memory_4k((unsigned long)__va(0), 1); +#endif } diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c deleted file mode 100644 index a972ac4c7e7d..000000000000 --- a/arch/x86/kernel/cpu/bugs_64.c +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Copyright (C) 1994 Linus Torvalds - * Copyright (C) 2000 SuSE - */ - -#include <linux/kernel.h> -#include <linux/init.h> -#include <asm/alternative.h> -#include <asm/bugs.h> -#include <asm/processor.h> -#include <asm/mtrr.h> -#include <asm/cacheflush.h> - -void __init check_bugs(void) -{ - identify_boot_cpu(); -#if !defined(CONFIG_SMP) - pr_info("CPU: "); - print_cpu_info(&boot_cpu_data); -#endif - alternative_instructions(); - - /* - * Make sure the first 2MB area is not mapped by huge pages - * There are typically fixed size MTRRs in there and overlapping - * MTRRs into large pages causes slow downs. - * - * Right now we don't do that with gbpages because there seems - * very little benefit for that case. - */ - if (!direct_gbpages) - set_memory_4k((unsigned long)__va(0), 1); -} diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cc9e980c68ec..729f92ba8224 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1172,7 +1172,6 @@ void enable_sep_cpu(void) void __init identify_boot_cpu(void) { identify_cpu(&boot_cpu_data); - init_amd_e400_c1e_mask(); #ifdef CONFIG_X86_32 sysenter_setup(); enable_sep_cpu(); @@ -1190,51 +1189,6 @@ void identify_secondary_cpu(struct cpuinfo_x86 *c) mtrr_ap_init(); } -struct msr_range { - unsigned min; - unsigned max; -}; - -static const struct msr_range msr_range_array[] = { - { 0x00000000, 0x00000418}, - { 0xc0000000, 0xc000040b}, - { 0xc0010000, 0xc0010142}, - { 0xc0011000, 0xc001103b}, -}; - -static void __print_cpu_msr(void) -{ - unsigned index_min, index_max; - unsigned index; - u64 val; - int i; - - for (i = 0; i < ARRAY_SIZE(msr_range_array); i++) { - index_min = msr_range_array[i].min; - index_max = msr_range_array[i].max; - - for (index = index_min; index < index_max; index++) { - if (rdmsrl_safe(index, &val)) - continue; - pr_info(" MSR%08x: %016llx\n", index, val); - } - } -} - -static int show_msr; - -static __init int setup_show_msr(char *arg) -{ - int num; - - get_option(&arg, &num); - - if (num > 0) - show_msr = num; - return 1; -} -__setup("show_msr=", setup_show_msr); - static __init int setup_noclflush(char *arg) { setup_clear_cpu_cap(X86_FEATURE_CLFLUSH); @@ -1268,14 +1222,6 @@ void print_cpu_info(struct cpuinfo_x86 *c) pr_cont(", stepping: 0x%x)\n", c->x86_mask); else pr_cont(")\n"); - - print_cpu_msr(c); -} - -void print_cpu_msr(struct cpuinfo_x86 *c) -{ - if (c->cpu_index < show_msr) - __print_cpu_msr(); } static __init int setup_disablecpuid(char *arg) @@ -1490,11 +1436,8 @@ void cpu_init(void) */ cr4_init_shadow(); - /* - * Load microcode on this cpu if a valid microcode is available. - * This is early microcode loading procedure. - */ - load_ucode_ap(); + if (cpu) + load_ucode_ap(); t = &per_cpu(cpu_tss, cpu); oist = &per_cpu(orig_ist, cpu); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index de6626c18e42..be6337156502 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -934,6 +934,8 @@ static int __populate_cache_leaves(unsigned int cpu) ci_leaf_init(this_leaf++, &id4_regs); __cache_cpumap_setup(cpu, idx, &id4_regs); } + this_cpu_ci->cpu_map_populated = true; + return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 631356c8cca4..c7efbcfbeda6 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -311,7 +311,7 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e *msg = s->msg; s->covered = 1; if (s->sev >= MCE_UC_SEVERITY && ctx == IN_KERNEL) { - if (panic_on_oops || tolerant < 1) + if (tolerant < 1) return MCE_PANIC_SEVERITY; } return s->sev; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a7fdf453d895..00ef43233e03 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -43,6 +43,7 @@ #include <linux/export.h> #include <linux/jump_label.h> +#include <asm/intel-family.h> #include <asm/processor.h> #include <asm/traps.h> #include <asm/tlbflush.h> @@ -135,6 +136,9 @@ void mce_setup(struct mce *m) m->socketid = cpu_data(m->extcpu).phys_proc_id; m->apicid = cpu_data(m->extcpu).initial_apicid; rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); + + if (this_cpu_has(X86_FEATURE_INTEL_PPIN)) + rdmsrl(MSR_PPIN, m->ppin); } DEFINE_PER_CPU(struct mce, injectm); @@ -207,8 +211,12 @@ EXPORT_SYMBOL_GPL(mce_inject_log); static struct notifier_block mce_srao_nb; +static atomic_t num_notifiers; + void mce_register_decode_chain(struct notifier_block *nb) { + atomic_inc(&num_notifiers); + /* Ensure SRAO notifier has the highest priority in the decode chain. */ if (nb != &mce_srao_nb && nb->priority == INT_MAX) nb->priority -= 1; @@ -219,6 +227,8 @@ EXPORT_SYMBOL_GPL(mce_register_decode_chain); void mce_unregister_decode_chain(struct notifier_block *nb) { + atomic_dec(&num_notifiers); + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); } EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); @@ -270,17 +280,17 @@ struct mca_msr_regs msr_ops = { .misc = misc_reg }; -static void print_mce(struct mce *m) +static void __print_mce(struct mce *m) { - int ret = 0; - - pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", - m->extcpu, m->mcgstatus, m->bank, m->status); + pr_emerg(HW_ERR "CPU %d: Machine Check%s: %Lx Bank %d: %016Lx\n", + m->extcpu, + (m->mcgstatus & MCG_STATUS_MCIP ? " Exception" : ""), + m->mcgstatus, m->bank, m->status); if (m->ip) { pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", - m->cs, m->ip); + m->cs, m->ip); if (m->cs == __KERNEL_CS) print_symbol("{%s}", m->ip); @@ -308,6 +318,13 @@ static void print_mce(struct mce *m) pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, cpu_data(m->extcpu).microcode); +} + +static void print_mce(struct mce *m) +{ + int ret = 0; + + __print_mce(m); /* * Print out human-readable details about the MCE error, @@ -499,7 +516,7 @@ int mce_available(struct cpuinfo_x86 *c) static void mce_schedule_work(void) { - if (!mce_gen_pool_empty() && keventd_up()) + if (!mce_gen_pool_empty()) schedule_work(&mce_work); } @@ -569,6 +586,32 @@ static struct notifier_block mce_srao_nb = { .priority = INT_MAX, }; +static int mce_default_notifier(struct notifier_block *nb, unsigned long val, + void *data) +{ + struct mce *m = (struct mce *)data; + + if (!m) + return NOTIFY_DONE; + + /* + * Run the default notifier if we have only the SRAO + * notifier and us registered. + */ + if (atomic_read(&num_notifiers) > 2) + return NOTIFY_DONE; + + __print_mce(m); + + return NOTIFY_DONE; +} + +static struct notifier_block mce_default_nb = { + .notifier_call = mce_default_notifier, + /* lowest prio, we want it to run last. */ + .priority = 0, +}; + /* * Read ADDR and MISC registers. */ @@ -667,6 +710,15 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_gather_info(&m, NULL); + /* + * m.tsc was set in mce_setup(). Clear it if not requested. + * + * FIXME: Propagate @flags to mce_gather_info/mce_setup() to avoid + * that dance. + */ + if (!(flags & MCP_TIMESTAMP)) + m.tsc = 0; + for (i = 0; i < mca_cfg.banks; i++) { if (!mce_banks[i].ctl || !test_bit(i, *b)) continue; @@ -674,14 +726,12 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m.misc = 0; m.addr = 0; m.bank = i; - m.tsc = 0; barrier(); m.status = mce_rdmsrl(msr_ops.status(i)); if (!(m.status & MCI_STATUS_VAL)) continue; - /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -696,9 +746,6 @@ bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) mce_read_aux(&m, i); - if (!(flags & MCP_TIMESTAMP)) - m.tsc = 0; - severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) @@ -1355,7 +1402,7 @@ static void mce_timer_fn(unsigned long data) iv = __this_cpu_read(mce_next_interval); if (mce_available(this_cpu_ptr(&cpu_info))) { - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks)); + machine_check_poll(0, this_cpu_ptr(&mce_poll_banks)); if (mce_intel_cmci_poll()) { iv = mce_adjust_timer(iv); @@ -1745,6 +1792,14 @@ static void mce_start_timer(unsigned int cpu, struct timer_list *t) add_timer_on(t, cpu); } +static void __mcheck_cpu_setup_timer(void) +{ + struct timer_list *t = this_cpu_ptr(&mce_timer); + unsigned int cpu = smp_processor_id(); + + setup_pinned_timer(t, mce_timer_fn, cpu); +} + static void __mcheck_cpu_init_timer(void) { struct timer_list *t = this_cpu_ptr(&mce_timer); @@ -1796,7 +1851,7 @@ void mcheck_cpu_init(struct cpuinfo_x86 *c) __mcheck_cpu_init_generic(); __mcheck_cpu_init_vendor(c); __mcheck_cpu_init_clear_banks(); - __mcheck_cpu_init_timer(); + __mcheck_cpu_setup_timer(); } /* @@ -2138,6 +2193,7 @@ int __init mcheck_init(void) { mcheck_intel_therm_init(); mce_register_decode_chain(&mce_srao_nb); + mce_register_decode_chain(&mce_default_nb); mcheck_vendor_init_severity(); INIT_WORK(&mce_work, mce_process_work); @@ -2255,8 +2311,6 @@ static struct bus_type mce_subsys = { DEFINE_PER_CPU(struct device *, mce_device); -void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); - static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) { return container_of(attr, struct mce_bank, attr); @@ -2409,6 +2463,10 @@ static int mce_device_create(unsigned int cpu) if (!mce_available(&boot_cpu_data)) return -EIO; + dev = per_cpu(mce_device, cpu); + if (dev) + return 0; + dev = kzalloc(sizeof *dev, GFP_KERNEL); if (!dev) return -ENOMEM; @@ -2468,28 +2526,25 @@ static void mce_device_remove(unsigned int cpu) } /* Make sure there are no machine checks on offlined CPUs. */ -static void mce_disable_cpu(void *h) +static void mce_disable_cpu(void) { - unsigned long action = *(unsigned long *)h; - if (!mce_available(raw_cpu_ptr(&cpu_info))) return; - if (!(action & CPU_TASKS_FROZEN)) + if (!cpuhp_tasks_frozen) cmci_clear(); vendor_disable_error_reporting(); } -static void mce_reenable_cpu(void *h) +static void mce_reenable_cpu(void) { - unsigned long action = *(unsigned long *)h; int i; if (!mce_available(raw_cpu_ptr(&cpu_info))) return; - if (!(action & CPU_TASKS_FROZEN)) + if (!cpuhp_tasks_frozen) cmci_reenable(); for (i = 0; i < mca_cfg.banks; i++) { struct mce_bank *b = &mce_banks[i]; @@ -2499,45 +2554,43 @@ static void mce_reenable_cpu(void *h) } } -/* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int -mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) +static int mce_cpu_dead(unsigned int cpu) +{ + mce_intel_hcpu_update(cpu); + + /* intentionally ignoring frozen here */ + if (!cpuhp_tasks_frozen) + cmci_rediscover(); + return 0; +} + +static int mce_cpu_online(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; struct timer_list *t = &per_cpu(mce_timer, cpu); + int ret; - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_ONLINE: - mce_device_create(cpu); - if (threshold_cpu_callback) - threshold_cpu_callback(action, cpu); - break; - case CPU_DEAD: - if (threshold_cpu_callback) - threshold_cpu_callback(action, cpu); - mce_device_remove(cpu); - mce_intel_hcpu_update(cpu); + mce_device_create(cpu); - /* intentionally ignoring frozen here */ - if (!(action & CPU_TASKS_FROZEN)) - cmci_rediscover(); - break; - case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, mce_disable_cpu, &action, 1); - del_timer_sync(t); - break; - case CPU_DOWN_FAILED: - smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); - mce_start_timer(cpu, t); - break; + ret = mce_threshold_create_device(cpu); + if (ret) { + mce_device_remove(cpu); + return ret; } - - return NOTIFY_OK; + mce_reenable_cpu(); + mce_start_timer(cpu, t); + return 0; } -static struct notifier_block mce_cpu_notifier = { - .notifier_call = mce_cpu_callback, -}; +static int mce_cpu_pre_down(unsigned int cpu) +{ + struct timer_list *t = &per_cpu(mce_timer, cpu); + + mce_disable_cpu(); + del_timer_sync(t); + mce_threshold_remove_device(cpu); + mce_device_remove(cpu); + return 0; +} static __init void mce_init_banks(void) { @@ -2559,8 +2612,8 @@ static __init void mce_init_banks(void) static __init int mcheck_init_device(void) { + enum cpuhp_state hp_online; int err; - int i = 0; if (!mce_available(&boot_cpu_data)) { err = -EIO; @@ -2578,23 +2631,16 @@ static __init int mcheck_init_device(void) if (err) goto err_out_mem; - cpu_notifier_register_begin(); - for_each_online_cpu(i) { - err = mce_device_create(i); - if (err) { - /* - * Register notifier anyway (and do not unreg it) so - * that we don't leave undeleted timers, see notifier - * callback above. - */ - __register_hotcpu_notifier(&mce_cpu_notifier); - cpu_notifier_register_done(); - goto err_device_create; - } - } + err = cpuhp_setup_state(CPUHP_X86_MCE_DEAD, "x86/mce:dead", NULL, + mce_cpu_dead); + if (err) + goto err_out_mem; - __register_hotcpu_notifier(&mce_cpu_notifier); - cpu_notifier_register_done(); + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/mce:online", + mce_cpu_online, mce_cpu_pre_down); + if (err < 0) + goto err_out_online; + hp_online = err; register_syscore_ops(&mce_syscore_ops); @@ -2607,16 +2653,10 @@ static __init int mcheck_init_device(void) err_register: unregister_syscore_ops(&mce_syscore_ops); + cpuhp_remove_state(hp_online); -err_device_create: - /* - * We didn't keep track of which devices were created above, but - * even if we had, the set of online cpus might have changed. - * Play safe and remove for every possible cpu, since - * mce_device_remove() will do the right thing. - */ - for_each_possible_cpu(i) - mce_device_remove(i); +err_out_online: + cpuhp_remove_state(CPUHP_X86_MCE_DEAD); err_out_mem: free_cpumask_var(mce_device_initialized); diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 9b5403462936..ffacfdcacb85 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -24,7 +24,6 @@ #include <asm/amd_nb.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/trace/irq_vectors.h> @@ -55,6 +54,8 @@ /* Threshold LVT offset is at MSR0xC0000410[15:12] */ #define SMCA_THR_LVT_OFF 0xF000 +static bool thresholding_en; + static const char * const th_names[] = { "load_store", "insn_fetch", @@ -69,7 +70,12 @@ static const char * const smca_umc_block_names[] = { "misc_umc" }; -struct smca_bank_name smca_bank_names[] = { +struct smca_bank_name { + const char *name; /* Short name for sysfs */ + const char *long_name; /* Long name for pretty-printing */ +}; + +static struct smca_bank_name smca_names[] = { [SMCA_LS] = { "load_store", "Load Store Unit" }, [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, @@ -84,9 +90,25 @@ struct smca_bank_name smca_bank_names[] = { [SMCA_PSP] = { "psp", "Platform Security Processor" }, [SMCA_SMU] = { "smu", "System Management Unit" }, }; -EXPORT_SYMBOL_GPL(smca_bank_names); -static struct smca_hwid_mcatype smca_hwid_mcatypes[] = { +const char *smca_get_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_names[t].name; +} + +const char *smca_get_long_name(enum smca_bank_types t) +{ + if (t >= N_SMCA_BANK_TYPES) + return NULL; + + return smca_names[t].long_name; +} +EXPORT_SYMBOL_GPL(smca_get_long_name); + +static struct smca_hwid smca_hwid_mcatypes[] = { /* { bank_type, hwid_mcatype, xec_bitmap } */ /* ZN Core (HWID=0xB0) MCA types */ @@ -116,7 +138,7 @@ static struct smca_hwid_mcatype smca_hwid_mcatypes[] = { { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, }; -struct smca_bank_info smca_banks[MAX_NR_BANKS]; +struct smca_bank smca_banks[MAX_NR_BANKS]; EXPORT_SYMBOL_GPL(smca_banks); /* @@ -142,35 +164,34 @@ static void default_deferred_error_interrupt(void) } void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; -/* - * CPU Initialization - */ - static void get_smca_bank_info(unsigned int bank) { unsigned int i, hwid_mcatype, cpu = smp_processor_id(); - struct smca_hwid_mcatype *type; - u32 high, instanceId; - u16 hwid, mcatype; + struct smca_hwid *s_hwid; + u32 high, instance_id; /* Collect bank_info using CPU 0 for now. */ if (cpu) return; - if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) { + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instance_id, &high)) { pr_warn("Failed to read MCA_IPID for bank %d\n", bank); return; } - hwid = high & MCI_IPID_HWID; - mcatype = (high & MCI_IPID_MCATYPE) >> 16; - hwid_mcatype = HWID_MCATYPE(hwid, mcatype); + hwid_mcatype = HWID_MCATYPE(high & MCI_IPID_HWID, + (high & MCI_IPID_MCATYPE) >> 16); for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { - type = &smca_hwid_mcatypes[i]; - if (hwid_mcatype == type->hwid_mcatype) { - smca_banks[bank].type = type; - smca_banks[bank].type_instance = instanceId; + s_hwid = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == s_hwid->hwid_mcatype) { + + WARN(smca_banks[bank].hwid, + "Bank %s already initialized!\n", + smca_get_name(s_hwid->bank_type)); + + smca_banks[bank].hwid = s_hwid; + smca_banks[bank].id = instance_id; break; } } @@ -533,6 +554,206 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) deferred_error_interrupt_enable(c); } +int umc_normaddr_to_sysaddr(u64 norm_addr, u16 nid, u8 umc, u64 *sys_addr) +{ + u64 dram_base_addr, dram_limit_addr, dram_hole_base; + /* We start from the normalized address */ + u64 ret_addr = norm_addr; + + u32 tmp; + + u8 die_id_shift, die_id_mask, socket_id_shift, socket_id_mask; + u8 intlv_num_dies, intlv_num_chan, intlv_num_sockets; + u8 intlv_addr_sel, intlv_addr_bit; + u8 num_intlv_bits, hashed_bit; + u8 lgcy_mmio_hole_en, base = 0; + u8 cs_mask, cs_id = 0; + bool hash_enabled = false; + + /* Read D18F0x1B4 (DramOffset), check if base 1 is used. */ + if (amd_df_indirect_read(nid, 0, 0x1B4, umc, &tmp)) + goto out_err; + + /* Remove HiAddrOffset from normalized address, if enabled: */ + if (tmp & BIT(0)) { + u64 hi_addr_offset = (tmp & GENMASK_ULL(31, 20)) << 8; + + if (norm_addr >= hi_addr_offset) { + ret_addr -= hi_addr_offset; + base = 1; + } + } + + /* Read D18F0x110 (DramBaseAddress). */ + if (amd_df_indirect_read(nid, 0, 0x110 + (8 * base), umc, &tmp)) + goto out_err; + + /* Check if address range is valid. */ + if (!(tmp & BIT(0))) { + pr_err("%s: Invalid DramBaseAddress range: 0x%x.\n", + __func__, tmp); + goto out_err; + } + + lgcy_mmio_hole_en = tmp & BIT(1); + intlv_num_chan = (tmp >> 4) & 0xF; + intlv_addr_sel = (tmp >> 8) & 0x7; + dram_base_addr = (tmp & GENMASK_ULL(31, 12)) << 16; + + /* {0, 1, 2, 3} map to address bits {8, 9, 10, 11} respectively */ + if (intlv_addr_sel > 3) { + pr_err("%s: Invalid interleave address select %d.\n", + __func__, intlv_addr_sel); + goto out_err; + } + + /* Read D18F0x114 (DramLimitAddress). */ + if (amd_df_indirect_read(nid, 0, 0x114 + (8 * base), umc, &tmp)) + goto out_err; + + intlv_num_sockets = (tmp >> 8) & 0x1; + intlv_num_dies = (tmp >> 10) & 0x3; + dram_limit_addr = ((tmp & GENMASK_ULL(31, 12)) << 16) | GENMASK_ULL(27, 0); + + intlv_addr_bit = intlv_addr_sel + 8; + + /* Re-use intlv_num_chan by setting it equal to log2(#channels) */ + switch (intlv_num_chan) { + case 0: intlv_num_chan = 0; break; + case 1: intlv_num_chan = 1; break; + case 3: intlv_num_chan = 2; break; + case 5: intlv_num_chan = 3; break; + case 7: intlv_num_chan = 4; break; + + case 8: intlv_num_chan = 1; + hash_enabled = true; + break; + default: + pr_err("%s: Invalid number of interleaved channels %d.\n", + __func__, intlv_num_chan); + goto out_err; + } + + num_intlv_bits = intlv_num_chan; + + if (intlv_num_dies > 2) { + pr_err("%s: Invalid number of interleaved nodes/dies %d.\n", + __func__, intlv_num_dies); + goto out_err; + } + + num_intlv_bits += intlv_num_dies; + + /* Add a bit if sockets are interleaved. */ + num_intlv_bits += intlv_num_sockets; + + /* Assert num_intlv_bits <= 4 */ + if (num_intlv_bits > 4) { + pr_err("%s: Invalid interleave bits %d.\n", + __func__, num_intlv_bits); + goto out_err; + } + + if (num_intlv_bits > 0) { + u64 temp_addr_x, temp_addr_i, temp_addr_y; + u8 die_id_bit, sock_id_bit, cs_fabric_id; + + /* + * Read FabricBlockInstanceInformation3_CS[BlockFabricID]. + * This is the fabric id for this coherent slave. Use + * umc/channel# as instance id of the coherent slave + * for FICAA. + */ + if (amd_df_indirect_read(nid, 0, 0x50, umc, &tmp)) + goto out_err; + + cs_fabric_id = (tmp >> 8) & 0xFF; + die_id_bit = 0; + + /* If interleaved over more than 1 channel: */ + if (intlv_num_chan) { + die_id_bit = intlv_num_chan; + cs_mask = (1 << die_id_bit) - 1; + cs_id = cs_fabric_id & cs_mask; + } + + sock_id_bit = die_id_bit; + + /* Read D18F1x208 (SystemFabricIdMask). */ + if (intlv_num_dies || intlv_num_sockets) + if (amd_df_indirect_read(nid, 1, 0x208, umc, &tmp)) + goto out_err; + + /* If interleaved over more than 1 die. */ + if (intlv_num_dies) { + sock_id_bit = die_id_bit + intlv_num_dies; + die_id_shift = (tmp >> 24) & 0xF; + die_id_mask = (tmp >> 8) & 0xFF; + + cs_id |= ((cs_fabric_id & die_id_mask) >> die_id_shift) << die_id_bit; + } + + /* If interleaved over more than 1 socket. */ + if (intlv_num_sockets) { + socket_id_shift = (tmp >> 28) & 0xF; + socket_id_mask = (tmp >> 16) & 0xFF; + + cs_id |= ((cs_fabric_id & socket_id_mask) >> socket_id_shift) << sock_id_bit; + } + + /* + * The pre-interleaved address consists of XXXXXXIIIYYYYY + * where III is the ID for this CS, and XXXXXXYYYYY are the + * address bits from the post-interleaved address. + * "num_intlv_bits" has been calculated to tell us how many "I" + * bits there are. "intlv_addr_bit" tells us how many "Y" bits + * there are (where "I" starts). + */ + temp_addr_y = ret_addr & GENMASK_ULL(intlv_addr_bit-1, 0); + temp_addr_i = (cs_id << intlv_addr_bit); + temp_addr_x = (ret_addr & GENMASK_ULL(63, intlv_addr_bit)) << num_intlv_bits; + ret_addr = temp_addr_x | temp_addr_i | temp_addr_y; + } + + /* Add dram base address */ + ret_addr += dram_base_addr; + + /* If legacy MMIO hole enabled */ + if (lgcy_mmio_hole_en) { + if (amd_df_indirect_read(nid, 0, 0x104, umc, &tmp)) + goto out_err; + + dram_hole_base = tmp & GENMASK(31, 24); + if (ret_addr >= dram_hole_base) + ret_addr += (BIT_ULL(32) - dram_hole_base); + } + + if (hash_enabled) { + /* Save some parentheses and grab ls-bit at the end. */ + hashed_bit = (ret_addr >> 12) ^ + (ret_addr >> 18) ^ + (ret_addr >> 21) ^ + (ret_addr >> 30) ^ + cs_id; + + hashed_bit &= BIT(0); + + if (hashed_bit != ((ret_addr >> intlv_addr_bit) & BIT(0))) + ret_addr ^= BIT(intlv_addr_bit); + } + + /* Is calculated system address is above DRAM limit address? */ + if (ret_addr > dram_limit_addr) + goto out_err; + + *sys_addr = ret_addr; + return 0; + +out_err: + return -EINVAL; +} +EXPORT_SYMBOL_GPL(umc_normaddr_to_sysaddr); + static void __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) { @@ -645,6 +866,7 @@ static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; unsigned int bank, block, cpu = smp_processor_id(); + struct thresh_restart tr; /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { @@ -681,6 +903,11 @@ static void amd_threshold_interrupt(void) log: __log_error(bank, false, true, ((u64)high << 32) | low); + + /* Reset threshold block after logging error. */ + memset(&tr, 0, sizeof(tr)); + tr.b = &per_cpu(threshold_banks, cpu)[bank]->blocks[block]; + threshold_restart_bank(&tr); } /* @@ -826,10 +1053,10 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) return th_names[bank]; } - if (!smca_banks[bank].type) + if (!smca_banks[bank].hwid) return NULL; - bank_type = smca_banks[bank].type->bank_type; + bank_type = smca_banks[bank].hwid->bank_type; if (b && bank_type == SMCA_UMC) { if (b->block < ARRAY_SIZE(smca_umc_block_names)) @@ -838,8 +1065,8 @@ static const char *get_name(unsigned int bank, struct threshold_block *b) } snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, - "%s_%x", smca_bank_names[bank_type].name, - smca_banks[bank].type_instance); + "%s_%x", smca_get_name(bank_type), + smca_banks[bank].id); return buf_mcatype; } @@ -1010,31 +1237,6 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) return err; } -/* create dir/files for all valid threshold banks */ -static int threshold_create_device(unsigned int cpu) -{ - unsigned int bank; - struct threshold_bank **bp; - int err = 0; - - bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks, - GFP_KERNEL); - if (!bp) - return -ENOMEM; - - per_cpu(threshold_banks, cpu) = bp; - - for (bank = 0; bank < mca_cfg.banks; ++bank) { - if (!(per_cpu(bank_map, cpu) & (1 << bank))) - continue; - err = threshold_create_bank(cpu, bank); - if (err) - return err; - } - - return err; -} - static void deallocate_threshold_block(unsigned int cpu, unsigned int bank) { @@ -1102,48 +1304,71 @@ free_out: per_cpu(threshold_banks, cpu)[bank] = NULL; } -static void threshold_remove_device(unsigned int cpu) +int mce_threshold_remove_device(unsigned int cpu) { unsigned int bank; + if (!thresholding_en) + return 0; + for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; threshold_remove_bank(cpu, bank); } kfree(per_cpu(threshold_banks, cpu)); + per_cpu(threshold_banks, cpu) = NULL; + return 0; } -/* get notified when a cpu comes on/off */ -static void -amd_64_threshold_cpu_callback(unsigned long action, unsigned int cpu) +/* create dir/files for all valid threshold banks */ +int mce_threshold_create_device(unsigned int cpu) { - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - threshold_create_device(cpu); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - threshold_remove_device(cpu); - break; - default: - break; + unsigned int bank; + struct threshold_bank **bp; + int err = 0; + + if (!thresholding_en) + return 0; + + bp = per_cpu(threshold_banks, cpu); + if (bp) + return 0; + + bp = kzalloc(sizeof(struct threshold_bank *) * mca_cfg.banks, + GFP_KERNEL); + if (!bp) + return -ENOMEM; + + per_cpu(threshold_banks, cpu) = bp; + + for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (!(per_cpu(bank_map, cpu) & (1 << bank))) + continue; + err = threshold_create_bank(cpu, bank); + if (err) + goto err; } + return err; +err: + mce_threshold_remove_device(cpu); + return err; } static __init int threshold_init_device(void) { unsigned lcpu = 0; + if (mce_threshold_vector == amd_threshold_interrupt) + thresholding_en = true; + /* to hit CPUs online before the notifier is up */ for_each_online_cpu(lcpu) { - int err = threshold_create_device(lcpu); + int err = mce_threshold_create_device(lcpu); if (err) return err; } - threshold_cpu_callback = amd_64_threshold_cpu_callback; return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 1defb8ea882c..190b3e6cef4d 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -11,6 +11,8 @@ #include <linux/sched.h> #include <linux/cpumask.h> #include <asm/apic.h> +#include <asm/cpufeature.h> +#include <asm/intel-family.h> #include <asm/processor.h> #include <asm/msr.h> #include <asm/mce.h> @@ -130,7 +132,7 @@ bool mce_intel_cmci_poll(void) * Reset the counter if we've logged an error in the last poll * during the storm. */ - if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned))) + if (machine_check_poll(0, this_cpu_ptr(&mce_banks_owned))) this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); else this_cpu_dec(cmci_backoff_cnt); @@ -342,7 +344,7 @@ void cmci_recheck(void) return; local_irq_save(flags); - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); + machine_check_poll(0, this_cpu_ptr(&mce_banks_owned)); local_irq_restore(flags); } @@ -464,11 +466,46 @@ static void intel_clear_lmce(void) wrmsrl(MSR_IA32_MCG_EXT_CTL, val); } +static void intel_ppin_init(struct cpuinfo_x86 *c) +{ + unsigned long long val; + + /* + * Even if testing the presence of the MSR would be enough, we don't + * want to risk the situation where other models reuse this MSR for + * other purposes. + */ + switch (c->x86_model) { + case INTEL_FAM6_IVYBRIDGE_X: + case INTEL_FAM6_HASWELL_X: + case INTEL_FAM6_BROADWELL_XEON_D: + case INTEL_FAM6_BROADWELL_X: + case INTEL_FAM6_SKYLAKE_X: + if (rdmsrl_safe(MSR_PPIN_CTL, &val)) + return; + + if ((val & 3UL) == 1UL) { + /* PPIN available but disabled: */ + return; + } + + /* If PPIN is disabled, but not locked, try to enable: */ + if (!(val & 3UL)) { + wrmsrl_safe(MSR_PPIN_CTL, val | 2UL); + rdmsrl_safe(MSR_PPIN_CTL, &val); + } + + if ((val & 3UL) == 2UL) + set_cpu_cap(c, X86_FEATURE_INTEL_PPIN); + } +} + void mce_intel_feature_init(struct cpuinfo_x86 *c) { intel_init_thermal(c); intel_init_cmci(); intel_init_lmce(); + intel_ppin_init(c); } void mce_intel_feature_clear(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 6b9dc4d18ccc..465aca8be009 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -26,7 +26,6 @@ #include <asm/processor.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/msr.h> #include <asm/trace/irq_vectors.h> @@ -271,58 +270,32 @@ static void thermal_throttle_remove_dev(struct device *dev) } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ -static int -thermal_throttle_cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int thermal_throttle_online(unsigned int cpu) { - unsigned int cpu = (unsigned long)hcpu; - struct device *dev; - int err = 0; - - dev = get_cpu_device(cpu); - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - err = thermal_throttle_add_dev(dev, cpu); - WARN_ON(err); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - thermal_throttle_remove_dev(dev); - break; - } - return notifier_from_errno(err); + struct device *dev = get_cpu_device(cpu); + + return thermal_throttle_add_dev(dev, cpu); } -static struct notifier_block thermal_throttle_cpu_notifier = +static int thermal_throttle_offline(unsigned int cpu) { - .notifier_call = thermal_throttle_cpu_callback, -}; + struct device *dev = get_cpu_device(cpu); + + thermal_throttle_remove_dev(dev); + return 0; +} static __init int thermal_throttle_init_device(void) { - unsigned int cpu = 0; - int err; + int ret; if (!atomic_read(&therm_throt_en)) return 0; - cpu_notifier_register_begin(); - - /* connect live CPUs to sysfs */ - for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); - WARN_ON(err); - } - - __register_hotcpu_notifier(&thermal_throttle_cpu_notifier); - cpu_notifier_register_done(); - - return 0; + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/therm:online", + thermal_throttle_online, + thermal_throttle_offline); + return ret < 0 ? ret : 0; } device_initcall(thermal_throttle_init_device); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index fcf9ae9384f4..9beb092d68a5 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -6,7 +6,6 @@ #include <asm/irq_vectors.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/trace/irq_vectors.h> diff --git a/arch/x86/kernel/cpu/microcode/Makefile b/arch/x86/kernel/cpu/microcode/Makefile index 220b1a508513..ba12e8aa4a45 100644 --- a/arch/x86/kernel/cpu/microcode/Makefile +++ b/arch/x86/kernel/cpu/microcode/Makefile @@ -1,4 +1,4 @@ microcode-y := core.o obj-$(CONFIG_MICROCODE) += microcode.o -microcode-$(CONFIG_MICROCODE_INTEL) += intel.o intel_lib.o +microcode-$(CONFIG_MICROCODE_INTEL) += intel.o microcode-$(CONFIG_MICROCODE_AMD) += amd.o diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 017bda12caae..6f353bdb3a25 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -5,6 +5,7 @@ * CPUs and later. * * Copyright (C) 2008-2011 Advanced Micro Devices Inc. + * 2013-2016 Borislav Petkov <bp@alien8.de> * * Author: Peter Oruba <peter.oruba@amd.com> * @@ -39,64 +40,25 @@ static struct equiv_cpu_entry *equiv_cpu_table; -struct ucode_patch { - struct list_head plist; - void *data; - u32 patch_id; - u16 equiv_cpu; -}; - -static LIST_HEAD(pcache); - /* * This points to the current valid container of microcode patches which we will - * save from the initrd before jettisoning its contents. + * save from the initrd/builtin before jettisoning its contents. */ -static u8 *container; -static size_t container_size; -static bool ucode_builtin; +struct container { + u8 *data; + size_t size; +} cont; static u32 ucode_new_rev; static u8 amd_ucode_patch[PATCH_MAX_SIZE]; static u16 this_equiv_id; -static struct cpio_data ucode_cpio; - -static struct cpio_data __init find_ucode_in_initrd(void) -{ -#ifdef CONFIG_BLK_DEV_INITRD - char *path; - void *start; - size_t size; - - /* - * Microcode patch container file is prepended to the initrd in cpio - * format. See Documentation/x86/early-microcode.txt - */ - static __initdata char ucode_path[] = "kernel/x86/microcode/AuthenticAMD.bin"; - -#ifdef CONFIG_X86_32 - struct boot_params *p; - - /* - * On 32-bit, early load occurs before paging is turned on so we need - * to use physical addresses. - */ - p = (struct boot_params *)__pa_nodebug(&boot_params); - path = (char *)__pa_nodebug(ucode_path); - start = (void *)p->hdr.ramdisk_image; - size = p->hdr.ramdisk_size; -#else - path = ucode_path; - start = (void *)(boot_params.hdr.ramdisk_image + PAGE_OFFSET); - size = boot_params.hdr.ramdisk_size; -#endif /* !CONFIG_X86_32 */ - - return find_cpio_data(path, start, size, NULL); -#else - return (struct cpio_data){ NULL, 0, "" }; -#endif -} +/* + * Microcode patch container file is prepended to the initrd in cpio + * format. See Documentation/x86/early-microcode.txt + */ +static const char +ucode_path[] __maybe_unused = "kernel/x86/microcode/AuthenticAMD.bin"; static size_t compute_container_size(u8 *data, u32 total_size) { @@ -135,48 +97,48 @@ static size_t compute_container_size(u8 *data, u32 total_size) return size; } +static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table, + unsigned int sig) +{ + int i = 0; + + if (!equiv_cpu_table) + return 0; + + while (equiv_cpu_table[i].installed_cpu != 0) { + if (sig == equiv_cpu_table[i].installed_cpu) + return equiv_cpu_table[i].equiv_cpu; + + i++; + } + return 0; +} + /* - * Early load occurs before we can vmalloc(). So we look for the microcode - * patch container file in initrd, traverse equivalent cpu table, look for a - * matching microcode patch, and update, all in initrd memory in place. - * When vmalloc() is available for use later -- on 64-bit during first AP load, - * and on 32-bit during save_microcode_in_initrd_amd() -- we can call - * load_microcode_amd() to save equivalent cpu table and microcode patches in - * kernel heap memory. + * This scans the ucode blob for the proper container as we can have multiple + * containers glued together. */ -static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) +static struct container +find_proper_container(u8 *ucode, size_t size, u16 *ret_id) { + struct container ret = { NULL, 0 }; + u32 eax, ebx, ecx, edx; struct equiv_cpu_entry *eq; - size_t *cont_sz; - u32 *header; - u8 *data, **cont; - u8 (*patch)[PATCH_MAX_SIZE]; - u16 eq_id = 0; int offset, left; - u32 rev, eax, ebx, ecx, edx; - u32 *new_rev; - -#ifdef CONFIG_X86_32 - new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); - cont_sz = (size_t *)__pa_nodebug(&container_size); - cont = (u8 **)__pa_nodebug(&container); - patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); -#else - new_rev = &ucode_new_rev; - cont_sz = &container_size; - cont = &container; - patch = &amd_ucode_patch; -#endif + u16 eq_id = 0; + u32 *header; + u8 *data; data = ucode; left = size; header = (u32 *)data; + /* find equiv cpu table */ if (header[0] != UCODE_MAGIC || header[1] != UCODE_EQUIV_CPU_TABLE_TYPE || /* type */ header[2] == 0) /* size */ - return; + return ret; eax = 0x00000001; ecx = 0; @@ -185,7 +147,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) while (left > 0) { eq = (struct equiv_cpu_entry *)(data + CONTAINER_HDR_SZ); - *cont = data; + ret.data = data; /* Advance past the container header */ offset = header[2] + CONTAINER_HDR_SZ; @@ -194,15 +156,15 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) eq_id = find_equiv_id(eq, eax); if (eq_id) { - this_equiv_id = eq_id; - *cont_sz = compute_container_size(*cont, left + offset); + ret.size = compute_container_size(ret.data, left + offset); /* * truncate how much we need to iterate over in the * ucode update loop below */ - left = *cont_sz - offset; - break; + left = ret.size - offset; + *ret_id = eq_id; + return ret; } /* @@ -212,6 +174,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) */ while (left > 0) { header = (u32 *)data; + if (header[0] == UCODE_MAGIC && header[1] == UCODE_EQUIV_CPU_TABLE_TYPE) break; @@ -226,14 +189,64 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) ucode = data; } - if (!eq_id) { - *cont = NULL; - *cont_sz = 0; - return; - } + return ret; +} + +static int __apply_microcode_amd(struct microcode_amd *mc_amd) +{ + u32 rev, dummy; + + native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); + + /* verify patch application was successful */ + native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); + if (rev != mc_amd->hdr.patch_id) + return -1; + + return 0; +} + +/* + * Early load occurs before we can vmalloc(). So we look for the microcode + * patch container file in initrd, traverse equivalent cpu table, look for a + * matching microcode patch, and update, all in initrd memory in place. + * When vmalloc() is available for use later -- on 64-bit during first AP load, + * and on 32-bit during save_microcode_in_initrd_amd() -- we can call + * load_microcode_amd() to save equivalent cpu table and microcode patches in + * kernel heap memory. + */ +static struct container +apply_microcode_early_amd(void *ucode, size_t size, bool save_patch) +{ + struct container ret = { NULL, 0 }; + u8 (*patch)[PATCH_MAX_SIZE]; + int offset, left; + u32 rev, *header; + u8 *data; + u16 eq_id = 0; + u32 *new_rev; + +#ifdef CONFIG_X86_32 + new_rev = (u32 *)__pa_nodebug(&ucode_new_rev); + patch = (u8 (*)[PATCH_MAX_SIZE])__pa_nodebug(&amd_ucode_patch); +#else + new_rev = &ucode_new_rev; + patch = &amd_ucode_patch; +#endif if (check_current_patch_level(&rev, true)) - return; + return (struct container){ NULL, 0 }; + + ret = find_proper_container(ucode, size, &eq_id); + if (!eq_id) + return (struct container){ NULL, 0 }; + + this_equiv_id = eq_id; + header = (u32 *)ret.data; + + /* We're pointing to an equiv table, skip over it. */ + data = ret.data + header[2] + CONTAINER_HDR_SZ; + left = ret.size - (header[2] + CONTAINER_HDR_SZ); while (left > 0) { struct microcode_amd *mc; @@ -252,8 +265,7 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) *new_rev = rev; if (save_patch) - memcpy(patch, mc, - min_t(u32, header[1], PATCH_MAX_SIZE)); + memcpy(patch, mc, min_t(u32, header[1], PATCH_MAX_SIZE)); } } @@ -261,10 +273,10 @@ static void apply_ucode_in_initrd(void *ucode, size_t size, bool save_patch) data += offset; left -= offset; } + return ret; } -static bool __init load_builtin_amd_microcode(struct cpio_data *cp, - unsigned int family) +static bool get_builtin_microcode(struct cpio_data *cp, unsigned int family) { #ifdef CONFIG_X86_64 char fw_name[36] = "amd-ucode/microcode_amd.bin"; @@ -281,47 +293,45 @@ static bool __init load_builtin_amd_microcode(struct cpio_data *cp, void __init load_ucode_amd_bsp(unsigned int family) { + struct ucode_cpu_info *uci; struct cpio_data cp; - bool *builtin; - void **data; - size_t *size; + const char *path; + bool use_pa; -#ifdef CONFIG_X86_32 - data = (void **)__pa_nodebug(&ucode_cpio.data); - size = (size_t *)__pa_nodebug(&ucode_cpio.size); - builtin = (bool *)__pa_nodebug(&ucode_builtin); -#else - data = &ucode_cpio.data; - size = &ucode_cpio.size; - builtin = &ucode_builtin; -#endif + if (IS_ENABLED(CONFIG_X86_32)) { + uci = (struct ucode_cpu_info *)__pa_nodebug(ucode_cpu_info); + path = (const char *)__pa_nodebug(ucode_path); + use_pa = true; + } else { + uci = ucode_cpu_info; + path = ucode_path; + use_pa = false; + } - *builtin = load_builtin_amd_microcode(&cp, family); - if (!*builtin) - cp = find_ucode_in_initrd(); + if (!get_builtin_microcode(&cp, family)) + cp = find_microcode_in_initrd(path, use_pa); if (!(cp.data && cp.size)) return; - *data = cp.data; - *size = cp.size; + /* Get BSP's CPUID.EAX(1), needed in load_microcode_amd() */ + uci->cpu_sig.sig = cpuid_eax(1); - apply_ucode_in_initrd(cp.data, cp.size, true); + apply_microcode_early_amd(cp.data, cp.size, true); } #ifdef CONFIG_X86_32 /* * On 32-bit, since AP's early load occurs before paging is turned on, we - * cannot traverse cpu_equiv_table and pcache in kernel heap memory. So during - * cold boot, AP will apply_ucode_in_initrd() just like the BSP. During - * save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, + * cannot traverse cpu_equiv_table and microcode_cache in kernel heap memory. + * So during cold boot, AP will apply_ucode_in_initrd() just like the BSP. + * In save_microcode_in_initrd_amd() BSP's patch is copied to amd_ucode_patch, * which is used upon resume from suspend. */ -void load_ucode_amd_ap(void) +void load_ucode_amd_ap(unsigned int family) { struct microcode_amd *mc; - size_t *usize; - void **ucode; + struct cpio_data cp; mc = (struct microcode_amd *)__pa_nodebug(amd_ucode_patch); if (mc->hdr.patch_id && mc->hdr.processor_rev_id) { @@ -329,60 +339,63 @@ void load_ucode_amd_ap(void) return; } - ucode = (void *)__pa_nodebug(&container); - usize = (size_t *)__pa_nodebug(&container_size); + if (!get_builtin_microcode(&cp, family)) + cp = find_microcode_in_initrd((const char *)__pa_nodebug(ucode_path), true); - if (!*ucode || !*usize) + if (!(cp.data && cp.size)) return; - apply_ucode_in_initrd(*ucode, *usize, false); -} - -static void __init collect_cpu_sig_on_bsp(void *arg) -{ - unsigned int cpu = smp_processor_id(); - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - uci->cpu_sig.sig = cpuid_eax(0x00000001); -} - -static void __init get_bsp_sig(void) -{ - unsigned int bsp = boot_cpu_data.cpu_index; - struct ucode_cpu_info *uci = ucode_cpu_info + bsp; - - if (!uci->cpu_sig.sig) - smp_call_function_single(bsp, collect_cpu_sig_on_bsp, NULL, 1); + /* + * This would set amd_ucode_patch above so that the following APs can + * use it directly instead of going down this path again. + */ + apply_microcode_early_amd(cp.data, cp.size, true); } #else -void load_ucode_amd_ap(void) +void load_ucode_amd_ap(unsigned int family) { - unsigned int cpu = smp_processor_id(); struct equiv_cpu_entry *eq; struct microcode_amd *mc; - u8 *cont = container; u32 rev, eax; u16 eq_id; - /* Exit if called on the BSP. */ - if (!cpu) + /* 64-bit runs with paging enabled, thus early==false. */ + if (check_current_patch_level(&rev, false)) return; - if (!container) - return; + /* First AP hasn't cached it yet, go through the blob. */ + if (!cont.data) { + struct cpio_data cp = { NULL, 0, "" }; - /* - * 64-bit runs with paging enabled, thus early==false. - */ - if (check_current_patch_level(&rev, false)) - return; + if (cont.size == -1) + return; - /* Add CONFIG_RANDOMIZE_MEMORY offset. */ - if (!ucode_builtin) - cont += PAGE_OFFSET - __PAGE_OFFSET_BASE; +reget: + if (!get_builtin_microcode(&cp, family)) { +#ifdef CONFIG_BLK_DEV_INITRD + cp = find_cpio_data(ucode_path, (void *)initrd_start, + initrd_end - initrd_start, NULL); +#endif + if (!(cp.data && cp.size)) { + /* + * Mark it so that other APs do not scan again + * for no real reason and slow down boot + * needlessly. + */ + cont.size = -1; + return; + } + } + + cont = apply_microcode_early_amd(cp.data, cp.size, false); + if (!(cont.data && cont.size)) { + cont.size = -1; + return; + } + } eax = cpuid_eax(0x00000001); - eq = (struct equiv_cpu_entry *)(cont + CONTAINER_HDR_SZ); + eq = (struct equiv_cpu_entry *)(cont.data + CONTAINER_HDR_SZ); eq_id = find_equiv_id(eq, eax); if (!eq_id) @@ -397,61 +410,50 @@ void load_ucode_amd_ap(void) } } else { - if (!ucode_cpio.data) - return; /* * AP has a different equivalence ID than BSP, looks like * mixed-steppings silicon so go through the ucode blob anew. */ - apply_ucode_in_initrd(ucode_cpio.data, ucode_cpio.size, false); + goto reget; } } -#endif +#endif /* CONFIG_X86_32 */ + +static enum ucode_state +load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size); -int __init save_microcode_in_initrd_amd(void) +int __init save_microcode_in_initrd_amd(unsigned int fam) { - unsigned long cont; - int retval = 0; enum ucode_state ret; - u8 *cont_va; - u32 eax; + int retval = 0; + u16 eq_id; - if (!container) - return -EINVAL; + if (!cont.data) { + if (IS_ENABLED(CONFIG_X86_32) && (cont.size != -1)) { + struct cpio_data cp = { NULL, 0, "" }; -#ifdef CONFIG_X86_32 - get_bsp_sig(); - cont = (unsigned long)container; - cont_va = __va(container); -#else - /* - * We need the physical address of the container for both bitness since - * boot_params.hdr.ramdisk_image is a physical address. - */ - cont = __pa_nodebug(container); - cont_va = container; +#ifdef CONFIG_BLK_DEV_INITRD + cp = find_cpio_data(ucode_path, (void *)initrd_start, + initrd_end - initrd_start, NULL); #endif - /* - * Take into account the fact that the ramdisk might get relocated and - * therefore we need to recompute the container's position in virtual - * memory space. - */ - if (relocated_ramdisk) - container = (u8 *)(__va(relocated_ramdisk) + - (cont - boot_params.hdr.ramdisk_image)); - else - container = cont_va; + if (!(cp.data && cp.size)) { + cont.size = -1; + return -EINVAL; + } - /* Add CONFIG_RANDOMIZE_MEMORY offset. */ - if (!ucode_builtin) - container += PAGE_OFFSET - __PAGE_OFFSET_BASE; + cont = find_proper_container(cp.data, cp.size, &eq_id); + if (!eq_id) { + cont.size = -1; + return -EINVAL; + } - eax = cpuid_eax(0x00000001); - eax = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff); + } else + return -EINVAL; + } - ret = load_microcode_amd(smp_processor_id(), eax, container, container_size); + ret = load_microcode_amd(smp_processor_id(), fam, cont.data, cont.size); if (ret != UCODE_OK) retval = -EINVAL; @@ -459,8 +461,8 @@ int __init save_microcode_in_initrd_amd(void) * This will be freed any msec now, stash patches for the current * family and switch to patch cache for cpu hotplug, etc later. */ - container = NULL; - container_size = 0; + cont.data = NULL; + cont.size = 0; return retval; } @@ -478,8 +480,10 @@ void reload_ucode_amd(void) return; mc = (struct microcode_amd *)amd_ucode_patch; + if (!mc) + return; - if (mc && rev < mc->hdr.patch_id) { + if (rev < mc->hdr.patch_id) { if (!__apply_microcode_amd(mc)) { ucode_new_rev = mc->hdr.patch_id; pr_info("reload patch_level=0x%08x\n", ucode_new_rev); @@ -513,7 +517,7 @@ static struct ucode_patch *cache_find_patch(u16 equiv_cpu) { struct ucode_patch *p; - list_for_each_entry(p, &pcache, plist) + list_for_each_entry(p, µcode_cache, plist) if (p->equiv_cpu == equiv_cpu) return p; return NULL; @@ -523,7 +527,7 @@ static void update_cache(struct ucode_patch *new_patch) { struct ucode_patch *p; - list_for_each_entry(p, &pcache, plist) { + list_for_each_entry(p, µcode_cache, plist) { if (p->equiv_cpu == new_patch->equiv_cpu) { if (p->patch_id >= new_patch->patch_id) /* we already have the latest patch */ @@ -536,14 +540,14 @@ static void update_cache(struct ucode_patch *new_patch) } } /* no patch found, add it */ - list_add_tail(&new_patch->plist, &pcache); + list_add_tail(&new_patch->plist, µcode_cache); } static void free_cache(void) { struct ucode_patch *p, *tmp; - list_for_each_entry_safe(p, tmp, &pcache, plist) { + list_for_each_entry_safe(p, tmp, µcode_cache, plist) { __list_del(p->plist.prev, p->plist.next); kfree(p->data); kfree(p); @@ -663,21 +667,7 @@ bool check_current_patch_level(u32 *rev, bool early) return ret; } -int __apply_microcode_amd(struct microcode_amd *mc_amd) -{ - u32 rev, dummy; - - native_wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); - - /* verify patch application was successful */ - native_rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - if (rev != mc_amd->hdr.patch_id) - return -1; - - return 0; -} - -int apply_microcode_amd(int cpu) +static int apply_microcode_amd(int cpu) { struct cpuinfo_x86 *c = &cpu_data(cpu); struct microcode_amd *mc_amd; @@ -860,7 +850,8 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data, return UCODE_OK; } -enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) +static enum ucode_state +load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size) { enum ucode_state ret; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 5ce5155f0695..6996413c78c3 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -3,7 +3,7 @@ * * Copyright (C) 2000-2006 Tigran Aivazian <tigran@aivazian.fsnet.co.uk> * 2006 Shaohua Li <shaohua.li@intel.com> - * 2013-2015 Borislav Petkov <bp@alien8.de> + * 2013-2016 Borislav Petkov <bp@alien8.de> * * X86 CPU microcode early update for Linux: * @@ -39,12 +39,15 @@ #include <asm/microcode.h> #include <asm/processor.h> #include <asm/cmdline.h> +#include <asm/setup.h> -#define MICROCODE_VERSION "2.01" +#define DRIVER_VERSION "2.2" static struct microcode_ops *microcode_ops; static bool dis_ucode_ldr; +LIST_HEAD(microcode_cache); + /* * Synchronization. * @@ -167,7 +170,7 @@ void load_ucode_ap(void) break; case X86_VENDOR_AMD: if (family >= 0x10) - load_ucode_amd_ap(); + load_ucode_amd_ap(family); break; default: break; @@ -185,7 +188,7 @@ static int __init save_microcode_in_initrd(void) break; case X86_VENDOR_AMD: if (c->x86 >= 0x10) - return save_microcode_in_initrd_amd(); + return save_microcode_in_initrd_amd(c->x86); break; default: break; @@ -194,6 +197,58 @@ static int __init save_microcode_in_initrd(void) return -EINVAL; } +struct cpio_data find_microcode_in_initrd(const char *path, bool use_pa) +{ +#ifdef CONFIG_BLK_DEV_INITRD + unsigned long start = 0; + size_t size; + +#ifdef CONFIG_X86_32 + struct boot_params *params; + + if (use_pa) + params = (struct boot_params *)__pa_nodebug(&boot_params); + else + params = &boot_params; + + size = params->hdr.ramdisk_size; + + /* + * Set start only if we have an initrd image. We cannot use initrd_start + * because it is not set that early yet. + */ + if (size) + start = params->hdr.ramdisk_image; + +# else /* CONFIG_X86_64 */ + size = (unsigned long)boot_params.ext_ramdisk_size << 32; + size |= boot_params.hdr.ramdisk_size; + + if (size) { + start = (unsigned long)boot_params.ext_ramdisk_image << 32; + start |= boot_params.hdr.ramdisk_image; + + start += PAGE_OFFSET; + } +# endif + + /* + * Did we relocate the ramdisk? + * + * So we possibly relocate the ramdisk *after* applying microcode on the + * BSP so we rely on use_pa (use physical addresses) - even if it is not + * absolutely correct - to determine whether we've done the ramdisk + * relocation already. + */ + if (!use_pa && relocated_ramdisk) + start = initrd_start; + + return find_cpio_data(path, (void *)start, size, NULL); +#else /* !CONFIG_BLK_DEV_INITRD */ + return (struct cpio_data){ NULL, 0, "" }; +#endif +} + void reload_early_microcode(void) { int vendor, family; @@ -453,16 +508,17 @@ static struct attribute_group mc_attr_group = { static void microcode_fini_cpu(int cpu) { - microcode_ops->microcode_fini_cpu(cpu); + if (microcode_ops->microcode_fini_cpu) + microcode_ops->microcode_fini_cpu(cpu); } static enum ucode_state microcode_resume_cpu(int cpu) { - pr_debug("CPU%d updated upon resume\n", cpu); - if (apply_microcode_on_target(cpu)) return UCODE_ERROR; + pr_debug("CPU%d updated upon resume\n", cpu); + return UCODE_OK; } @@ -496,6 +552,9 @@ static enum ucode_state microcode_update_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + /* Refresh CPU microcode revision after resume. */ + collect_cpu_info(cpu); + if (uci->valid) return microcode_resume_cpu(cpu); @@ -579,12 +638,7 @@ static int mc_cpu_down_prep(unsigned int cpu) /* Suspend is in progress, only remove the interface */ sysfs_remove_group(&dev->kobj, &mc_attr_group); pr_debug("CPU%d removed\n", cpu); - /* - * When a CPU goes offline, don't free up or invalidate the copy of - * the microcode in kernel memory, so that we can reuse it when the - * CPU comes back online without unnecessarily requesting the userspace - * for it again. - */ + return 0; } @@ -649,8 +703,7 @@ int __init microcode_init(void) cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/microcode:online", mc_cpu_online, mc_cpu_down_prep); - pr_info("Microcode Update Driver: v" MICROCODE_VERSION - " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n"); + pr_info("Microcode Update Driver: v%s.", DRIVER_VERSION); return 0; diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index cdc0deab00c9..54d50c3694d8 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -39,125 +39,83 @@ #include <asm/setup.h> #include <asm/msr.h> -/* - * Temporary microcode blobs pointers storage. We note here during early load - * the pointers to microcode blobs we've got from whatever storage (detached - * initrd, builtin). Later on, we put those into final storage - * mc_saved_data.mc_saved. - * - * Important: those are offsets from the beginning of initrd or absolute - * addresses within the kernel image when built-in. - */ -static unsigned long mc_tmp_ptrs[MAX_UCODE_COUNT]; - -static struct mc_saved_data { - unsigned int num_saved; - struct microcode_intel **mc_saved; -} mc_saved_data; +static const char ucode_path[] = "kernel/x86/microcode/GenuineIntel.bin"; -/* Microcode blobs within the initrd. 0 if builtin. */ -static struct ucode_blobs { - unsigned long start; - bool valid; -} blobs; +/* Current microcode patch used in early patching */ +struct microcode_intel *intel_ucode_patch; -/* Go through saved patches and find the one suitable for the current CPU. */ -static enum ucode_state -find_microcode_patch(struct microcode_intel **saved, - unsigned int num_saved, struct ucode_cpu_info *uci) +static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, + unsigned int s2, unsigned int p2) { - struct microcode_intel *ucode_ptr, *new_mc = NULL; - struct microcode_header_intel *mc_hdr; - int new_rev, ret, i; - - new_rev = uci->cpu_sig.rev; - - for (i = 0; i < num_saved; i++) { - ucode_ptr = saved[i]; - mc_hdr = (struct microcode_header_intel *)ucode_ptr; - - ret = has_newer_microcode(ucode_ptr, - uci->cpu_sig.sig, - uci->cpu_sig.pf, - new_rev); - if (!ret) - continue; - - new_rev = mc_hdr->rev; - new_mc = ucode_ptr; - } + if (s1 != s2) + return false; - if (!new_mc) - return UCODE_NFOUND; + /* Processor flags are either both 0 ... */ + if (!p1 && !p2) + return true; - uci->mc = (struct microcode_intel *)new_mc; - return UCODE_OK; + /* ... or they intersect. */ + return p1 & p2; } -static inline void -copy_ptrs(struct microcode_intel **mc_saved, unsigned long *mc_ptrs, - unsigned long off, int num_saved) +/* + * Returns 1 if update has been found, 0 otherwise. + */ +static int find_matching_signature(void *mc, unsigned int csig, int cpf) { + struct microcode_header_intel *mc_hdr = mc; + struct extended_sigtable *ext_hdr; + struct extended_signature *ext_sig; int i; - for (i = 0; i < num_saved; i++) - mc_saved[i] = (struct microcode_intel *)(mc_ptrs[i] + off); -} - -#ifdef CONFIG_X86_32 -static void -microcode_phys(struct microcode_intel **mc_saved_tmp, struct mc_saved_data *mcs) -{ - int i; - struct microcode_intel ***mc_saved; + if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) + return 1; - mc_saved = (struct microcode_intel ***)__pa_nodebug(&mcs->mc_saved); + /* Look for ext. headers: */ + if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) + return 0; - for (i = 0; i < mcs->num_saved; i++) { - struct microcode_intel *p; + ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; + ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; - p = *(struct microcode_intel **)__pa_nodebug(mcs->mc_saved + i); - mc_saved_tmp[i] = (struct microcode_intel *)__pa_nodebug(p); + for (i = 0; i < ext_hdr->count; i++) { + if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) + return 1; + ext_sig++; } + return 0; } -#endif -static enum ucode_state -load_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - unsigned long offset, struct ucode_cpu_info *uci) +/* + * Returns 1 if update has been found, 0 otherwise. + */ +static int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) { - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int count = mcs->num_saved; + struct microcode_header_intel *mc_hdr = mc; - if (!mcs->mc_saved) { - copy_ptrs(mc_saved_tmp, mc_ptrs, offset, count); + if (mc_hdr->rev <= new_rev) + return 0; - return find_microcode_patch(mc_saved_tmp, count, uci); - } else { -#ifdef CONFIG_X86_32 - microcode_phys(mc_saved_tmp, mcs); - return find_microcode_patch(mc_saved_tmp, count, uci); -#else - return find_microcode_patch(mcs->mc_saved, count, uci); -#endif - } + return find_matching_signature(mc, csig, cpf); } /* * Given CPU signature and a microcode patch, this function finds if the * microcode patch has matching family and model with the CPU. + * + * %true - if there's a match + * %false - otherwise */ -static enum ucode_state -matching_model_microcode(struct microcode_header_intel *mc_header, - unsigned long sig) +static bool microcode_matches(struct microcode_header_intel *mc_header, + unsigned long sig) { - unsigned int fam, model; - unsigned int fam_ucode, model_ucode; - struct extended_sigtable *ext_header; unsigned long total_size = get_totalsize(mc_header); unsigned long data_size = get_datasize(mc_header); - int ext_sigcount, i; + struct extended_sigtable *ext_header; + unsigned int fam_ucode, model_ucode; struct extended_signature *ext_sig; + unsigned int fam, model; + int ext_sigcount, i; fam = x86_family(sig); model = x86_model(sig); @@ -166,11 +124,11 @@ matching_model_microcode(struct microcode_header_intel *mc_header, model_ucode = x86_model(mc_header->sig); if (fam == fam_ucode && model == model_ucode) - return UCODE_OK; + return true; /* Look for ext. headers: */ if (total_size <= data_size + MC_HEADER_SIZE) - return UCODE_NFOUND; + return false; ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE; ext_sig = (void *)ext_header + EXT_HEADER_SIZE; @@ -181,192 +139,242 @@ matching_model_microcode(struct microcode_header_intel *mc_header, model_ucode = x86_model(ext_sig->sig); if (fam == fam_ucode && model == model_ucode) - return UCODE_OK; + return true; ext_sig++; } - return UCODE_NFOUND; + return false; } -static int -save_microcode(struct mc_saved_data *mcs, - struct microcode_intel **mc_saved_src, - unsigned int num_saved) +static struct ucode_patch *__alloc_microcode_buf(void *data, unsigned int size) { - int i, j; - struct microcode_intel **saved_ptr; - int ret; + struct ucode_patch *p; - if (!num_saved) - return -EINVAL; + p = kzalloc(size, GFP_KERNEL); + if (!p) + return ERR_PTR(-ENOMEM); - /* - * Copy new microcode data. - */ - saved_ptr = kcalloc(num_saved, sizeof(struct microcode_intel *), GFP_KERNEL); - if (!saved_ptr) - return -ENOMEM; - - for (i = 0; i < num_saved; i++) { - struct microcode_header_intel *mc_hdr; - struct microcode_intel *mc; - unsigned long size; - - if (!mc_saved_src[i]) { - ret = -EINVAL; - goto err; - } + p->data = kmemdup(data, size, GFP_KERNEL); + if (!p->data) { + kfree(p); + return ERR_PTR(-ENOMEM); + } - mc = mc_saved_src[i]; - mc_hdr = &mc->hdr; - size = get_totalsize(mc_hdr); + return p; +} - saved_ptr[i] = kmemdup(mc, size, GFP_KERNEL); - if (!saved_ptr[i]) { - ret = -ENOMEM; - goto err; +static void save_microcode_patch(void *data, unsigned int size) +{ + struct microcode_header_intel *mc_hdr, *mc_saved_hdr; + struct ucode_patch *iter, *tmp, *p; + bool prev_found = false; + unsigned int sig, pf; + + mc_hdr = (struct microcode_header_intel *)data; + + list_for_each_entry_safe(iter, tmp, µcode_cache, plist) { + mc_saved_hdr = (struct microcode_header_intel *)iter->data; + sig = mc_saved_hdr->sig; + pf = mc_saved_hdr->pf; + + if (find_matching_signature(data, sig, pf)) { + prev_found = true; + + if (mc_hdr->rev <= mc_saved_hdr->rev) + continue; + + p = __alloc_microcode_buf(data, size); + if (IS_ERR(p)) + pr_err("Error allocating buffer %p\n", data); + else + list_replace(&iter->plist, &p->plist); } } /* - * Point to newly saved microcode. + * There weren't any previous patches found in the list cache; save the + * newly found. */ - mcs->mc_saved = saved_ptr; - mcs->num_saved = num_saved; - - return 0; - -err: - for (j = 0; j <= i; j++) - kfree(saved_ptr[j]); - kfree(saved_ptr); - - return ret; + if (!prev_found) { + p = __alloc_microcode_buf(data, size); + if (IS_ERR(p)) + pr_err("Error allocating buffer for %p\n", data); + else + list_add_tail(&p->plist, µcode_cache); + } } -/* - * A microcode patch in ucode_ptr is saved into mc_saved - * - if it has matching signature and newer revision compared to an existing - * patch mc_saved. - * - or if it is a newly discovered microcode patch. - * - * The microcode patch should have matching model with CPU. - * - * Returns: The updated number @num_saved of saved microcode patches. - */ -static unsigned int _save_mc(struct microcode_intel **mc_saved, - u8 *ucode_ptr, unsigned int num_saved) +static int microcode_sanity_check(void *mc, int print_err) { - struct microcode_header_intel *mc_hdr, *mc_saved_hdr; - unsigned int sig, pf; - int found = 0, i; + unsigned long total_size, data_size, ext_table_size; + struct microcode_header_intel *mc_header = mc; + struct extended_sigtable *ext_header = NULL; + u32 sum, orig_sum, ext_sigcount = 0, i; + struct extended_signature *ext_sig; - mc_hdr = (struct microcode_header_intel *)ucode_ptr; + total_size = get_totalsize(mc_header); + data_size = get_datasize(mc_header); - for (i = 0; i < num_saved; i++) { - mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i]; - sig = mc_saved_hdr->sig; - pf = mc_saved_hdr->pf; + if (data_size + MC_HEADER_SIZE > total_size) { + if (print_err) + pr_err("Error: bad microcode data file size.\n"); + return -EINVAL; + } - if (!find_matching_signature(ucode_ptr, sig, pf)) - continue; + if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { + if (print_err) + pr_err("Error: invalid/unknown microcode update format.\n"); + return -EINVAL; + } - found = 1; + ext_table_size = total_size - (MC_HEADER_SIZE + data_size); + if (ext_table_size) { + u32 ext_table_sum = 0; + u32 *ext_tablep; - if (mc_hdr->rev <= mc_saved_hdr->rev) - continue; + if ((ext_table_size < EXT_HEADER_SIZE) + || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { + if (print_err) + pr_err("Error: truncated extended signature table.\n"); + return -EINVAL; + } + + ext_header = mc + MC_HEADER_SIZE + data_size; + if (ext_table_size != exttable_size(ext_header)) { + if (print_err) + pr_err("Error: extended signature table size mismatch.\n"); + return -EFAULT; + } + + ext_sigcount = ext_header->count; /* - * Found an older ucode saved earlier. Replace it with - * this newer one. + * Check extended table checksum: the sum of all dwords that + * comprise a valid table must be 0. */ - mc_saved[i] = (struct microcode_intel *)ucode_ptr; - break; + ext_tablep = (u32 *)ext_header; + + i = ext_table_size / sizeof(u32); + while (i--) + ext_table_sum += ext_tablep[i]; + + if (ext_table_sum) { + if (print_err) + pr_warn("Bad extended signature table checksum, aborting.\n"); + return -EINVAL; + } + } + + /* + * Calculate the checksum of update data and header. The checksum of + * valid update data and header including the extended signature table + * must be 0. + */ + orig_sum = 0; + i = (MC_HEADER_SIZE + data_size) / sizeof(u32); + while (i--) + orig_sum += ((u32 *)mc)[i]; + + if (orig_sum) { + if (print_err) + pr_err("Bad microcode data checksum, aborting.\n"); + return -EINVAL; } - /* Newly detected microcode, save it to memory. */ - if (i >= num_saved && !found) - mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr; + if (!ext_table_size) + return 0; - return num_saved; + /* + * Check extended signature checksum: 0 => valid. + */ + for (i = 0; i < ext_sigcount; i++) { + ext_sig = (void *)ext_header + EXT_HEADER_SIZE + + EXT_SIGNATURE_SIZE * i; + + sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); + if (sum) { + if (print_err) + pr_err("Bad extended signature checksum, aborting.\n"); + return -EINVAL; + } + } + return 0; } /* * Get microcode matching with BSP's model. Only CPUs with the same model as * BSP can stay in the platform. */ -static enum ucode_state __init -get_matching_model_microcode(unsigned long start, void *data, size_t size, - struct mc_saved_data *mcs, unsigned long *mc_ptrs, - struct ucode_cpu_info *uci) +static struct microcode_intel * +scan_microcode(void *data, size_t size, struct ucode_cpu_info *uci, bool save) { - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; struct microcode_header_intel *mc_header; - unsigned int num_saved = mcs->num_saved; - enum ucode_state state = UCODE_OK; - unsigned int leftover = size; - u8 *ucode_ptr = data; + struct microcode_intel *patch = NULL; unsigned int mc_size; - int i; - - while (leftover && num_saved < ARRAY_SIZE(mc_saved_tmp)) { - if (leftover < sizeof(mc_header)) + while (size) { + if (size < sizeof(struct microcode_header_intel)) break; - mc_header = (struct microcode_header_intel *)ucode_ptr; + mc_header = (struct microcode_header_intel *)data; mc_size = get_totalsize(mc_header); - if (!mc_size || mc_size > leftover || - microcode_sanity_check(ucode_ptr, 0) < 0) + if (!mc_size || + mc_size > size || + microcode_sanity_check(data, 0) < 0) break; - leftover -= mc_size; + size -= mc_size; - /* - * Since APs with same family and model as the BSP may boot in - * the platform, we need to find and save microcode patches - * with the same family and model as the BSP. - */ - if (matching_model_microcode(mc_header, uci->cpu_sig.sig) != UCODE_OK) { - ucode_ptr += mc_size; + if (!microcode_matches(mc_header, uci->cpu_sig.sig)) { + data += mc_size; continue; } - num_saved = _save_mc(mc_saved_tmp, ucode_ptr, num_saved); + if (save) { + save_microcode_patch(data, mc_size); + goto next; + } - ucode_ptr += mc_size; - } - if (leftover) { - state = UCODE_ERROR; - return state; - } + if (!patch) { + if (!has_newer_microcode(data, + uci->cpu_sig.sig, + uci->cpu_sig.pf, + uci->cpu_sig.rev)) + goto next; - if (!num_saved) { - state = UCODE_NFOUND; - return state; - } + } else { + struct microcode_header_intel *phdr = &patch->hdr; - for (i = 0; i < num_saved; i++) - mc_ptrs[i] = (unsigned long)mc_saved_tmp[i] - start; + if (!has_newer_microcode(data, + phdr->sig, + phdr->pf, + phdr->rev)) + goto next; + } + + /* We have a newer patch, save it. */ + patch = data; - mcs->num_saved = num_saved; +next: + data += mc_size; + } - return state; + if (size) + return NULL; + + return patch; } static int collect_cpu_info_early(struct ucode_cpu_info *uci) { unsigned int val[2]; unsigned int family, model; - struct cpu_signature csig; + struct cpu_signature csig = { 0 }; unsigned int eax, ebx, ecx, edx; - csig.sig = 0; - csig.pf = 0; - csig.rev = 0; - memset(uci, 0, sizeof(*uci)); eax = 0x00000001; @@ -374,8 +382,8 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) native_cpuid(&eax, &ebx, &ecx, &edx); csig.sig = eax; - family = x86_family(csig.sig); - model = x86_model(csig.sig); + family = x86_family(eax); + model = x86_model(eax); if ((model >= 5) || (family > 6)) { /* get processor flags from MSR 0x17 */ @@ -401,40 +409,41 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) static void show_saved_mc(void) { #ifdef DEBUG - int i, j; + int i = 0, j; unsigned int sig, pf, rev, total_size, data_size, date; struct ucode_cpu_info uci; + struct ucode_patch *p; - if (!mc_saved_data.num_saved) { + if (list_empty(µcode_cache)) { pr_debug("no microcode data saved.\n"); return; } - pr_debug("Total microcode saved: %d\n", mc_saved_data.num_saved); collect_cpu_info_early(&uci); - sig = uci.cpu_sig.sig; - pf = uci.cpu_sig.pf; - rev = uci.cpu_sig.rev; + sig = uci.cpu_sig.sig; + pf = uci.cpu_sig.pf; + rev = uci.cpu_sig.rev; pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); - for (i = 0; i < mc_saved_data.num_saved; i++) { + list_for_each_entry(p, µcode_cache, plist) { struct microcode_header_intel *mc_saved_header; struct extended_sigtable *ext_header; - int ext_sigcount; struct extended_signature *ext_sig; + int ext_sigcount; + + mc_saved_header = (struct microcode_header_intel *)p->data; - mc_saved_header = (struct microcode_header_intel *) - mc_saved_data.mc_saved[i]; - sig = mc_saved_header->sig; - pf = mc_saved_header->pf; - rev = mc_saved_header->rev; - total_size = get_totalsize(mc_saved_header); - data_size = get_datasize(mc_saved_header); - date = mc_saved_header->date; + sig = mc_saved_header->sig; + pf = mc_saved_header->pf; + rev = mc_saved_header->rev; + date = mc_saved_header->date; + + total_size = get_totalsize(mc_saved_header); + data_size = get_datasize(mc_saved_header); pr_debug("mc_saved[%d]: sig=0x%x, pf=0x%x, rev=0x%x, total size=0x%x, date = %04x-%02x-%02x\n", - i, sig, pf, rev, total_size, + i++, sig, pf, rev, total_size, date & 0xffff, date >> 24, (date >> 16) & 0xff); @@ -443,7 +452,7 @@ static void show_saved_mc(void) if (total_size <= data_size + MC_HEADER_SIZE) continue; - ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE; + ext_header = (void *)mc_saved_header + data_size + MC_HEADER_SIZE; ext_sigcount = ext_header->count; ext_sig = (void *)ext_header + EXT_HEADER_SIZE; @@ -456,85 +465,43 @@ static void show_saved_mc(void) ext_sig++; } - } #endif } /* - * Save this mc into mc_saved_data. So it will be loaded early when a CPU is - * hot added or resumes. - * - * Please make sure this mc should be a valid microcode patch before calling - * this function. + * Save this microcode patch. It will be loaded early when a CPU is + * hot-added or resumes. */ -static void save_mc_for_early(u8 *mc) +static void save_mc_for_early(u8 *mc, unsigned int size) { #ifdef CONFIG_HOTPLUG_CPU /* Synchronization during CPU hotplug. */ static DEFINE_MUTEX(x86_cpu_microcode_mutex); - struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; - unsigned int mc_saved_count_init; - unsigned int num_saved; - struct microcode_intel **mc_saved; - int ret, i; - mutex_lock(&x86_cpu_microcode_mutex); - mc_saved_count_init = mc_saved_data.num_saved; - num_saved = mc_saved_data.num_saved; - mc_saved = mc_saved_data.mc_saved; - - if (mc_saved && num_saved) - memcpy(mc_saved_tmp, mc_saved, - num_saved * sizeof(struct microcode_intel *)); - /* - * Save the microcode patch mc in mc_save_tmp structure if it's a newer - * version. - */ - num_saved = _save_mc(mc_saved_tmp, mc, num_saved); - - /* - * Save the mc_save_tmp in global mc_saved_data. - */ - ret = save_microcode(&mc_saved_data, mc_saved_tmp, num_saved); - if (ret) { - pr_err("Cannot save microcode patch.\n"); - goto out; - } - + save_microcode_patch(mc, size); show_saved_mc(); - /* - * Free old saved microcode data. - */ - if (mc_saved) { - for (i = 0; i < mc_saved_count_init; i++) - kfree(mc_saved[i]); - kfree(mc_saved); - } - -out: mutex_unlock(&x86_cpu_microcode_mutex); #endif } -static bool __init load_builtin_intel_microcode(struct cpio_data *cp) +static bool load_builtin_intel_microcode(struct cpio_data *cp) { -#ifdef CONFIG_X86_64 - unsigned int eax = 0x00000001, ebx, ecx = 0, edx; + unsigned int eax = 1, ebx, ecx = 0, edx; char name[30]; + if (IS_ENABLED(CONFIG_X86_32)) + return false; + native_cpuid(&eax, &ebx, &ecx, &edx); sprintf(name, "intel-ucode/%02x-%02x-%02x", x86_family(eax), x86_model(eax), x86_stepping(eax)); return get_builtin_firmware(cp, name); -#else - return false; -#endif } /* @@ -570,8 +537,7 @@ void show_ucode_info_early(void) } /* - * At this point, we can not call printk() yet. Keep microcode patch number in - * mc_saved_data.mc_saved and delay printing microcode info in + * At this point, we can not call printk() yet. Delay printing microcode info in * show_ucode_info_early() until printk() works. */ static void print_ucode(struct ucode_cpu_info *uci) @@ -648,206 +614,140 @@ static int apply_microcode_early(struct ucode_cpu_info *uci, bool early) return 0; } -/* - * This function converts microcode patch offsets previously stored in - * mc_tmp_ptrs to pointers and stores the pointers in mc_saved_data. - */ int __init save_microcode_in_initrd_intel(void) { - struct microcode_intel *mc_saved[MAX_UCODE_COUNT]; - unsigned int count = mc_saved_data.num_saved; - unsigned long offset = 0; - int ret; - - if (!count) - return 0; + struct ucode_cpu_info uci; + struct cpio_data cp; /* - * We have found a valid initrd but it might've been relocated in the - * meantime so get its updated address. + * AP loading didn't find any microcode patch, no need to save anything. */ - if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && blobs.valid) - offset = initrd_start; - - copy_ptrs(mc_saved, mc_tmp_ptrs, offset, count); + if (!intel_ucode_patch || IS_ERR(intel_ucode_patch)) + return 0; - ret = save_microcode(&mc_saved_data, mc_saved, count); - if (ret) - pr_err("Cannot save microcode patches from initrd.\n"); - else - show_saved_mc(); + if (!load_builtin_intel_microcode(&cp)) + cp = find_microcode_in_initrd(ucode_path, false); - return ret; -} + if (!(cp.data && cp.size)) + return 0; -static __init enum ucode_state -__scan_microcode_initrd(struct cpio_data *cd, struct ucode_blobs *blbp) -{ -#ifdef CONFIG_BLK_DEV_INITRD - static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; - char *p = IS_ENABLED(CONFIG_X86_32) ? (char *)__pa_nodebug(ucode_name) - : ucode_name; -# ifdef CONFIG_X86_32 - unsigned long start = 0, size; - struct boot_params *params; + collect_cpu_info_early(&uci); - params = (struct boot_params *)__pa_nodebug(&boot_params); - size = params->hdr.ramdisk_size; + scan_microcode(cp.data, cp.size, &uci, true); - /* - * Set start only if we have an initrd image. We cannot use initrd_start - * because it is not set that early yet. - */ - start = (size ? params->hdr.ramdisk_image : 0); + show_saved_mc(); -# else /* CONFIG_X86_64 */ - unsigned long start = 0, size; + return 0; +} - size = (u64)boot_params.ext_ramdisk_size << 32; - size |= boot_params.hdr.ramdisk_size; - if (size) { - start = (u64)boot_params.ext_ramdisk_image << 32; - start |= boot_params.hdr.ramdisk_image; +/* + * @res_patch, output: a pointer to the patch we found. + */ +static struct microcode_intel *__load_ucode_intel(struct ucode_cpu_info *uci) +{ + static const char *path; + struct cpio_data cp; + bool use_pa; - start += PAGE_OFFSET; + if (IS_ENABLED(CONFIG_X86_32)) { + path = (const char *)__pa_nodebug(ucode_path); + use_pa = true; + } else { + path = ucode_path; + use_pa = false; } -# endif - - *cd = find_cpio_data(p, (void *)start, size, NULL); - if (cd->data) { - blbp->start = start; - blbp->valid = true; - return UCODE_OK; - } else -#endif /* CONFIG_BLK_DEV_INITRD */ - return UCODE_ERROR; -} + /* try built-in microcode first */ + if (!load_builtin_intel_microcode(&cp)) + cp = find_microcode_in_initrd(path, use_pa); -static __init enum ucode_state -scan_microcode(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - struct ucode_cpu_info *uci, struct ucode_blobs *blbp) -{ - struct cpio_data cd = { NULL, 0, "" }; - enum ucode_state ret; + if (!(cp.data && cp.size)) + return NULL; - /* try built-in microcode first */ - if (load_builtin_intel_microcode(&cd)) - /* - * Invalidate blobs as we might've gotten an initrd too, - * supplied by the boot loader, by mistake or simply forgotten - * there. That's fine, we ignore it since we've found builtin - * microcode already. - */ - blbp->valid = false; - else { - ret = __scan_microcode_initrd(&cd, blbp); - if (ret != UCODE_OK) - return ret; - } + collect_cpu_info_early(uci); - return get_matching_model_microcode(blbp->start, cd.data, cd.size, - mcs, mc_ptrs, uci); + return scan_microcode(cp.data, cp.size, uci, false); } -static void __init -_load_ucode_intel_bsp(struct mc_saved_data *mcs, unsigned long *mc_ptrs, - struct ucode_blobs *blbp) +void __init load_ucode_intel_bsp(void) { + struct microcode_intel *patch; struct ucode_cpu_info uci; - enum ucode_state ret; - - collect_cpu_info_early(&uci); - ret = scan_microcode(mcs, mc_ptrs, &uci, blbp); - if (ret != UCODE_OK) + patch = __load_ucode_intel(&uci); + if (!patch) return; - ret = load_microcode(mcs, mc_ptrs, blbp->start, &uci); - if (ret != UCODE_OK) - return; + uci.mc = patch; apply_microcode_early(&uci, true); } -void __init load_ucode_intel_bsp(void) +void load_ucode_intel_ap(void) { - struct ucode_blobs *blobs_p; - struct mc_saved_data *mcs; - unsigned long *ptrs; + struct microcode_intel *patch, **iup; + struct ucode_cpu_info uci; -#ifdef CONFIG_X86_32 - mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - ptrs = (unsigned long *)__pa_nodebug(&mc_tmp_ptrs); - blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs); -#else - mcs = &mc_saved_data; - ptrs = mc_tmp_ptrs; - blobs_p = &blobs; -#endif + if (IS_ENABLED(CONFIG_X86_32)) + iup = (struct microcode_intel **) __pa_nodebug(&intel_ucode_patch); + else + iup = &intel_ucode_patch; + +reget: + if (!*iup) { + patch = __load_ucode_intel(&uci); + if (!patch) + return; + + *iup = patch; + } - _load_ucode_intel_bsp(mcs, ptrs, blobs_p); + uci.mc = *iup; + + if (apply_microcode_early(&uci, true)) { + /* Mixed-silicon system? Try to refetch the proper patch: */ + *iup = NULL; + + goto reget; + } } -void load_ucode_intel_ap(void) +static struct microcode_intel *find_patch(struct ucode_cpu_info *uci) { - struct ucode_blobs *blobs_p; - unsigned long *ptrs, start = 0; - struct mc_saved_data *mcs; - struct ucode_cpu_info uci; - enum ucode_state ret; + struct microcode_header_intel *phdr; + struct ucode_patch *iter, *tmp; -#ifdef CONFIG_X86_32 - mcs = (struct mc_saved_data *)__pa_nodebug(&mc_saved_data); - ptrs = (unsigned long *)__pa_nodebug(mc_tmp_ptrs); - blobs_p = (struct ucode_blobs *)__pa_nodebug(&blobs); -#else - mcs = &mc_saved_data; - ptrs = mc_tmp_ptrs; - blobs_p = &blobs; -#endif - - /* - * If there is no valid ucode previously saved in memory, no need to - * update ucode on this AP. - */ - if (!mcs->num_saved) - return; + list_for_each_entry_safe(iter, tmp, µcode_cache, plist) { - if (blobs_p->valid) { - start = blobs_p->start; + phdr = (struct microcode_header_intel *)iter->data; - /* - * Pay attention to CONFIG_RANDOMIZE_MEMORY=y as it shuffles - * physmem mapping too and there we have the initrd. - */ - start += PAGE_OFFSET - __PAGE_OFFSET_BASE; - } + if (phdr->rev <= uci->cpu_sig.rev) + continue; - collect_cpu_info_early(&uci); - ret = load_microcode(mcs, ptrs, start, &uci); - if (ret != UCODE_OK) - return; + if (!find_matching_signature(phdr, + uci->cpu_sig.sig, + uci->cpu_sig.pf)) + continue; - apply_microcode_early(&uci, true); + return iter->data; + } + return NULL; } void reload_ucode_intel(void) { + struct microcode_intel *p; struct ucode_cpu_info uci; - enum ucode_state ret; - - if (!mc_saved_data.num_saved) - return; collect_cpu_info_early(&uci); - ret = find_microcode_patch(mc_saved_data.mc_saved, - mc_saved_data.num_saved, &uci); - if (ret != UCODE_OK) + p = find_patch(&uci); + if (!p) return; + uci.mc = p; + apply_microcode_early(&uci, false); } @@ -879,24 +779,6 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) return 0; } -/* - * return 0 - no update found - * return 1 - found update - */ -static int get_matching_mc(struct microcode_intel *mc, int cpu) -{ - struct cpu_signature cpu_sig; - unsigned int csig, cpf, crev; - - collect_cpu_info(cpu, &cpu_sig); - - csig = cpu_sig.sig; - cpf = cpu_sig.pf; - crev = cpu_sig.rev; - - return has_newer_microcode(mc, csig, cpf, crev); -} - static int apply_microcode_intel(int cpu) { struct microcode_intel *mc; @@ -911,16 +793,12 @@ static int apply_microcode_intel(int cpu) uci = ucode_cpu_info + cpu; mc = uci->mc; - if (!mc) - return 0; - - /* - * Microcode on this CPU could be updated earlier. Only apply the - * microcode patch in mc when it is newer than the one on this - * CPU. - */ - if (!get_matching_mc(mc, cpu)) - return 0; + if (!mc) { + /* Look for a newer patch in our cache: */ + mc = find_patch(uci); + if (!mc) + return 0; + } /* write microcode via MSR 0x79 */ wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); @@ -962,7 +840,6 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; - enum ucode_state state = UCODE_OK; unsigned int curr_mc_size = 0; unsigned int csig, cpf; @@ -1015,14 +892,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, if (leftover) { vfree(new_mc); - state = UCODE_ERROR; - goto out; + return UCODE_ERROR; } - if (!new_mc) { - state = UCODE_NFOUND; - goto out; - } + if (!new_mc) + return UCODE_NFOUND; vfree(uci->mc); uci->mc = (struct microcode_intel *)new_mc; @@ -1032,12 +906,12 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, * permanent memory. So it will be loaded early when a CPU is hot added * or resumes. */ - save_mc_for_early(new_mc); + save_mc_for_early(new_mc, curr_mc_size); pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); -out: - return state; + + return UCODE_OK; } static int get_ucode_fw(void *to, const void *from, size_t n) @@ -1081,20 +955,11 @@ request_microcode_user(int cpu, const void __user *buf, size_t size) return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } -static void microcode_fini_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - - vfree(uci->mc); - uci->mc = NULL; -} - static struct microcode_ops microcode_intel_ops = { .request_microcode_user = request_microcode_user, .request_microcode_fw = request_microcode_fw, .collect_cpu_info = collect_cpu_info, .apply_microcode = apply_microcode_intel, - .microcode_fini_cpu = microcode_fini_cpu, }; struct microcode_ops * __init init_intel_microcode(void) @@ -1109,4 +974,3 @@ struct microcode_ops * __init init_intel_microcode(void) return µcode_intel_ops; } - diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c deleted file mode 100644 index 406cb6c0d9dd..000000000000 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Intel CPU Microcode Update Driver for Linux - * - * Copyright (C) 2012 Fenghua Yu <fenghua.yu@intel.com> - * H Peter Anvin" <hpa@zytor.com> - * - * This driver allows to upgrade microcode on Intel processors - * belonging to IA-32 family - PentiumPro, Pentium II, - * Pentium III, Xeon, Pentium 4, etc. - * - * Reference: Section 8.11 of Volume 3a, IA-32 Intel? Architecture - * Software Developer's Manual - * Order Number 253668 or free download from: - * - * http://developer.intel.com/Assets/PDF/manual/253668.pdf - * - * For more information, go to http://www.urbanmyth.org/microcode - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - * - */ -#include <linux/firmware.h> -#include <linux/uaccess.h> -#include <linux/kernel.h> - -#include <asm/microcode_intel.h> -#include <asm/processor.h> -#include <asm/msr.h> - -static inline bool cpu_signatures_match(unsigned int s1, unsigned int p1, - unsigned int s2, unsigned int p2) -{ - if (s1 != s2) - return false; - - /* Processor flags are either both 0 ... */ - if (!p1 && !p2) - return true; - - /* ... or they intersect. */ - return p1 & p2; -} - -int microcode_sanity_check(void *mc, int print_err) -{ - unsigned long total_size, data_size, ext_table_size; - struct microcode_header_intel *mc_header = mc; - struct extended_sigtable *ext_header = NULL; - u32 sum, orig_sum, ext_sigcount = 0, i; - struct extended_signature *ext_sig; - - total_size = get_totalsize(mc_header); - data_size = get_datasize(mc_header); - - if (data_size + MC_HEADER_SIZE > total_size) { - if (print_err) - pr_err("Error: bad microcode data file size.\n"); - return -EINVAL; - } - - if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { - if (print_err) - pr_err("Error: invalid/unknown microcode update format.\n"); - return -EINVAL; - } - - ext_table_size = total_size - (MC_HEADER_SIZE + data_size); - if (ext_table_size) { - u32 ext_table_sum = 0; - u32 *ext_tablep; - - if ((ext_table_size < EXT_HEADER_SIZE) - || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { - if (print_err) - pr_err("Error: truncated extended signature table.\n"); - return -EINVAL; - } - - ext_header = mc + MC_HEADER_SIZE + data_size; - if (ext_table_size != exttable_size(ext_header)) { - if (print_err) - pr_err("Error: extended signature table size mismatch.\n"); - return -EFAULT; - } - - ext_sigcount = ext_header->count; - - /* - * Check extended table checksum: the sum of all dwords that - * comprise a valid table must be 0. - */ - ext_tablep = (u32 *)ext_header; - - i = ext_table_size / sizeof(u32); - while (i--) - ext_table_sum += ext_tablep[i]; - - if (ext_table_sum) { - if (print_err) - pr_warn("Bad extended signature table checksum, aborting.\n"); - return -EINVAL; - } - } - - /* - * Calculate the checksum of update data and header. The checksum of - * valid update data and header including the extended signature table - * must be 0. - */ - orig_sum = 0; - i = (MC_HEADER_SIZE + data_size) / sizeof(u32); - while (i--) - orig_sum += ((u32 *)mc)[i]; - - if (orig_sum) { - if (print_err) - pr_err("Bad microcode data checksum, aborting.\n"); - return -EINVAL; - } - - if (!ext_table_size) - return 0; - - /* - * Check extended signature checksum: 0 => valid. - */ - for (i = 0; i < ext_sigcount; i++) { - ext_sig = (void *)ext_header + EXT_HEADER_SIZE + - EXT_SIGNATURE_SIZE * i; - - sum = (mc_header->sig + mc_header->pf + mc_header->cksum) - - (ext_sig->sig + ext_sig->pf + ext_sig->cksum); - if (sum) { - if (print_err) - pr_err("Bad extended signature checksum, aborting.\n"); - return -EINVAL; - } - } - return 0; -} - -/* - * Returns 1 if update has been found, 0 otherwise. - */ -int find_matching_signature(void *mc, unsigned int csig, int cpf) -{ - struct microcode_header_intel *mc_hdr = mc; - struct extended_sigtable *ext_hdr; - struct extended_signature *ext_sig; - int i; - - if (cpu_signatures_match(csig, cpf, mc_hdr->sig, mc_hdr->pf)) - return 1; - - /* Look for ext. headers: */ - if (get_totalsize(mc_hdr) <= get_datasize(mc_hdr) + MC_HEADER_SIZE) - return 0; - - ext_hdr = mc + get_datasize(mc_hdr) + MC_HEADER_SIZE; - ext_sig = (void *)ext_hdr + EXT_HEADER_SIZE; - - for (i = 0; i < ext_hdr->count; i++) { - if (cpu_signatures_match(csig, cpf, ext_sig->sig, ext_sig->pf)) - return 1; - ext_sig++; - } - return 0; -} - -/* - * Returns 1 if update has been found, 0 otherwise. - */ -int has_newer_microcode(void *mc, unsigned int csig, int cpf, int new_rev) -{ - struct microcode_header_intel *mc_hdr = mc; - - if (mc_hdr->rev <= new_rev) - return 0; - - return find_matching_signature(mc, csig, cpf); -} diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 8f44c5a50ab8..6c044543545e 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -25,7 +25,6 @@ #include <asm/hyperv.h> #include <asm/mshyperv.h> #include <asm/desc.h> -#include <asm/idle.h> #include <asm/irq_regs.h> #include <asm/i8259.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 1db8dc490b66..d1316f9c8329 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -17,11 +17,17 @@ struct cpuid_bit { u32 sub_leaf; }; -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX +/* Please keep the leaf sorted by cpuid_bit.level for faster search. */ +static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, + { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, + { 0, 0, 0, 0, 0 } }; void init_scattered_cpuid_features(struct cpuinfo_x86 *c) @@ -30,18 +36,6 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) u32 regs[4]; const struct cpuid_bit *cb; - static const struct cpuid_bit cpuid_bits[] = { - { X86_FEATURE_INTEL_PT, CR_EBX,25, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4VNNIW, CR_EDX, 2, 0x00000007, 0 }, - { X86_FEATURE_AVX512_4FMAPS, CR_EDX, 3, 0x00000007, 0 }, - { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, - { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, - { X86_FEATURE_HW_PSTATE, CR_EDX, 7, 0x80000007, 0 }, - { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, - { X86_FEATURE_PROC_FEEDBACK, CR_EDX,11, 0x80000007, 0 }, - { 0, 0, 0, 0, 0 } - }; - for (cb = cpuid_bits; cb->feature; cb++) { /* Verify that the level is valid */ @@ -50,10 +44,35 @@ void init_scattered_cpuid_features(struct cpuinfo_x86 *c) max_level > (cb->level | 0xffff)) continue; - cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], - ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); + cpuid_count(cb->level, cb->sub_leaf, ®s[CPUID_EAX], + ®s[CPUID_EBX], ®s[CPUID_ECX], + ®s[CPUID_EDX]); if (regs[cb->reg] & (1 << cb->bit)) set_cpu_cap(c, cb->feature); } } + +u32 get_scattered_cpuid_leaf(unsigned int level, unsigned int sub_leaf, + enum cpuid_regs_idx reg) +{ + const struct cpuid_bit *cb; + u32 cpuid_val = 0; + + for (cb = cpuid_bits; cb->feature; cb++) { + + if (level > cb->level) + continue; + + if (level < cb->level) + break; + + if (reg == cb->reg && sub_leaf == cb->sub_leaf) { + if (cpu_has(&boot_cpu_data, cb->feature)) + cpuid_val |= BIT(cb->bit); + } + } + + return cpuid_val; +} +EXPORT_SYMBOL_GPL(get_scattered_cpuid_leaf); diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index 5130985b758b..891f4dad7b2c 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -24,11 +24,16 @@ #include <linux/dmi.h> #include <linux/init.h> #include <linux/export.h> +#include <linux/clocksource.h> #include <asm/div64.h> #include <asm/x86_init.h> #include <asm/hypervisor.h> #include <asm/timer.h> #include <asm/apic.h> +#include <asm/timer.h> + +#undef pr_fmt +#define pr_fmt(fmt) "vmware: " fmt #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -48,6 +53,8 @@ "2"(VMWARE_HYPERVISOR_PORT), "3"(UINT_MAX) : \ "memory"); +static unsigned long vmware_tsc_khz __ro_after_init; + static inline int __vmware_platform(void) { uint32_t eax, ebx, ecx, edx; @@ -57,35 +64,80 @@ static inline int __vmware_platform(void) static unsigned long vmware_get_tsc_khz(void) { - uint64_t tsc_hz, lpj; - uint32_t eax, ebx, ecx, edx; + return vmware_tsc_khz; +} - VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); +#ifdef CONFIG_PARAVIRT +static struct cyc2ns_data vmware_cyc2ns __ro_after_init; +static int vmw_sched_clock __initdata = 1; - tsc_hz = eax | (((uint64_t)ebx) << 32); - do_div(tsc_hz, 1000); - BUG_ON(tsc_hz >> 32); - pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", - (unsigned long) tsc_hz / 1000, - (unsigned long) tsc_hz % 1000); - - if (!preset_lpj) { - lpj = ((u64)tsc_hz * 1000); - do_div(lpj, HZ); - preset_lpj = lpj; - } +static __init int setup_vmw_sched_clock(char *s) +{ + vmw_sched_clock = 0; + return 0; +} +early_param("no-vmw-sched-clock", setup_vmw_sched_clock); + +static unsigned long long vmware_sched_clock(void) +{ + unsigned long long ns; - return tsc_hz; + ns = mul_u64_u32_shr(rdtsc(), vmware_cyc2ns.cyc2ns_mul, + vmware_cyc2ns.cyc2ns_shift); + ns -= vmware_cyc2ns.cyc2ns_offset; + return ns; } +static void __init vmware_sched_clock_setup(void) +{ + struct cyc2ns_data *d = &vmware_cyc2ns; + unsigned long long tsc_now = rdtsc(); + + clocks_calc_mult_shift(&d->cyc2ns_mul, &d->cyc2ns_shift, + vmware_tsc_khz, NSEC_PER_MSEC, 0); + d->cyc2ns_offset = mul_u64_u32_shr(tsc_now, d->cyc2ns_mul, + d->cyc2ns_shift); + + pv_time_ops.sched_clock = vmware_sched_clock; + pr_info("using sched offset of %llu ns\n", d->cyc2ns_offset); +} + +static void __init vmware_paravirt_ops_setup(void) +{ + pv_info.name = "VMware hypervisor"; + pv_cpu_ops.io_delay = paravirt_nop; + + if (vmware_tsc_khz && vmw_sched_clock) + vmware_sched_clock_setup(); +} +#else +#define vmware_paravirt_ops_setup() do {} while (0) +#endif + static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; + uint64_t lpj, tsc_khz; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); if (ebx != UINT_MAX) { + lpj = tsc_khz = eax | (((uint64_t)ebx) << 32); + do_div(tsc_khz, 1000); + WARN_ON(tsc_khz >> 32); + pr_info("TSC freq read from hypervisor : %lu.%03lu MHz\n", + (unsigned long) tsc_khz / 1000, + (unsigned long) tsc_khz % 1000); + + if (!preset_lpj) { + do_div(lpj, HZ); + preset_lpj = lpj; + } + + vmware_tsc_khz = tsc_khz; x86_platform.calibrate_tsc = vmware_get_tsc_khz; + x86_platform.calibrate_cpu = vmware_get_tsc_khz; + #ifdef CONFIG_X86_LOCAL_APIC /* Skip lapic calibration since we know the bus frequency. */ lapic_timer_frequency = ecx / HZ; @@ -96,6 +148,8 @@ static void __init vmware_platform_setup(void) pr_warn("Failed to get TSC freq from the hypervisor\n"); } + vmware_paravirt_ops_setup(); + #ifdef CONFIG_X86_IO_APIC no_timer_check = 1; #endif diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 2836de390f95..0931a105ffe1 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -45,10 +45,7 @@ #include <asm/msr.h> static struct class *cpuid_class; - -struct cpuid_regs { - u32 eax, ebx, ecx, edx; -}; +static enum cpuhp_state cpuhp_cpuid_state; static void cpuid_smp_cpuid(void *cmd_block) { @@ -115,7 +112,7 @@ static const struct file_operations cpuid_fops = { .open = cpuid_open, }; -static int cpuid_device_create(int cpu) +static int cpuid_device_create(unsigned int cpu) { struct device *dev; @@ -124,35 +121,12 @@ static int cpuid_device_create(int cpu) return PTR_ERR_OR_ZERO(dev); } -static void cpuid_device_destroy(int cpu) +static int cpuid_device_destroy(unsigned int cpu) { device_destroy(cpuid_class, MKDEV(CPUID_MAJOR, cpu)); + return 0; } -static int cpuid_class_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - int err = 0; - - switch (action) { - case CPU_UP_PREPARE: - err = cpuid_device_create(cpu); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - cpuid_device_destroy(cpu); - break; - } - return notifier_from_errno(err); -} - -static struct notifier_block cpuid_class_cpu_notifier = -{ - .notifier_call = cpuid_class_cpu_callback, -}; - static char *cpuid_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); @@ -160,15 +134,13 @@ static char *cpuid_devnode(struct device *dev, umode_t *mode) static int __init cpuid_init(void) { - int i, err = 0; - i = 0; + int err; if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid", &cpuid_fops)) { printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", CPUID_MAJOR); - err = -EBUSY; - goto out; + return -EBUSY; } cpuid_class = class_create(THIS_MODULE, "cpuid"); if (IS_ERR(cpuid_class)) { @@ -177,45 +149,28 @@ static int __init cpuid_init(void) } cpuid_class->devnode = cpuid_devnode; - cpu_notifier_register_begin(); - for_each_online_cpu(i) { - err = cpuid_device_create(i); - if (err != 0) - goto out_class; - } - __register_hotcpu_notifier(&cpuid_class_cpu_notifier); - cpu_notifier_register_done(); + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/cpuid:online", + cpuid_device_create, cpuid_device_destroy); + if (err < 0) + goto out_class; - err = 0; - goto out; + cpuhp_cpuid_state = err; + return 0; out_class: - i = 0; - for_each_online_cpu(i) { - cpuid_device_destroy(i); - } - cpu_notifier_register_done(); class_destroy(cpuid_class); out_chrdev: __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); -out: return err; } +module_init(cpuid_init); static void __exit cpuid_exit(void) { - int cpu = 0; - - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - cpuid_device_destroy(cpu); + cpuhp_remove_state(cpuhp_cpuid_state); class_destroy(cpuid_class); __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); - __unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); - cpu_notifier_register_done(); } - -module_init(cpuid_init); module_exit(cpuid_exit); MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 85f854b98a9d..0cfd01d2754c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -22,7 +22,6 @@ int panic_on_unrecovered_nmi; int panic_on_io_nmi; unsigned int code_bytes = 64; -int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; bool in_task_stack(unsigned long *stack, struct task_struct *task, @@ -46,14 +45,7 @@ static void printk_stack_address(unsigned long address, int reliable, char *log_lvl) { touch_nmi_watchdog(); - printk("%s [<%p>] %s%pB\n", - log_lvl, (void *)address, reliable ? "" : "? ", - (void *)address); -} - -void printk_address(unsigned long address) -{ - pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address); + printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, @@ -67,6 +59,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); + stack = stack ? : get_stack_pointer(task, regs); /* * Iterate through the stacks, starting with the current stack pointer. @@ -82,8 +75,8 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - softirq stack * - hardirq stack */ - for (; stack; stack = stack_info.next_sp) { - const char *str_begin, *str_end; + for (regs = NULL; stack; stack = stack_info.next_sp) { + const char *stack_name; /* * If we overflowed the task stack into a guard page, jump back @@ -95,9 +88,9 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (get_stack_info(stack, task, &stack_info, &visit_mask)) break; - stack_type_str(stack_info.type, &str_begin, &str_end); - if (str_begin) - printk("%s <%s> ", log_lvl, str_begin); + stack_name = stack_type_name(stack_info.type); + if (stack_name) + printk("%s <%s>\n", log_lvl, stack_name); /* * Scan the stack, printing any text addresses we find. At the @@ -119,6 +112,15 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, if (!__kernel_text_address(addr)) continue; + /* + * Don't print regs->ip again if it was already printed + * by __show_regs() below. + */ + if (regs && stack == ®s->ip) { + unwind_next_frame(&state); + continue; + } + if (stack == ret_addr_p) reliable = 1; @@ -146,10 +148,15 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * of the addresses will just be printed as unreliable. */ unwind_next_frame(&state); + + /* if the frame has entry regs, print them */ + regs = unwind_get_entry_regs(&state); + if (regs) + __show_regs(regs, 0); } - if (str_end) - printk("%s <%s> ", log_lvl, str_end); + if (stack_name) + printk("%s </%s>\n", log_lvl, stack_name); } } @@ -164,12 +171,12 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (!sp && task == current) sp = get_stack_pointer(current, NULL); - show_stack_log_lvl(task, NULL, sp, ""); + show_trace_log_lvl(task, NULL, sp, KERN_DEFAULT); } void show_stack_regs(struct pt_regs *regs) { - show_stack_log_lvl(current, regs, NULL, ""); + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); } static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; @@ -261,14 +268,11 @@ int __die(const char *str, struct pt_regs *regs, long err) sp = kernel_stack_pointer(regs); savesegment(ss, ss); } - printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); - print_symbol("%s", regs->ip); - printk(" SS:ESP %04x:%08lx\n", ss, sp); + printk(KERN_EMERG "EIP: %pS SS:ESP: %04x:%08lx\n", + (void *)regs->ip, ss, sp); #else /* Executive summary in case the oops scrolled away */ - printk(KERN_ALERT "RIP "); - printk_address(regs->ip); - printk(" RSP <%016lx>\n", regs->sp); + printk(KERN_ALERT "RIP: %pS RSP: %016lx\n", (void *)regs->ip, regs->sp); #endif return 0; } @@ -291,22 +295,6 @@ void die(const char *str, struct pt_regs *regs, long err) oops_end(flags, regs, sig); } -static int __init kstack_setup(char *s) -{ - ssize_t ret; - unsigned long val; - - if (!s) - return -EINVAL; - - ret = kstrtoul(s, 0, &val); - if (ret) - return ret; - kstack_depth_to_print = val; - return 0; -} -early_param("kstack", kstack_setup); - static int __init code_bytes_setup(char *s) { ssize_t ret; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 06eb322b5f9f..bb3b5b9a6899 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,18 +16,15 @@ #include <asm/stacktrace.h> -void stack_type_str(enum stack_type type, const char **begin, const char **end) +const char *stack_type_name(enum stack_type type) { - switch (type) { - case STACK_TYPE_IRQ: - case STACK_TYPE_SOFTIRQ: - *begin = "IRQ"; - *end = "EOI"; - break; - default: - *begin = NULL; - *end = NULL; - } + if (type == STACK_TYPE_IRQ) + return "IRQ"; + + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + + return NULL; } static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) @@ -109,8 +106,10 @@ recursion_check: * just break out and report an unknown stack type. */ if (visit_mask) { - if (*visit_mask & (1UL << info->type)) + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; + } *visit_mask |= 1UL << info->type; } @@ -121,36 +120,6 @@ unknown: return -EINVAL; } -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) -{ - unsigned long *stack; - int i; - - if (!try_get_task_stack(task)) - return; - - sp = sp ? : get_stack_pointer(task, regs); - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - if (kstack_end(stack)) - break; - if ((i % STACKSLOTS_PER_LINE) == 0) { - if (i != 0) - pr_cont("\n"); - printk("%s %08lx", log_lvl, *stack++); - } else - pr_cont(" %08lx", *stack++); - touch_nmi_watchdog(); - } - pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); - - put_task_stack(task); -} - - void show_regs(struct pt_regs *regs) { int i; @@ -168,8 +137,7 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - pr_emerg("Stack:\n"); - show_stack_log_lvl(current, regs, NULL, KERN_EMERG); + show_trace_log_lvl(current, regs, NULL, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 36cf1a498227..fac189efcc34 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -28,23 +28,17 @@ static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { [DEBUG_STACK - 1] = DEBUG_STKSZ }; -void stack_type_str(enum stack_type type, const char **begin, const char **end) +const char *stack_type_name(enum stack_type type) { BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); - switch (type) { - case STACK_TYPE_IRQ: - *begin = "IRQ"; - *end = "EOI"; - break; - case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST: - *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION]; - *end = "EOE"; - break; - default: - *begin = NULL; - *end = NULL; - } + if (type == STACK_TYPE_IRQ) + return "IRQ"; + + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) + return exception_stack_names[type - STACK_TYPE_EXCEPTION]; + + return NULL; } static bool in_exception_stack(unsigned long *stack, struct stack_info *info) @@ -128,8 +122,10 @@ recursion_check: * just break out and report an unknown stack type. */ if (visit_mask) { - if (*visit_mask & (1UL << info->type)) + if (*visit_mask & (1UL << info->type)) { + printk_deferred_once(KERN_WARNING "WARNING: stack recursion on stack type %d\n", info->type); goto unknown; + } *visit_mask |= 1UL << info->type; } @@ -140,56 +136,6 @@ unknown: return -EINVAL; } -void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, char *log_lvl) -{ - unsigned long *irq_stack_end; - unsigned long *irq_stack; - unsigned long *stack; - int i; - - if (!try_get_task_stack(task)) - return; - - irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr); - irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); - - sp = sp ? : get_stack_pointer(task, regs); - - stack = sp; - for (i = 0; i < kstack_depth_to_print; i++) { - unsigned long word; - - if (stack >= irq_stack && stack <= irq_stack_end) { - if (stack == irq_stack_end) { - stack = (unsigned long *) (irq_stack_end[-1]); - pr_cont(" <EOI> "); - } - } else { - if (kstack_end(stack)) - break; - } - - if (probe_kernel_address(stack, word)) - break; - - if ((i % STACKSLOTS_PER_LINE) == 0) { - if (i != 0) - pr_cont("\n"); - printk("%s %016lx", log_lvl, word); - } else - pr_cont(" %016lx", word); - - stack++; - touch_nmi_watchdog(); - } - - pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, log_lvl); - - put_task_stack(task); -} - void show_regs(struct pt_regs *regs) { int i; @@ -207,8 +153,7 @@ void show_regs(struct pt_regs *regs) unsigned char c; u8 *ip; - printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT); + show_trace_log_lvl(current, regs, NULL, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); diff --git a/arch/x86/kernel/fpu/bugs.c b/arch/x86/kernel/fpu/bugs.c index aad34aafc0e0..d913047f832c 100644 --- a/arch/x86/kernel/fpu/bugs.c +++ b/arch/x86/kernel/fpu/bugs.c @@ -23,17 +23,12 @@ static double __initdata y = 3145727.0; */ void __init fpu__init_check_bugs(void) { - u32 cr0_saved; s32 fdiv_bug; /* kernel_fpu_begin/end() relies on patched alternative instructions. */ if (!boot_cpu_has(X86_FEATURE_FPU)) return; - /* We might have CR0::TS set already, clear it: */ - cr0_saved = read_cr0(); - write_cr0(cr0_saved & ~X86_CR0_TS); - kernel_fpu_begin(); /* @@ -56,8 +51,6 @@ void __init fpu__init_check_bugs(void) kernel_fpu_end(); - write_cr0(cr0_saved); - if (fdiv_bug) { set_cpu_bug(&boot_cpu_data, X86_BUG_FDIV); pr_warn("Hmm, FPU with FDIV bug\n"); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index ebb4e95fbd74..e4e97a5355ce 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -58,27 +58,9 @@ static bool kernel_fpu_disabled(void) return this_cpu_read(in_kernel_fpu); } -/* - * Were we in an interrupt that interrupted kernel mode? - * - * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that - * pair does nothing at all: the thread must not have fpu (so - * that we don't try to save the FPU state), and TS must - * be set (so that the clts/stts pair does nothing that is - * visible in the interrupted kernel thread). - * - * Except for the eagerfpu case when we return true; in the likely case - * the thread has FPU but we are not going to set/clear TS. - */ static bool interrupted_kernel_fpu_idle(void) { - if (kernel_fpu_disabled()) - return false; - - if (use_eager_fpu()) - return true; - - return !current->thread.fpu.fpregs_active && (read_cr0() & X86_CR0_TS); + return !kernel_fpu_disabled(); } /* @@ -125,8 +107,7 @@ void __kernel_fpu_begin(void) */ copy_fpregs_to_fpstate(fpu); } else { - this_cpu_write(fpu_fpregs_owner_ctx, NULL); - __fpregs_activate_hw(); + __cpu_invalidate_fpregs_state(); } } EXPORT_SYMBOL(__kernel_fpu_begin); @@ -137,8 +118,6 @@ void __kernel_fpu_end(void) if (fpu->fpregs_active) copy_kernel_to_fpregs(&fpu->state); - else - __fpregs_deactivate_hw(); kernel_fpu_enable(); } @@ -159,35 +138,6 @@ void kernel_fpu_end(void) EXPORT_SYMBOL_GPL(kernel_fpu_end); /* - * CR0::TS save/restore functions: - */ -int irq_ts_save(void) -{ - /* - * If in process context and not atomic, we can take a spurious DNA fault. - * Otherwise, doing clts() in process context requires disabling preemption - * or some heavy lifting like kernel_fpu_begin() - */ - if (!in_atomic()) - return 0; - - if (read_cr0() & X86_CR0_TS) { - clts(); - return 1; - } - - return 0; -} -EXPORT_SYMBOL_GPL(irq_ts_save); - -void irq_ts_restore(int TS_state) -{ - if (TS_state) - stts(); -} -EXPORT_SYMBOL_GPL(irq_ts_restore); - -/* * Save the FPU state (mark it for reload if necessary): * * This only ever gets called for the current task. @@ -200,10 +150,7 @@ void fpu__save(struct fpu *fpu) trace_x86_fpu_before_save(fpu); if (fpu->fpregs_active) { if (!copy_fpregs_to_fpstate(fpu)) { - if (use_eager_fpu()) - copy_kernel_to_fpregs(&fpu->state); - else - fpregs_deactivate(fpu); + copy_kernel_to_fpregs(&fpu->state); } } trace_x86_fpu_after_save(fpu); @@ -247,7 +194,6 @@ EXPORT_SYMBOL_GPL(fpstate_init); int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) { - dst_fpu->counter = 0; dst_fpu->fpregs_active = 0; dst_fpu->last_cpu = -1; @@ -260,8 +206,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) * Don't let 'init optimized' areas of the XSAVE area * leak into the child task: */ - if (use_eager_fpu()) - memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); + memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* * Save current FPU registers directly into the child @@ -283,10 +228,7 @@ int fpu__copy(struct fpu *dst_fpu, struct fpu *src_fpu) memcpy(&src_fpu->state, &dst_fpu->state, fpu_kernel_xstate_size); - if (use_eager_fpu()) - copy_kernel_to_fpregs(&src_fpu->state); - else - fpregs_deactivate(src_fpu); + copy_kernel_to_fpregs(&src_fpu->state); } preempt_enable(); @@ -366,7 +308,7 @@ void fpu__activate_fpstate_write(struct fpu *fpu) if (fpu->fpstate_active) { /* Invalidate any lazy state: */ - fpu->last_cpu = -1; + __fpu_invalidate_fpregs_state(fpu); } else { fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); @@ -409,7 +351,7 @@ void fpu__current_fpstate_write_begin(void) * ensures we will not be lazy and skip a XRSTOR in the * future. */ - fpu->last_cpu = -1; + __fpu_invalidate_fpregs_state(fpu); } /* @@ -459,7 +401,6 @@ void fpu__restore(struct fpu *fpu) trace_x86_fpu_before_restore(fpu); fpregs_activate(fpu); copy_kernel_to_fpregs(&fpu->state); - fpu->counter++; trace_x86_fpu_after_restore(fpu); kernel_fpu_enable(); } @@ -477,7 +418,6 @@ EXPORT_SYMBOL_GPL(fpu__restore); void fpu__drop(struct fpu *fpu) { preempt_disable(); - fpu->counter = 0; if (fpu->fpregs_active) { /* Ignore delayed exceptions from user space */ diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 2f2b8c7ccb85..60dece392b3a 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -10,18 +10,6 @@ #include <linux/init.h> /* - * Initialize the TS bit in CR0 according to the style of context-switches - * we are using: - */ -static void fpu__init_cpu_ctx_switch(void) -{ - if (!boot_cpu_has(X86_FEATURE_EAGER_FPU)) - stts(); - else - clts(); -} - -/* * Initialize the registers found in all CPUs, CR0 and CR4: */ static void fpu__init_cpu_generic(void) @@ -58,7 +46,6 @@ void fpu__init_cpu(void) { fpu__init_cpu_generic(); fpu__init_cpu_xstate(); - fpu__init_cpu_ctx_switch(); } /* @@ -233,82 +220,16 @@ static void __init fpu__init_system_xstate_size_legacy(void) } /* - * FPU context switching strategies: - * - * Against popular belief, we don't do lazy FPU saves, due to the - * task migration complications it brings on SMP - we only do - * lazy FPU restores. - * - * 'lazy' is the traditional strategy, which is based on setting - * CR0::TS to 1 during context-switch (instead of doing a full - * restore of the FPU state), which causes the first FPU instruction - * after the context switch (whenever it is executed) to fault - at - * which point we lazily restore the FPU state into FPU registers. - * - * Tasks are of course under no obligation to execute FPU instructions, - * so it can easily happen that another context-switch occurs without - * a single FPU instruction being executed. If we eventually switch - * back to the original task (that still owns the FPU) then we have - * not only saved the restores along the way, but we also have the - * FPU ready to be used for the original task. - * - * 'lazy' is deprecated because it's almost never a performance win - * and it's much more complicated than 'eager'. - * - * 'eager' switching is by default on all CPUs, there we switch the FPU - * state during every context switch, regardless of whether the task - * has used FPU instructions in that time slice or not. This is done - * because modern FPU context saving instructions are able to optimize - * state saving and restoration in hardware: they can detect both - * unused and untouched FPU state and optimize accordingly. - * - * [ Note that even in 'lazy' mode we might optimize context switches - * to use 'eager' restores, if we detect that a task is using the FPU - * frequently. See the fpu->counter logic in fpu/internal.h for that. ] - */ -static enum { ENABLE, DISABLE } eagerfpu = ENABLE; - -/* * Find supported xfeatures based on cpu features and command-line input. * This must be called after fpu__init_parse_early_param() is called and * xfeatures_mask is enumerated. */ u64 __init fpu__get_supported_xfeatures_mask(void) { - /* Support all xfeatures known to us */ - if (eagerfpu != DISABLE) - return XCNTXT_MASK; - - /* Warning of xfeatures being disabled for no eagerfpu mode */ - if (xfeatures_mask & XFEATURE_MASK_EAGER) { - pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n", - xfeatures_mask & XFEATURE_MASK_EAGER); - } - - /* Return a mask that masks out all features requiring eagerfpu mode */ - return ~XFEATURE_MASK_EAGER; + return XCNTXT_MASK; } -/* - * Disable features dependent on eagerfpu. - */ -static void __init fpu__clear_eager_fpu_features(void) -{ - setup_clear_cpu_cap(X86_FEATURE_MPX); -} - -/* - * Pick the FPU context switching strategy: - * - * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of - * the following is true: - * - * (1) the cpu has xsaveopt, as it has the optimization and doing eager - * FPU switching has a relatively low cost compared to a plain xsave; - * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU - * switching. Should the kernel boot with noxsaveopt, we support MPX - * with eager FPU switching at a higher cost. - */ +/* Legacy code to initialize eager fpu mode. */ static void __init fpu__init_system_ctx_switch(void) { static bool on_boot_cpu __initdata = 1; @@ -317,17 +238,6 @@ static void __init fpu__init_system_ctx_switch(void) on_boot_cpu = 0; WARN_ON_FPU(current->thread.fpu.fpstate_active); - - if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE) - eagerfpu = ENABLE; - - if (xfeatures_mask & XFEATURE_MASK_EAGER) - eagerfpu = ENABLE; - - if (eagerfpu == ENABLE) - setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); - - printk(KERN_INFO "x86/fpu: Using '%s' FPU context switches.\n", eagerfpu == ENABLE ? "eager" : "lazy"); } /* @@ -336,11 +246,6 @@ static void __init fpu__init_system_ctx_switch(void) */ static void __init fpu__init_parse_early_param(void) { - if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) { - eagerfpu = DISABLE; - fpu__clear_eager_fpu_features(); - } - if (cmdline_find_option_bool(boot_command_line, "no387")) setup_clear_cpu_cap(X86_FEATURE_FPU); @@ -375,14 +280,6 @@ void __init fpu__init_system(struct cpuinfo_x86 *c) */ fpu__init_cpu(); - /* - * But don't leave CR0::TS set yet, as some of the FPU setup - * methods depend on being able to execute FPU instructions - * that will fault on a set TS, such as the FXSAVE in - * fpu__init_system_mxcsr(). - */ - clts(); - fpu__init_system_generic(); fpu__init_system_xstate_size_legacy(); fpu__init_system_xstate(); diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index a184c210efba..83c23c230b4c 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -340,11 +340,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) } fpu->fpstate_active = 1; - if (use_eager_fpu()) { - preempt_disable(); - fpu__restore(fpu); - preempt_enable(); - } + preempt_disable(); + fpu__restore(fpu); + preempt_enable(); return err; } else { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 095ef7ddd6ae..1d7770447b3e 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -65,6 +65,7 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX); setup_clear_cpu_cap(X86_FEATURE_AVX2); setup_clear_cpu_cap(X86_FEATURE_AVX512F); + setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); setup_clear_cpu_cap(X86_FEATURE_AVX512PF); setup_clear_cpu_cap(X86_FEATURE_AVX512ER); setup_clear_cpu_cap(X86_FEATURE_AVX512CD); @@ -73,6 +74,7 @@ void fpu__xstate_clear_all_cpu_caps(void) setup_clear_cpu_cap(X86_FEATURE_AVX512VL); setup_clear_cpu_cap(X86_FEATURE_MPX); setup_clear_cpu_cap(X86_FEATURE_XGETBV1); + setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); setup_clear_cpu_cap(X86_FEATURE_PKU); setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); @@ -890,15 +892,6 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, */ if (!boot_cpu_has(X86_FEATURE_OSPKE)) return -EINVAL; - /* - * For most XSAVE components, this would be an arduous task: - * brining fpstate up to date with fpregs, updating fpstate, - * then re-populating fpregs. But, for components that are - * never lazily managed, we can just access the fpregs - * directly. PKRU is never managed lazily, so we can just - * manipulate it directly. Make sure it stays that way. - */ - WARN_ON_ONCE(!use_eager_fpu()); /* Set the bits we need in PKRU: */ if (init_val & PKEY_DISABLE_ACCESS) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 2dabea46f039..4e8577d03372 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -63,6 +63,8 @@ #define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) #endif +#define SIZEOF_PTREGS 17*4 + /* * Number of possible pages in the lowmem region. * @@ -248,19 +250,19 @@ page_pde_offset = (__PAGE_OFFSET >> 20); #ifdef CONFIG_PARAVIRT /* This is can only trip for a broken bootloader... */ cmpw $0x207, pa(boot_params + BP_version) - jb default_entry + jb .Ldefault_entry /* Paravirt-compatible boot parameters. Look to see what architecture we're booting under. */ movl pa(boot_params + BP_hardware_subarch), %eax cmpl $num_subarch_entries, %eax - jae bad_subarch + jae .Lbad_subarch movl pa(subarch_entries)(,%eax,4), %eax subl $__PAGE_OFFSET, %eax jmp *%eax -bad_subarch: +.Lbad_subarch: WEAK(lguest_entry) WEAK(xen_entry) /* Unknown implementation; there's really @@ -270,14 +272,14 @@ WEAK(xen_entry) __INITDATA subarch_entries: - .long default_entry /* normal x86/PC */ + .long .Ldefault_entry /* normal x86/PC */ .long lguest_entry /* lguest hypervisor */ .long xen_entry /* Xen hypervisor */ - .long default_entry /* Moorestown MID */ + .long .Ldefault_entry /* Moorestown MID */ num_subarch_entries = (. - subarch_entries) / 4 .previous #else - jmp default_entry + jmp .Ldefault_entry #endif /* CONFIG_PARAVIRT */ #ifdef CONFIG_HOTPLUG_CPU @@ -289,7 +291,8 @@ num_subarch_entries = (. - subarch_entries) / 4 ENTRY(start_cpu0) movl initial_stack, %ecx movl %ecx, %esp - jmp *(initial_code) + call *(initial_code) +1: jmp 1b ENDPROC(start_cpu0) #endif @@ -317,7 +320,7 @@ ENTRY(startup_32_smp) call load_ucode_ap #endif -default_entry: +.Ldefault_entry: #define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ X86_CR0_PG) @@ -347,7 +350,7 @@ default_entry: pushfl popl %eax # get EFLAGS testl $X86_EFLAGS_ID,%eax # did EFLAGS.ID remained set? - jz enable_paging # hw disallowed setting of ID bit + jz .Lenable_paging # hw disallowed setting of ID bit # which means no CPUID and no CR4 xorl %eax,%eax @@ -357,13 +360,13 @@ default_entry: movl $1,%eax cpuid andl $~1,%edx # Ignore CPUID.FPU - jz enable_paging # No flags or only CPUID.FPU = no CR4 + jz .Lenable_paging # No flags or only CPUID.FPU = no CR4 movl pa(mmu_cr4_features),%eax movl %eax,%cr4 testb $X86_CR4_PAE, %al # check if PAE is enabled - jz enable_paging + jz .Lenable_paging /* Check if extended functions are implemented */ movl $0x80000000, %eax @@ -371,7 +374,7 @@ default_entry: /* Value must be in the range 0x80000001 to 0x8000ffff */ subl $0x80000001, %eax cmpl $(0x8000ffff-0x80000001), %eax - ja enable_paging + ja .Lenable_paging /* Clear bogus XD_DISABLE bits */ call verify_cpu @@ -380,7 +383,7 @@ default_entry: cpuid /* Execute Disable bit supported? */ btl $(X86_FEATURE_NX & 31), %edx - jnc enable_paging + jnc .Lenable_paging /* Setup EFER (Extended Feature Enable Register) */ movl $MSR_EFER, %ecx @@ -390,7 +393,7 @@ default_entry: /* Make changes effective */ wrmsr -enable_paging: +.Lenable_paging: /* * Enable paging @@ -419,7 +422,7 @@ enable_paging: */ movb $4,X86 # at least 486 cmpl $-1,X86_CPUID - je is486 + je .Lis486 /* get vendor info */ xorl %eax,%eax # call CPUID with 0 -> return vendor ID @@ -430,7 +433,7 @@ enable_paging: movl %ecx,X86_VENDOR_ID+8 # last 4 chars orl %eax,%eax # do we have processor info as well? - je is486 + je .Lis486 movl $1,%eax # Use the CPUID instruction to get CPU type cpuid @@ -444,7 +447,7 @@ enable_paging: movb %cl,X86_MASK movl %edx,X86_CAPABILITY -is486: +.Lis486: movl $0x50022,%ecx # set AM, WP, NE and MP movl %cr0,%eax andl $0x80000011,%eax # Save PG,PE,ET @@ -470,8 +473,9 @@ is486: xorl %eax,%eax # Clear LDT lldt %ax - pushl $0 # fake return address for unwinder - jmp *(initial_code) + call *(initial_code) +1: jmp 1b +ENDPROC(startup_32_smp) #include "verify_cpu.S" @@ -709,7 +713,12 @@ ENTRY(initial_page_table) .data .balign 4 ENTRY(initial_stack) - .long init_thread_union+THREAD_SIZE + /* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel + * unwinder reliably detect the end of the stack. + */ + .long init_thread_union + THREAD_SIZE - SIZEOF_PTREGS - \ + TOP_OF_KERNEL_STACK_PADDING; __INITRODATA int_msg: diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index b4421cc191b0..90de28841242 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -66,13 +66,8 @@ startup_64: * tables and then reload them. */ - /* - * Setup stack for verify_cpu(). "-8" because initial_stack is defined - * this way, see below. Our best guess is a NULL ptr for stack - * termination heuristics and we don't want to break anything which - * might depend on it (kgdb, ...). - */ - leaq (__end_init_task - 8)(%rip), %rsp + /* Set up the stack for verify_cpu(), similar to initial_stack below */ + leaq (__end_init_task - SIZEOF_PTREGS)(%rip), %rsp /* Sanitize CPU configuration */ call verify_cpu @@ -117,20 +112,20 @@ startup_64: movq %rdi, %rax shrq $PGDIR_SHIFT, %rax - leaq (4096 + _KERNPG_TABLE)(%rbx), %rdx + leaq (PAGE_SIZE + _KERNPG_TABLE)(%rbx), %rdx movq %rdx, 0(%rbx,%rax,8) movq %rdx, 8(%rbx,%rax,8) - addq $4096, %rdx + addq $PAGE_SIZE, %rdx movq %rdi, %rax shrq $PUD_SHIFT, %rax andl $(PTRS_PER_PUD-1), %eax - movq %rdx, 4096(%rbx,%rax,8) + movq %rdx, PAGE_SIZE(%rbx,%rax,8) incl %eax andl $(PTRS_PER_PUD-1), %eax - movq %rdx, 4096(%rbx,%rax,8) + movq %rdx, PAGE_SIZE(%rbx,%rax,8) - addq $8192, %rbx + addq $PAGE_SIZE * 2, %rbx movq %rdi, %rax shrq $PMD_SHIFT, %rdi addq $(__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL), %rax @@ -147,6 +142,9 @@ startup_64: decl %ecx jnz 1b + test %rbp, %rbp + jz .Lskip_fixup + /* * Fixup the kernel text+data virtual addresses. Note that * we might write invalid pmds, when the kernel is relocated @@ -154,9 +152,9 @@ startup_64: * beyond _end. */ leaq level2_kernel_pgt(%rip), %rdi - leaq 4096(%rdi), %r8 + leaq PAGE_SIZE(%rdi), %r8 /* See if it is a valid page table entry */ -1: testb $1, 0(%rdi) +1: testb $_PAGE_PRESENT, 0(%rdi) jz 2f addq %rbp, 0(%rdi) /* Go to the next page */ @@ -167,6 +165,7 @@ startup_64: /* Fixup phys_base */ addq %rbp, phys_base(%rip) +.Lskip_fixup: movq $(early_level4_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) @@ -265,13 +264,17 @@ ENTRY(secondary_startup_64) movl $MSR_GS_BASE,%ecx movl initial_gs(%rip),%eax movl initial_gs+4(%rip),%edx - wrmsr + wrmsr /* rsi is pointer to real mode structure with interesting info. pass it to C */ movq %rsi, %rdi - - /* Finally jump to run C code and to be on real kernel address + jmp start_cpu +ENDPROC(secondary_startup_64) + +ENTRY(start_cpu) + /* + * Jump to run C code and to be on a real kernel address. * Since we are running on identity-mapped space we have to jump * to the full 64bit address, this is only possible as indirect * jump. In addition we need to ensure %cs is set so we make this @@ -295,12 +298,13 @@ ENTRY(secondary_startup_64) * REX.W + FF /5 JMP m16:64 Jump far, absolute indirect, * address given in m16:64. */ - movq initial_code(%rip),%rax - pushq $0 # fake return address to stop unwinder + call 1f # put return address on stack for unwinder +1: xorq %rbp, %rbp # clear frame pointer + movq initial_code(%rip), %rax pushq $__KERNEL_CS # set correct cs pushq %rax # target address in negative space lretq -ENDPROC(secondary_startup_64) +ENDPROC(start_cpu) #include "verify_cpu.S" @@ -308,15 +312,11 @@ ENDPROC(secondary_startup_64) /* * Boot CPU0 entry point. It's called from play_dead(). Everything has been set * up already except stack. We just set up stack here. Then call - * start_secondary(). + * start_secondary() via start_cpu(). */ ENTRY(start_cpu0) - movq initial_stack(%rip),%rsp - movq initial_code(%rip),%rax - pushq $0 # fake return address to stop unwinder - pushq $__KERNEL_CS # set correct cs - pushq %rax # target address in negative space - lretq + movq initial_stack(%rip), %rsp + jmp start_cpu ENDPROC(start_cpu0) #endif @@ -328,7 +328,11 @@ ENDPROC(start_cpu0) GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) GLOBAL(initial_stack) - .quad init_thread_union+THREAD_SIZE-8 + /* + * The SIZEOF_PTREGS gap is a convention which helps the in-kernel + * unwinder reliably detect the end of the stack. + */ + .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS __FINITDATA bad_address: diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9f669fdd2010..7c6e9ffe4424 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -14,7 +14,6 @@ #include <asm/apic.h> #include <asm/io_apic.h> #include <asm/irq.h> -#include <asm/idle.h> #include <asm/mce.h> #include <asm/hw_irq.h> #include <asm/desc.h> diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 9ebd0b0e73d9..6b0678a541e2 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -16,7 +16,6 @@ #include <linux/uaccess.h> #include <linux/smp.h> #include <asm/io_apic.h> -#include <asm/idle.h> #include <asm/apic.h> int sysctl_panic_on_stackoverflow; diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c new file mode 100644 index 000000000000..cb9c1ed1d391 --- /dev/null +++ b/arch/x86/kernel/itmt.c @@ -0,0 +1,215 @@ +/* + * itmt.c: Support Intel Turbo Boost Max Technology 3.0 + * + * (C) Copyright 2016 Intel Corporation + * Author: Tim Chen <tim.c.chen@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + * + * On platforms supporting Intel Turbo Boost Max Technology 3.0, (ITMT), + * the maximum turbo frequencies of some cores in a CPU package may be + * higher than for the other cores in the same package. In that case, + * better performance can be achieved by making the scheduler prefer + * to run tasks on the CPUs with higher max turbo frequencies. + * + * This file provides functions and data structures for enabling the + * scheduler to favor scheduling on cores can be boosted to a higher + * frequency under ITMT. + */ + +#include <linux/sched.h> +#include <linux/cpumask.h> +#include <linux/cpuset.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/sysctl.h> +#include <linux/nodemask.h> + +static DEFINE_MUTEX(itmt_update_mutex); +DEFINE_PER_CPU_READ_MOSTLY(int, sched_core_priority); + +/* Boolean to track if system has ITMT capabilities */ +static bool __read_mostly sched_itmt_capable; + +/* + * Boolean to control whether we want to move processes to cpu capable + * of higher turbo frequency for cpus supporting Intel Turbo Boost Max + * Technology 3.0. + * + * It can be set via /proc/sys/kernel/sched_itmt_enabled + */ +unsigned int __read_mostly sysctl_sched_itmt_enabled; + +static int sched_itmt_update_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + unsigned int old_sysctl; + int ret; + + mutex_lock(&itmt_update_mutex); + + if (!sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return -EINVAL; + } + + old_sysctl = sysctl_sched_itmt_enabled; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + + if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) { + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); + + return ret; +} + +static unsigned int zero; +static unsigned int one = 1; +static struct ctl_table itmt_kern_table[] = { + { + .procname = "sched_itmt_enabled", + .data = &sysctl_sched_itmt_enabled, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_itmt_update_handler, + .extra1 = &zero, + .extra2 = &one, + }, + {} +}; + +static struct ctl_table itmt_root_table[] = { + { + .procname = "kernel", + .mode = 0555, + .child = itmt_kern_table, + }, + {} +}; + +static struct ctl_table_header *itmt_sysctl_header; + +/** + * sched_set_itmt_support() - Indicate platform supports ITMT + * + * This function is used by the OS to indicate to scheduler that the platform + * is capable of supporting the ITMT feature. + * + * The current scheme has the pstate driver detects if the system + * is ITMT capable and call sched_set_itmt_support. + * + * This must be done only after sched_set_itmt_core_prio + * has been called to set the cpus' priorities. + * It must not be called with cpu hot plug lock + * held as we need to acquire the lock to rebuild sched domains + * later. + * + * Return: 0 on success + */ +int sched_set_itmt_support(void) +{ + mutex_lock(&itmt_update_mutex); + + if (sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return 0; + } + + itmt_sysctl_header = register_sysctl_table(itmt_root_table); + if (!itmt_sysctl_header) { + mutex_unlock(&itmt_update_mutex); + return -ENOMEM; + } + + sched_itmt_capable = true; + + sysctl_sched_itmt_enabled = 1; + + if (sysctl_sched_itmt_enabled) { + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); + + return 0; +} + +/** + * sched_clear_itmt_support() - Revoke platform's support of ITMT + * + * This function is used by the OS to indicate that it has + * revoked the platform's support of ITMT feature. + * + * It must not be called with cpu hot plug lock + * held as we need to acquire the lock to rebuild sched domains + * later. + */ +void sched_clear_itmt_support(void) +{ + mutex_lock(&itmt_update_mutex); + + if (!sched_itmt_capable) { + mutex_unlock(&itmt_update_mutex); + return; + } + sched_itmt_capable = false; + + if (itmt_sysctl_header) { + unregister_sysctl_table(itmt_sysctl_header); + itmt_sysctl_header = NULL; + } + + if (sysctl_sched_itmt_enabled) { + /* disable sched_itmt if we are no longer ITMT capable */ + sysctl_sched_itmt_enabled = 0; + x86_topology_update = true; + rebuild_sched_domains(); + } + + mutex_unlock(&itmt_update_mutex); +} + +int arch_asym_cpu_priority(int cpu) +{ + return per_cpu(sched_core_priority, cpu); +} + +/** + * sched_set_itmt_core_prio() - Set CPU priority based on ITMT + * @prio: Priority of cpu core + * @core_cpu: The cpu number associated with the core + * + * The pstate driver will find out the max boost frequency + * and call this function to set a priority proportional + * to the max boost frequency. CPU with higher boost + * frequency will receive higher priority. + * + * No need to rebuild sched domain after updating + * the CPU priorities. The sched domains have no + * dependency on CPU priorities. + */ +void sched_set_itmt_core_prio(int prio, int core_cpu) +{ + int cpu, i = 1; + + for_each_cpu(cpu, topology_sibling_cpumask(core_cpu)) { + int smt_prio; + + /* + * Ensure that the siblings are moved to the end + * of the priority chain and only used when + * all other high priority cpus are out of capacity. + */ + smt_prio = prio * smp_num_siblings / i; + per_cpu(sched_core_priority, cpu) = smt_prio; + i++; + } +} diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index edbbfc854e39..36bc66416021 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -42,7 +42,6 @@ #include <asm/traps.h> #include <asm/desc.h> #include <asm/tlbflush.h> -#include <asm/idle.h> #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> @@ -267,13 +266,11 @@ do_async_page_fault(struct pt_regs *regs, unsigned long error_code) case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ prev_state = exception_enter(); - exit_idle(); kvm_async_pf_task_wait((u32)read_cr2()); exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: rcu_irq_enter(); - exit_idle(); kvm_async_pf_task_wake((u32)read_cr2()); rcu_irq_exit(); break; @@ -308,7 +305,7 @@ static void kvm_register_steal_time(void) static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; -static void kvm_guest_apic_eoi_write(u32 reg, u32 val) +static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val) { /** * This relies on __test_and_clear_bit to modify the memory @@ -319,7 +316,7 @@ static void kvm_guest_apic_eoi_write(u32 reg, u32 val) */ if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi))) return; - apic_write(APIC_EOI, APIC_EOI_ACK); + apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK); } static void kvm_guest_cpu_init(void) @@ -592,6 +589,14 @@ out: local_irq_restore(flags); } +__visible bool __kvm_vcpu_is_preempted(int cpu) +{ + struct kvm_steal_time *src = &per_cpu(steal_time, cpu); + + return !!src->preempted; +} +PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted); + /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -608,6 +613,11 @@ void __init kvm_spinlock_init(void) pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_lock_ops.wait = kvm_wait; pv_lock_ops.kick = kvm_kick_cpu; + + if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { + pv_lock_ops.vcpu_is_preempted = + PV_CALLEE_SAVE(__kvm_vcpu_is_preempted); + } } static __init int kvm_spinlock_init_jump(void) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 6707039b9032..d4a15831ac58 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -34,10 +34,10 @@ static void flush_ldt(void *current_mm) } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ -static struct ldt_struct *alloc_ldt_struct(int size) +static struct ldt_struct *alloc_ldt_struct(unsigned int size) { struct ldt_struct *new_ldt; - int alloc_size; + unsigned int alloc_size; if (size > LDT_ENTRIES) return NULL; @@ -93,7 +93,7 @@ static void free_ldt_struct(struct ldt_struct *ldt) paravirt_free_ldt(ldt->entries, ldt->size); if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) - vfree(ldt->entries); + vfree_atomic(ldt->entries); else free_page((unsigned long)ldt->entries); kfree(ldt); @@ -207,11 +207,11 @@ static int read_default_ldt(void __user *ptr, unsigned long bytecount) static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) { struct mm_struct *mm = current->mm; + struct ldt_struct *new_ldt, *old_ldt; + unsigned int oldsize, newsize; + struct user_desc ldt_info; struct desc_struct ldt; int error; - struct user_desc ldt_info; - int oldsize, newsize; - struct ldt_struct *new_ldt, *old_ldt; error = -EINVAL; if (bytecount != sizeof(ldt_info)) @@ -249,7 +249,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) old_ldt = mm->context.ldt; oldsize = old_ldt ? old_ldt->size : 0; - newsize = max((int)(ldt_info.entry_number + 1), oldsize); + newsize = max(ldt_info.entry_number + 1, oldsize); error = -ENOMEM; new_ldt = alloc_ldt_struct(newsize); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 7f3550acde1b..f5e3ff835cc8 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -44,6 +44,7 @@ #include <asm/msr.h> static struct class *msr_class; +static enum cpuhp_state cpuhp_msr_state; static ssize_t msr_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -180,7 +181,7 @@ static const struct file_operations msr_fops = { .compat_ioctl = msr_ioctl, }; -static int msr_device_create(int cpu) +static int msr_device_create(unsigned int cpu) { struct device *dev; @@ -189,34 +190,12 @@ static int msr_device_create(int cpu) return PTR_ERR_OR_ZERO(dev); } -static void msr_device_destroy(int cpu) +static int msr_device_destroy(unsigned int cpu) { device_destroy(msr_class, MKDEV(MSR_MAJOR, cpu)); + return 0; } -static int msr_class_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - int err = 0; - - switch (action) { - case CPU_UP_PREPARE: - err = msr_device_create(cpu); - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - msr_device_destroy(cpu); - break; - } - return notifier_from_errno(err); -} - -static struct notifier_block __refdata msr_class_cpu_notifier = { - .notifier_call = msr_class_cpu_callback, -}; - static char *msr_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); @@ -224,13 +203,11 @@ static char *msr_devnode(struct device *dev, umode_t *mode) static int __init msr_init(void) { - int i, err = 0; - i = 0; + int err; if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) { pr_err("unable to get major %d for msr\n", MSR_MAJOR); - err = -EBUSY; - goto out; + return -EBUSY; } msr_class = class_create(THIS_MODULE, "msr"); if (IS_ERR(msr_class)) { @@ -239,44 +216,28 @@ static int __init msr_init(void) } msr_class->devnode = msr_devnode; - cpu_notifier_register_begin(); - for_each_online_cpu(i) { - err = msr_device_create(i); - if (err != 0) - goto out_class; - } - __register_hotcpu_notifier(&msr_class_cpu_notifier); - cpu_notifier_register_done(); - - err = 0; - goto out; + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/msr:online", + msr_device_create, msr_device_destroy); + if (err < 0) + goto out_class; + cpuhp_msr_state = err; + return 0; out_class: - i = 0; - for_each_online_cpu(i) - msr_device_destroy(i); - cpu_notifier_register_done(); + cpuhp_remove_state(cpuhp_msr_state); class_destroy(msr_class); out_chrdev: __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); -out: return err; } +module_init(msr_init); static void __exit msr_exit(void) { - int cpu = 0; - - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - msr_device_destroy(cpu); + cpuhp_remove_state(cpuhp_msr_state); class_destroy(msr_class); __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); - __unregister_hotcpu_notifier(&msr_class_cpu_notifier); - cpu_notifier_register_done(); } - -module_init(msr_init); module_exit(msr_exit) MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>"); diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 2c55a003b793..6d4bf812af45 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -12,7 +12,6 @@ __visible void __native_queued_spin_unlock(struct qspinlock *lock) { native_queued_spin_unlock(lock); } - PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock); bool pv_is_native_spin_unlock(void) @@ -21,12 +20,25 @@ bool pv_is_native_spin_unlock(void) __raw_callee_save___native_queued_spin_unlock; } +__visible bool __native_vcpu_is_preempted(int cpu) +{ + return false; +} +PV_CALLEE_SAVE_REGS_THUNK(__native_vcpu_is_preempted); + +bool pv_is_native_vcpu_is_preempted(void) +{ + return pv_lock_ops.vcpu_is_preempted.func == + __raw_callee_save___native_vcpu_is_preempted; +} + struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), .wait = paravirt_nop, .kick = paravirt_nop, + .vcpu_is_preempted = PV_CALLEE_SAVE(__native_vcpu_is_preempted), #endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bbf3d5933eaa..a1bfba0f7234 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -328,7 +328,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .cpuid = native_cpuid, .get_debugreg = native_get_debugreg, .set_debugreg = native_set_debugreg, - .clts = native_clts, .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, .read_cr4 = native_read_cr4, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 920c6ae08592..d33ef165b1f8 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -8,10 +8,10 @@ DEF_NATIVE(pv_cpu_ops, iret, "iret"); DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); -DEF_NATIVE(pv_cpu_ops, clts, "clts"); #if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); +DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %eax, %eax"); #endif unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) @@ -27,6 +27,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) } extern bool pv_is_native_spin_unlock(void); +extern bool pv_is_native_vcpu_is_preempted(void); unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) @@ -48,7 +49,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_cpu_ops, clts); #if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { @@ -56,9 +56,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, end = end_pv_lock_ops_queued_spin_unlock; goto patch_site; } + goto patch_default; + + case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted): + if (pv_is_native_vcpu_is_preempted()) { + start = start_pv_lock_ops_vcpu_is_preempted; + end = end_pv_lock_ops_vcpu_is_preempted; + goto patch_site; + } + goto patch_default; #endif default: +patch_default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index bb3840cedb4f..f4fcf26c9fce 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); -DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); @@ -21,6 +20,7 @@ DEF_NATIVE(, mov64, "mov %rdi, %rax"); #if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); +DEF_NATIVE(pv_lock_ops, vcpu_is_preempted, "xor %rax, %rax"); #endif unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) @@ -36,6 +36,7 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) } extern bool pv_is_native_spin_unlock(void); +extern bool pv_is_native_vcpu_is_preempted(void); unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) @@ -58,7 +59,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); #if defined(CONFIG_PARAVIRT_SPINLOCKS) @@ -68,9 +68,19 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, end = end_pv_lock_ops_queued_spin_unlock; goto patch_site; } + goto patch_default; + + case PARAVIRT_PATCH(pv_lock_ops.vcpu_is_preempted): + if (pv_is_native_vcpu_is_preempted()) { + start = start_pv_lock_ops_vcpu_is_preempted; + end = end_pv_lock_ops_vcpu_is_preempted; + goto patch_site; + } + goto patch_default; #endif default: +patch_default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0888a879120f..43c36d8a6ae2 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -23,7 +23,6 @@ #include <asm/cpu.h> #include <asm/apic.h> #include <asm/syscalls.h> -#include <asm/idle.h> #include <asm/uaccess.h> #include <asm/mwait.h> #include <asm/fpu/internal.h> @@ -65,23 +64,6 @@ __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { }; EXPORT_PER_CPU_SYMBOL(cpu_tss); -#ifdef CONFIG_X86_64 -static DEFINE_PER_CPU(unsigned char, is_idle); -static ATOMIC_NOTIFIER_HEAD(idle_notifier); - -void idle_notifier_register(struct notifier_block *n) -{ - atomic_notifier_chain_register(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_register); - -void idle_notifier_unregister(struct notifier_block *n) -{ - atomic_notifier_chain_unregister(&idle_notifier, n); -} -EXPORT_SYMBOL_GPL(idle_notifier_unregister); -#endif - /* * this gets called so that we can store lazy state into memory and copy the * current task into the new thread. @@ -251,39 +233,9 @@ static inline void play_dead(void) } #endif -#ifdef CONFIG_X86_64 -void enter_idle(void) -{ - this_cpu_write(is_idle, 1); - atomic_notifier_call_chain(&idle_notifier, IDLE_START, NULL); -} - -static void __exit_idle(void) -{ - if (x86_test_and_clear_bit_percpu(0, is_idle) == 0) - return; - atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); -} - -/* Called from interrupts to signify idle end */ -void exit_idle(void) -{ - /* idle loop has pid 0 */ - if (current->pid) - return; - __exit_idle(); -} -#endif - void arch_cpu_idle_enter(void) { local_touch_nmi(); - enter_idle(); -} - -void arch_cpu_idle_exit(void) -{ - __exit_idle(); } void arch_cpu_idle_dead(void) @@ -336,59 +288,33 @@ void stop_this_cpu(void *dummy) halt(); } -bool amd_e400_c1e_detected; -EXPORT_SYMBOL(amd_e400_c1e_detected); - -static cpumask_var_t amd_e400_c1e_mask; - -void amd_e400_remove_cpu(int cpu) -{ - if (amd_e400_c1e_mask != NULL) - cpumask_clear_cpu(cpu, amd_e400_c1e_mask); -} - /* - * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt - * pending message MSR. If we detect C1E, then we handle it the same - * way as C3 power states (local apic timer and TSC stop) + * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power + * states (local apic timer and TSC stop). */ static void amd_e400_idle(void) { - if (!amd_e400_c1e_detected) { - u32 lo, hi; - - rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); - - if (lo & K8_INTP_C1E_ACTIVE_MASK) { - amd_e400_c1e_detected = true; - if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) - mark_tsc_unstable("TSC halt in AMD C1E"); - pr_info("System has AMD C1E enabled\n"); - } + /* + * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E + * gets set after static_cpu_has() places have been converted via + * alternatives. + */ + if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { + default_idle(); + return; } - if (amd_e400_c1e_detected) { - int cpu = smp_processor_id(); + tick_broadcast_enter(); - if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { - cpumask_set_cpu(cpu, amd_e400_c1e_mask); - /* Force broadcast so ACPI can not interfere. */ - tick_broadcast_force(); - pr_info("Switch to broadcast mode on CPU%d\n", cpu); - } - tick_broadcast_enter(); - - default_idle(); + default_idle(); - /* - * The switch back from broadcast mode needs to be - * called with interrupts disabled. - */ - local_irq_disable(); - tick_broadcast_exit(); - local_irq_enable(); - } else - default_idle(); + /* + * The switch back from broadcast mode needs to be called with + * interrupts disabled. + */ + local_irq_disable(); + tick_broadcast_exit(); + local_irq_enable(); } /* @@ -448,8 +374,7 @@ void select_idle_routine(const struct cpuinfo_x86 *c) if (x86_idle || boot_option_idle_override == IDLE_POLL) return; - if (cpu_has_bug(c, X86_BUG_AMD_APIC_C1E)) { - /* E400: APIC timer interrupt does not wake up CPU from C1e */ + if (boot_cpu_has_bug(X86_BUG_AMD_E400)) { pr_info("using AMD E400 aware idle routine\n"); x86_idle = amd_e400_idle; } else if (prefer_mwait_c1_over_halt(c)) { @@ -459,11 +384,37 @@ void select_idle_routine(const struct cpuinfo_x86 *c) x86_idle = default_idle; } -void __init init_amd_e400_c1e_mask(void) +void amd_e400_c1e_apic_setup(void) +{ + if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) { + pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id()); + local_irq_disable(); + tick_broadcast_force(); + local_irq_enable(); + } +} + +void __init arch_post_acpi_subsys_init(void) { - /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */ - if (x86_idle == amd_e400_idle) - zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL); + u32 lo, hi; + + if (!boot_cpu_has_bug(X86_BUG_AMD_E400)) + return; + + /* + * AMD E400 detection needs to happen after ACPI has been enabled. If + * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in + * MSR_K8_INT_PENDING_MSG. + */ + rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (!(lo & K8_INTP_C1E_ACTIVE_MASK)) + return; + + boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E); + + if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) + mark_tsc_unstable("TSC halt in AMD C1E"); + pr_info("System has AMD C1E enabled\n"); } static int __init idle_setup(char *str) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bd7be8efdc4c..d0d744108594 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -49,7 +49,6 @@ #include <asm/tlbflush.h> #include <asm/cpu.h> -#include <asm/idle.h> #include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> @@ -72,10 +71,9 @@ void __show_regs(struct pt_regs *regs, int all) savesegment(gs, gs); } - printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n", - (u16)regs->cs, regs->ip, regs->flags, - smp_processor_id()); - print_symbol("EIP is at %s\n", regs->ip); + printk(KERN_DEFAULT "EIP: %pS\n", (void *)regs->ip); + printk(KERN_DEFAULT "EFLAGS: %08lx CPU: %d\n", regs->flags, + smp_processor_id()); printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", regs->ax, regs->bx, regs->cx, regs->dx); @@ -232,11 +230,10 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - fpu_switch_t fpu_switch; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); + switch_fpu_prepare(prev_fpu, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -295,7 +292,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) if (prev->gs | next->gs) lazy_load_gs(next->gs); - switch_fpu_finish(next_fpu, fpu_switch); + switch_fpu_finish(next_fpu, cpu); this_cpu_write(current_task, next_p); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b3760b3c1ca0..a76b65e3e615 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -44,7 +44,6 @@ #include <asm/desc.h> #include <asm/proto.h> #include <asm/ia32.h> -#include <asm/idle.h> #include <asm/syscalls.h> #include <asm/debugreg.h> #include <asm/switch_to.h> @@ -61,10 +60,15 @@ void __show_regs(struct pt_regs *regs, int all) unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip); - printk_address(regs->ip); - printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, - regs->sp, regs->flags); + printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff, + (void *)regs->ip); + printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, + regs->sp, regs->flags); + if (regs->orig_ax != -1) + pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); + else + pr_cont("\n"); + printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n", regs->ax, regs->bx, regs->cx); printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n", @@ -265,9 +269,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) int cpu = smp_processor_id(); struct tss_struct *tss = &per_cpu(cpu_tss, cpu); unsigned prev_fsindex, prev_gsindex; - fpu_switch_t fpu_switch; - fpu_switch = switch_fpu_prepare(prev_fpu, next_fpu, cpu); + switch_fpu_prepare(prev_fpu, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -417,7 +420,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) prev->gsbase = 0; prev->gsindex = prev_gsindex; - switch_fpu_finish(next_fpu, fpu_switch); + switch_fpu_finish(next_fpu, cpu); /* * Switch the PDA and FPU contexts. diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 79c6311cd912..5b21cb7d84d6 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -64,6 +64,15 @@ void mach_get_cmos_time(struct timespec *now) unsigned int status, year, mon, day, hour, min, sec, century = 0; unsigned long flags; + /* + * If pm_trace abused the RTC as storage, set the timespec to 0, + * which tells the caller that this RTC value is unusable. + */ + if (!pm_trace_rtc_valid()) { + now->tv_sec = now->tv_nsec = 0; + return; + } + spin_lock_irqsave(&rtc_lock, flags); /* diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9c337b0e8ba7..4cfba947d774 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -985,6 +985,30 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + if (movable_node_is_enabled()) + memblock_set_bottom_up(true); +#endif + x86_report_nx(); /* after early param, so could get panic from serial */ diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 2bbd27f89802..9820d6d977c6 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -1,7 +1,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/kernel.h> -#include <linux/module.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/bootmem.h> #include <linux/percpu.h> @@ -12,6 +12,7 @@ #include <linux/pfn.h> #include <asm/sections.h> #include <asm/processor.h> +#include <asm/desc.h> #include <asm/setup.h> #include <asm/mpspec.h> #include <asm/apicdef.h> diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index c00cb64bc0a1..68f8cc222f25 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -261,10 +261,8 @@ static inline void __smp_reschedule_interrupt(void) __visible void smp_reschedule_interrupt(struct pt_regs *regs) { - irq_enter(); ack_APIC_irq(); __smp_reschedule_interrupt(); - irq_exit(); /* * KVM uses this interrupt to force a cpu out of guest mode */ diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 42f5eb7b4f6c..0c37d4fd01b2 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -58,7 +58,6 @@ #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> -#include <asm/idle.h> #include <asm/realmode.h> #include <asm/cpu.h> #include <asm/numa.h> @@ -109,6 +108,17 @@ static bool logical_packages_frozen __read_mostly; /* Maximum number of SMT threads on any online core */ int __max_smt_threads __read_mostly; +/* Flag to indicate if a complete sched domain rebuild is required */ +bool x86_topology_update; + +int arch_update_cpu_topology(void) +{ + int retval = x86_topology_update; + + x86_topology_update = false; + return retval; +} + static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip) { unsigned long flags; @@ -471,22 +481,42 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } +#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) +static inline int x86_sched_itmt_flags(void) +{ + return sysctl_sched_itmt_enabled ? SD_ASYM_PACKING : 0; +} + +#ifdef CONFIG_SCHED_MC +static int x86_core_flags(void) +{ + return cpu_core_flags() | x86_sched_itmt_flags(); +} +#endif +#ifdef CONFIG_SCHED_SMT +static int x86_smt_flags(void) +{ + return cpu_smt_flags() | x86_sched_itmt_flags(); +} +#endif +#endif + static struct sched_domain_topology_level x86_numa_in_package_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif { NULL, }, }; static struct sched_domain_topology_level x86_topology[] = { #ifdef CONFIG_SCHED_SMT - { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, + { cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) }, #endif #ifdef CONFIG_SCHED_MC - { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, + { cpu_coregroup_mask, x86_core_flags, SD_INIT_NAME(MC) }, #endif { cpu_cpu_mask, SD_INIT_NAME(DIE) }, { NULL, }, @@ -821,14 +851,6 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip) return (send_status | accept_status); } -void smp_announce(void) -{ - int num_nodes = num_online_nodes(); - - printk(KERN_INFO "x86: Booted up %d node%s, %d CPUs\n", - num_nodes, (num_nodes > 1 ? "s" : ""), num_online_cpus()); -} - /* reduce the number of lines printed when booting a large cpu count system */ static void announce_cpu(int cpu, int apicid) { @@ -964,9 +986,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) int cpu0_nmi_registered = 0; unsigned long timeout; - idle->thread.sp = (unsigned long) (((struct pt_regs *) - (THREAD_SIZE + task_stack_page(idle))) - 1); - + idle->thread.sp = (unsigned long)task_pt_regs(idle); early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; initial_stack = idle->thread.sp; @@ -1111,7 +1131,7 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) return err; /* the FPU context is blank, nobody can own it */ - __cpu_disable_lazy_restore(cpu); + per_cpu(fpu_fpregs_owner_ctx, cpu) = NULL; common_cpu_up(cpu, tidle); @@ -1331,7 +1351,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) default_setup_apic_routing(); cpu0_logical_apicid = apic_bsp_setup(false); - pr_info("CPU%d: ", 0); + pr_info("CPU0: "); print_cpu_info(&cpu_data(0)); if (is_uv_system()) @@ -1575,7 +1595,6 @@ void play_dead_common(void) { idle_task_exit(); reset_lazy_tlbstate(); - amd_e400_remove_cpu(raw_smp_processor_id()); /* Ack it */ (void)cpu_report_death(); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index bd4e3d4d3625..bf0c6d049080 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -853,6 +853,8 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { + unsigned long cr0; + RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU"); #ifdef CONFIG_MATH_EMULATION @@ -866,10 +868,20 @@ do_device_not_available(struct pt_regs *regs, long error_code) return; } #endif - fpu__restore(¤t->thread.fpu); /* interrupts still off */ -#ifdef CONFIG_X86_32 - cond_local_irq_enable(regs); -#endif + + /* This should not happen. */ + cr0 = read_cr0(); + if (WARN(cr0 & X86_CR0_TS, "CR0.TS was set")) { + /* Try to fix it up and carry on. */ + write_cr0(cr0 & ~X86_CR0_TS); + } else { + /* + * Something terrible happened, and we're better off trying + * to kill the task than getting stuck in a never-ending + * loop of #NM faults. + */ + die("unexpected #NM exception", regs, error_code); + } } NOKPROBE_SYMBOL(do_device_not_available); diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index a2456d4d286a..ea7b7f9a3b9e 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -14,13 +14,55 @@ unsigned long unwind_get_return_address(struct unwind_state *state) if (unwind_done(state)) return 0; + if (state->regs && user_mode(state->regs)) + return 0; + addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, addr_p); - return __kernel_text_address(addr) ? addr : 0; + if (!__kernel_text_address(addr)) { + printk_deferred_once(KERN_WARNING + "WARNING: unrecognized kernel stack return address %p at %p in %s:%d\n", + (void *)addr, addr_p, state->task->comm, + state->task->pid); + return 0; + } + + return addr; } EXPORT_SYMBOL_GPL(unwind_get_return_address); +static size_t regs_size(struct pt_regs *regs) +{ + /* x86_32 regs from kernel mode are two words shorter: */ + if (IS_ENABLED(CONFIG_X86_32) && !user_mode(regs)) + return sizeof(*regs) - 2*sizeof(long); + + return sizeof(*regs); +} + +static bool is_last_task_frame(struct unwind_state *state) +{ + unsigned long bp = (unsigned long)state->bp; + unsigned long regs = (unsigned long)task_pt_regs(state->task); + + return bp == regs - FRAME_HEADER_SIZE; +} + +/* + * This determines if the frame pointer actually contains an encoded pointer to + * pt_regs on the stack. See ENCODE_FRAME_POINTER. + */ +static struct pt_regs *decode_frame_pointer(unsigned long *bp) +{ + unsigned long regs = (unsigned long)bp; + + if (!(regs & 0x1)) + return NULL; + + return (struct pt_regs *)(regs & ~0x1); +} + static bool update_stack_state(struct unwind_state *state, void *addr, size_t len) { @@ -43,26 +85,117 @@ static bool update_stack_state(struct unwind_state *state, void *addr, bool unwind_next_frame(struct unwind_state *state) { - unsigned long *next_bp; + struct pt_regs *regs; + unsigned long *next_bp, *next_frame; + size_t next_len; + enum stack_type prev_type = state->stack_info.type; if (unwind_done(state)) return false; - next_bp = (unsigned long *)*state->bp; + /* have we reached the end? */ + if (state->regs && user_mode(state->regs)) + goto the_end; + + if (is_last_task_frame(state)) { + regs = task_pt_regs(state->task); + + /* + * kthreads (other than the boot CPU's idle thread) have some + * partial regs at the end of their stack which were placed + * there by copy_thread_tls(). But the regs don't have any + * useful information, so we can skip them. + * + * This user_mode() check is slightly broader than a PF_KTHREAD + * check because it also catches the awkward situation where a + * newly forked kthread transitions into a user task by calling + * do_execve(), which eventually clears PF_KTHREAD. + */ + if (!user_mode(regs)) + goto the_end; + + /* + * We're almost at the end, but not quite: there's still the + * syscall regs frame. Entry code doesn't encode the regs + * pointer for syscalls, so we have to set it manually. + */ + state->regs = regs; + state->bp = NULL; + return true; + } + + /* get the next frame pointer */ + if (state->regs) + next_bp = (unsigned long *)state->regs->bp; + else + next_bp = (unsigned long *)*state->bp; + + /* is the next frame pointer an encoded pointer to pt_regs? */ + regs = decode_frame_pointer(next_bp); + if (regs) { + next_frame = (unsigned long *)regs; + next_len = sizeof(*regs); + } else { + next_frame = next_bp; + next_len = FRAME_HEADER_SIZE; + } /* make sure the next frame's data is accessible */ - if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE)) - return false; + if (!update_stack_state(state, next_frame, next_len)) { + /* + * Don't warn on bad regs->bp. An interrupt in entry code + * might cause a false positive warning. + */ + if (state->regs) + goto the_end; + + goto bad_address; + } + + /* Make sure it only unwinds up and doesn't overlap the last frame: */ + if (state->stack_info.type == prev_type) { + if (state->regs && (void *)next_frame < (void *)state->regs + regs_size(state->regs)) + goto bad_address; + + if (state->bp && (void *)next_frame < (void *)state->bp + FRAME_HEADER_SIZE) + goto bad_address; + } /* move to the next frame */ - state->bp = next_bp; + if (regs) { + state->regs = regs; + state->bp = NULL; + } else { + state->bp = next_bp; + state->regs = NULL; + } + return true; + +bad_address: + if (state->regs) { + printk_deferred_once(KERN_WARNING + "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", + state->regs, state->task->comm, + state->task->pid, next_frame); + } else { + printk_deferred_once(KERN_WARNING + "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", + state->bp, state->task->comm, + state->task->pid, next_frame); + } +the_end: + state->stack_info.type = STACK_TYPE_UNKNOWN; + return false; } EXPORT_SYMBOL_GPL(unwind_next_frame); void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { + unsigned long *bp, *frame; + size_t len; + memset(state, 0, sizeof(*state)); state->task = task; @@ -73,12 +206,22 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, } /* set up the starting stack frame */ - state->bp = get_frame_pointer(task, regs); + bp = get_frame_pointer(task, regs); + regs = decode_frame_pointer(bp); + if (regs) { + state->regs = regs; + frame = (unsigned long *)regs; + len = sizeof(*regs); + } else { + state->bp = bp; + frame = bp; + len = FRAME_HEADER_SIZE; + } /* initialize stack info and make sure the frame data is accessible */ - get_stack_info(state->bp, state->task, &state->stack_info, + get_stack_info(frame, state->task, &state->stack_info, &state->stack_mask); - update_stack_state(state, state->bp, FRAME_HEADER_SIZE); + update_stack_state(state, frame, len); /* * The caller can provide the address of the first frame directly diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index b80e8bf43cc6..22881ddcbb9f 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -7,11 +7,13 @@ unsigned long unwind_get_return_address(struct unwind_state *state) { - unsigned long addr = READ_ONCE_NOCHECK(*state->sp); + unsigned long addr; if (unwind_done(state)) return 0; + addr = READ_ONCE_NOCHECK(*state->sp); + return ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, state->sp); } @@ -25,11 +27,12 @@ bool unwind_next_frame(struct unwind_state *state) return false; do { - unsigned long addr = READ_ONCE_NOCHECK(*state->sp); + for (state->sp++; state->sp < info->end; state->sp++) { + unsigned long addr = READ_ONCE_NOCHECK(*state->sp); - for (state->sp++; state->sp < info->end; state->sp++) if (__kernel_text_address(addr)) return true; + } state->sp = info->next_sp; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index dbf67f64d5ec..e79f15f108a8 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -91,10 +91,10 @@ SECTIONS /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; + _stext = .; /* bootstrapping code */ HEAD_TEXT . = ALIGN(8); - _stext = .; TEXT_TEXT SCHED_TEXT CPUIDLE_TEXT diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index afa7bbb596cd..b2d3cf1ef54a 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -16,7 +16,7 @@ #include <linux/export.h> #include <linux/vmalloc.h> #include <linux/uaccess.h> -#include <asm/fpu/internal.h> /* For use_eager_fpu. Ugh! */ +#include <asm/processor.h> #include <asm/user.h> #include <asm/fpu/xstate.h> #include "cpuid.h" @@ -65,6 +65,11 @@ u64 kvm_supported_xcr0(void) #define F(x) bit(X86_FEATURE_##x) +/* These are scattered features in cpufeatures.h. */ +#define KVM_CPUID_BIT_AVX512_4VNNIW 2 +#define KVM_CPUID_BIT_AVX512_4FMAPS 3 +#define KF(x) bit(KVM_CPUID_BIT_##x) + int kvm_update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -81,6 +86,10 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) best->ecx |= F(OSXSAVE); } + best->edx &= ~F(APIC); + if (vcpu->arch.apic_base & MSR_IA32_APICBASE_ENABLE) + best->edx |= F(APIC); + if (apic) { if (best->ecx & F(TSC_DEADLINE_TIMER)) apic->lapic_timer.timer_mode_mask = 3 << 17; @@ -114,8 +123,7 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC)))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - if (use_eager_fpu()) - kvm_x86_ops->fpu_activate(vcpu); + kvm_x86_ops->fpu_activate(vcpu); /* * The existing code assumes virtual address is 48-bit in the canonical @@ -376,6 +384,10 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 7.0.ecx*/ const u32 kvm_cpuid_7_0_ecx_x86_features = F(PKU) | 0 /*OSPKE*/; + /* cpuid 7.0.edx*/ + const u32 kvm_cpuid_7_0_edx_x86_features = + KF(AVX512_4VNNIW) | KF(AVX512_4FMAPS); + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -458,12 +470,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* PKU is not yet implemented for shadow paging. */ if (!tdp_enabled) entry->ecx &= ~F(PKU); + entry->edx &= kvm_cpuid_7_0_edx_x86_features; + entry->edx &= get_scattered_cpuid_leaf(7, 0, CPUID_EDX); } else { entry->ebx = 0; entry->ecx = 0; + entry->edx = 0; } entry->eax = 0; - entry->edx = 0; break; } case 9: @@ -863,17 +877,17 @@ void kvm_cpuid(struct kvm_vcpu *vcpu, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) } EXPORT_SYMBOL_GPL(kvm_cpuid); -void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) +int kvm_emulate_cpuid(struct kvm_vcpu *vcpu) { - u32 function, eax, ebx, ecx, edx; + u32 eax, ebx, ecx, edx; - function = eax = kvm_register_read(vcpu, VCPU_REGS_RAX); + eax = kvm_register_read(vcpu, VCPU_REGS_RAX); ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); kvm_cpuid(vcpu, &eax, &ebx, &ecx, &edx); kvm_register_write(vcpu, VCPU_REGS_RAX, eax); kvm_register_write(vcpu, VCPU_REGS_RBX, ebx); kvm_register_write(vcpu, VCPU_REGS_RCX, ecx); kvm_register_write(vcpu, VCPU_REGS_RDX, edx); - kvm_x86_ops->skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_cpuid); diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index a3ce9d260d68..56628a44668b 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -158,9 +158,11 @@ #define Src2GS (OpGS << Src2Shift) #define Src2Mask (OpMask << Src2Shift) #define Mmx ((u64)1 << 40) /* MMX Vector instruction */ +#define AlignMask ((u64)7 << 41) #define Aligned ((u64)1 << 41) /* Explicitly aligned (e.g. MOVDQA) */ -#define Unaligned ((u64)1 << 42) /* Explicitly unaligned (e.g. MOVDQU) */ -#define Avx ((u64)1 << 43) /* Advanced Vector Extensions */ +#define Unaligned ((u64)2 << 41) /* Explicitly unaligned (e.g. MOVDQU) */ +#define Avx ((u64)3 << 41) /* Advanced Vector Extensions */ +#define Aligned16 ((u64)4 << 41) /* Aligned to 16 byte boundary (e.g. FXSAVE) */ #define Fastop ((u64)1 << 44) /* Use opcode::u.fastop */ #define NoWrite ((u64)1 << 45) /* No writeback */ #define SrcWrite ((u64)1 << 46) /* Write back src operand */ @@ -446,6 +448,26 @@ FOP_END; FOP_START(salc) "pushf; sbb %al, %al; popf \n\t" FOP_RET FOP_END; +/* + * XXX: inoutclob user must know where the argument is being expanded. + * Relying on CC_HAVE_ASM_GOTO would allow us to remove _fault. + */ +#define asm_safe(insn, inoutclob...) \ +({ \ + int _fault = 0; \ + \ + asm volatile("1:" insn "\n" \ + "2:\n" \ + ".pushsection .fixup, \"ax\"\n" \ + "3: movl $1, %[_fault]\n" \ + " jmp 2b\n" \ + ".popsection\n" \ + _ASM_EXTABLE(1b, 3b) \ + : [_fault] "+qm"(_fault) inoutclob ); \ + \ + _fault ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE; \ +}) + static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, enum x86_intercept intercept, enum x86_intercept_stage stage) @@ -632,21 +654,26 @@ static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, * depending on whether they're AVX encoded or not. * * Also included is CMPXCHG16B which is not a vector instruction, yet it is - * subject to the same check. + * subject to the same check. FXSAVE and FXRSTOR are checked here too as their + * 512 bytes of data must be aligned to a 16 byte boundary. */ -static bool insn_aligned(struct x86_emulate_ctxt *ctxt, unsigned size) +static unsigned insn_alignment(struct x86_emulate_ctxt *ctxt, unsigned size) { - if (likely(size < 16)) - return false; + u64 alignment = ctxt->d & AlignMask; - if (ctxt->d & Aligned) - return true; - else if (ctxt->d & Unaligned) - return false; - else if (ctxt->d & Avx) - return false; - else - return true; + if (likely(size < 16)) + return 1; + + switch (alignment) { + case Unaligned: + case Avx: + return 1; + case Aligned16: + return 16; + case Aligned: + default: + return size; + } } static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, @@ -704,7 +731,7 @@ static __always_inline int __linearize(struct x86_emulate_ctxt *ctxt, } break; } - if (insn_aligned(ctxt, size) && ((la & (size - 1)) != 0)) + if (la & (insn_alignment(ctxt, size) - 1)) return emulate_gp(ctxt, 0); return X86EMUL_CONTINUE; bad: @@ -3842,6 +3869,131 @@ static int em_movsxd(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int check_fxsr(struct x86_emulate_ctxt *ctxt) +{ + u32 eax = 1, ebx, ecx = 0, edx; + + ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx); + if (!(edx & FFL(FXSR))) + return emulate_ud(ctxt); + + if (ctxt->ops->get_cr(ctxt, 0) & (X86_CR0_TS | X86_CR0_EM)) + return emulate_nm(ctxt); + + /* + * Don't emulate a case that should never be hit, instead of working + * around a lack of fxsave64/fxrstor64 on old compilers. + */ + if (ctxt->mode >= X86EMUL_MODE_PROT64) + return X86EMUL_UNHANDLEABLE; + + return X86EMUL_CONTINUE; +} + +/* + * FXSAVE and FXRSTOR have 4 different formats depending on execution mode, + * 1) 16 bit mode + * 2) 32 bit mode + * - like (1), but FIP and FDP (foo) are only 16 bit. At least Intel CPUs + * preserve whole 32 bit values, though, so (1) and (2) are the same wrt. + * save and restore + * 3) 64-bit mode with REX.W prefix + * - like (2), but XMM 8-15 are being saved and restored + * 4) 64-bit mode without REX.W prefix + * - like (3), but FIP and FDP are 64 bit + * + * Emulation uses (3) for (1) and (2) and preserves XMM 8-15 to reach the + * desired result. (4) is not emulated. + * + * Note: Guest and host CPUID.(EAX=07H,ECX=0H):EBX[bit 13] (deprecate FPU CS + * and FPU DS) should match. + */ +static int em_fxsave(struct x86_emulate_ctxt *ctxt) +{ + struct fxregs_state fx_state; + size_t size; + int rc; + + rc = check_fxsr(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + + ctxt->ops->get_fpu(ctxt); + + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(fx_state)); + + ctxt->ops->put_fpu(ctxt); + + if (rc != X86EMUL_CONTINUE) + return rc; + + if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR) + size = offsetof(struct fxregs_state, xmm_space[8 * 16/4]); + else + size = offsetof(struct fxregs_state, xmm_space[0]); + + return segmented_write(ctxt, ctxt->memop.addr.mem, &fx_state, size); +} + +static int fxrstor_fixup(struct x86_emulate_ctxt *ctxt, + struct fxregs_state *new) +{ + int rc = X86EMUL_CONTINUE; + struct fxregs_state old; + + rc = asm_safe("fxsave %[fx]", , [fx] "+m"(old)); + if (rc != X86EMUL_CONTINUE) + return rc; + + /* + * 64 bit host will restore XMM 8-15, which is not correct on non-64 + * bit guests. Load the current values in order to preserve 64 bit + * XMMs after fxrstor. + */ +#ifdef CONFIG_X86_64 + /* XXX: accessing XMM 8-15 very awkwardly */ + memcpy(&new->xmm_space[8 * 16/4], &old.xmm_space[8 * 16/4], 8 * 16); +#endif + + /* + * Hardware doesn't save and restore XMM 0-7 without CR4.OSFXSR, but + * does save and restore MXCSR. + */ + if (!(ctxt->ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR)) + memcpy(new->xmm_space, old.xmm_space, 8 * 16); + + return rc; +} + +static int em_fxrstor(struct x86_emulate_ctxt *ctxt) +{ + struct fxregs_state fx_state; + int rc; + + rc = check_fxsr(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + + rc = segmented_read(ctxt, ctxt->memop.addr.mem, &fx_state, 512); + if (rc != X86EMUL_CONTINUE) + return rc; + + if (fx_state.mxcsr >> 16) + return emulate_gp(ctxt, 0); + + ctxt->ops->get_fpu(ctxt); + + if (ctxt->mode < X86EMUL_MODE_PROT64) + rc = fxrstor_fixup(ctxt, &fx_state); + + if (rc == X86EMUL_CONTINUE) + rc = asm_safe("fxrstor %[fx]", : [fx] "m"(fx_state)); + + ctxt->ops->put_fpu(ctxt); + + return rc; +} + static bool valid_cr(int nr) { switch (nr) { @@ -4194,7 +4346,9 @@ static const struct gprefix pfx_0f_ae_7 = { }; static const struct group_dual group15 = { { - N, N, N, N, N, N, N, GP(0, &pfx_0f_ae_7), + I(ModRM | Aligned16, em_fxsave), + I(ModRM | Aligned16, em_fxrstor), + N, N, N, N, N, GP(0, &pfx_0f_ae_7), }, { N, N, N, N, N, N, N, N, } }; @@ -5066,21 +5220,13 @@ static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) static int flush_pending_x87_faults(struct x86_emulate_ctxt *ctxt) { - bool fault = false; + int rc; ctxt->ops->get_fpu(ctxt); - asm volatile("1: fwait \n\t" - "2: \n\t" - ".pushsection .fixup,\"ax\" \n\t" - "3: \n\t" - "movb $1, %[fault] \n\t" - "jmp 2b \n\t" - ".popsection \n\t" - _ASM_EXTABLE(1b, 3b) - : [fault]"+qm"(fault)); + rc = asm_safe("fwait"); ctxt->ops->put_fpu(ctxt); - if (unlikely(fault)) + if (unlikely(rc != X86EMUL_CONTINUE)) return emulate_exception(ctxt, MF_VECTOR, 0, false); return X86EMUL_CONTINUE; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 42b1c83741c8..99cde5220e07 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -291,7 +291,7 @@ static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata) return ret; } -int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) +static int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint) { struct kvm_vcpu *vcpu = synic_to_vcpu(synic); struct kvm_lapic_irq irq; diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index 16a7134eedac..a78b445ce411 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -212,7 +212,7 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) */ smp_mb(); if (atomic_dec_if_positive(&ps->pending) > 0) - kthread_queue_work(&pit->worker, &pit->expired); + kthread_queue_work(pit->worker, &pit->expired); } void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) @@ -272,7 +272,7 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data) if (atomic_read(&ps->reinject)) atomic_inc(&ps->pending); - kthread_queue_work(&pt->worker, &pt->expired); + kthread_queue_work(pt->worker, &pt->expired); if (ps->is_periodic) { hrtimer_add_expires_ns(&ps->timer, ps->period); @@ -667,10 +667,8 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) pid_nr = pid_vnr(pid); put_pid(pid); - kthread_init_worker(&pit->worker); - pit->worker_task = kthread_run(kthread_worker_fn, &pit->worker, - "kvm-pit/%d", pid_nr); - if (IS_ERR(pit->worker_task)) + pit->worker = kthread_create_worker(0, "kvm-pit/%d", pid_nr); + if (IS_ERR(pit->worker)) goto fail_kthread; kthread_init_work(&pit->expired, pit_do_work); @@ -713,7 +711,7 @@ fail_register_speaker: fail_register_pit: mutex_unlock(&kvm->slots_lock); kvm_pit_set_reinject(pit, false); - kthread_stop(pit->worker_task); + kthread_destroy_worker(pit->worker); fail_kthread: kvm_free_irq_source_id(kvm, pit->irq_source_id); fail_request: @@ -730,8 +728,7 @@ void kvm_free_pit(struct kvm *kvm) kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->speaker_dev); kvm_pit_set_reinject(pit, false); hrtimer_cancel(&pit->pit_state.timer); - kthread_flush_work(&pit->expired); - kthread_stop(pit->worker_task); + kthread_destroy_worker(pit->worker); kvm_free_irq_source_id(kvm, pit->irq_source_id); kfree(pit); } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 2f5af0798326..600bee9dcbbd 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -44,8 +44,7 @@ struct kvm_pit { struct kvm_kpit_state pit_state; int irq_source_id; struct kvm_irq_mask_notifier mask_notifier; - struct kthread_worker worker; - struct task_struct *worker_task; + struct kthread_worker *worker; struct kthread_work expired; }; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 6f69340f9fa3..34a66b2d47e6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -342,9 +342,11 @@ void __kvm_apic_update_irr(u32 *pir, void *regs) u32 i, pir_val; for (i = 0; i <= 7; i++) { - pir_val = xchg(&pir[i], 0); - if (pir_val) + pir_val = READ_ONCE(pir[i]); + if (pir_val) { + pir_val = xchg(&pir[i], 0); *((u32 *)(regs + APIC_IRR + i * 0x10)) |= pir_val; + } } } EXPORT_SYMBOL_GPL(__kvm_apic_update_irr); @@ -1090,7 +1092,7 @@ static void apic_send_ipi(struct kvm_lapic *apic) static u32 apic_get_tmcct(struct kvm_lapic *apic) { - ktime_t remaining; + ktime_t remaining, now; s64 ns; u32 tmcct; @@ -1101,7 +1103,8 @@ static u32 apic_get_tmcct(struct kvm_lapic *apic) apic->lapic_timer.period == 0) return 0; - remaining = hrtimer_get_remaining(&apic->lapic_timer.timer); + now = ktime_get(); + remaining = ktime_sub(apic->lapic_timer.target_expiration, now); if (ktime_to_ns(remaining) < 0) remaining = ktime_set(0, 0); @@ -1332,7 +1335,7 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_save(flags); - now = apic->lapic_timer.timer.base->get_time(); + now = ktime_get(); guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; @@ -1347,6 +1350,79 @@ static void start_sw_tscdeadline(struct kvm_lapic *apic) local_irq_restore(flags); } +static void start_sw_period(struct kvm_lapic *apic) +{ + if (!apic->lapic_timer.period) + return; + + if (apic_lvtt_oneshot(apic) && + ktime_after(ktime_get(), + apic->lapic_timer.target_expiration)) { + apic_timer_expired(apic); + return; + } + + hrtimer_start(&apic->lapic_timer.timer, + apic->lapic_timer.target_expiration, + HRTIMER_MODE_ABS_PINNED); +} + +static bool set_target_expiration(struct kvm_lapic *apic) +{ + ktime_t now; + u64 tscl = rdtsc(); + + now = ktime_get(); + apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) + * APIC_BUS_CYCLE_NS * apic->divide_count; + + if (!apic->lapic_timer.period) + return false; + + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic)) { + s64 min_period = min_timer_period_us * 1000LL; + + if (apic->lapic_timer.period < min_period) { + pr_info_ratelimited( + "kvm: vcpu %i: requested %lld ns " + "lapic timer period limited to %lld ns\n", + apic->vcpu->vcpu_id, + apic->lapic_timer.period, min_period); + apic->lapic_timer.period = min_period; + } + } + + apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" + PRIx64 ", " + "timer initial count 0x%x, period %lldns, " + "expire @ 0x%016" PRIx64 ".\n", __func__, + APIC_BUS_CYCLE_NS, ktime_to_ns(now), + kvm_lapic_get_reg(apic, APIC_TMICT), + apic->lapic_timer.period, + ktime_to_ns(ktime_add_ns(now, + apic->lapic_timer.period))); + + apic->lapic_timer.tscdeadline = kvm_read_l1_tsc(apic->vcpu, tscl) + + nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); + apic->lapic_timer.target_expiration = ktime_add_ns(now, apic->lapic_timer.period); + + return true; +} + +static void advance_periodic_target_expiration(struct kvm_lapic *apic) +{ + apic->lapic_timer.tscdeadline += + nsec_to_cycles(apic->vcpu, apic->lapic_timer.period); + apic->lapic_timer.target_expiration = + ktime_add_ns(apic->lapic_timer.target_expiration, + apic->lapic_timer.period); +} + bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) { if (!lapic_in_kernel(vcpu)) @@ -1356,52 +1432,59 @@ bool kvm_lapic_hv_timer_in_use(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_lapic_hv_timer_in_use); -static void cancel_hv_tscdeadline(struct kvm_lapic *apic) +static void cancel_hv_timer(struct kvm_lapic *apic) { kvm_x86_ops->cancel_hv_timer(apic->vcpu); apic->lapic_timer.hv_timer_in_use = false; } -void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) -{ - struct kvm_lapic *apic = vcpu->arch.apic; - - WARN_ON(!apic->lapic_timer.hv_timer_in_use); - WARN_ON(swait_active(&vcpu->wq)); - cancel_hv_tscdeadline(apic); - apic_timer_expired(apic); -} -EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); - -static bool start_hv_tscdeadline(struct kvm_lapic *apic) +static bool start_hv_timer(struct kvm_lapic *apic) { u64 tscdeadline = apic->lapic_timer.tscdeadline; - if (atomic_read(&apic->lapic_timer.pending) || + if ((atomic_read(&apic->lapic_timer.pending) && + !apic_lvtt_period(apic)) || kvm_x86_ops->set_hv_timer(apic->vcpu, tscdeadline)) { if (apic->lapic_timer.hv_timer_in_use) - cancel_hv_tscdeadline(apic); + cancel_hv_timer(apic); } else { apic->lapic_timer.hv_timer_in_use = true; hrtimer_cancel(&apic->lapic_timer.timer); /* In case the sw timer triggered in the window */ - if (atomic_read(&apic->lapic_timer.pending)) - cancel_hv_tscdeadline(apic); + if (atomic_read(&apic->lapic_timer.pending) && + !apic_lvtt_period(apic)) + cancel_hv_timer(apic); } trace_kvm_hv_timer_state(apic->vcpu->vcpu_id, apic->lapic_timer.hv_timer_in_use); return apic->lapic_timer.hv_timer_in_use; } +void kvm_lapic_expired_hv_timer(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + WARN_ON(!apic->lapic_timer.hv_timer_in_use); + WARN_ON(swait_active(&vcpu->wq)); + cancel_hv_timer(apic); + apic_timer_expired(apic); + + if (apic_lvtt_period(apic) && apic->lapic_timer.period) { + advance_periodic_target_expiration(apic); + if (!start_hv_timer(apic)) + start_sw_period(apic); + } +} +EXPORT_SYMBOL_GPL(kvm_lapic_expired_hv_timer); + void kvm_lapic_switch_to_hv_timer(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; WARN_ON(apic->lapic_timer.hv_timer_in_use); - if (apic_lvtt_tscdeadline(apic)) - start_hv_tscdeadline(apic); + start_hv_timer(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_hv_timer); @@ -1413,62 +1496,28 @@ void kvm_lapic_switch_to_sw_timer(struct kvm_vcpu *vcpu) if (!apic->lapic_timer.hv_timer_in_use) return; - cancel_hv_tscdeadline(apic); + cancel_hv_timer(apic); if (atomic_read(&apic->lapic_timer.pending)) return; - start_sw_tscdeadline(apic); + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) + start_sw_period(apic); + else if (apic_lvtt_tscdeadline(apic)) + start_sw_tscdeadline(apic); } EXPORT_SYMBOL_GPL(kvm_lapic_switch_to_sw_timer); static void start_apic_timer(struct kvm_lapic *apic) { - ktime_t now; - atomic_set(&apic->lapic_timer.pending, 0); if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { - /* lapic timer in oneshot or periodic mode */ - now = apic->lapic_timer.timer.base->get_time(); - apic->lapic_timer.period = (u64)kvm_lapic_get_reg(apic, APIC_TMICT) - * APIC_BUS_CYCLE_NS * apic->divide_count; - - if (!apic->lapic_timer.period) - return; - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (apic_lvtt_period(apic)) { - s64 min_period = min_timer_period_us * 1000LL; - - if (apic->lapic_timer.period < min_period) { - pr_info_ratelimited( - "kvm: vcpu %i: requested %lld ns " - "lapic timer period limited to %lld ns\n", - apic->vcpu->vcpu_id, - apic->lapic_timer.period, min_period); - apic->lapic_timer.period = min_period; - } - } - - hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, apic->lapic_timer.period), - HRTIMER_MODE_ABS_PINNED); - - apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" - PRIx64 ", " - "timer initial count 0x%x, period %lldns, " - "expire @ 0x%016" PRIx64 ".\n", __func__, - APIC_BUS_CYCLE_NS, ktime_to_ns(now), - kvm_lapic_get_reg(apic, APIC_TMICT), - apic->lapic_timer.period, - ktime_to_ns(ktime_add_ns(now, - apic->lapic_timer.period))); + if (set_target_expiration(apic) && + !(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) + start_sw_period(apic); } else if (apic_lvtt_tscdeadline(apic)) { - if (!(kvm_x86_ops->set_hv_timer && start_hv_tscdeadline(apic))) + if (!(kvm_x86_ops->set_hv_timer && start_hv_timer(apic))) start_sw_tscdeadline(apic); } } @@ -1701,13 +1750,22 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) * LAPIC interface *---------------------------------------------------------------------- */ +u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (!lapic_in_kernel(vcpu)) + return 0; + + return apic->lapic_timer.tscdeadline; +} u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (!lapic_in_kernel(vcpu) || apic_lvtt_oneshot(apic) || - apic_lvtt_period(apic)) + if (!lapic_in_kernel(vcpu) || + !apic_lvtt_tscdeadline(apic)) return 0; return apic->lapic_timer.tscdeadline; @@ -1748,14 +1806,17 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) { + if (!apic) value |= MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; - return; - } vcpu->arch.apic_base = value; + if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) + kvm_update_cpuid(vcpu); + + if (!apic) + return; + /* update jump label if enable bit changes */ if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) { if (value & MSR_IA32_APICBASE_ENABLE) { @@ -1909,6 +1970,7 @@ static enum hrtimer_restart apic_timer_fn(struct hrtimer *data) apic_timer_expired(apic); if (lapic_is_periodic(apic)) { + advance_periodic_target_expiration(apic); hrtimer_add_expires_ns(&ktimer->timer, ktimer->period); return HRTIMER_RESTART; } else @@ -1993,6 +2055,10 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu) kvm_apic_local_deliver(apic, APIC_LVTT); if (apic_lvtt_tscdeadline(apic)) apic->lapic_timer.tscdeadline = 0; + if (apic_lvtt_oneshot(apic)) { + apic->lapic_timer.tscdeadline = 0; + apic->lapic_timer.target_expiration = ktime_set(0, 0); + } atomic_set(&apic->lapic_timer.pending, 0); } } diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index f60d01c29d51..e0c80233b3e1 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -15,6 +15,7 @@ struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ + ktime_t target_expiration; u32 timer_mode; u32 timer_mode_mask; u64 tscdeadline; @@ -85,6 +86,7 @@ int kvm_apic_get_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); +u64 kvm_get_lapic_target_expiration_tsc(struct kvm_vcpu *vcpu); u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index d9c7e986b4e4..7012de4a1fed 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1660,17 +1660,9 @@ int kvm_age_hva(struct kvm *kvm, unsigned long start, unsigned long end) * This has some overhead, but not as much as the cost of swapping * out actively used pages or breaking up actively used hugepages. */ - if (!shadow_accessed_mask) { - /* - * We are holding the kvm->mmu_lock, and we are blowing up - * shadow PTEs. MMU notifier consumers need to be kept at bay. - * This is correct as long as we don't decouple the mmu_lock - * protected regions (like invalidate_range_start|end does). - */ - kvm->mmu_notifier_seq++; + if (!shadow_accessed_mask) return kvm_handle_hva_range(kvm, start, end, 0, kvm_unmap_rmapp); - } return kvm_handle_hva_range(kvm, start, end, 0, kvm_age_rmapp); } @@ -4405,7 +4397,8 @@ static u64 *get_written_sptes(struct kvm_mmu_page *sp, gpa_t gpa, int *nspte) } static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, - const u8 *new, int bytes) + const u8 *new, int bytes, + struct kvm_page_track_notifier_node *node) { gfn_t gfn = gpa >> PAGE_SHIFT; struct kvm_mmu_page *sp; @@ -4508,7 +4501,7 @@ static void make_mmu_pages_available(struct kvm_vcpu *vcpu) kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); } -int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, +int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, void *insn, int insn_len) { int r, emulation_type = EMULTYPE_RETRY; @@ -4527,12 +4520,28 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code, return r; } - r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false); + r = vcpu->arch.mmu.page_fault(vcpu, cr2, lower_32_bits(error_code), + false); if (r < 0) return r; if (!r) return 1; + /* + * Before emulating the instruction, check if the error code + * was due to a RO violation while translating the guest page. + * This can occur when using nested virtualization with nested + * paging in both guests. If true, we simply unprotect the page + * and resume the guest. + * + * Note: AMD only (since it supports the PFERR_GUEST_PAGE_MASK used + * in PFERR_NEXT_GUEST_PAGE) + */ + if (error_code == PFERR_NESTED_GUEST_PAGE) { + kvm_mmu_unprotect_page(vcpu->kvm, gpa_to_gfn(cr2)); + return 1; + } + if (mmio_info_in_cache(vcpu, cr2, direct)) emulation_type = 0; emulate: @@ -4617,11 +4626,19 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu) init_kvm_mmu(vcpu); } +static void kvm_mmu_invalidate_zap_pages_in_memslot(struct kvm *kvm, + struct kvm_memory_slot *slot, + struct kvm_page_track_notifier_node *node) +{ + kvm_mmu_invalidate_zap_all_pages(kvm); +} + void kvm_mmu_init_vm(struct kvm *kvm) { struct kvm_page_track_notifier_node *node = &kvm->arch.mmu_sp_tracker; node->track_write = kvm_mmu_pte_write; + node->track_flush_slot = kvm_mmu_invalidate_zap_pages_in_memslot; kvm_page_track_register_notifier(kvm, node); } @@ -4958,7 +4975,7 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots) * zap all shadow pages. */ if (unlikely((slots->generation & MMIO_GEN_MASK) == 0)) { - printk_ratelimited(KERN_DEBUG "kvm: zapping shadow pages for mmio generation wraparound\n"); + kvm_debug_ratelimited("kvm: zapping shadow pages for mmio generation wraparound\n"); kvm_mmu_invalidate_zap_all_pages(kvm); } } diff --git a/arch/x86/kvm/page_track.c b/arch/x86/kvm/page_track.c index b431539c3714..4a1c13eaa518 100644 --- a/arch/x86/kvm/page_track.c +++ b/arch/x86/kvm/page_track.c @@ -106,6 +106,7 @@ void kvm_slot_page_track_add_page(struct kvm *kvm, if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn)) kvm_flush_remote_tlbs(kvm); } +EXPORT_SYMBOL_GPL(kvm_slot_page_track_add_page); /* * remove the guest page from the tracking pool which stops the interception @@ -135,6 +136,7 @@ void kvm_slot_page_track_remove_page(struct kvm *kvm, */ kvm_mmu_gfn_allow_lpage(slot, gfn); } +EXPORT_SYMBOL_GPL(kvm_slot_page_track_remove_page); /* * check if the corresponding access on the specified guest page is tracked. @@ -181,6 +183,7 @@ kvm_page_track_register_notifier(struct kvm *kvm, hlist_add_head_rcu(&n->node, &head->track_notifier_list); spin_unlock(&kvm->mmu_lock); } +EXPORT_SYMBOL_GPL(kvm_page_track_register_notifier); /* * stop receiving the event interception. It is the opposed operation of @@ -199,6 +202,7 @@ kvm_page_track_unregister_notifier(struct kvm *kvm, spin_unlock(&kvm->mmu_lock); synchronize_srcu(&head->track_srcu); } +EXPORT_SYMBOL_GPL(kvm_page_track_unregister_notifier); /* * Notify the node that write access is intercepted and write emulation is @@ -222,6 +226,31 @@ void kvm_page_track_write(struct kvm_vcpu *vcpu, gpa_t gpa, const u8 *new, idx = srcu_read_lock(&head->track_srcu); hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) if (n->track_write) - n->track_write(vcpu, gpa, new, bytes); + n->track_write(vcpu, gpa, new, bytes, n); + srcu_read_unlock(&head->track_srcu, idx); +} + +/* + * Notify the node that memory slot is being removed or moved so that it can + * drop write-protection for the pages in the memory slot. + * + * The node should figure out it has any write-protected pages in this slot + * by itself. + */ +void kvm_page_track_flush_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +{ + struct kvm_page_track_notifier_head *head; + struct kvm_page_track_notifier_node *n; + int idx; + + head = &kvm->arch.track_notifier_head; + + if (hlist_empty(&head->track_notifier_list)) + return; + + idx = srcu_read_lock(&head->track_srcu); + hlist_for_each_entry_rcu(n, &head->track_notifier_list, node) + if (n->track_flush_slot) + n->track_flush_slot(kvm, slot, n); srcu_read_unlock(&head->track_srcu, idx); } diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 8ca1eca5038d..08a4d3ab3455 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2074,7 +2074,7 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) static int pf_interception(struct vcpu_svm *svm) { u64 fault_address = svm->vmcb->control.exit_info_2; - u32 error_code; + u64 error_code; int r = 1; switch (svm->apf_reason) { @@ -2270,7 +2270,7 @@ static int io_interception(struct vcpu_svm *svm) ++svm->vcpu.stat.io_exits; string = (io_info & SVM_IOIO_STR_MASK) != 0; in = (io_info & SVM_IOIO_TYPE_MASK) != 0; - if (string || in) + if (string) return emulate_instruction(vcpu, 0) == EMULATE_DONE; port = io_info >> 16; @@ -2278,7 +2278,8 @@ static int io_interception(struct vcpu_svm *svm) svm->next_rip = svm->vmcb->control.exit_info_2; skip_emulated_instruction(&svm->vcpu); - return kvm_fast_pio_out(vcpu, size, port); + return in ? kvm_fast_pio_in(vcpu, size, port) + : kvm_fast_pio_out(vcpu, size, port); } static int nmi_interception(struct vcpu_svm *svm) @@ -3150,8 +3151,7 @@ static int skinit_interception(struct vcpu_svm *svm) static int wbinvd_interception(struct vcpu_svm *svm) { - kvm_emulate_wbinvd(&svm->vcpu); - return 1; + return kvm_emulate_wbinvd(&svm->vcpu); } static int xsetbv_interception(struct vcpu_svm *svm) @@ -3238,8 +3238,7 @@ static int task_switch_interception(struct vcpu_svm *svm) static int cpuid_interception(struct vcpu_svm *svm) { svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; - kvm_emulate_cpuid(&svm->vcpu); - return 1; + return kvm_emulate_cpuid(&svm->vcpu); } static int iret_interception(struct vcpu_svm *svm) @@ -3275,9 +3274,7 @@ static int rdpmc_interception(struct vcpu_svm *svm) return emulate_on_interception(svm); err = kvm_rdpmc(&svm->vcpu); - kvm_complete_insn_gp(&svm->vcpu, err); - - return 1; + return kvm_complete_insn_gp(&svm->vcpu, err); } static bool check_selective_cr0_intercepted(struct vcpu_svm *svm, @@ -3374,9 +3371,7 @@ static int cr_interception(struct vcpu_svm *svm) } kvm_register_write(&svm->vcpu, reg, val); } - kvm_complete_insn_gp(&svm->vcpu, err); - - return 1; + return kvm_complete_insn_gp(&svm->vcpu, err); } static int dr_interception(struct vcpu_svm *svm) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5382b82462fc..aae43c6f2472 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -132,6 +132,22 @@ module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO); #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5 +#define VMX_VPID_EXTENT_SUPPORTED_MASK \ + (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ + VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) + +/* + * Hyper-V requires all of these, so mark them as supported even though + * they are just treated the same as all-context. + */ +#define VMX_VPID_EXTENT_SUPPORTED_MASK \ + (VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT | \ + VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | \ + VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT | \ + VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT) + /* * These 2 parameters are used to config the controls for Pause-Loop Exiting: * ple_gap: upper bound on the amount of time between two successive @@ -446,23 +462,31 @@ struct nested_vmx { u16 vpid02; u16 last_vpid; + /* + * We only store the "true" versions of the VMX capability MSRs. We + * generate the "non-true" versions by setting the must-be-1 bits + * according to the SDM. + */ u32 nested_vmx_procbased_ctls_low; u32 nested_vmx_procbased_ctls_high; - u32 nested_vmx_true_procbased_ctls_low; u32 nested_vmx_secondary_ctls_low; u32 nested_vmx_secondary_ctls_high; u32 nested_vmx_pinbased_ctls_low; u32 nested_vmx_pinbased_ctls_high; u32 nested_vmx_exit_ctls_low; u32 nested_vmx_exit_ctls_high; - u32 nested_vmx_true_exit_ctls_low; u32 nested_vmx_entry_ctls_low; u32 nested_vmx_entry_ctls_high; - u32 nested_vmx_true_entry_ctls_low; u32 nested_vmx_misc_low; u32 nested_vmx_misc_high; u32 nested_vmx_ept_caps; u32 nested_vmx_vpid_caps; + u64 nested_vmx_basic; + u64 nested_vmx_cr0_fixed0; + u64 nested_vmx_cr0_fixed1; + u64 nested_vmx_cr4_fixed0; + u64 nested_vmx_cr4_fixed1; + u64 nested_vmx_vmcs_enum; }; #define POSTED_INTR_ON 0 @@ -520,6 +544,12 @@ static inline void pi_set_sn(struct pi_desc *pi_desc) (unsigned long *)&pi_desc->control); } +static inline void pi_clear_on(struct pi_desc *pi_desc) +{ + clear_bit(POSTED_INTR_ON, + (unsigned long *)&pi_desc->control); +} + static inline int pi_test_on(struct pi_desc *pi_desc) { return test_bit(POSTED_INTR_ON, @@ -920,16 +950,32 @@ static DEFINE_PER_CPU(struct desc_ptr, host_gdt); static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu); static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock); -static unsigned long *vmx_io_bitmap_a; -static unsigned long *vmx_io_bitmap_b; -static unsigned long *vmx_msr_bitmap_legacy; -static unsigned long *vmx_msr_bitmap_longmode; -static unsigned long *vmx_msr_bitmap_legacy_x2apic; -static unsigned long *vmx_msr_bitmap_longmode_x2apic; -static unsigned long *vmx_msr_bitmap_legacy_x2apic_apicv_inactive; -static unsigned long *vmx_msr_bitmap_longmode_x2apic_apicv_inactive; -static unsigned long *vmx_vmread_bitmap; -static unsigned long *vmx_vmwrite_bitmap; +enum { + VMX_IO_BITMAP_A, + VMX_IO_BITMAP_B, + VMX_MSR_BITMAP_LEGACY, + VMX_MSR_BITMAP_LONGMODE, + VMX_MSR_BITMAP_LEGACY_X2APIC_APICV, + VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV, + VMX_MSR_BITMAP_LEGACY_X2APIC, + VMX_MSR_BITMAP_LONGMODE_X2APIC, + VMX_VMREAD_BITMAP, + VMX_VMWRITE_BITMAP, + VMX_BITMAP_NR +}; + +static unsigned long *vmx_bitmap[VMX_BITMAP_NR]; + +#define vmx_io_bitmap_a (vmx_bitmap[VMX_IO_BITMAP_A]) +#define vmx_io_bitmap_b (vmx_bitmap[VMX_IO_BITMAP_B]) +#define vmx_msr_bitmap_legacy (vmx_bitmap[VMX_MSR_BITMAP_LEGACY]) +#define vmx_msr_bitmap_longmode (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE]) +#define vmx_msr_bitmap_legacy_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC_APICV]) +#define vmx_msr_bitmap_longmode_x2apic_apicv (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC_APICV]) +#define vmx_msr_bitmap_legacy_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LEGACY_X2APIC]) +#define vmx_msr_bitmap_longmode_x2apic (vmx_bitmap[VMX_MSR_BITMAP_LONGMODE_X2APIC]) +#define vmx_vmread_bitmap (vmx_bitmap[VMX_VMREAD_BITMAP]) +#define vmx_vmwrite_bitmap (vmx_bitmap[VMX_VMWRITE_BITMAP]) static bool cpu_has_load_ia32_efer; static bool cpu_has_load_perf_global_ctrl; @@ -2145,12 +2191,6 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx) #endif if (vmx->host_state.msr_host_bndcfgs) wrmsrl(MSR_IA32_BNDCFGS, vmx->host_state.msr_host_bndcfgs); - /* - * If the FPU is not active (through the host task or - * the guest vcpu), then restore the cr0.TS bit. - */ - if (!fpregs_active() && !vmx->vcpu.guest_fpu_loaded) - stts(); load_gdt(this_cpu_ptr(&host_gdt)); } @@ -2529,14 +2569,14 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) { if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic; + msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv; else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic; + msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv; } else { if (is_long_mode(vcpu)) - msr_bitmap = vmx_msr_bitmap_longmode_x2apic_apicv_inactive; + msr_bitmap = vmx_msr_bitmap_longmode_x2apic; else - msr_bitmap = vmx_msr_bitmap_legacy_x2apic_apicv_inactive; + msr_bitmap = vmx_msr_bitmap_legacy_x2apic; } } else { if (is_long_mode(vcpu)) @@ -2712,9 +2752,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_exit_ctls_high |= VM_EXIT_CLEAR_BNDCFGS; /* We support free control of debug control saving. */ - vmx->nested.nested_vmx_true_exit_ctls_low = - vmx->nested.nested_vmx_exit_ctls_low & - ~VM_EXIT_SAVE_DEBUG_CONTROLS; + vmx->nested.nested_vmx_exit_ctls_low &= ~VM_EXIT_SAVE_DEBUG_CONTROLS; /* entry controls */ rdmsr(MSR_IA32_VMX_ENTRY_CTLS, @@ -2733,9 +2771,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_entry_ctls_high |= VM_ENTRY_LOAD_BNDCFGS; /* We support free control of debug control loading. */ - vmx->nested.nested_vmx_true_entry_ctls_low = - vmx->nested.nested_vmx_entry_ctls_low & - ~VM_ENTRY_LOAD_DEBUG_CONTROLS; + vmx->nested.nested_vmx_entry_ctls_low &= ~VM_ENTRY_LOAD_DEBUG_CONTROLS; /* cpu-based controls */ rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, @@ -2768,8 +2804,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) CPU_BASED_USE_MSR_BITMAPS; /* We support free control of CR3 access interception. */ - vmx->nested.nested_vmx_true_procbased_ctls_low = - vmx->nested.nested_vmx_procbased_ctls_low & + vmx->nested.nested_vmx_procbased_ctls_low &= ~(CPU_BASED_CR3_LOAD_EXITING | CPU_BASED_CR3_STORE_EXITING); /* secondary cpu-based controls */ @@ -2780,6 +2815,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) vmx->nested.nested_vmx_secondary_ctls_high &= SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_RDTSCP | + SECONDARY_EXEC_DESC | SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_APIC_REGISTER_VIRT | @@ -2811,8 +2847,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) */ if (enable_vpid) vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT | - VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT | - VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT; + VMX_VPID_EXTENT_SUPPORTED_MASK; else vmx->nested.nested_vmx_vpid_caps = 0; @@ -2829,14 +2864,52 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx) VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE | VMX_MISC_ACTIVITY_HLT; vmx->nested.nested_vmx_misc_high = 0; + + /* + * This MSR reports some information about VMX support. We + * should return information about the VMX we emulate for the + * guest, and the VMCS structure we give it - not about the + * VMX support of the underlying hardware. + */ + vmx->nested.nested_vmx_basic = + VMCS12_REVISION | + VMX_BASIC_TRUE_CTLS | + ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | + (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); + + if (cpu_has_vmx_basic_inout()) + vmx->nested.nested_vmx_basic |= VMX_BASIC_INOUT; + + /* + * These MSRs specify bits which the guest must keep fixed on + * while L1 is in VMXON mode (in L1's root mode, or running an L2). + * We picked the standard core2 setting. + */ +#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) +#define VMXON_CR4_ALWAYSON X86_CR4_VMXE + vmx->nested.nested_vmx_cr0_fixed0 = VMXON_CR0_ALWAYSON; + vmx->nested.nested_vmx_cr4_fixed0 = VMXON_CR4_ALWAYSON; + + /* These MSRs specify bits which the guest must keep fixed off. */ + rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx->nested.nested_vmx_cr0_fixed1); + rdmsrl(MSR_IA32_VMX_CR4_FIXED1, vmx->nested.nested_vmx_cr4_fixed1); + + /* highest index: VMX_PREEMPTION_TIMER_VALUE */ + vmx->nested.nested_vmx_vmcs_enum = 0x2e; +} + +/* + * if fixed0[i] == 1: val[i] must be 1 + * if fixed1[i] == 0: val[i] must be 0 + */ +static inline bool fixed_bits_valid(u64 val, u64 fixed0, u64 fixed1) +{ + return ((val & fixed1) | fixed0) == val; } static inline bool vmx_control_verify(u32 control, u32 low, u32 high) { - /* - * Bits 0 in high must be 0, and bits 1 in low must be 1. - */ - return ((control & high) | low) == control; + return fixed_bits_valid(control, low, high); } static inline u64 vmx_control_msr(u32 low, u32 high) @@ -2844,87 +2917,285 @@ static inline u64 vmx_control_msr(u32 low, u32 high) return low | ((u64)high << 32); } -/* Returns 0 on success, non-0 otherwise. */ -static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask) +{ + superset &= mask; + subset &= mask; + + return (superset | subset) == superset; +} + +static int vmx_restore_vmx_basic(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved = + /* feature (except bit 48; see below) */ + BIT_ULL(49) | BIT_ULL(54) | BIT_ULL(55) | + /* reserved */ + BIT_ULL(31) | GENMASK_ULL(47, 45) | GENMASK_ULL(63, 56); + u64 vmx_basic = vmx->nested.nested_vmx_basic; + + if (!is_bitwise_subset(vmx_basic, data, feature_and_reserved)) + return -EINVAL; + + /* + * KVM does not emulate a version of VMX that constrains physical + * addresses of VMX structures (e.g. VMCS) to 32-bits. + */ + if (data & BIT_ULL(48)) + return -EINVAL; + + if (vmx_basic_vmcs_revision_id(vmx_basic) != + vmx_basic_vmcs_revision_id(data)) + return -EINVAL; + + if (vmx_basic_vmcs_size(vmx_basic) > vmx_basic_vmcs_size(data)) + return -EINVAL; + + vmx->nested.nested_vmx_basic = data; + return 0; +} + +static int +vmx_restore_control_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 supported; + u32 *lowp, *highp; + + switch (msr_index) { + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + lowp = &vmx->nested.nested_vmx_pinbased_ctls_low; + highp = &vmx->nested.nested_vmx_pinbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + lowp = &vmx->nested.nested_vmx_procbased_ctls_low; + highp = &vmx->nested.nested_vmx_procbased_ctls_high; + break; + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + lowp = &vmx->nested.nested_vmx_exit_ctls_low; + highp = &vmx->nested.nested_vmx_exit_ctls_high; + break; + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + lowp = &vmx->nested.nested_vmx_entry_ctls_low; + highp = &vmx->nested.nested_vmx_entry_ctls_high; + break; + case MSR_IA32_VMX_PROCBASED_CTLS2: + lowp = &vmx->nested.nested_vmx_secondary_ctls_low; + highp = &vmx->nested.nested_vmx_secondary_ctls_high; + break; + default: + BUG(); + } + + supported = vmx_control_msr(*lowp, *highp); + + /* Check must-be-1 bits are still 1. */ + if (!is_bitwise_subset(data, supported, GENMASK_ULL(31, 0))) + return -EINVAL; + + /* Check must-be-0 bits are still 0. */ + if (!is_bitwise_subset(supported, data, GENMASK_ULL(63, 32))) + return -EINVAL; + + *lowp = data; + *highp = data >> 32; + return 0; +} + +static int vmx_restore_vmx_misc(struct vcpu_vmx *vmx, u64 data) +{ + const u64 feature_and_reserved_bits = + /* feature */ + BIT_ULL(5) | GENMASK_ULL(8, 6) | BIT_ULL(14) | BIT_ULL(15) | + BIT_ULL(28) | BIT_ULL(29) | BIT_ULL(30) | + /* reserved */ + GENMASK_ULL(13, 9) | BIT_ULL(31); + u64 vmx_misc; + + vmx_misc = vmx_control_msr(vmx->nested.nested_vmx_misc_low, + vmx->nested.nested_vmx_misc_high); + + if (!is_bitwise_subset(vmx_misc, data, feature_and_reserved_bits)) + return -EINVAL; + + if ((vmx->nested.nested_vmx_pinbased_ctls_high & + PIN_BASED_VMX_PREEMPTION_TIMER) && + vmx_misc_preemption_timer_rate(data) != + vmx_misc_preemption_timer_rate(vmx_misc)) + return -EINVAL; + + if (vmx_misc_cr3_count(data) > vmx_misc_cr3_count(vmx_misc)) + return -EINVAL; + + if (vmx_misc_max_msr(data) > vmx_misc_max_msr(vmx_misc)) + return -EINVAL; + + if (vmx_misc_mseg_revid(data) != vmx_misc_mseg_revid(vmx_misc)) + return -EINVAL; + + vmx->nested.nested_vmx_misc_low = data; + vmx->nested.nested_vmx_misc_high = data >> 32; + return 0; +} + +static int vmx_restore_vmx_ept_vpid_cap(struct vcpu_vmx *vmx, u64 data) +{ + u64 vmx_ept_vpid_cap; + + vmx_ept_vpid_cap = vmx_control_msr(vmx->nested.nested_vmx_ept_caps, + vmx->nested.nested_vmx_vpid_caps); + + /* Every bit is either reserved or a feature bit. */ + if (!is_bitwise_subset(vmx_ept_vpid_cap, data, -1ULL)) + return -EINVAL; + + vmx->nested.nested_vmx_ept_caps = data; + vmx->nested.nested_vmx_vpid_caps = data >> 32; + return 0; +} + +static int vmx_restore_fixed0_msr(struct vcpu_vmx *vmx, u32 msr_index, u64 data) +{ + u64 *msr; + + switch (msr_index) { + case MSR_IA32_VMX_CR0_FIXED0: + msr = &vmx->nested.nested_vmx_cr0_fixed0; + break; + case MSR_IA32_VMX_CR4_FIXED0: + msr = &vmx->nested.nested_vmx_cr4_fixed0; + break; + default: + BUG(); + } + + /* + * 1 bits (which indicates bits which "must-be-1" during VMX operation) + * must be 1 in the restored value. + */ + if (!is_bitwise_subset(data, *msr, -1ULL)) + return -EINVAL; + + *msr = data; + return 0; +} + +/* + * Called when userspace is restoring VMX MSRs. + * + * Returns 0 on success, non-0 otherwise. + */ +static int vmx_set_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) { struct vcpu_vmx *vmx = to_vmx(vcpu); switch (msr_index) { case MSR_IA32_VMX_BASIC: + return vmx_restore_vmx_basic(vmx, data); + case MSR_IA32_VMX_PINBASED_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS: + case MSR_IA32_VMX_EXIT_CTLS: + case MSR_IA32_VMX_ENTRY_CTLS: + /* + * The "non-true" VMX capability MSRs are generated from the + * "true" MSRs, so we do not support restoring them directly. + * + * If userspace wants to emulate VMX_BASIC[55]=0, userspace + * should restore the "true" MSRs with the must-be-1 bits + * set according to the SDM Vol 3. A.2 "RESERVED CONTROLS AND + * DEFAULT SETTINGS". + */ + return -EINVAL; + case MSR_IA32_VMX_TRUE_PINBASED_CTLS: + case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: + case MSR_IA32_VMX_TRUE_EXIT_CTLS: + case MSR_IA32_VMX_TRUE_ENTRY_CTLS: + case MSR_IA32_VMX_PROCBASED_CTLS2: + return vmx_restore_control_msr(vmx, msr_index, data); + case MSR_IA32_VMX_MISC: + return vmx_restore_vmx_misc(vmx, data); + case MSR_IA32_VMX_CR0_FIXED0: + case MSR_IA32_VMX_CR4_FIXED0: + return vmx_restore_fixed0_msr(vmx, msr_index, data); + case MSR_IA32_VMX_CR0_FIXED1: + case MSR_IA32_VMX_CR4_FIXED1: + /* + * These MSRs are generated based on the vCPU's CPUID, so we + * do not support restoring them directly. + */ + return -EINVAL; + case MSR_IA32_VMX_EPT_VPID_CAP: + return vmx_restore_vmx_ept_vpid_cap(vmx, data); + case MSR_IA32_VMX_VMCS_ENUM: + vmx->nested.nested_vmx_vmcs_enum = data; + return 0; + default: /* - * This MSR reports some information about VMX support. We - * should return information about the VMX we emulate for the - * guest, and the VMCS structure we give it - not about the - * VMX support of the underlying hardware. + * The rest of the VMX capability MSRs do not support restore. */ - *pdata = VMCS12_REVISION | VMX_BASIC_TRUE_CTLS | - ((u64)VMCS12_SIZE << VMX_BASIC_VMCS_SIZE_SHIFT) | - (VMX_BASIC_MEM_TYPE_WB << VMX_BASIC_MEM_TYPE_SHIFT); - if (cpu_has_vmx_basic_inout()) - *pdata |= VMX_BASIC_INOUT; + return -EINVAL; + } +} + +/* Returns 0 on success, non-0 otherwise. */ +static int vmx_get_vmx_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + + switch (msr_index) { + case MSR_IA32_VMX_BASIC: + *pdata = vmx->nested.nested_vmx_basic; break; case MSR_IA32_VMX_TRUE_PINBASED_CTLS: case MSR_IA32_VMX_PINBASED_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_pinbased_ctls_low, vmx->nested.nested_vmx_pinbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PINBASED_CTLS) + *pdata |= PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_PROCBASED_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_procbased_ctls_low, - vmx->nested.nested_vmx_procbased_ctls_high); - break; case MSR_IA32_VMX_PROCBASED_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high); + if (msr_index == MSR_IA32_VMX_PROCBASED_CTLS) + *pdata |= CPU_BASED_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_EXIT_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_exit_ctls_low, - vmx->nested.nested_vmx_exit_ctls_high); - break; case MSR_IA32_VMX_EXIT_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_exit_ctls_low, vmx->nested.nested_vmx_exit_ctls_high); + if (msr_index == MSR_IA32_VMX_EXIT_CTLS) + *pdata |= VM_EXIT_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_TRUE_ENTRY_CTLS: - *pdata = vmx_control_msr( - vmx->nested.nested_vmx_true_entry_ctls_low, - vmx->nested.nested_vmx_entry_ctls_high); - break; case MSR_IA32_VMX_ENTRY_CTLS: *pdata = vmx_control_msr( vmx->nested.nested_vmx_entry_ctls_low, vmx->nested.nested_vmx_entry_ctls_high); + if (msr_index == MSR_IA32_VMX_ENTRY_CTLS) + *pdata |= VM_ENTRY_ALWAYSON_WITHOUT_TRUE_MSR; break; case MSR_IA32_VMX_MISC: *pdata = vmx_control_msr( vmx->nested.nested_vmx_misc_low, vmx->nested.nested_vmx_misc_high); break; - /* - * These MSRs specify bits which the guest must keep fixed (on or off) - * while L1 is in VMXON mode (in L1's root mode, or running an L2). - * We picked the standard core2 setting. - */ -#define VMXON_CR0_ALWAYSON (X86_CR0_PE | X86_CR0_PG | X86_CR0_NE) -#define VMXON_CR4_ALWAYSON X86_CR4_VMXE case MSR_IA32_VMX_CR0_FIXED0: - *pdata = VMXON_CR0_ALWAYSON; + *pdata = vmx->nested.nested_vmx_cr0_fixed0; break; case MSR_IA32_VMX_CR0_FIXED1: - *pdata = -1ULL; + *pdata = vmx->nested.nested_vmx_cr0_fixed1; break; case MSR_IA32_VMX_CR4_FIXED0: - *pdata = VMXON_CR4_ALWAYSON; + *pdata = vmx->nested.nested_vmx_cr4_fixed0; break; case MSR_IA32_VMX_CR4_FIXED1: - *pdata = -1ULL; + *pdata = vmx->nested.nested_vmx_cr4_fixed1; break; case MSR_IA32_VMX_VMCS_ENUM: - *pdata = 0x2e; /* highest index: VMX_PREEMPTION_TIMER_VALUE */ + *pdata = vmx->nested.nested_vmx_vmcs_enum; break; case MSR_IA32_VMX_PROCBASED_CTLS2: *pdata = vmx_control_msr( @@ -3107,7 +3378,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) vmx_leave_nested(vcpu); break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: - return 1; /* they are read-only */ + if (!msr_info->host_initiated) + return 1; /* they are read-only */ + if (!nested_vmx_allowed(vcpu)) + return 1; + return vmx_set_vmx_msr(vcpu, msr_index, data); case MSR_IA32_XSS: if (!vmx_xsaves_supported()) return 1; @@ -3869,6 +4144,40 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu) (unsigned long *)&vcpu->arch.regs_dirty); } +static bool nested_guest_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; + struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + + if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & + SECONDARY_EXEC_UNRESTRICTED_GUEST && + nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) + fixed0 &= ~(X86_CR0_PE | X86_CR0_PG); + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static bool nested_host_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr0_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +static bool nested_cr4_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + u64 fixed0 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed0; + u64 fixed1 = to_vmx(vcpu)->nested.nested_vmx_cr4_fixed1; + + return fixed_bits_valid(val, fixed0, fixed1); +} + +/* No difference in the restrictions on guest and host CR4 in VMX operation. */ +#define nested_guest_cr4_valid nested_cr4_valid +#define nested_host_cr4_valid nested_cr4_valid + static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, @@ -3997,8 +4306,8 @@ static int vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) if (!nested_vmx_allowed(vcpu)) return 1; } - if (to_vmx(vcpu)->nested.vmxon && - ((cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) + + if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, cr4)) return 1; vcpu->arch.cr4 = cr4; @@ -4575,41 +4884,6 @@ static void __vmx_disable_intercept_for_msr(unsigned long *msr_bitmap, } } -static void __vmx_enable_intercept_for_msr(unsigned long *msr_bitmap, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) - return; - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R) - /* read-low */ - __set_bit(msr, msr_bitmap + 0x000 / f); - - if (type & MSR_TYPE_W) - /* write-low */ - __set_bit(msr, msr_bitmap + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R) - /* read-high */ - __set_bit(msr, msr_bitmap + 0x400 / f); - - if (type & MSR_TYPE_W) - /* write-high */ - __set_bit(msr, msr_bitmap + 0xc00 / f); - - } -} - /* * If a msr is allowed by L0, we should check whether it is allowed by L1. * The corresponding bit will be cleared unless both of L0 and L1 allow it. @@ -4665,48 +4939,18 @@ static void vmx_disable_intercept_for_msr(u32 msr, bool longmode_only) msr, MSR_TYPE_R | MSR_TYPE_W); } -static void vmx_enable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) -{ - if (apicv_active) { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); - } else { - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - __vmx_enable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - } -} - -static void vmx_disable_intercept_msr_read_x2apic(u32 msr, bool apicv_active) +static void vmx_disable_intercept_msr_x2apic(u32 msr, int type, bool apicv_active) { if (apicv_active) { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_R); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv, + msr, type); + __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv, + msr, type); } else { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_R); - } -} - -static void vmx_disable_intercept_msr_write_x2apic(u32 msr, bool apicv_active) -{ - if (apicv_active) { __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic, - msr, MSR_TYPE_W); + msr, type); __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic, - msr, MSR_TYPE_W); - } else { - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, - msr, MSR_TYPE_W); - __vmx_disable_intercept_for_msr(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, - msr, MSR_TYPE_W); + msr, type); } } @@ -4828,9 +5072,15 @@ static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); - if (!pi_test_and_clear_on(&vmx->pi_desc)) + if (!pi_test_on(&vmx->pi_desc)) return; + pi_clear_on(&vmx->pi_desc); + /* + * IOMMU can write to PIR.ON, so the barrier matters even on UP. + * But on x86 this is just a compiler barrier anyway. + */ + smp_mb__after_atomic(); kvm_apic_update_irr(vcpu, vmx->pi_desc.pir); } @@ -4845,9 +5095,11 @@ static void vmx_set_constant_host_state(struct vcpu_vmx *vmx) u32 low32, high32; unsigned long tmpl; struct desc_ptr dt; - unsigned long cr4; + unsigned long cr0, cr4; - vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS); /* 22.2.3 */ + cr0 = read_cr0(); + WARN_ON(cr0 & X86_CR0_TS); + vmcs_writel(HOST_CR0, cr0); /* 22.2.3 */ vmcs_writel(HOST_CR3, read_cr3()); /* 22.2.3 FIXME: shadow tables */ /* Save the most likely value for this task's CR4 in the VMCS. */ @@ -5587,7 +5839,7 @@ static int handle_triple_fault(struct kvm_vcpu *vcpu) static int handle_io(struct kvm_vcpu *vcpu) { unsigned long exit_qualification; - int size, in, string; + int size, in, string, ret; unsigned port; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -5601,9 +5853,14 @@ static int handle_io(struct kvm_vcpu *vcpu) port = exit_qualification >> 16; size = (exit_qualification & 7) + 1; - skip_emulated_instruction(vcpu); - return kvm_fast_pio_out(vcpu, size, port); + ret = kvm_skip_emulated_instruction(vcpu); + + /* + * TODO: we might be squashing a KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + return kvm_fast_pio_out(vcpu, size, port) && ret; } static void @@ -5617,18 +5874,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) hypercall[2] = 0xc1; } -static bool nested_cr0_valid(struct kvm_vcpu *vcpu, unsigned long val) -{ - unsigned long always_on = VMXON_CR0_ALWAYSON; - struct vmcs12 *vmcs12 = get_vmcs12(vcpu); - - if (to_vmx(vcpu)->nested.nested_vmx_secondary_ctls_high & - SECONDARY_EXEC_UNRESTRICTED_GUEST && - nested_cpu_has2(vmcs12, SECONDARY_EXEC_UNRESTRICTED_GUEST)) - always_on &= ~(X86_CR0_PE | X86_CR0_PG); - return (val & always_on) == always_on; -} - /* called to set cr0 as appropriate for a mov-to-cr0 exit. */ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) { @@ -5647,7 +5892,7 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) val = (val & ~vmcs12->cr0_guest_host_mask) | (vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask); - if (!nested_cr0_valid(vcpu, val)) + if (!nested_guest_cr0_valid(vcpu, val)) return 1; if (kvm_set_cr0(vcpu, val)) @@ -5656,8 +5901,9 @@ static int handle_set_cr0(struct kvm_vcpu *vcpu, unsigned long val) return 0; } else { if (to_vmx(vcpu)->nested.vmxon && - ((val & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON)) + !nested_host_cr0_valid(vcpu, val)) return 1; + return kvm_set_cr0(vcpu, val); } } @@ -5701,6 +5947,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) int cr; int reg; int err; + int ret; exit_qualification = vmcs_readl(EXIT_QUALIFICATION); cr = exit_qualification & 15; @@ -5712,25 +5959,27 @@ static int handle_cr(struct kvm_vcpu *vcpu) switch (cr) { case 0: err = handle_set_cr0(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 3: err = kvm_set_cr3(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 4: err = handle_set_cr4(vcpu, val); - kvm_complete_insn_gp(vcpu, err); - return 1; + return kvm_complete_insn_gp(vcpu, err); case 8: { u8 cr8_prev = kvm_get_cr8(vcpu); u8 cr8 = (u8)val; err = kvm_set_cr8(vcpu, cr8); - kvm_complete_insn_gp(vcpu, err); + ret = kvm_complete_insn_gp(vcpu, err); if (lapic_in_kernel(vcpu)) - return 1; + return ret; if (cr8_prev <= cr8) - return 1; + return ret; + /* + * TODO: we might be squashing a + * KVM_GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ vcpu->run->exit_reason = KVM_EXIT_SET_TPR; return 0; } @@ -5739,23 +5988,20 @@ static int handle_cr(struct kvm_vcpu *vcpu) case 2: /* clts */ handle_clts(vcpu); trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - skip_emulated_instruction(vcpu); vmx_fpu_activate(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); case 1: /*mov from cr*/ switch (cr) { case 3: val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); case 8: val = kvm_get_cr8(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } break; case 3: /* lmsw */ @@ -5763,8 +6009,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val); kvm_lmsw(vcpu, val); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); default: break; } @@ -5835,8 +6080,7 @@ static int handle_dr(struct kvm_vcpu *vcpu) if (kvm_set_dr(vcpu, dr, kvm_register_readl(vcpu, reg))) return 1; - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static u64 vmx_get_dr6(struct kvm_vcpu *vcpu) @@ -5868,8 +6112,7 @@ static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) static int handle_cpuid(struct kvm_vcpu *vcpu) { - kvm_emulate_cpuid(vcpu); - return 1; + return kvm_emulate_cpuid(vcpu); } static int handle_rdmsr(struct kvm_vcpu *vcpu) @@ -5890,8 +6133,7 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu) /* FIXME: handling of bits 32:63 of rax, rdx */ vcpu->arch.regs[VCPU_REGS_RAX] = msr_info.data & -1u; vcpu->arch.regs[VCPU_REGS_RDX] = (msr_info.data >> 32) & -1u; - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_wrmsr(struct kvm_vcpu *vcpu) @@ -5911,8 +6153,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu) } trace_kvm_msr_write(ecx, data); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) @@ -5956,8 +6197,7 @@ static int handle_invlpg(struct kvm_vcpu *vcpu) unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); kvm_mmu_invlpg(vcpu, exit_qualification); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_rdpmc(struct kvm_vcpu *vcpu) @@ -5965,15 +6205,12 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu) int err; err = kvm_rdpmc(vcpu); - kvm_complete_insn_gp(vcpu, err); - - return 1; + return kvm_complete_insn_gp(vcpu, err); } static int handle_wbinvd(struct kvm_vcpu *vcpu) { - kvm_emulate_wbinvd(vcpu); - return 1; + return kvm_emulate_wbinvd(vcpu); } static int handle_xsetbv(struct kvm_vcpu *vcpu) @@ -5982,20 +6219,20 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) u32 index = kvm_register_read(vcpu, VCPU_REGS_RCX); if (kvm_set_xcr(vcpu, index, new_bv) == 0) - skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); return 1; } static int handle_xsaves(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); + kvm_skip_emulated_instruction(vcpu); WARN(1, "this should never happen\n"); return 1; } static int handle_xrstors(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); + kvm_skip_emulated_instruction(vcpu); WARN(1, "this should never happen\n"); return 1; } @@ -6016,8 +6253,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu) if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && (offset == APIC_EOI)) { kvm_lapic_set_eoi(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } } return emulate_instruction(vcpu, 0) == EMULATE_DONE; @@ -6165,9 +6401,8 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); if (!kvm_io_bus_write(vcpu, KVM_FAST_MMIO_BUS, gpa, 0, NULL)) { - skip_emulated_instruction(vcpu); trace_kvm_fast_mmio(gpa); - return 1; + return kvm_skip_emulated_instruction(vcpu); } ret = handle_mmio_page_fault(vcpu, gpa, true); @@ -6352,50 +6587,13 @@ static __init int hardware_setup(void) for (i = 0; i < ARRAY_SIZE(vmx_msr_index); ++i) kvm_define_shared_msr(i, vmx_msr_index[i]); - vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_a) - return r; + for (i = 0; i < VMX_BITMAP_NR; i++) { + vmx_bitmap[i] = (unsigned long *)__get_free_page(GFP_KERNEL); + if (!vmx_bitmap[i]) + goto out; + } vmx_io_bitmap_b = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_io_bitmap_b) - goto out; - - vmx_msr_bitmap_legacy = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy) - goto out1; - - vmx_msr_bitmap_legacy_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic) - goto out2; - - vmx_msr_bitmap_legacy_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_legacy_x2apic_apicv_inactive) - goto out3; - - vmx_msr_bitmap_longmode = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode) - goto out4; - - vmx_msr_bitmap_longmode_x2apic = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic) - goto out5; - - vmx_msr_bitmap_longmode_x2apic_apicv_inactive = - (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_msr_bitmap_longmode_x2apic_apicv_inactive) - goto out6; - - vmx_vmread_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmread_bitmap) - goto out7; - - vmx_vmwrite_bitmap = (unsigned long *)__get_free_page(GFP_KERNEL); - if (!vmx_vmwrite_bitmap) - goto out8; - memset(vmx_vmread_bitmap, 0xff, PAGE_SIZE); memset(vmx_vmwrite_bitmap, 0xff, PAGE_SIZE); @@ -6413,7 +6611,7 @@ static __init int hardware_setup(void) if (setup_vmcs_config(&vmcs_config) < 0) { r = -EIO; - goto out9; + goto out; } if (boot_cpu_has(X86_FEATURE_NX)) @@ -6476,39 +6674,34 @@ static __init int hardware_setup(void) vmx_disable_intercept_for_msr(MSR_IA32_SYSENTER_EIP, false); vmx_disable_intercept_for_msr(MSR_IA32_BNDCFGS, true); - memcpy(vmx_msr_bitmap_legacy_x2apic, + memcpy(vmx_msr_bitmap_legacy_x2apic_apicv, vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic, + memcpy(vmx_msr_bitmap_longmode_x2apic_apicv, vmx_msr_bitmap_longmode, PAGE_SIZE); - memcpy(vmx_msr_bitmap_legacy_x2apic_apicv_inactive, + memcpy(vmx_msr_bitmap_legacy_x2apic, vmx_msr_bitmap_legacy, PAGE_SIZE); - memcpy(vmx_msr_bitmap_longmode_x2apic_apicv_inactive, + memcpy(vmx_msr_bitmap_longmode_x2apic, vmx_msr_bitmap_longmode, PAGE_SIZE); set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */ + for (msr = 0x800; msr <= 0x8ff; msr++) { + if (msr == 0x839 /* TMCCT */) + continue; + vmx_disable_intercept_msr_x2apic(msr, MSR_TYPE_R, true); + } + /* - * enable_apicv && kvm_vcpu_apicv_active() + * TPR reads and writes can be virtualized even if virtual interrupt + * delivery is not in use. */ - for (msr = 0x800; msr <= 0x8ff; msr++) - vmx_disable_intercept_msr_read_x2apic(msr, true); + vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_W, true); + vmx_disable_intercept_msr_x2apic(0x808, MSR_TYPE_R | MSR_TYPE_W, false); - /* TMCCT */ - vmx_enable_intercept_msr_read_x2apic(0x839, true); - /* TPR */ - vmx_disable_intercept_msr_write_x2apic(0x808, true); /* EOI */ - vmx_disable_intercept_msr_write_x2apic(0x80b, true); + vmx_disable_intercept_msr_x2apic(0x80b, MSR_TYPE_W, true); /* SELF-IPI */ - vmx_disable_intercept_msr_write_x2apic(0x83f, true); - - /* - * (enable_apicv && !kvm_vcpu_apicv_active()) || - * !enable_apicv - */ - /* TPR */ - vmx_disable_intercept_msr_read_x2apic(0x808, false); - vmx_disable_intercept_msr_write_x2apic(0x808, false); + vmx_disable_intercept_msr_x2apic(0x83f, MSR_TYPE_W, true); if (enable_ept) { kvm_mmu_set_mask_ptes(VMX_EPT_READABLE_MASK, @@ -6555,42 +6748,19 @@ static __init int hardware_setup(void) return alloc_kvm_area(); -out9: - free_page((unsigned long)vmx_vmwrite_bitmap); -out8: - free_page((unsigned long)vmx_vmread_bitmap); -out7: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); -out6: - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); -out5: - free_page((unsigned long)vmx_msr_bitmap_longmode); -out4: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); -out3: - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); -out2: - free_page((unsigned long)vmx_msr_bitmap_legacy); -out1: - free_page((unsigned long)vmx_io_bitmap_b); out: - free_page((unsigned long)vmx_io_bitmap_a); + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); return r; } static __exit void hardware_unsetup(void) { - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic); - free_page((unsigned long)vmx_msr_bitmap_legacy_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic); - free_page((unsigned long)vmx_msr_bitmap_longmode_x2apic_apicv_inactive); - free_page((unsigned long)vmx_msr_bitmap_legacy); - free_page((unsigned long)vmx_msr_bitmap_longmode); - free_page((unsigned long)vmx_io_bitmap_b); - free_page((unsigned long)vmx_io_bitmap_a); - free_page((unsigned long)vmx_vmwrite_bitmap); - free_page((unsigned long)vmx_vmread_bitmap); + int i; + + for (i = 0; i < VMX_BITMAP_NR; i++) + free_page((unsigned long)vmx_bitmap[i]); free_kvm_area(); } @@ -6604,16 +6774,13 @@ static int handle_pause(struct kvm_vcpu *vcpu) if (ple_gap) grow_ple_window(vcpu); - skip_emulated_instruction(vcpu); kvm_vcpu_on_spin(vcpu); - - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_nop(struct kvm_vcpu *vcpu) { - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_mwait(struct kvm_vcpu *vcpu) @@ -6920,8 +7087,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, */ if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } page = nested_get_page(vcpu, vmptr); @@ -6929,8 +7095,7 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, *(u32 *)kmap(page) != VMCS12_REVISION) { nested_vmx_failInvalid(vcpu); kunmap(page); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } kunmap(page); vmx->nested.vmxon_ptr = vmptr; @@ -6939,30 +7104,26 @@ static int nested_vmx_check_vmptr(struct kvm_vcpu *vcpu, int exit_reason, if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmptr == vmx->nested.vmxon_ptr) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } break; case EXIT_REASON_VMPTRLD: if (!PAGE_ALIGNED(vmptr) || (vmptr >> maxphyaddr)) { nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INVALID_ADDRESS); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmptr == vmx->nested.vmxon_ptr) { nested_vmx_failValid(vcpu, VMXERR_VMCLEAR_VMXON_POINTER); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } break; default: @@ -7018,8 +7179,7 @@ static int handle_vmon(struct kvm_vcpu *vcpu) if (vmx->nested.vmxon) { nested_vmx_failValid(vcpu, VMXERR_VMXON_IN_VMX_ROOT_OPERATION); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if ((vmx->msr_ia32_feature_control & VMXON_NEEDED_FEATURES) @@ -7059,9 +7219,8 @@ static int handle_vmon(struct kvm_vcpu *vcpu) vmx->nested.vmxon = true; - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); out_shadow_vmcs: kfree(vmx->nested.cached_vmcs12); @@ -7180,9 +7339,8 @@ static int handle_vmoff(struct kvm_vcpu *vcpu) if (!nested_vmx_check_permission(vcpu)) return 1; free_nested(to_vmx(vcpu)); - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the VMCLEAR instruction */ @@ -7221,9 +7379,8 @@ static int handle_vmclear(struct kvm_vcpu *vcpu) nested_free_vmcs02(vmx, vmptr); - skip_emulated_instruction(vcpu); nested_vmx_succeed(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch); @@ -7421,7 +7578,6 @@ static int nested_vmx_check_vmcs12(struct kvm_vcpu *vcpu) struct vcpu_vmx *vmx = to_vmx(vcpu); if (vmx->nested.current_vmptr == -1ull) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); return 0; } return 1; @@ -7435,17 +7591,18 @@ static int handle_vmread(struct kvm_vcpu *vcpu) u32 vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); gva_t gva = 0; - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) + if (!nested_vmx_check_permission(vcpu)) return 1; + if (!nested_vmx_check_vmcs12(vcpu)) + return kvm_skip_emulated_instruction(vcpu); + /* Decode instruction info and find the field to read */ field = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 28) & 0xf)); /* Read the field, zero-extended to a u64 field_value */ if (vmcs12_read_any(vcpu, field, &field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* * Now copy part of this value to register or memory, as requested. @@ -7465,8 +7622,7 @@ static int handle_vmread(struct kvm_vcpu *vcpu) } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } @@ -7485,10 +7641,12 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) u64 field_value = 0; struct x86_exception e; - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) + if (!nested_vmx_check_permission(vcpu)) return 1; + if (!nested_vmx_check_vmcs12(vcpu)) + return kvm_skip_emulated_instruction(vcpu); + if (vmx_instruction_info & (1u << 10)) field_value = kvm_register_readl(vcpu, (((vmx_instruction_info) >> 3) & 0xf)); @@ -7508,19 +7666,16 @@ static int handle_vmwrite(struct kvm_vcpu *vcpu) if (vmcs_field_readonly(field)) { nested_vmx_failValid(vcpu, VMXERR_VMWRITE_READ_ONLY_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } if (vmcs12_write_any(vcpu, field, field_value) < 0) { nested_vmx_failValid(vcpu, VMXERR_UNSUPPORTED_VMCS_COMPONENT); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the VMPTRLD instruction */ @@ -7541,8 +7696,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) page = nested_get_page(vcpu, vmptr); if (page == NULL) { nested_vmx_failInvalid(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } new_vmcs12 = kmap(page); if (new_vmcs12->revision_id != VMCS12_REVISION) { @@ -7550,8 +7704,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) nested_release_page_clean(page); nested_vmx_failValid(vcpu, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } nested_release_vmcs12(vmx); @@ -7575,8 +7728,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu) } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the VMPTRST instruction */ @@ -7601,8 +7753,7 @@ static int handle_vmptrst(struct kvm_vcpu *vcpu) return 1; } nested_vmx_succeed(vcpu); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* Emulate the INVEPT instruction */ @@ -7640,8 +7791,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) if (type >= 32 || !(types & (1 << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* According to the Intel VMX instruction reference, the memory @@ -7672,8 +7822,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) break; } - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } static int handle_invvpid(struct kvm_vcpu *vcpu) @@ -7698,13 +7847,13 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf); - types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7; + types = (vmx->nested.nested_vmx_vpid_caps & + VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; if (type >= 32 || !(types & (1 << type))) { nested_vmx_failValid(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); - skip_emulated_instruction(vcpu); - return 1; + return kvm_skip_emulated_instruction(vcpu); } /* according to the intel vmx instruction reference, the memory @@ -7720,23 +7869,26 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) } switch (type) { + case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: case VMX_VPID_EXTENT_SINGLE_CONTEXT: - /* - * Old versions of KVM use the single-context version so we - * have to support it; just treat it the same as all-context. - */ + case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL: + if (!vpid) { + nested_vmx_failValid(vcpu, + VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + return kvm_skip_emulated_instruction(vcpu); + } + break; case VMX_VPID_EXTENT_ALL_CONTEXT: - __vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02); - nested_vmx_succeed(vcpu); break; default: - /* Trap individual address invalidation invvpid calls */ - BUG_ON(1); - break; + WARN_ON_ONCE(1); + return kvm_skip_emulated_instruction(vcpu); } - skip_emulated_instruction(vcpu); - return 1; + __vmx_flush_tlb(vcpu, vmx->nested.vpid02); + nested_vmx_succeed(vcpu); + + return kvm_skip_emulated_instruction(vcpu); } static int handle_pml_full(struct kvm_vcpu *vcpu) @@ -8075,6 +8227,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return nested_cpu_has(vmcs12, CPU_BASED_MOV_DR_EXITING); case EXIT_REASON_IO_INSTRUCTION: return nested_vmx_exit_handled_io(vcpu, vmcs12); + case EXIT_REASON_GDTR_IDTR: case EXIT_REASON_LDTR_TR: + return nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC); case EXIT_REASON_MSR_READ: case EXIT_REASON_MSR_WRITE: return nested_vmx_exit_handled_msr(vcpu, vmcs12, exit_reason); @@ -8624,11 +8778,6 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu) u32 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); register void *__sp asm(_ASM_SP); - /* - * If external interrupt exists, IF bit is set in rflags/eflags on the - * interrupt stack frame, and interrupt will be enabled on a return - * from interrupt handler. - */ if ((exit_intr_info & (INTR_INFO_VALID_MASK | INTR_INFO_INTR_TYPE_MASK)) == (INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR)) { unsigned int vector; @@ -8813,7 +8962,7 @@ static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx) msrs[i].host); } -void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) +static void vmx_arm_hv_timer(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); u64 tscl; @@ -9283,6 +9432,50 @@ static void vmcs_set_secondary_exec_control(u32 new_ctl) (new_ctl & ~mask) | (cur_ctl & mask)); } +/* + * Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits + * (indicating "allowed-1") if they are supported in the guest's CPUID. + */ +static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu) +{ + struct vcpu_vmx *vmx = to_vmx(vcpu); + struct kvm_cpuid_entry2 *entry; + + vmx->nested.nested_vmx_cr0_fixed1 = 0xffffffff; + vmx->nested.nested_vmx_cr4_fixed1 = X86_CR4_PCE; + +#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \ + if (entry && (entry->_reg & (_cpuid_mask))) \ + vmx->nested.nested_vmx_cr4_fixed1 |= (_cr4_mask); \ +} while (0) + + entry = kvm_find_cpuid_entry(vcpu, 0x1, 0); + cr4_fixed1_update(X86_CR4_VME, edx, bit(X86_FEATURE_VME)); + cr4_fixed1_update(X86_CR4_PVI, edx, bit(X86_FEATURE_VME)); + cr4_fixed1_update(X86_CR4_TSD, edx, bit(X86_FEATURE_TSC)); + cr4_fixed1_update(X86_CR4_DE, edx, bit(X86_FEATURE_DE)); + cr4_fixed1_update(X86_CR4_PSE, edx, bit(X86_FEATURE_PSE)); + cr4_fixed1_update(X86_CR4_PAE, edx, bit(X86_FEATURE_PAE)); + cr4_fixed1_update(X86_CR4_MCE, edx, bit(X86_FEATURE_MCE)); + cr4_fixed1_update(X86_CR4_PGE, edx, bit(X86_FEATURE_PGE)); + cr4_fixed1_update(X86_CR4_OSFXSR, edx, bit(X86_FEATURE_FXSR)); + cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, bit(X86_FEATURE_XMM)); + cr4_fixed1_update(X86_CR4_VMXE, ecx, bit(X86_FEATURE_VMX)); + cr4_fixed1_update(X86_CR4_SMXE, ecx, bit(X86_FEATURE_SMX)); + cr4_fixed1_update(X86_CR4_PCIDE, ecx, bit(X86_FEATURE_PCID)); + cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, bit(X86_FEATURE_XSAVE)); + + entry = kvm_find_cpuid_entry(vcpu, 0x7, 0); + cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, bit(X86_FEATURE_FSGSBASE)); + cr4_fixed1_update(X86_CR4_SMEP, ebx, bit(X86_FEATURE_SMEP)); + cr4_fixed1_update(X86_CR4_SMAP, ebx, bit(X86_FEATURE_SMAP)); + cr4_fixed1_update(X86_CR4_PKE, ecx, bit(X86_FEATURE_PKU)); + /* TODO: Use X86_CR4_UMIP and X86_FEATURE_UMIP macros */ + cr4_fixed1_update(bit(11), ecx, bit(2)); + +#undef cr4_fixed1_update +} + static void vmx_cpuid_update(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; @@ -9324,6 +9517,9 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu) else to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &= ~FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX; + + if (nested_vmx_allowed(vcpu)) + nested_vmx_cr_fixed1_bits_update(vcpu); } static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) @@ -9778,6 +9974,49 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) return 0; } +static bool nested_cr3_valid(struct kvm_vcpu *vcpu, unsigned long val) +{ + unsigned long invalid_mask; + + invalid_mask = (~0ULL) << cpuid_maxphyaddr(vcpu); + return (val & invalid_mask) == 0; +} + +/* + * Load guest's/host's cr3 at nested entry/exit. nested_ept is true if we are + * emulating VM entry into a guest with EPT enabled. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. + */ +static int nested_vmx_load_cr3(struct kvm_vcpu *vcpu, unsigned long cr3, bool nested_ept, + unsigned long *entry_failure_code) +{ + if (cr3 != kvm_read_cr3(vcpu) || (!nested_ept && pdptrs_changed(vcpu))) { + if (!nested_cr3_valid(vcpu, cr3)) { + *entry_failure_code = ENTRY_FAIL_DEFAULT; + return 1; + } + + /* + * If PAE paging and EPT are both on, CR3 is not used by the CPU and + * must not be dereferenced. + */ + if (!is_long_mode(vcpu) && is_pae(vcpu) && is_paging(vcpu) && + !nested_ept) { + if (!load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3)) { + *entry_failure_code = ENTRY_FAIL_PDPTE; + return 1; + } + } + + vcpu->arch.cr3 = cr3; + __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail); + } + + kvm_mmu_reset_context(vcpu); + return 0; +} + /* * prepare_vmcs02 is called when the L1 guest hypervisor runs its nested * L2 guest. L1 has a vmcs for L2 (vmcs12), and this function "merges" it @@ -9786,11 +10025,15 @@ static int nested_vmx_store_msr(struct kvm_vcpu *vcpu, u64 gpa, u32 count) * needs. In addition to modifying the active vmcs (which is vmcs02), this * function also has additional necessary side-effects, like setting various * vcpu->arch fields. + * Returns 0 on success, 1 on failure. Invalid state exit qualification code + * is assigned to entry_failure_code on failure. */ -static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) +static int prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, + unsigned long *entry_failure_code) { struct vcpu_vmx *vmx = to_vmx(vcpu); u32 exec_control; + bool nested_ept_enabled = false; vmcs_write16(GUEST_ES_SELECTOR, vmcs12->guest_es_selector); vmcs_write16(GUEST_CS_SELECTOR, vmcs12->guest_cs_selector); @@ -9955,6 +10198,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs12->guest_intr_status); } + nested_ept_enabled = (exec_control & SECONDARY_EXEC_ENABLE_EPT) != 0; vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); } @@ -9968,6 +10212,15 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_set_constant_host_state(vmx); /* + * Set the MSR load/store lists to match L0's settings. + */ + vmcs_write32(VM_EXIT_MSR_STORE_COUNT, 0); + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host)); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write64(VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest)); + + /* * HOST_RSP is normally set correctly in vmx_vcpu_run() just before * entry, but only if the current (host) sp changed from the value * we wrote last (vmx->host_rsp). This cache is no longer relevant @@ -10073,15 +10326,6 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) nested_ept_init_mmu_context(vcpu); } - if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) - vcpu->arch.efer = vmcs12->guest_ia32_efer; - else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) - vcpu->arch.efer |= (EFER_LMA | EFER_LME); - else - vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); - /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ - vmx_set_efer(vcpu, vcpu->arch.efer); - /* * This sets GUEST_CR0 to vmcs12->guest_cr0, with possibly a modified * TS bit (for lazy fpu) and bits which we consider mandatory enabled. @@ -10096,8 +10340,20 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmx_set_cr4(vcpu, vmcs12->guest_cr4); vmcs_writel(CR4_READ_SHADOW, nested_read_cr4(vmcs12)); - /* shadow page tables on either EPT or shadow page tables */ - kvm_set_cr3(vcpu, vmcs12->guest_cr3); + if (vmcs12->vm_entry_controls & VM_ENTRY_LOAD_IA32_EFER) + vcpu->arch.efer = vmcs12->guest_ia32_efer; + else if (vmcs12->vm_entry_controls & VM_ENTRY_IA32E_MODE) + vcpu->arch.efer |= (EFER_LMA | EFER_LME); + else + vcpu->arch.efer &= ~(EFER_LMA | EFER_LME); + /* Note: modifies VM_ENTRY/EXIT_CONTROLS and GUEST/HOST_IA32_EFER */ + vmx_set_efer(vcpu, vcpu->arch.efer); + + /* Shadow page tables on either EPT or shadow page tables. */ + if (nested_vmx_load_cr3(vcpu, vmcs12->guest_cr3, nested_ept_enabled, + entry_failure_code)) + return 1; + kvm_mmu_reset_context(vcpu); if (!enable_ept) @@ -10115,6 +10371,7 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) kvm_register_write(vcpu, VCPU_REGS_RSP, vmcs12->guest_rsp); kvm_register_write(vcpu, VCPU_REGS_RIP, vmcs12->guest_rip); + return 0; } /* @@ -10129,12 +10386,14 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) struct loaded_vmcs *vmcs02; bool ia32e; u32 msr_entry_idx; + unsigned long exit_qualification; - if (!nested_vmx_check_permission(vcpu) || - !nested_vmx_check_vmcs12(vcpu)) + if (!nested_vmx_check_permission(vcpu)) return 1; - skip_emulated_instruction(vcpu); + if (!nested_vmx_check_vmcs12(vcpu)) + goto out; + vmcs12 = get_vmcs12(vcpu); if (enable_shadow_vmcs) @@ -10154,37 +10413,37 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) nested_vmx_failValid(vcpu, launch ? VMXERR_VMLAUNCH_NONCLEAR_VMCS : VMXERR_VMRESUME_NONLAUNCHED_VMCS); - return 1; + goto out; } if (vmcs12->guest_activity_state != GUEST_ACTIVITY_ACTIVE && vmcs12->guest_activity_state != GUEST_ACTIVITY_HLT) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (!nested_get_vmcs12_pages(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (nested_vmx_check_msr_bitmap_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (nested_vmx_check_apicv_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (nested_vmx_check_msr_switch_controls(vcpu, vmcs12)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } if (!vmx_control_verify(vmcs12->cpu_based_vm_exec_control, - vmx->nested.nested_vmx_true_procbased_ctls_low, + vmx->nested.nested_vmx_procbased_ctls_low, vmx->nested.nested_vmx_procbased_ctls_high) || !vmx_control_verify(vmcs12->secondary_vm_exec_control, vmx->nested.nested_vmx_secondary_ctls_low, @@ -10193,33 +10452,34 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx->nested.nested_vmx_pinbased_ctls_low, vmx->nested.nested_vmx_pinbased_ctls_high) || !vmx_control_verify(vmcs12->vm_exit_controls, - vmx->nested.nested_vmx_true_exit_ctls_low, + vmx->nested.nested_vmx_exit_ctls_low, vmx->nested.nested_vmx_exit_ctls_high) || !vmx_control_verify(vmcs12->vm_entry_controls, - vmx->nested.nested_vmx_true_entry_ctls_low, + vmx->nested.nested_vmx_entry_ctls_low, vmx->nested.nested_vmx_entry_ctls_high)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); - return 1; + goto out; } - if (((vmcs12->host_cr0 & VMXON_CR0_ALWAYSON) != VMXON_CR0_ALWAYSON) || - ((vmcs12->host_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { + if (!nested_host_cr0_valid(vcpu, vmcs12->host_cr0) || + !nested_host_cr4_valid(vcpu, vmcs12->host_cr4) || + !nested_cr3_valid(vcpu, vmcs12->host_cr3)) { nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_HOST_STATE_FIELD); - return 1; + goto out; } - if (!nested_cr0_valid(vcpu, vmcs12->guest_cr0) || - ((vmcs12->guest_cr4 & VMXON_CR4_ALWAYSON) != VMXON_CR4_ALWAYSON)) { + if (!nested_guest_cr0_valid(vcpu, vmcs12->guest_cr0) || + !nested_guest_cr4_valid(vcpu, vmcs12->guest_cr4)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); - return 1; + goto out; } if (vmcs12->vmcs_link_pointer != -1ull) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_VMCS_LINK_PTR); - return 1; + goto out; } /* @@ -10239,7 +10499,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) ia32e != !!(vmcs12->guest_ia32_efer & EFER_LME))) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); - return 1; + goto out; } } @@ -10257,7 +10517,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) ia32e != !!(vmcs12->host_ia32_efer & EFER_LME)) { nested_vmx_entry_failure(vcpu, vmcs12, EXIT_REASON_INVALID_STATE, ENTRY_FAIL_DEFAULT); - return 1; + goto out; } } @@ -10270,6 +10530,12 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (!vmcs02) return -ENOMEM; + /* + * After this point, the trap flag no longer triggers a singlestep trap + * on the vm entry instructions. Don't call + * kvm_skip_emulated_instruction. + */ + skip_emulated_instruction(vcpu); enter_guest_mode(vcpu); if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS)) @@ -10284,7 +10550,13 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) vmx_segment_cache_clear(vmx); - prepare_vmcs02(vcpu, vmcs12); + if (prepare_vmcs02(vcpu, vmcs12, &exit_qualification)) { + leave_guest_mode(vcpu); + vmx_load_vmcs01(vcpu); + nested_vmx_entry_failure(vcpu, vmcs12, + EXIT_REASON_INVALID_STATE, exit_qualification); + return 1; + } msr_entry_idx = nested_vmx_load_msr(vcpu, vmcs12->vm_entry_msr_load_addr, @@ -10311,6 +10583,9 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) * the success flag) when L2 exits (see nested_vmx_vmexit()). */ return 1; + +out: + return kvm_skip_emulated_instruction(vcpu); } /* @@ -10616,6 +10891,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { struct kvm_segment seg; + unsigned long entry_failure_code; if (vmcs12->vm_exit_controls & VM_EXIT_LOAD_IA32_EFER) vcpu->arch.efer = vmcs12->host_ia32_efer; @@ -10653,8 +10929,12 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, nested_ept_uninit_mmu_context(vcpu); - kvm_set_cr3(vcpu, vmcs12->host_cr3); - kvm_mmu_reset_context(vcpu); + /* + * Only PDPTE load can fail as the value of cr3 was checked on entry and + * couldn't have changed. + */ + if (nested_vmx_load_cr3(vcpu, vmcs12->host_cr3, false, &entry_failure_code)) + nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_PDPTE_FAIL); if (!enable_ept) vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; @@ -10755,6 +11035,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, { struct vcpu_vmx *vmx = to_vmx(vcpu); struct vmcs12 *vmcs12 = get_vmcs12(vcpu); + u32 vm_inst_error = 0; /* trying to cancel vmlaunch/vmresume is a bug */ WARN_ON_ONCE(vmx->nested.nested_run_pending); @@ -10767,6 +11048,9 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, vmcs12->vm_exit_msr_store_count)) nested_vmx_abort(vcpu, VMX_ABORT_SAVE_GUEST_MSR_FAIL); + if (unlikely(vmx->fail)) + vm_inst_error = vmcs_read32(VM_INSTRUCTION_ERROR); + vmx_load_vmcs01(vcpu); if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT) @@ -10795,6 +11079,8 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, load_vmcs12_host_state(vcpu, vmcs12); /* Update any VMCS fields that might have changed while L2 ran */ + vmcs_write32(VM_EXIT_MSR_LOAD_COUNT, vmx->msr_autoload.nr); + vmcs_write32(VM_ENTRY_MSR_LOAD_COUNT, vmx->msr_autoload.nr); vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset); if (vmx->hv_deadline_tsc == -1) vmcs_clear_bits(PIN_BASED_VM_EXEC_CONTROL, @@ -10843,7 +11129,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason, */ if (unlikely(vmx->fail)) { vmx->fail = 0; - nested_vmx_failValid(vcpu, vmcs_read32(VM_INSTRUCTION_ERROR)); + nested_vmx_failValid(vcpu, vm_inst_error); } else nested_vmx_succeed(vcpu); if (enable_shadow_vmcs) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 04c5d96b1d67..1f0d2383f5ee 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -434,12 +434,14 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) } EXPORT_SYMBOL_GPL(kvm_requeue_exception); -void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) +int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) { if (err) kvm_inject_gp(vcpu, 0); else - kvm_x86_ops->skip_emulated_instruction(vcpu); + return kvm_skip_emulated_instruction(vcpu); + + return 1; } EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); @@ -573,7 +575,7 @@ out: } EXPORT_SYMBOL_GPL(load_pdptrs); -static bool pdptrs_changed(struct kvm_vcpu *vcpu) +bool pdptrs_changed(struct kvm_vcpu *vcpu) { u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)]; bool changed = true; @@ -599,6 +601,7 @@ out: return changed; } +EXPORT_SYMBOL_GPL(pdptrs_changed); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { @@ -2071,6 +2074,8 @@ static void record_steal_time(struct kvm_vcpu *vcpu) &vcpu->arch.st.steal, sizeof(struct kvm_steal_time)))) return; + vcpu->arch.st.steal.preempted = 0; + if (vcpu->arch.st.steal.version & 1) vcpu->arch.st.steal.version += 1; /* first time write, random junk */ @@ -2176,7 +2181,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_KVM_SYSTEM_TIME_NEW: case MSR_KVM_SYSTEM_TIME: { - u64 gpa_offset; struct kvm_arch *ka = &vcpu->kvm->arch; kvmclock_reset(vcpu); @@ -2198,8 +2202,6 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!(data & 1)) break; - gpa_offset = data & ~(PAGE_MASK | 1); - if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.pv_time, data & ~1ULL, sizeof(struct pvclock_vcpu_time_info))) @@ -2294,7 +2296,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr)) return kvm_pmu_set_msr(vcpu, msr_info); if (!ignore_msrs) { - vcpu_unimpl(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", + vcpu_debug_ratelimited(vcpu, "unhandled wrmsr: 0x%x data 0x%llx\n", msr, data); return 1; } else { @@ -2506,7 +2508,8 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (kvm_pmu_is_valid_msr(vcpu, msr_info->index)) return kvm_pmu_get_msr(vcpu, msr_info->index, &msr_info->data); if (!ignore_msrs) { - vcpu_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr_info->index); + vcpu_debug_ratelimited(vcpu, "unhandled rdmsr: 0x%x\n", + msr_info->index); return 1; } else { vcpu_unimpl(vcpu, "ignored rdmsr: 0x%x\n", msr_info->index); @@ -2810,7 +2813,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } if (kvm_lapic_hv_timer_in_use(vcpu) && kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_tscdeadline_msr(vcpu))) + kvm_get_lapic_target_expiration_tsc(vcpu))) kvm_lapic_switch_to_sw_timer(vcpu); /* * On a host with synchronized TSC, there is no need to update @@ -2826,8 +2829,22 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); } +static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) +{ + if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) + return; + + vcpu->arch.st.steal.preempted = 1; + + kvm_write_guest_offset_cached(vcpu->kvm, &vcpu->arch.st.stime, + &vcpu->arch.st.steal.preempted, + offsetof(struct kvm_steal_time, preempted), + sizeof(vcpu->arch.st.steal.preempted)); +} + void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { + kvm_steal_time_set_preempted(vcpu); kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); vcpu->arch.last_host_tsc = rdtsc(); @@ -4816,7 +4833,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); } -int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) +static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) { if (!need_emulate_wbinvd(vcpu)) return X86EMUL_CONTINUE; @@ -4836,8 +4853,8 @@ int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) { - kvm_x86_ops->skip_emulated_instruction(vcpu); - return kvm_emulate_wbinvd_noskip(vcpu); + kvm_emulate_wbinvd_noskip(vcpu); + return kvm_skip_emulated_instruction(vcpu); } EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); @@ -5081,11 +5098,6 @@ static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) { preempt_disable(); kvm_load_guest_fpu(emul_to_vcpu(ctxt)); - /* - * CR0.TS may reference the host fpu state, not the guest fpu state, - * so it may be clear at this point. - */ - clts(); } static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) @@ -5440,7 +5452,6 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag kvm_run->exit_reason = KVM_EXIT_DEBUG; *r = EMULATE_USER_EXIT; } else { - vcpu->arch.emulate_ctxt.eflags &= ~X86_EFLAGS_TF; /* * "Certain debug exceptions may clear bit 0-3. The * remaining contents of the DR6 register are never @@ -5453,6 +5464,17 @@ static void kvm_vcpu_check_singlestep(struct kvm_vcpu *vcpu, unsigned long rflag } } +int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) +{ + unsigned long rflags = kvm_x86_ops->get_rflags(vcpu); + int r = EMULATE_DONE; + + kvm_x86_ops->skip_emulated_instruction(vcpu); + kvm_vcpu_check_singlestep(vcpu, rflags, &r); + return r == EMULATE_DONE; +} +EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); + static bool kvm_vcpu_check_breakpoint(struct kvm_vcpu *vcpu, int *r) { if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && @@ -5638,6 +5660,49 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) } EXPORT_SYMBOL_GPL(kvm_fast_pio_out); +static int complete_fast_pio_in(struct kvm_vcpu *vcpu) +{ + unsigned long val; + + /* We should only ever be called with arch.pio.count equal to 1 */ + BUG_ON(vcpu->arch.pio.count != 1); + + /* For size less than 4 we merge, else we zero extend */ + val = (vcpu->arch.pio.size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) + : 0; + + /* + * Since vcpu->arch.pio.count == 1 let emulator_pio_in_emulated perform + * the copy and tracing + */ + emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, vcpu->arch.pio.size, + vcpu->arch.pio.port, &val, 1); + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + + return 1; +} + +int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port) +{ + unsigned long val; + int ret; + + /* For size less than 4 we merge, else we zero extend */ + val = (size < 4) ? kvm_register_read(vcpu, VCPU_REGS_RAX) : 0; + + ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size, port, + &val, 1); + if (ret) { + kvm_register_write(vcpu, VCPU_REGS_RAX, val); + return ret; + } + + vcpu->arch.complete_userspace_io = complete_fast_pio_in; + + return 0; +} +EXPORT_SYMBOL_GPL(kvm_fast_pio_in); + static int kvmclock_cpu_down_prep(unsigned int cpu) { __this_cpu_write(cpu_tsc_khz, 0); @@ -5987,8 +6052,12 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_halt); int kvm_emulate_halt(struct kvm_vcpu *vcpu) { - kvm_x86_ops->skip_emulated_instruction(vcpu); - return kvm_vcpu_halt(vcpu); + int ret = kvm_skip_emulated_instruction(vcpu); + /* + * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered + * KVM_EXIT_DEBUG here. + */ + return kvm_vcpu_halt(vcpu) && ret; } EXPORT_SYMBOL_GPL(kvm_emulate_halt); @@ -6019,9 +6088,9 @@ void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu) int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) { unsigned long nr, a0, a1, a2, a3, ret; - int op_64_bit, r = 1; + int op_64_bit, r; - kvm_x86_ops->skip_emulated_instruction(vcpu); + r = kvm_skip_emulated_instruction(vcpu); if (kvm_hv_hypercall_enabled(vcpu->kvm)) return kvm_hv_hypercall(vcpu); @@ -7407,25 +7476,13 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) { - if (!vcpu->guest_fpu_loaded) { - vcpu->fpu_counter = 0; + if (!vcpu->guest_fpu_loaded) return; - } vcpu->guest_fpu_loaded = 0; copy_fpregs_to_fpstate(&vcpu->arch.guest_fpu); __kernel_fpu_end(); ++vcpu->stat.fpu_reload; - /* - * If using eager FPU mode, or if the guest is a frequent user - * of the FPU, just leave the FPU active for next time. - * Every 255 times fpu_counter rolls over to 0; a guest that uses - * the FPU in bursts will revert to loading it on demand. - */ - if (!use_eager_fpu()) { - if (++vcpu->fpu_counter < 5) - kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu); - } trace_kvm_fpu(0); } @@ -8176,7 +8233,7 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm) void kvm_arch_flush_shadow_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) { - kvm_mmu_invalidate_zap_all_pages(kvm); + kvm_page_track_flush_slot(kvm, slot); } static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 25da5bc8d83d..4ca0d78adcf0 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -497,38 +497,24 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, * a whole series of functions like read_cr0() and write_cr0(). * * We start with cr0. cr0 allows you to turn on and off all kinds of basic - * features, but Linux only really cares about one: the horrifically-named Task - * Switched (TS) bit at bit 3 (ie. 8) + * features, but the only cr0 bit that Linux ever used at runtime was the + * horrifically-named Task Switched (TS) bit at bit 3 (ie. 8) * * What does the TS bit do? Well, it causes the CPU to trap (interrupt 7) if * the floating point unit is used. Which allows us to restore FPU state - * lazily after a task switch, and Linux uses that gratefully, but wouldn't a - * name like "FPUTRAP bit" be a little less cryptic? + * lazily after a task switch if we wanted to, but wouldn't a name like + * "FPUTRAP bit" be a little less cryptic? * - * We store cr0 locally because the Host never changes it. The Guest sometimes - * wants to read it and we'd prefer not to bother the Host unnecessarily. + * Fortunately, Linux keeps it simple and doesn't use TS, so we can ignore + * cr0. */ -static unsigned long current_cr0; static void lguest_write_cr0(unsigned long val) { - lazy_hcall1(LHCALL_TS, val & X86_CR0_TS); - current_cr0 = val; } static unsigned long lguest_read_cr0(void) { - return current_cr0; -} - -/* - * Intel provided a special instruction to clear the TS bit for people too cool - * to use write_cr0() to do it. This "clts" instruction is faster, because all - * the vowels have been optimized out. - */ -static void lguest_clts(void) -{ - lazy_hcall1(LHCALL_TS, 0); - current_cr0 &= ~X86_CR0_TS; + return 0; } /* @@ -1432,7 +1418,6 @@ __init void lguest_init(void) pv_cpu_ops.load_tls = lguest_load_tls; pv_cpu_ops.get_debugreg = lguest_get_debugreg; pv_cpu_ops.set_debugreg = lguest_set_debugreg; - pv_cpu_ops.clts = lguest_clts; pv_cpu_ops.read_cr0 = lguest_read_cr0; pv_cpu_ops.write_cr0 = lguest_write_cr0; pv_cpu_ops.read_cr4 = lguest_read_cr4; diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index d376e4b48f88..c5959576c315 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -16,53 +16,6 @@ #include <asm/smap.h> #include <asm/export.h> -/* Standard copy_to_user with segment limit checking */ -ENTRY(_copy_to_user) - mov PER_CPU_VAR(current_task), %rax - movq %rdi,%rcx - addq %rdx,%rcx - jc bad_to_user - cmpq TASK_addr_limit(%rax),%rcx - ja bad_to_user - ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ - "jmp copy_user_generic_string", \ - X86_FEATURE_REP_GOOD, \ - "jmp copy_user_enhanced_fast_string", \ - X86_FEATURE_ERMS -ENDPROC(_copy_to_user) -EXPORT_SYMBOL(_copy_to_user) - -/* Standard copy_from_user with segment limit checking */ -ENTRY(_copy_from_user) - mov PER_CPU_VAR(current_task), %rax - movq %rsi,%rcx - addq %rdx,%rcx - jc bad_from_user - cmpq TASK_addr_limit(%rax),%rcx - ja bad_from_user - ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ - "jmp copy_user_generic_string", \ - X86_FEATURE_REP_GOOD, \ - "jmp copy_user_enhanced_fast_string", \ - X86_FEATURE_ERMS -ENDPROC(_copy_from_user) -EXPORT_SYMBOL(_copy_from_user) - - - .section .fixup,"ax" - /* must zero dest */ -ENTRY(bad_from_user) -bad_from_user: - movl %edx,%ecx - xorl %eax,%eax - rep - stosb -bad_to_user: - movl %edx,%eax - ret -ENDPROC(bad_from_user) - .previous - /* * copy_user_generic_unrolled - memory copy with exception handling. * This version is for CPUs like P4 that don't have efficient micro diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c index d1dee753b949..07764255b611 100644 --- a/arch/x86/lib/msr.c +++ b/arch/x86/lib/msr.c @@ -113,14 +113,14 @@ int msr_clear_bit(u32 msr, u8 bit) } #ifdef CONFIG_TRACEPOINTS -void do_trace_write_msr(unsigned msr, u64 val, int failed) +void do_trace_write_msr(unsigned int msr, u64 val, int failed) { trace_write_msr(msr, val, failed); } EXPORT_SYMBOL(do_trace_write_msr); EXPORT_TRACEPOINT_SYMBOL(write_msr); -void do_trace_read_msr(unsigned msr, u64 val, int failed) +void do_trace_read_msr(unsigned int msr, u64 val, int failed) { trace_read_msr(msr, val, failed); } diff --git a/arch/x86/lib/usercopy.c b/arch/x86/lib/usercopy.c index b4908789484e..c074799bddae 100644 --- a/arch/x86/lib/usercopy.c +++ b/arch/x86/lib/usercopy.c @@ -34,3 +34,52 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n) return ret; } EXPORT_SYMBOL_GPL(copy_from_user_nmi); + +/** + * copy_to_user: - Copy a block of data into user space. + * @to: Destination address, in user space. + * @from: Source address, in kernel space. + * @n: Number of bytes to copy. + * + * Context: User context only. This function may sleep if pagefaults are + * enabled. + * + * Copy data from kernel space to user space. + * + * Returns number of bytes that could not be copied. + * On success, this will be zero. + */ +unsigned long _copy_to_user(void __user *to, const void *from, unsigned n) +{ + if (access_ok(VERIFY_WRITE, to, n)) + n = __copy_to_user(to, from, n); + return n; +} +EXPORT_SYMBOL(_copy_to_user); + +/** + * copy_from_user: - Copy a block of data from user space. + * @to: Destination address, in kernel space. + * @from: Source address, in user space. + * @n: Number of bytes to copy. + * + * Context: User context only. This function may sleep if pagefaults are + * enabled. + * + * Copy data from user space to kernel space. + * + * Returns number of bytes that could not be copied. + * On success, this will be zero. + * + * If some data could not be copied, this function will pad the copied + * data to the requested size using zero bytes. + */ +unsigned long _copy_from_user(void *to, const void __user *from, unsigned n) +{ + if (access_ok(VERIFY_READ, from, n)) + n = __copy_from_user(to, from, n); + else + memset(to, 0, n); + return n; +} +EXPORT_SYMBOL(_copy_from_user); diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 3bc7baf2a711..0b281217c890 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -640,52 +640,3 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr return n; } EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); - -/** - * copy_to_user: - Copy a block of data into user space. - * @to: Destination address, in user space. - * @from: Source address, in kernel space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from kernel space to user space. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - */ -unsigned long _copy_to_user(void __user *to, const void *from, unsigned n) -{ - if (access_ok(VERIFY_WRITE, to, n)) - n = __copy_to_user(to, from, n); - return n; -} -EXPORT_SYMBOL(_copy_to_user); - -/** - * copy_from_user: - Copy a block of data from user space. - * @to: Destination address, in kernel space. - * @from: Source address, in user space. - * @n: Number of bytes to copy. - * - * Context: User context only. This function may sleep if pagefaults are - * enabled. - * - * Copy data from user space to kernel space. - * - * Returns number of bytes that could not be copied. - * On success, this will be zero. - * - * If some data could not be copied, this function will pad the copied - * data to the requested size using zero bytes. - */ -unsigned long _copy_from_user(void *to, const void __user *from, unsigned n) -{ - if (access_ok(VERIFY_READ, from, n)) - n = __copy_from_user(to, from, n); - else - memset(to, 0, n); - return n; -} -EXPORT_SYMBOL(_copy_from_user); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9f72ca3b2669..17c55a536fdd 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -679,8 +679,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(KERN_CONT "paging request"); printk(KERN_CONT " at %p\n", (void *) address); - printk(KERN_ALERT "IP:"); - printk_address(regs->ip); + printk(KERN_ALERT "IP: %pS\n", (void *)regs->ip); dump_pagetable(address); } diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 83e701f160a9..efc32bc6862b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -986,20 +986,17 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, return 0; } -int track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, - pfn_t pfn) +void track_pfn_insert(struct vm_area_struct *vma, pgprot_t *prot, pfn_t pfn) { enum page_cache_mode pcm; if (!pat_enabled()) - return 0; + return; /* Set prot based on lookup */ pcm = lookup_memtype(pfn_t_to_phys(pfn)); *prot = __pgprot((pgprot_val(*prot) & (~_PAGE_CACHE_MASK)) | cachemode2protval(pcm)); - - return 0; } /* diff --git a/arch/x86/mm/pkeys.c b/arch/x86/mm/pkeys.c index f88ce0e5efd9..2dab69a706ec 100644 --- a/arch/x86/mm/pkeys.c +++ b/arch/x86/mm/pkeys.c @@ -141,8 +141,7 @@ u32 init_pkru_value = PKRU_AD_KEY( 1) | PKRU_AD_KEY( 2) | PKRU_AD_KEY( 3) | * Called from the FPU code when creating a fresh set of FPU * registers. This is called from a very specific context where * we know the FPU regstiers are safe for use and we can use PKRU - * directly. The fact that PKRU is only available when we are - * using eagerfpu mode makes this possible. + * directly. */ void copy_init_pkru_to_fpregs(void) { diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index fe04a04dab8e..e76d1af60f7a 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -853,7 +853,7 @@ xadd: if (is_imm8(insn->off)) func = (u8 *) __bpf_call_base + imm32; jmp_offset = func - (image + addrs[i]); if (seen_ld_abs) { - reload_skb_data = bpf_helper_changes_skb_data(func); + reload_skb_data = bpf_helper_changes_pkt_data(func); if (reload_skb_data) { EMIT1(0x57); /* push %rdi */ jmp_offset += 22; /* pop, mov, sub, mov */ diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 28c04123b6dd..ffdbc4836b4f 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -339,10 +339,11 @@ fail: return 0; } -static void nmi_cpu_setup(void *dummy) +static void nmi_cpu_setup(void) { int cpu = smp_processor_id(); struct op_msrs *msrs = &per_cpu(cpu_msrs, cpu); + nmi_cpu_save_registers(msrs); raw_spin_lock(&oprofilefs_lock); model->setup_ctrs(model, msrs); @@ -369,7 +370,7 @@ static void nmi_cpu_restore_registers(struct op_msrs *msrs) } } -static void nmi_cpu_shutdown(void *dummy) +static void nmi_cpu_shutdown(void) { unsigned int v; int cpu = smp_processor_id(); @@ -387,20 +388,26 @@ static void nmi_cpu_shutdown(void *dummy) nmi_cpu_restore_registers(msrs); } -static void nmi_cpu_up(void *dummy) +static int nmi_cpu_online(unsigned int cpu) { + local_irq_disable(); if (nmi_enabled) - nmi_cpu_setup(dummy); + nmi_cpu_setup(); if (ctr_running) - nmi_cpu_start(dummy); + nmi_cpu_start(NULL); + local_irq_enable(); + return 0; } -static void nmi_cpu_down(void *dummy) +static int nmi_cpu_down_prep(unsigned int cpu) { + local_irq_disable(); if (ctr_running) - nmi_cpu_stop(dummy); + nmi_cpu_stop(NULL); if (nmi_enabled) - nmi_cpu_shutdown(dummy); + nmi_cpu_shutdown(); + local_irq_enable(); + return 0; } static int nmi_create_files(struct dentry *root) @@ -433,26 +440,7 @@ static int nmi_create_files(struct dentry *root) return 0; } -static int oprofile_cpu_notifier(struct notifier_block *b, unsigned long action, - void *data) -{ - int cpu = (unsigned long)data; - - switch (action & ~CPU_TASKS_FROZEN) { - case CPU_DOWN_FAILED: - case CPU_ONLINE: - smp_call_function_single(cpu, nmi_cpu_up, NULL, 0); - break; - case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, nmi_cpu_down, NULL, 1); - break; - } - return NOTIFY_DONE; -} - -static struct notifier_block oprofile_cpu_nb = { - .notifier_call = oprofile_cpu_notifier -}; +static enum cpuhp_state cpuhp_nmi_online; static int nmi_setup(void) { @@ -495,20 +483,17 @@ static int nmi_setup(void) if (err) goto fail; - cpu_notifier_register_begin(); - - /* Use get/put_online_cpus() to protect 'nmi_enabled' */ - get_online_cpus(); nmi_enabled = 1; /* make nmi_enabled visible to the nmi handler: */ smp_mb(); - on_each_cpu(nmi_cpu_setup, NULL, 1); - __register_cpu_notifier(&oprofile_cpu_nb); - put_online_cpus(); - - cpu_notifier_register_done(); - + err = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/oprofile:online", + nmi_cpu_online, nmi_cpu_down_prep); + if (err < 0) + goto fail_nmi; + cpuhp_nmi_online = err; return 0; +fail_nmi: + unregister_nmi_handler(NMI_LOCAL, "oprofile"); fail: free_msrs(); return err; @@ -518,17 +503,9 @@ static void nmi_shutdown(void) { struct op_msrs *msrs; - cpu_notifier_register_begin(); - - /* Use get/put_online_cpus() to protect 'nmi_enabled' & 'ctr_running' */ - get_online_cpus(); - on_each_cpu(nmi_cpu_shutdown, NULL, 1); + cpuhp_remove_state(cpuhp_nmi_online); nmi_enabled = 0; ctr_running = 0; - __unregister_cpu_notifier(&oprofile_cpu_nb); - put_online_cpus(); - - cpu_notifier_register_done(); /* make variables visible to the nmi handler: */ smp_mb(); diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c index c20d2cc7ef64..ae387e5ee6f7 100644 --- a/arch/x86/pci/amd_bus.c +++ b/arch/x86/pci/amd_bus.c @@ -327,35 +327,18 @@ static int __init early_root_info_init(void) #define ENABLE_CF8_EXT_CFG (1ULL << 46) -static void enable_pci_io_ecs(void *unused) +static int amd_bus_cpu_online(unsigned int cpu) { u64 reg; + rdmsrl(MSR_AMD64_NB_CFG, reg); if (!(reg & ENABLE_CF8_EXT_CFG)) { reg |= ENABLE_CF8_EXT_CFG; wrmsrl(MSR_AMD64_NB_CFG, reg); } + return 0; } -static int amd_cpu_notify(struct notifier_block *self, unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - smp_call_function_single(cpu, enable_pci_io_ecs, NULL, 0); - break; - default: - break; - } - return NOTIFY_OK; -} - -static struct notifier_block amd_cpu_notifier = { - .notifier_call = amd_cpu_notify, -}; - static void __init pci_enable_pci_io_ecs(void) { #ifdef CONFIG_AMD_NB @@ -385,7 +368,7 @@ static void __init pci_enable_pci_io_ecs(void) static int __init pci_io_ecs_init(void) { - int cpu; + int ret; /* assume all cpus from fam10h have IO ECS */ if (boot_cpu_data.x86 < 0x10) @@ -395,12 +378,9 @@ static int __init pci_io_ecs_init(void) if (early_pci_allowed()) pci_enable_pci_io_ecs(); - cpu_notifier_register_begin(); - for_each_online_cpu(cpu) - amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, - (void *)(long)cpu); - __register_cpu_notifier(&amd_cpu_notifier); - cpu_notifier_register_done(); + ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "pci/amd_bus:online", + amd_bus_cpu_online, NULL); + WARN_ON(ret < 0); pci_probe |= PCI_HAS_IO_ECS; diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index bedfab98077a..e1fb269c87af 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -264,8 +264,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) return 0; error: - dev_err(&dev->dev, - "Xen PCI frontend has not registered MSI/MSI-X support!\n"); + dev_err(&dev->dev, "Failed to create MSI%s! ret=%d!\n", + type == PCI_CAP_ID_MSI ? "" : "-X", irq); return irq; } diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index b27bccd4390f..821cb41f00e6 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -89,7 +89,7 @@ static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value) } static void ce4100_serial_fixup(int port, struct uart_port *up, - unsigned short *capabilites) + u32 *capabilites) { #ifdef CONFIG_EARLY_PRINTK /* diff --git a/arch/x86/platform/intel-mid/pwr.c b/arch/x86/platform/intel-mid/pwr.c index 67375dda451c..ef03852ea6e8 100644 --- a/arch/x86/platform/intel-mid/pwr.c +++ b/arch/x86/platform/intel-mid/pwr.c @@ -270,7 +270,6 @@ int intel_mid_pci_set_power_state(struct pci_dev *pdev, pci_power_t state) return 0; } -EXPORT_SYMBOL_GPL(intel_mid_pci_set_power_state); pci_power_t intel_mid_pci_get_power_state(struct pci_dev *pdev) { diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c index 55130846ac87..c0533fbc39e3 100644 --- a/arch/x86/platform/olpc/olpc-xo15-sci.c +++ b/arch/x86/platform/olpc/olpc-xo15-sci.c @@ -196,6 +196,7 @@ static int xo15_sci_remove(struct acpi_device *device) return 0; } +#ifdef CONFIG_PM_SLEEP static int xo15_sci_resume(struct device *dev) { /* Enable all EC events */ @@ -207,6 +208,7 @@ static int xo15_sci_resume(struct device *dev) return 0; } +#endif static SIMPLE_DEV_PM_OPS(xo15_sci_pm, NULL, xo15_sci_resume); diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index 9e42842e924a..766d4d3529a1 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -19,7 +19,6 @@ #include <asm/uv/uv_hub.h> #include <asm/uv/uv_bau.h> #include <asm/apic.h> -#include <asm/idle.h> #include <asm/tsc.h> #include <asm/irq_vectors.h> #include <asm/timer.h> diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index cd5173a2733f..8410e7d0a5b5 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -387,8 +387,8 @@ static void uv_nmi_dump_cpu_ip_hdr(void) /* Dump Instruction Pointer info */ static void uv_nmi_dump_cpu_ip(int cpu, struct pt_regs *regs) { - pr_info("UV: %4d %6d %-32.32s ", cpu, current->pid, current->comm); - printk_address(regs->ip); + pr_info("UV: %4d %6d %-32.32s %pS", + cpu, current->pid, current->comm, (void *)regs->ip); } /* diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c index 9634557a5444..ded2e8272382 100644 --- a/arch/x86/power/hibernate_64.c +++ b/arch/x86/power/hibernate_64.c @@ -11,6 +11,10 @@ #include <linux/gfp.h> #include <linux/smp.h> #include <linux/suspend.h> +#include <linux/scatterlist.h> +#include <linux/kdebug.h> + +#include <crypto/hash.h> #include <asm/init.h> #include <asm/proto.h> @@ -177,14 +181,86 @@ int pfn_is_nosave(unsigned long pfn) return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); } +#define MD5_DIGEST_SIZE 16 + struct restore_data_record { unsigned long jump_address; unsigned long jump_address_phys; unsigned long cr3; unsigned long magic; + u8 e820_digest[MD5_DIGEST_SIZE]; }; -#define RESTORE_MAGIC 0x123456789ABCDEF0UL +#define RESTORE_MAGIC 0x23456789ABCDEF01UL + +#if IS_BUILTIN(CONFIG_CRYPTO_MD5) +/** + * get_e820_md5 - calculate md5 according to given e820 map + * + * @map: the e820 map to be calculated + * @buf: the md5 result to be stored to + */ +static int get_e820_md5(struct e820map *map, void *buf) +{ + struct scatterlist sg; + struct crypto_ahash *tfm; + int size; + int ret = 0; + + tfm = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); + if (IS_ERR(tfm)) + return -ENOMEM; + + { + AHASH_REQUEST_ON_STACK(req, tfm); + size = offsetof(struct e820map, map) + + sizeof(struct e820entry) * map->nr_map; + ahash_request_set_tfm(req, tfm); + sg_init_one(&sg, (u8 *)map, size); + ahash_request_set_callback(req, 0, NULL, NULL); + ahash_request_set_crypt(req, &sg, buf, size); + + if (crypto_ahash_digest(req)) + ret = -EINVAL; + ahash_request_zero(req); + } + crypto_free_ahash(tfm); + + return ret; +} + +static void hibernation_e820_save(void *buf) +{ + get_e820_md5(e820_saved, buf); +} + +static bool hibernation_e820_mismatch(void *buf) +{ + int ret; + u8 result[MD5_DIGEST_SIZE]; + + memset(result, 0, MD5_DIGEST_SIZE); + /* If there is no digest in suspend kernel, let it go. */ + if (!memcmp(result, buf, MD5_DIGEST_SIZE)) + return false; + + ret = get_e820_md5(e820_saved, result); + if (ret) + return true; + + return memcmp(result, buf, MD5_DIGEST_SIZE) ? true : false; +} +#else +static void hibernation_e820_save(void *buf) +{ +} + +static bool hibernation_e820_mismatch(void *buf) +{ + /* If md5 is not builtin for restore kernel, let it go. */ + return false; +} +#endif /** * arch_hibernation_header_save - populate the architecture specific part @@ -201,6 +277,9 @@ int arch_hibernation_header_save(void *addr, unsigned int max_size) rdr->jump_address_phys = __pa_symbol(&restore_registers); rdr->cr3 = restore_cr3; rdr->magic = RESTORE_MAGIC; + + hibernation_e820_save(rdr->e820_digest); + return 0; } @@ -216,5 +295,16 @@ int arch_hibernation_header_restore(void *addr) restore_jump_address = rdr->jump_address; jump_address_phys = rdr->jump_address_phys; restore_cr3 = rdr->cr3; - return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL; + + if (rdr->magic != RESTORE_MAGIC) { + pr_crit("Unrecognized hibernate image header format!\n"); + return -EINVAL; + } + + if (hibernation_e820_mismatch(rdr->e820_digest)) { + pr_crit("Hibernate inconsistent memory map detected!\n"); + return -ENODEV; + } + + return 0; } diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 1ac76479c266..8730c2882fff 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -275,6 +275,8 @@ static void do_inject(void) unsigned int cpu = i_mce.extcpu; u8 b = i_mce.bank; + rdtscll(i_mce.tsc); + if (i_mce.misc) i_mce.status |= MCI_STATUS_MISCV; diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 25012abc3409..4463fa72db94 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -69,7 +69,7 @@ $(obj)/realmode.relocs: $(obj)/realmode.elf FORCE # --------------------------------------------------------------------------- -KBUILD_CFLAGS := $(LINUXINCLUDE) $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \ +KBUILD_CFLAGS := $(REALMODE_CFLAGS) -D_SETUP -D_WAKEUP \ -I$(srctree)/arch/x86/boot KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ GCOV_PROFILE := n diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c index ba70ff232917..1972565ab106 100644 --- a/arch/x86/tools/insn_sanity.c +++ b/arch/x86/tools/insn_sanity.c @@ -269,7 +269,8 @@ int main(int argc, char **argv) insns++; } - fprintf(stdout, "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", + fprintf((errors) ? stderr : stdout, + "%s: %s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", prog, (errors) ? "Failure" : "Success", insns, diff --git a/arch/x86/tools/relocs.h b/arch/x86/tools/relocs.h index f59590645b68..1d23bf953a4a 100644 --- a/arch/x86/tools/relocs.h +++ b/arch/x86/tools/relocs.h @@ -16,7 +16,7 @@ #include <regex.h> #include <tools/le_byteshift.h> -void die(char *fmt, ...); +void die(char *fmt, ...) __attribute__((noreturn)); #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index 56f04db0c9c0..ecf31e0358c8 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -167,7 +167,7 @@ int main(int argc, char **argv) fprintf(stderr, "Warning: decoded and checked %d" " instructions with %d warnings\n", insns, warnings); else - fprintf(stderr, "Succeed: decoded and checked %d" + fprintf(stdout, "Success: decoded and checked %d" " instructions\n", insns); return 0; } diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 233ee09c1ce8..c77db2288982 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -26,7 +26,6 @@ static inline void rep_nop(void) } #define cpu_relax() rep_nop() -#define cpu_relax_lowlatency() cpu_relax() #define task_pt_regs(t) (&(t)->thread.regs) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index bdd855685403..ced7027b3fbc 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -980,17 +980,6 @@ static void xen_io_delay(void) { } -static void xen_clts(void) -{ - struct multicall_space mcs; - - mcs = xen_mc_entry(0); - - MULTI_fpu_taskswitch(mcs.mc, 0); - - xen_mc_issue(PARAVIRT_LAZY_CPU); -} - static DEFINE_PER_CPU(unsigned long, xen_cr0_value); static unsigned long xen_read_cr0(void) @@ -1233,8 +1222,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .set_debugreg = xen_set_debugreg, .get_debugreg = xen_get_debugreg, - .clts = xen_clts, - .read_cr0 = xen_read_cr0, .write_cr0 = xen_write_cr0, diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index 0e98e5d241d0..a9fafb5c8738 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -19,7 +19,6 @@ int xen_swiotlb __read_mostly; static struct dma_map_ops xen_swiotlb_dma_ops = { - .mapping_error = xen_swiotlb_dma_mapping_error, .alloc = xen_swiotlb_alloc_coherent, .free = xen_swiotlb_free_coherent, .sync_single_for_cpu = xen_swiotlb_sync_single_for_cpu, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index f8960fca0827..8c394e30e5fe 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -41,7 +41,7 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata; unsigned long xen_released_pages; /* E820 map used during setting up memory. */ -static struct e820entry xen_e820_map[E820MAX] __initdata; +static struct e820entry xen_e820_map[E820_X_MAX] __initdata; static u32 xen_e820_map_entries __initdata; /* @@ -750,7 +750,7 @@ char * __init xen_memory_setup(void) max_pfn = min(max_pfn, xen_start_info->nr_pages); mem_end = PFN_PHYS(max_pfn); - memmap.nr_entries = E820MAX; + memmap.nr_entries = ARRAY_SIZE(xen_e820_map); set_xen_guest_handle(memmap.buffer, xen_e820_map); op = xen_initial_domain() ? @@ -923,7 +923,7 @@ char * __init xen_auto_xlated_memory_setup(void) int i; int rc; - memmap.nr_entries = E820MAX; + memmap.nr_entries = ARRAY_SIZE(xen_e820_map); set_xen_guest_handle(memmap.buffer, xen_e820_map); rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 3d6e0064cbfc..e8a9ea7d7a21 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -114,6 +114,7 @@ void xen_uninit_lock_cpu(int cpu) per_cpu(irq_name, cpu) = NULL; } +PV_CALLEE_SAVE_REGS_THUNK(xen_vcpu_stolen); /* * Our init of PV spinlocks is split in two init functions due to us @@ -137,6 +138,7 @@ void __init xen_init_spinlocks(void) pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_lock_ops.wait = xen_qlock_wait; pv_lock_ops.kick = xen_qlock_kick; + pv_lock_ops.vcpu_is_preempted = PV_CALLEE_SAVE(xen_vcpu_stolen); } /* diff --git a/arch/xtensa/include/asm/mutex.h b/arch/xtensa/include/asm/mutex.h deleted file mode 100644 index 458c1f7fbc18..000000000000 --- a/arch/xtensa/include/asm/mutex.h +++ /dev/null @@ -1,9 +0,0 @@ -/* - * Pull in the generic implementation for the mutex fastpath. - * - * TODO: implement optimized primitives instead, or leave the generic - * implementation in place, or pick the atomic_xchg() based generic - * implementation. (see asm-generic/mutex-xchg.h for details) - */ - -#include <asm-generic/mutex-dec.h> diff --git a/arch/xtensa/include/asm/processor.h b/arch/xtensa/include/asm/processor.h index b42d68bfe3cf..86ffcd68e496 100644 --- a/arch/xtensa/include/asm/processor.h +++ b/arch/xtensa/include/asm/processor.h @@ -206,7 +206,6 @@ extern unsigned long get_wchan(struct task_struct *p); #define KSTK_ESP(tsk) (task_pt_regs(tsk)->areg[1]) #define cpu_relax() barrier() -#define cpu_relax_lowlatency() cpu_relax() /* Special register access. */ diff --git a/arch/xtensa/include/uapi/asm/socket.h b/arch/xtensa/include/uapi/asm/socket.h index 81435d995e11..9fdbe1fe0473 100644 --- a/arch/xtensa/include/uapi/asm/socket.h +++ b/arch/xtensa/include/uapi/asm/socket.h @@ -101,4 +101,6 @@ #define SO_CNX_ADVICE 53 +#define SCM_TIMESTAMPING_OPT_STATS 54 + #endif /* _XTENSA_SOCKET_H */ |