diff options
144 files changed, 4502 insertions, 1832 deletions
diff --git a/Documentation/devicetree/bindings/powerpc/fsl/l2cache.txt b/Documentation/devicetree/bindings/powerpc/fsl/l2cache.txt index c41b2187eaa8..dc9bb3182525 100644 --- a/Documentation/devicetree/bindings/powerpc/fsl/l2cache.txt +++ b/Documentation/devicetree/bindings/powerpc/fsl/l2cache.txt @@ -5,8 +5,46 @@ The cache bindings explained below are ePAPR compliant Required Properties: -- compatible : Should include "fsl,chip-l2-cache-controller" and "cache" - where chip is the processor (bsc9132, npc8572 etc.) +- compatible : Should include one of the following: + "fsl,8540-l2-cache-controller" + "fsl,8541-l2-cache-controller" + "fsl,8544-l2-cache-controller" + "fsl,8548-l2-cache-controller" + "fsl,8555-l2-cache-controller" + "fsl,8568-l2-cache-controller" + "fsl,b4420-l2-cache-controller" + "fsl,b4860-l2-cache-controller" + "fsl,bsc9131-l2-cache-controller" + "fsl,bsc9132-l2-cache-controller" + "fsl,c293-l2-cache-controller" + "fsl,mpc8536-l2-cache-controller" + "fsl,mpc8540-l2-cache-controller" + "fsl,mpc8541-l2-cache-controller" + "fsl,mpc8544-l2-cache-controller" + "fsl,mpc8548-l2-cache-controller" + "fsl,mpc8555-l2-cache-controller" + "fsl,mpc8560-l2-cache-controller" + "fsl,mpc8568-l2-cache-controller" + "fsl,mpc8569-l2-cache-controller" + "fsl,mpc8572-l2-cache-controller" + "fsl,p1010-l2-cache-controller" + "fsl,p1011-l2-cache-controller" + "fsl,p1012-l2-cache-controller" + "fsl,p1013-l2-cache-controller" + "fsl,p1014-l2-cache-controller" + "fsl,p1015-l2-cache-controller" + "fsl,p1016-l2-cache-controller" + "fsl,p1020-l2-cache-controller" + "fsl,p1021-l2-cache-controller" + "fsl,p1022-l2-cache-controller" + "fsl,p1023-l2-cache-controller" + "fsl,p1024-l2-cache-controller" + "fsl,p1025-l2-cache-controller" + "fsl,p2010-l2-cache-controller" + "fsl,p2020-l2-cache-controller" + "fsl,t2080-l2-cache-controller" + "fsl,t4240-l2-cache-controller" + and "cache". - reg : Address and size of L2 cache controller registers - cache-size : Size of the entire L2 cache - interrupts : Error interrupt of L2 controller diff --git a/Documentation/devicetree/bindings/powerpc/opal/power-mgt.txt b/Documentation/devicetree/bindings/powerpc/opal/power-mgt.txt new file mode 100644 index 000000000000..9d619e955576 --- /dev/null +++ b/Documentation/devicetree/bindings/powerpc/opal/power-mgt.txt @@ -0,0 +1,118 @@ +IBM Power-Management Bindings +============================= + +Linux running on baremetal POWER machines has access to the processor +idle states. The description of these idle states is exposed via the +node @power-mgt in the device-tree by the firmware. + +Definitions: +---------------- +Typically each idle state has the following associated properties: + +- name: The name of the idle state as defined by the firmware. + +- flags: indicating some aspects of this idle states such as the + extent of state-loss, whether timebase is stopped on this + idle states and so on. The flag bits are as follows: + +- exit-latency: The latency involved in transitioning the state of the + CPU from idle to running. + +- target-residency: The minimum time that the CPU needs to reside in + this idle state in order to accrue power-savings + benefit. + +Properties +---------------- +The following properties provide details about the idle states. These +properties are exposed as arrays. Each entry in the property array +provides the value of that property for the idle state associated with +the array index of that entry. + +If idle-states are defined, then the properties +"ibm,cpu-idle-state-names" and "ibm,cpu-idle-state-flags" are +required. The other properties are required unless mentioned +otherwise. The length of all the property arrays must be the same. + +- ibm,cpu-idle-state-names: + Array of strings containing the names of the idle states. + +- ibm,cpu-idle-state-flags: + Array of unsigned 32-bit values containing the values of the + flags associated with the the aforementioned idle-states. The + flag bits are as follows: + 0x00000001 /* Decrementer would stop */ + 0x00000002 /* Needs timebase restore */ + 0x00001000 /* Restore GPRs like nap */ + 0x00002000 /* Restore hypervisor resource from PACA pointer */ + 0x00004000 /* Program PORE to restore PACA pointer */ + 0x00010000 /* This is a nap state (POWER7,POWER8) */ + 0x00020000 /* This is a fast-sleep state (POWER8)*/ + 0x00040000 /* This is a winkle state (POWER8) */ + 0x00080000 /* This is a fast-sleep state which requires a */ + /* software workaround for restoring the */ + /* timebase (POWER8) */ + 0x00800000 /* This state uses SPR PMICR instruction */ + /* (POWER8)*/ + 0x00100000 /* This is a fast stop state (POWER9) */ + 0x00200000 /* This is a deep-stop state (POWER9) */ + +- ibm,cpu-idle-state-latencies-ns: + Array of unsigned 32-bit values containing the values of the + exit-latencies (in ns) for the idle states in + ibm,cpu-idle-state-names. + +- ibm,cpu-idle-state-residency-ns: + Array of unsigned 32-bit values containing the values of the + target-residency (in ns) for the idle states in + ibm,cpu-idle-state-names. On POWER8 this is an optional + property. If the property is absent, the target residency for + the "Nap", "FastSleep" are defined to 10000 and 300000000 + respectively by the kernel. On POWER9 this property is required. + +- ibm,cpu-idle-state-psscr: + Array of unsigned 64-bit values containing the values for the + PSSCR for each of the idle states in ibm,cpu-idle-state-names. + This property is required on POWER9 and absent on POWER8. + +- ibm,cpu-idle-state-psscr-mask: + Array of unsigned 64-bit values containing the masks + indicating which psscr fields are set in the corresponding + entries of ibm,cpu-idle-state-psscr. This property is + required on POWER9 and absent on POWER8. + + Whenever the firmware sets an entry in + ibm,cpu-idle-state-psscr-mask value to 0xf, it implies that + only the Requested Level (RL) field of the corresponding entry + in ibm,cpu-idle-state-psscr should be considered by the + kernel. For such idle states, the kernel would set the + remaining fields of the psscr to the following sane-default + values. + + - ESL and EC bits are to 1. So wakeup from any stop + state will be at vector 0x100. + + - MTL and PSLL are set to the maximum allowed value as + per the ISA, i.e. 15. + + - The Transition Rate, TR is set to the Maximum value + 3. + + For all the other values of the entry in + ibm,cpu-idle-state-psscr-mask, the kernel expects all the + psscr fields of the corresponding entry in + ibm,cpu-idle-state-psscr to be correctly set by the firmware. + +- ibm,cpu-idle-state-pmicr: + Array of unsigned 64-bit values containing the pmicr values + for the idle states in ibm,cpu-idle-state-names. This 64-bit + register value is to be set in pmicr for the corresponding + state if the flag indicates that pmicr SPR should be set. This + is an optional property on POWER8 and is absent on + POWER9. + +- ibm,cpu-idle-state-pmicr-mask: + Array of unsigned 64-bit values containing the mask indicating + which of the fields of the PMICR are set in the corresponding + entries in ibm,cpu-idle-state-pmicr. This is an optional + property on POWER8 and is absent on POWER9. diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 03145b7cafaa..4470671b0c26 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -3201,6 +3201,71 @@ struct kvm_reinject_control { pit_reinject = 0 (!reinject mode) is recommended, unless running an old operating system that uses the PIT for timing (e.g. Linux 2.4.x). +4.99 KVM_PPC_CONFIGURE_V3_MMU + +Capability: KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_mmuv3_cfg (in) +Returns: 0 on success, + -EFAULT if struct kvm_ppc_mmuv3_cfg cannot be read, + -EINVAL if the configuration is invalid + +This ioctl controls whether the guest will use radix or HPT (hashed +page table) translation, and sets the pointer to the process table for +the guest. + +struct kvm_ppc_mmuv3_cfg { + __u64 flags; + __u64 process_table; +}; + +There are two bits that can be set in flags; KVM_PPC_MMUV3_RADIX and +KVM_PPC_MMUV3_GTSE. KVM_PPC_MMUV3_RADIX, if set, configures the guest +to use radix tree translation, and if clear, to use HPT translation. +KVM_PPC_MMUV3_GTSE, if set and if KVM permits it, configures the guest +to be able to use the global TLB and SLB invalidation instructions; +if clear, the guest may not use these instructions. + +The process_table field specifies the address and size of the guest +process table, which is in the guest's space. This field is formatted +as the second doubleword of the partition table entry, as defined in +the Power ISA V3.00, Book III section 5.7.6.1. + +4.100 KVM_PPC_GET_RMMU_INFO + +Capability: KVM_CAP_PPC_RADIX_MMU +Architectures: ppc +Type: vm ioctl +Parameters: struct kvm_ppc_rmmu_info (out) +Returns: 0 on success, + -EFAULT if struct kvm_ppc_rmmu_info cannot be written, + -EINVAL if no useful information can be returned + +This ioctl returns a structure containing two things: (a) a list +containing supported radix tree geometries, and (b) a list that maps +page sizes to put in the "AP" (actual page size) field for the tlbie +(TLB invalidate entry) instruction. + +struct kvm_ppc_rmmu_info { + struct kvm_ppc_radix_geom { + __u8 page_shift; + __u8 level_bits[4]; + __u8 pad[3]; + } geometries[8]; + __u32 ap_encodings[8]; +}; + +The geometries[] field gives up to 8 supported geometries for the +radix page table, in terms of the log base 2 of the smallest page +size, and the number of bits indexed at each level of the tree, from +the PTE level up to the PGD level in that order. Any unused entries +will have 0 in the page_shift field. + +The ap_encodings gives the supported page sizes and their AP field +encodings, encoded with the AP value in the top 3 bits and the log +base 2 of the page size in the bottom 6 bits. + 5. The kvm_run structure ------------------------ @@ -3942,3 +4007,21 @@ In order to use SynIC, it has to be activated by setting this capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this will disable the use of APIC hardware virtualization even if supported by the CPU, as it's incompatible with SynIC auto-EOI behavior. + +8.3 KVM_CAP_PPC_RADIX_MMU + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel can support guests using the +radix MMU defined in Power ISA V3.00 (as implemented in the POWER9 +processor). + +8.4 KVM_CAP_PPC_HASH_MMU_V3 + +Architectures: ppc + +This capability, if KVM_CHECK_EXTENSION indicates that it is +available, means that that the kernel can support guests using the +hashed page table MMU defined in Power ISA V3.00 (as implemented in +the POWER9 processor), including in-memory segment tables. diff --git a/arch/m68k/include/asm/macintosh.h b/arch/m68k/include/asm/macintosh.h index 42235e7fbeed..5b81ab188aa5 100644 --- a/arch/m68k/include/asm/macintosh.h +++ b/arch/m68k/include/asm/macintosh.h @@ -38,7 +38,7 @@ struct mac_model #define MAC_ADB_NONE 0 #define MAC_ADB_II 1 -#define MAC_ADB_IISI 2 +#define MAC_ADB_EGRET 2 #define MAC_ADB_CUDA 3 #define MAC_ADB_PB1 4 #define MAC_ADB_PB2 5 diff --git a/arch/m68k/mac/config.c b/arch/m68k/mac/config.c index e46895316eb0..9dc65a4c28d2 100644 --- a/arch/m68k/mac/config.c +++ b/arch/m68k/mac/config.c @@ -286,7 +286,7 @@ static struct mac_model mac_data_table[] = { }, { .ident = MAC_MODEL_IISI, .name = "IIsi", - .adb_type = MAC_ADB_IISI, + .adb_type = MAC_ADB_EGRET, .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_OLD, .scc_type = MAC_SCC_II, @@ -295,7 +295,7 @@ static struct mac_model mac_data_table[] = { }, { .ident = MAC_MODEL_IIVI, .name = "IIvi", - .adb_type = MAC_ADB_IISI, + .adb_type = MAC_ADB_EGRET, .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, @@ -304,7 +304,7 @@ static struct mac_model mac_data_table[] = { }, { .ident = MAC_MODEL_IIVX, .name = "IIvx", - .adb_type = MAC_ADB_IISI, + .adb_type = MAC_ADB_EGRET, .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, @@ -319,7 +319,7 @@ static struct mac_model mac_data_table[] = { { .ident = MAC_MODEL_CLII, .name = "Classic II", - .adb_type = MAC_ADB_IISI, + .adb_type = MAC_ADB_EGRET, .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, @@ -352,7 +352,7 @@ static struct mac_model mac_data_table[] = { { .ident = MAC_MODEL_LC, .name = "LC", - .adb_type = MAC_ADB_IISI, + .adb_type = MAC_ADB_EGRET, .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, @@ -361,7 +361,7 @@ static struct mac_model mac_data_table[] = { }, { .ident = MAC_MODEL_LCII, .name = "LC II", - .adb_type = MAC_ADB_IISI, + .adb_type = MAC_ADB_EGRET, .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, @@ -370,7 +370,7 @@ static struct mac_model mac_data_table[] = { }, { .ident = MAC_MODEL_LCIII, .name = "LC III", - .adb_type = MAC_ADB_IISI, + .adb_type = MAC_ADB_EGRET, .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, @@ -498,7 +498,7 @@ static struct mac_model mac_data_table[] = { { .ident = MAC_MODEL_P460, .name = "Performa 460", - .adb_type = MAC_ADB_IISI, + .adb_type = MAC_ADB_EGRET, .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, @@ -575,7 +575,7 @@ static struct mac_model mac_data_table[] = { }, { .ident = MAC_MODEL_P600, .name = "Performa 600", - .adb_type = MAC_ADB_IISI, + .adb_type = MAC_ADB_EGRET, .via_type = MAC_VIA_IICI, .scsi_type = MAC_SCSI_LC, .scc_type = MAC_SCC_II, diff --git a/arch/m68k/mac/misc.c b/arch/m68k/mac/misc.c index 3b1f7a6159f8..5b01704c85eb 100644 --- a/arch/m68k/mac/misc.c +++ b/arch/m68k/mac/misc.c @@ -141,54 +141,6 @@ static void pmu_write_pram(int offset, __u8 data) #define pmu_write_pram NULL #endif -#if 0 /* def CONFIG_ADB_MACIISI */ -extern int maciisi_request(struct adb_request *req, - void (*done)(struct adb_request *), int nbytes, ...); - -static long maciisi_read_time(void) -{ - struct adb_request req; - long time; - - if (maciisi_request(&req, NULL, 2, CUDA_PACKET, CUDA_GET_TIME)) - return 0; - - time = (req.reply[3] << 24) | (req.reply[4] << 16) - | (req.reply[5] << 8) | req.reply[6]; - return time - RTC_OFFSET; -} - -static void maciisi_write_time(long data) -{ - struct adb_request req; - data += RTC_OFFSET; - maciisi_request(&req, NULL, 6, CUDA_PACKET, CUDA_SET_TIME, - (data >> 24) & 0xFF, (data >> 16) & 0xFF, - (data >> 8) & 0xFF, data & 0xFF); -} - -static __u8 maciisi_read_pram(int offset) -{ - struct adb_request req; - if (maciisi_request(&req, NULL, 4, CUDA_PACKET, CUDA_GET_PRAM, - (offset >> 8) & 0xFF, offset & 0xFF)) - return 0; - return req.reply[3]; -} - -static void maciisi_write_pram(int offset, __u8 data) -{ - struct adb_request req; - maciisi_request(&req, NULL, 5, CUDA_PACKET, CUDA_SET_PRAM, - (offset >> 8) & 0xFF, offset & 0xFF, data); -} -#else -#define maciisi_read_time() 0 -#define maciisi_write_time(n) -#define maciisi_read_pram NULL -#define maciisi_write_pram NULL -#endif - /* * VIA PRAM/RTC access routines * @@ -457,11 +409,10 @@ void mac_pram_read(int offset, __u8 *buffer, int len) int i; switch(macintosh_config->adb_type) { - case MAC_ADB_IISI: - func = maciisi_read_pram; break; case MAC_ADB_PB1: case MAC_ADB_PB2: func = pmu_read_pram; break; + case MAC_ADB_EGRET: case MAC_ADB_CUDA: func = cuda_read_pram; break; default: @@ -480,11 +431,10 @@ void mac_pram_write(int offset, __u8 *buffer, int len) int i; switch(macintosh_config->adb_type) { - case MAC_ADB_IISI: - func = maciisi_write_pram; break; case MAC_ADB_PB1: case MAC_ADB_PB2: func = pmu_write_pram; break; + case MAC_ADB_EGRET: case MAC_ADB_CUDA: func = cuda_write_pram; break; default: @@ -499,17 +449,13 @@ void mac_pram_write(int offset, __u8 *buffer, int len) void mac_poweroff(void) { - /* - * MAC_ADB_IISI may need to be moved up here if it doesn't actually - * work using the ADB packet method. --David Kilzer - */ - if (oss_present) { oss_shutdown(); } else if (macintosh_config->adb_type == MAC_ADB_II) { via_shutdown(); #ifdef CONFIG_ADB_CUDA - } else if (macintosh_config->adb_type == MAC_ADB_CUDA) { + } else if (macintosh_config->adb_type == MAC_ADB_EGRET || + macintosh_config->adb_type == MAC_ADB_CUDA) { cuda_shutdown(); #endif #ifdef CONFIG_ADB_PMU68K @@ -549,7 +495,8 @@ void mac_reset(void) local_irq_restore(flags); } #ifdef CONFIG_ADB_CUDA - } else if (macintosh_config->adb_type == MAC_ADB_CUDA) { + } else if (macintosh_config->adb_type == MAC_ADB_EGRET || + macintosh_config->adb_type == MAC_ADB_CUDA) { cuda_restart(); #endif #ifdef CONFIG_ADB_PMU68K @@ -698,13 +645,11 @@ int mac_hwclk(int op, struct rtc_time *t) case MAC_ADB_IOP: now = via_read_time(); break; - case MAC_ADB_IISI: - now = maciisi_read_time(); - break; case MAC_ADB_PB1: case MAC_ADB_PB2: now = pmu_read_time(); break; + case MAC_ADB_EGRET: case MAC_ADB_CUDA: now = cuda_read_time(); break; @@ -736,6 +681,7 @@ int mac_hwclk(int op, struct rtc_time *t) case MAC_ADB_IOP: via_write_time(now); break; + case MAC_ADB_EGRET: case MAC_ADB_CUDA: cuda_write_time(now); break; @@ -743,8 +689,6 @@ int mac_hwclk(int op, struct rtc_time *t) case MAC_ADB_PB2: pmu_write_time(now); break; - case MAC_ADB_IISI: - maciisi_write_time(now); } } return 0; diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 281f4f1fcd1f..8582121d7a45 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -93,12 +93,14 @@ config PPC select HAVE_DYNAMIC_FTRACE_WITH_REGS if MPROFILE_KERNEL select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER + select HAVE_GCC_PLUGINS select SYSCTL_EXCEPTION_TRACE select VIRT_TO_BUS if !PPC64 select HAVE_IDE select HAVE_IOREMAP_PROT select HAVE_EFFICIENT_UNALIGNED_ACCESS if !(CPU_LITTLE_ENDIAN && POWER7_CPU) select HAVE_KPROBES + select HAVE_OPTPROBES if PPC64 select HAVE_ARCH_KGDB select HAVE_KRETPROBES select HAVE_ARCH_TRACEHOOK @@ -164,9 +166,10 @@ config PPC select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select HAVE_ARCH_HARDENED_USERCOPY select HAVE_KERNEL_GZIP + select HAVE_CONTEXT_TRACKING if PPC64 config GENERIC_CSUM - def_bool CPU_LITTLE_ENDIAN + def_bool n config EARLY_PRINTK bool @@ -390,8 +393,8 @@ config DISABLE_MPROFILE_KERNEL be disabled also. If you have a toolchain which supports mprofile-kernel, then you can - enable this. Otherwise leave it disabled. If you're not sure, say - "N". + disable this. Otherwise leave it enabled. If you're not sure, say + "Y". config MPROFILE_KERNEL depends on PPC64 && CPU_LITTLE_ENDIAN diff --git a/arch/powerpc/Kconfig.debug b/arch/powerpc/Kconfig.debug index 949258d412d0..c86df246339e 100644 --- a/arch/powerpc/Kconfig.debug +++ b/arch/powerpc/Kconfig.debug @@ -356,8 +356,7 @@ config FAIL_IOMMU config PPC_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" - depends on DEBUG_KERNEL - select DEBUG_FS + depends on DEBUG_KERNEL && DEBUG_FS help This option exports the state of the kernel pagetables to a debugfs file. This is only useful for kernel developers who are diff --git a/arch/powerpc/boot/.gitignore b/arch/powerpc/boot/.gitignore index d61c03525777..84774ccba1c2 100644 --- a/arch/powerpc/boot/.gitignore +++ b/arch/powerpc/boot/.gitignore @@ -1,4 +1,5 @@ addnote +decompress_inflate.c empty.c hack-coff inffast.c @@ -13,11 +14,13 @@ infutil.h kernel-vmlinux.strip.c kernel-vmlinux.strip.gz mktree +otheros.bld uImage cuImage.* dtbImage.* *.dtb treeImage.* +vmlinux.strip zImage zImage.initrd zImage.bin.* @@ -26,6 +29,7 @@ zImage.coff zImage.epapr zImage.holly zImage.*lds +zImage.maple zImage.miboot zImage.pmac zImage.pseries diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index e4d53fe5976a..ac8b8332ed82 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -26,9 +26,11 @@ CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_BPF=y CONFIG_CGROUP_PERF=y CONFIG_USER_NS=y CONFIG_BLK_DEV_INITRD=y +CONFIG_BPF_SYSCALL=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=y @@ -79,6 +81,11 @@ CONFIG_NETFILTER=y # CONFIG_NETFILTER_ADVANCED is not set CONFIG_BRIDGE=m CONFIG_VLAN_8021Q=m +CONFIG_NET_SCHED=y +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_BPF=m +CONFIG_BPF_JIT=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -213,10 +220,11 @@ CONFIG_HID_SUNPLUS=y CONFIG_USB_HIDDEV=y CONFIG_USB=y CONFIG_USB_MON=m +CONFIG_USB_XHCI_HCD=y CONFIG_USB_EHCI_HCD=y # CONFIG_USB_EHCI_HCD_PPC_OF is not set CONFIG_USB_OHCI_HCD=y -CONFIG_USB_STORAGE=m +CONFIG_USB_STORAGE=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=m CONFIG_LEDS_POWERNV=m @@ -289,6 +297,7 @@ CONFIG_LOCKUP_DETECTOR=y CONFIG_LATENCYTOP=y CONFIG_SCHED_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_UPROBE_EVENT=y CONFIG_CODE_PATCHING_SELFTEST=y CONFIG_FTR_FIXUP_SELFTEST=y CONFIG_MSI_BITMAP_SELFTEST=y diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig index 0396126ba6a8..4f1288b04303 100644 --- a/arch/powerpc/configs/ppc64_defconfig +++ b/arch/powerpc/configs/ppc64_defconfig @@ -14,7 +14,9 @@ CONFIG_LOG_BUF_SHIFT=18 CONFIG_LOG_CPU_MAX_BUF_SHIFT=13 CONFIG_CGROUPS=y CONFIG_CPUSETS=y +CONFIG_CGROUP_BPF=y CONFIG_BLK_DEV_INITRD=y +CONFIG_BPF_SYSCALL=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=y @@ -76,6 +78,10 @@ CONFIG_INET_IPCOMP=m CONFIG_NETFILTER=y # CONFIG_NETFILTER_ADVANCED is not set CONFIG_BRIDGE=m +CONFIG_NET_SCHED=y +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_BPF=m CONFIG_BPF_JIT=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y @@ -324,6 +330,7 @@ CONFIG_DEBUG_MUTEXES=y CONFIG_LATENCYTOP=y CONFIG_SCHED_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_UPROBE_EVENT=y CONFIG_CODE_PATCHING_SELFTEST=y CONFIG_FTR_FIXUP_SELFTEST=y CONFIG_MSI_BITMAP_SELFTEST=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index 5a06bdde1674..6d0eb02fefa4 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -24,12 +24,14 @@ CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y +CONFIG_CGROUP_BPF=y CONFIG_MEMCG=y CONFIG_MEMCG_SWAP=y CONFIG_CGROUP_PERF=y CONFIG_CGROUP_SCHED=y CONFIG_USER_NS=y CONFIG_BLK_DEV_INITRD=y +CONFIG_BPF_SYSCALL=y # CONFIG_COMPAT_BRK is not set CONFIG_PROFILING=y CONFIG_OPROFILE=y @@ -82,6 +84,11 @@ CONFIG_NETFILTER=y # CONFIG_NETFILTER_ADVANCED is not set CONFIG_BRIDGE=m CONFIG_VLAN_8021Q=m +CONFIG_NET_SCHED=y +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_BPF=m +CONFIG_BPF_JIT=y CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y @@ -289,6 +296,7 @@ CONFIG_LOCKUP_DETECTOR=y CONFIG_LATENCYTOP=y CONFIG_SCHED_TRACER=y CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_UPROBE_EVENT=y CONFIG_CODE_PATCHING_SELFTEST=y CONFIG_FTR_FIXUP_SELFTEST=y CONFIG_MSI_BITMAP_SELFTEST=y diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index ba47c70712f9..f6c5264287e5 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -120,4 +120,6 @@ extern s64 __ashrdi3(s64, int); extern int __cmpdi2(s64, s64); extern int __ucmpdi2(u64, u64); +void _mcount(void); + #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */ diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 4c935f7504f7..f7b721bbf918 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -33,9 +33,9 @@ H_PUD_INDEX_SIZE + H_PGD_INDEX_SIZE + PAGE_SHIFT) #define H_PGTABLE_RANGE (ASM_CONST(1) << H_PGTABLE_EADDR_SIZE) -#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_PPC_64K_PAGES) /* - * only with hash we need to use the second half of pmd page table + * only with hash 64k we need to use the second half of pmd page table * to store pointer to deposited pgtable_t */ #define H_PMD_CACHE_INDEX (H_PMD_INDEX_SIZE + 1) diff --git a/arch/powerpc/include/asm/book3s/64/mmu-hash.h b/arch/powerpc/include/asm/book3s/64/mmu-hash.h index 2e6a823fa502..52d8d1e4b772 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ b/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -157,6 +157,7 @@ struct mmu_hash_ops { unsigned long addr, unsigned char *hpte_slot_array, int psize, int ssize, int local); + int (*resize_hpt)(unsigned long shift); /* * Special for kexec. * To be called in real mode with interrupts disabled. No locks are @@ -525,6 +526,9 @@ extern void slb_set_size(u16 size); #define ESID_BITS 18 #define ESID_BITS_1T 6 +#define ESID_BITS_MASK ((1 << ESID_BITS) - 1) +#define ESID_BITS_1T_MASK ((1 << ESID_BITS_1T) - 1) + /* * 256MB segment * The proto-VSID space has 2^(CONTEX_BITS + ESID_BITS) - 1 segments @@ -660,9 +664,9 @@ static inline unsigned long get_vsid(unsigned long context, unsigned long ea, if (ssize == MMU_SEGSIZE_256M) return vsid_scramble((context << ESID_BITS) - | (ea >> SID_SHIFT), 256M); + | ((ea >> SID_SHIFT) & ESID_BITS_MASK), 256M); return vsid_scramble((context << ESID_BITS_1T) - | (ea >> SID_SHIFT_1T), 1T); + | ((ea >> SID_SHIFT_1T) & ESID_BITS_1T_MASK), 1T); } /* diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h b/arch/powerpc/include/asm/book3s/64/mmu.h index 8afb0e00f7d9..d73e9dfa5237 100644 --- a/arch/powerpc/include/asm/book3s/64/mmu.h +++ b/arch/powerpc/include/asm/book3s/64/mmu.h @@ -44,10 +44,20 @@ struct patb_entry { }; extern struct patb_entry *partition_tb; +/* Bits in patb0 field */ #define PATB_HR (1UL << 63) -#define PATB_GR (1UL << 63) #define RPDB_MASK 0x0ffffffffffff00fUL #define RPDB_SHIFT (1UL << 8) +#define RTS1_SHIFT 61 /* top 2 bits of radix tree size */ +#define RTS1_MASK (3UL << RTS1_SHIFT) +#define RTS2_SHIFT 5 /* bottom 3 bits of radix tree size */ +#define RTS2_MASK (7UL << RTS2_SHIFT) +#define RPDS_MASK 0x1f /* root page dir. size field */ + +/* Bits in patb1 field */ +#define PATB_GR (1UL << 63) /* guest uses radix; must match HR */ +#define PRTS_MASK 0x1f /* process table size field */ + /* * Limit process table to PAGE_SIZE table. This * also limit the max pid we can support. @@ -138,5 +148,11 @@ static inline void setup_initial_memory_limit(phys_addr_t first_memblock_base, extern int (*register_process_table)(unsigned long base, unsigned long page_size, unsigned long tbl_size); +#ifdef CONFIG_PPC_PSERIES +extern void radix_init_pseries(void); +#else +static inline void radix_init_pseries(void) { }; +#endif + #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_MMU_H_ */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h index 9db83b4e017d..8708a0239a56 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-4k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-4k.h @@ -47,7 +47,12 @@ static inline int hugepd_ok(hugepd_t hpd) return hash__hugepd_ok(hpd); } #define is_hugepd(hpd) (hugepd_ok(hpd)) + +#else /* !CONFIG_HUGETLB_PAGE */ +static inline int pmd_huge(pmd_t pmd) { return 0; } +static inline int pud_huge(pud_t pud) { return 0; } #endif /* CONFIG_HUGETLB_PAGE */ + #endif /* __ASSEMBLY__ */ #endif /*_ASM_POWERPC_BOOK3S_64_PGTABLE_4K_H */ diff --git a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h index 0d2845b44763..2ce4209399ed 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable-64k.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable-64k.h @@ -35,10 +35,6 @@ static inline int pgd_huge(pgd_t pgd) } #define pgd_huge pgd_huge -#ifdef CONFIG_DEBUG_VM -extern int hugepd_ok(hugepd_t hpd); -#define is_hugepd(hpd) (hugepd_ok(hpd)) -#else /* * With 64k page size, we have hugepage ptes in the pgd and pmd entries. We don't * need to setup hugepage directory for them. Our pte and page directory format @@ -49,8 +45,10 @@ static inline int hugepd_ok(hugepd_t hpd) return 0; } #define is_hugepd(pdep) 0 -#endif /* CONFIG_DEBUG_VM */ +#else /* !CONFIG_HUGETLB_PAGE */ +static inline int pmd_huge(pmd_t pmd) { return 0; } +static inline int pud_huge(pud_t pud) { return 0; } #endif /* CONFIG_HUGETLB_PAGE */ static inline int remap_4k_pfn(struct vm_area_struct *vma, unsigned long addr, diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 5905f0ff57d1..fef738229a68 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -371,6 +371,23 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, return __pte(old); } +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, int full) +{ + if (full && radix_enabled()) { + /* + * Let's skip the DD1 style pte update here. We know that + * this is a full mm pte clear and hence can be sure there is + * no parallel set_pte. + */ + return radix__ptep_get_and_clear_full(mm, addr, ptep, full); + } + return ptep_get_and_clear(mm, addr, ptep); +} + + static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t * ptep) { diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index b4d1302387a3..9e0bb7cd6e22 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -139,30 +139,43 @@ static inline unsigned long radix__pte_update(struct mm_struct *mm, unsigned long new_pte; - old_pte = __radix_pte_update(ptep, ~0, 0); + old_pte = __radix_pte_update(ptep, ~0ul, 0); /* * new value of pte */ new_pte = (old_pte | set) & ~clr; - /* - * If we are trying to clear the pte, we can skip - * the below sequence and batch the tlb flush. The - * tlb flush batching is done by mmu gather code - */ - if (new_pte) { - asm volatile("ptesync" : : : "memory"); - radix__flush_tlb_pte_p9_dd1(old_pte, mm, addr); + radix__flush_tlb_pte_p9_dd1(old_pte, mm, addr); + if (new_pte) __radix_pte_update(ptep, 0, new_pte); - } } else old_pte = __radix_pte_update(ptep, clr, set); - asm volatile("ptesync" : : : "memory"); if (!huge) assert_pte_locked(mm, addr); return old_pte; } +static inline pte_t radix__ptep_get_and_clear_full(struct mm_struct *mm, + unsigned long addr, + pte_t *ptep, int full) +{ + unsigned long old_pte; + + if (full) { + /* + * If we are trying to clear the pte, we can skip + * the DD1 pte update sequence and batch the tlb flush. The + * tlb flush batching is done by mmu gather code. We + * still keep the cmp_xchg update to make sure we get + * correct R/C bit which might be updated via Nest MMU. + */ + old_pte = __radix_pte_update(ptep, ~0ul, 0); + } else + old_pte = radix__pte_update(mm, addr, ptep, ~0ul, 0, 0); + + return __pte(old_pte); +} + /* * Set the dirty and/or accessed bits atomically in a linux PTE, this * function doesn't need to invalidate tlb. @@ -180,7 +193,6 @@ static inline void radix__ptep_set_access_flags(struct mm_struct *mm, unsigned long old_pte, new_pte; old_pte = __radix_pte_update(ptep, ~0, 0); - asm volatile("ptesync" : : : "memory"); /* * new value of pte */ @@ -291,5 +303,10 @@ static inline unsigned long radix__get_tree_size(void) } return rts_field; } + +#ifdef CONFIG_MEMORY_HOTPLUG +int radix__create_section_mapping(unsigned long start, unsigned long end); +int radix__remove_section_mapping(unsigned long start, unsigned long end); +#endif /* CONFIG_MEMORY_HOTPLUG */ #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/include/asm/cache.h b/arch/powerpc/include/asm/cache.h index 7657aa897a38..5a90292afbad 100644 --- a/arch/powerpc/include/asm/cache.h +++ b/arch/powerpc/include/asm/cache.h @@ -30,15 +30,22 @@ #define IFETCH_ALIGN_BYTES (1 << IFETCH_ALIGN_SHIFT) #if defined(__powerpc64__) && !defined(__ASSEMBLY__) + +struct ppc_cache_info { + u32 size; + u32 line_size; + u32 block_size; /* L1 only */ + u32 log_block_size; + u32 blocks_per_page; + u32 sets; + u32 assoc; +}; + struct ppc64_caches { - u32 dsize; /* L1 d-cache size */ - u32 dline_size; /* L1 d-cache line size */ - u32 log_dline_size; - u32 dlines_per_page; - u32 isize; /* L1 i-cache size */ - u32 iline_size; /* L1 i-cache line size */ - u32 log_iline_size; - u32 ilines_per_page; + struct ppc_cache_info l1d; + struct ppc_cache_info l1i; + struct ppc_cache_info l2; + struct ppc_cache_info l3; }; extern struct ppc64_caches ppc64_caches; diff --git a/arch/powerpc/include/asm/checksum.h b/arch/powerpc/include/asm/checksum.h index 1e8fceb308a5..4e63787dc3be 100644 --- a/arch/powerpc/include/asm/checksum.h +++ b/arch/powerpc/include/asm/checksum.h @@ -53,17 +53,29 @@ static inline __sum16 csum_fold(__wsum sum) return (__force __sum16)(~((__force u32)sum + tmp) >> 16); } +static inline u32 from64to32(u64 x) +{ + /* add up 32-bit and 32-bit for 32+c bit */ + x = (x & 0xffffffff) + (x >> 32); + /* add up carry.. */ + x = (x & 0xffffffff) + (x >> 32); + return (u32)x; +} + static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { #ifdef __powerpc64__ - unsigned long s = (__force u32)sum; + u64 s = (__force u32)sum; s += (__force u32)saddr; s += (__force u32)daddr; +#ifdef __BIG_ENDIAN__ s += proto + len; - s += (s >> 32); - return (__force __wsum) s; +#else + s += (proto + len) << 8; +#endif + return (__force __wsum) from64to32(s); #else __asm__("\n\ addc %0,%0,%1 \n\ @@ -123,8 +135,7 @@ static inline __wsum ip_fast_csum_nofold(const void *iph, unsigned int ihl) for (i = 0; i < ihl - 1; i++, ptr++) s += *ptr; - s += (s >> 32); - return (__force __wsum)s; + return (__force __wsum)from64to32(s); #else __wsum sum, tmp; diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 2015b072422c..8ab937771068 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -22,6 +22,7 @@ #define BRANCH_SET_LINK 0x1 #define BRANCH_ABSOLUTE 0x2 +bool is_offset_in_branch_range(long offset); unsigned int create_branch(const unsigned int *addr, unsigned long target, int flags); unsigned int create_cond_branch(const unsigned int *addr, @@ -34,6 +35,7 @@ int instr_is_branch_to_addr(const unsigned int *instr, unsigned long addr); unsigned long branch_target(const unsigned int *instr); unsigned int translate_branch(const unsigned int *dest, const unsigned int *src); +extern bool is_conditional_branch(unsigned int instr); #ifdef CONFIG_PPC_BOOK3E_64 void __patch_exception(int exc, unsigned long addr); #define patch_exception(exc, name) do { \ diff --git a/arch/powerpc/include/asm/cpuidle.h b/arch/powerpc/include/asm/cpuidle.h index 3919332965af..fd321eb423cb 100644 --- a/arch/powerpc/include/asm/cpuidle.h +++ b/arch/powerpc/include/asm/cpuidle.h @@ -10,18 +10,62 @@ #define PNV_CORE_IDLE_LOCK_BIT 0x100 #define PNV_CORE_IDLE_THREAD_BITS 0x0FF +/* + * ============================ NOTE ================================= + * The older firmware populates only the RL field in the psscr_val and + * sets the psscr_mask to 0xf. On such a firmware, the kernel sets the + * remaining PSSCR fields to default values as follows: + * + * - ESL and EC bits are to 1. So wakeup from any stop state will be + * at vector 0x100. + * + * - MTL and PSLL are set to the maximum allowed value as per the ISA, + * i.e. 15. + * + * - The Transition Rate, TR is set to the Maximum value 3. + */ +#define PSSCR_HV_DEFAULT_VAL (PSSCR_ESL | PSSCR_EC | \ + PSSCR_PSLL_MASK | PSSCR_TR_MASK | \ + PSSCR_MTL_MASK) + +#define PSSCR_HV_DEFAULT_MASK (PSSCR_ESL | PSSCR_EC | \ + PSSCR_PSLL_MASK | PSSCR_TR_MASK | \ + PSSCR_MTL_MASK | PSSCR_RL_MASK) +#define PSSCR_EC_SHIFT 20 +#define PSSCR_ESL_SHIFT 21 +#define GET_PSSCR_EC(x) (((x) & PSSCR_EC) >> PSSCR_EC_SHIFT) +#define GET_PSSCR_ESL(x) (((x) & PSSCR_ESL) >> PSSCR_ESL_SHIFT) +#define GET_PSSCR_RL(x) ((x) & PSSCR_RL_MASK) + +#define ERR_EC_ESL_MISMATCH -1 +#define ERR_DEEP_STATE_ESL_MISMATCH -2 + #ifndef __ASSEMBLY__ extern u32 pnv_fastsleep_workaround_at_entry[]; extern u32 pnv_fastsleep_workaround_at_exit[]; extern u64 pnv_first_deep_stop_state; + +int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags); +static inline void report_invalid_psscr_val(u64 psscr_val, int err) +{ + switch (err) { + case ERR_EC_ESL_MISMATCH: + pr_warn("Invalid psscr 0x%016llx : ESL,EC bits unequal", + psscr_val); + break; + case ERR_DEEP_STATE_ESL_MISMATCH: + pr_warn("Invalid psscr 0x%016llx : ESL cleared for deep stop-state", + psscr_val); + } +} #endif #endif /* Idle state entry routines */ #ifdef CONFIG_PPC_P7_NAP -#define IDLE_STATE_ENTER_SEQ(IDLE_INST) \ +#define IDLE_STATE_ENTER_SEQ(IDLE_INST) \ /* Magic NAP/SLEEP/WINKLE mode enter sequence */ \ std r0,0(r1); \ ptesync; \ @@ -29,6 +73,9 @@ extern u64 pnv_first_deep_stop_state; 1: cmpd cr0,r0,r0; \ bne 1b; \ IDLE_INST; \ + +#define IDLE_STATE_ENTER_SEQ_NORET(IDLE_INST) \ + IDLE_STATE_ENTER_SEQ(IDLE_INST) \ b . #endif /* CONFIG_PPC_P7_NAP */ diff --git a/arch/powerpc/include/asm/elf.h b/arch/powerpc/include/asm/elf.h index ee46ffef608e..93b9b84568e8 100644 --- a/arch/powerpc/include/asm/elf.h +++ b/arch/powerpc/include/asm/elf.h @@ -136,4 +136,46 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, #endif /* CONFIG_SPU_BASE */ +#ifdef CONFIG_PPC64 + +#define get_cache_geometry(level) \ + (ppc64_caches.level.assoc << 16 | ppc64_caches.level.line_size) + +#define ARCH_DLINFO_CACHE_GEOMETRY \ + NEW_AUX_ENT(AT_L1I_CACHESIZE, ppc64_caches.l1i.size); \ + NEW_AUX_ENT(AT_L1I_CACHEGEOMETRY, get_cache_geometry(l1i)); \ + NEW_AUX_ENT(AT_L1D_CACHESIZE, ppc64_caches.l1i.size); \ + NEW_AUX_ENT(AT_L1D_CACHEGEOMETRY, get_cache_geometry(l1i)); \ + NEW_AUX_ENT(AT_L2_CACHESIZE, ppc64_caches.l2.size); \ + NEW_AUX_ENT(AT_L2_CACHEGEOMETRY, get_cache_geometry(l2)); \ + NEW_AUX_ENT(AT_L3_CACHESIZE, ppc64_caches.l3.size); \ + NEW_AUX_ENT(AT_L3_CACHEGEOMETRY, get_cache_geometry(l3)) + +#else +#define ARCH_DLINFO_CACHE_GEOMETRY +#endif + +/* + * The requirements here are: + * - keep the final alignment of sp (sp & 0xf) + * - make sure the 32-bit value at the first 16 byte aligned position of + * AUXV is greater than 16 for glibc compatibility. + * AT_IGNOREPPC is used for that. + * - for compatibility with glibc ARCH_DLINFO must always be defined on PPC, + * even if DLINFO_ARCH_ITEMS goes to zero or is undefined. + * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes + */ +#define ARCH_DLINFO \ +do { \ + /* Handle glibc compatibility. */ \ + NEW_AUX_ENT(AT_IGNOREPPC, AT_IGNOREPPC); \ + NEW_AUX_ENT(AT_IGNOREPPC, AT_IGNOREPPC); \ + /* Cache size items */ \ + NEW_AUX_ENT(AT_DCACHEBSIZE, dcache_bsize); \ + NEW_AUX_ENT(AT_ICACHEBSIZE, icache_bsize); \ + NEW_AUX_ENT(AT_UCACHEBSIZE, ucache_bsize); \ + VDSO_AUX_ENT(AT_SYSINFO_EHDR, current->mm->context.vdso_base); \ + ARCH_DLINFO_CACHE_GEOMETRY; \ +} while (0) + #endif /* _ASM_POWERPC_ELF_H */ diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 9a3eee661297..14752eee3d0c 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -97,6 +97,15 @@ ld reg,PACAKBASE(r13); \ ori reg,reg,(ABS_ADDR(label))@l; +/* + * Branches from unrelocated code (e.g., interrupts) to labels outside + * head-y require >64K offsets. + */ +#define __LOAD_FAR_HANDLER(reg, label) \ + ld reg,PACAKBASE(r13); \ + ori reg,reg,(ABS_ADDR(label))@l; \ + addis reg,reg,(ABS_ADDR(label))@h; + /* Exception register prefixes */ #define EXC_HV H #define EXC_STD @@ -227,13 +236,49 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) mtctr reg; \ bctr +#define BRANCH_LINK_TO_FAR(reg, label) \ + __LOAD_FAR_HANDLER(reg, label); \ + mtctr reg; \ + bctrl + +/* + * KVM requires __LOAD_FAR_HANDLER. + * + * __BRANCH_TO_KVM_EXIT branches are also a special case because they + * explicitly use r9 then reload it from PACA before branching. Hence + * the double-underscore. + */ +#define __BRANCH_TO_KVM_EXIT(area, label) \ + mfctr r9; \ + std r9,HSTATE_SCRATCH1(r13); \ + __LOAD_FAR_HANDLER(r9, label); \ + mtctr r9; \ + ld r9,area+EX_R9(r13); \ + bctr + +#define BRANCH_TO_KVM(reg, label) \ + __LOAD_FAR_HANDLER(reg, label); \ + mtctr reg; \ + bctr + #else #define BRANCH_TO_COMMON(reg, label) \ b label +#define BRANCH_LINK_TO_FAR(reg, label) \ + bl label + +#define BRANCH_TO_KVM(reg, label) \ + b label + +#define __BRANCH_TO_KVM_EXIT(area, label) \ + ld r9,area+EX_R9(r13); \ + b label + #endif -#define __KVM_HANDLER_PROLOG(area, n) \ + +#define __KVM_HANDLER(area, h, n) \ BEGIN_FTR_SECTION_NESTED(947) \ ld r10,area+EX_CFAR(r13); \ std r10,HSTATE_CFAR(r13); \ @@ -243,30 +288,28 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) std r10,HSTATE_PPR(r13); \ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948); \ ld r10,area+EX_R10(r13); \ - stw r9,HSTATE_SCRATCH1(r13); \ - ld r9,area+EX_R9(r13); \ std r12,HSTATE_SCRATCH0(r13); \ - -#define __KVM_HANDLER(area, h, n) \ - __KVM_HANDLER_PROLOG(area, n) \ - li r12,n; \ - b kvmppc_interrupt + sldi r12,r9,32; \ + ori r12,r12,(n); \ + /* This reloads r9 before branching to kvmppc_interrupt */ \ + __BRANCH_TO_KVM_EXIT(area, kvmppc_interrupt) #define __KVM_HANDLER_SKIP(area, h, n) \ cmpwi r10,KVM_GUEST_MODE_SKIP; \ - ld r10,area+EX_R10(r13); \ beq 89f; \ - stw r9,HSTATE_SCRATCH1(r13); \ BEGIN_FTR_SECTION_NESTED(948) \ - ld r9,area+EX_PPR(r13); \ - std r9,HSTATE_PPR(r13); \ + ld r10,area+EX_PPR(r13); \ + std r10,HSTATE_PPR(r13); \ END_FTR_SECTION_NESTED(CPU_FTR_HAS_PPR,CPU_FTR_HAS_PPR,948); \ - ld r9,area+EX_R9(r13); \ + ld r10,area+EX_R10(r13); \ std r12,HSTATE_SCRATCH0(r13); \ - li r12,n; \ - b kvmppc_interrupt; \ + sldi r12,r9,32; \ + ori r12,r12,(n); \ + /* This reloads r9 before branching to kvmppc_interrupt */ \ + __BRANCH_TO_KVM_EXIT(area, kvmppc_interrupt); \ 89: mtocrf 0x80,r9; \ ld r9,area+EX_R9(r13); \ + ld r10,area+EX_R10(r13); \ b kvmppc_skip_##h##interrupt #ifdef CONFIG_KVM_BOOK3S_64_HANDLER @@ -393,12 +436,12 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_STD) #define STD_RELON_EXCEPTION_HV(loc, vec, label) \ - /* No guest interrupts come through here */ \ SET_SCRATCH0(r13); /* save r13 */ \ - EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label, EXC_HV, NOTEST, vec); + EXCEPTION_RELON_PROLOG_PSERIES(PACA_EXGEN, label, \ + EXC_HV, KVMTEST_HV, vec); #define STD_RELON_EXCEPTION_HV_OOL(vec, label) \ - EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, vec); \ + EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, vec); \ EXCEPTION_RELON_PROLOG_PSERIES_1(label, EXC_HV) /* This associate vector numbers with bits in paca->irq_happened */ @@ -475,10 +518,10 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define MASKABLE_RELON_EXCEPTION_HV(loc, vec, label) \ _MASKABLE_RELON_EXCEPTION_PSERIES(vec, label, \ - EXC_HV, SOFTEN_NOTEST_HV) + EXC_HV, SOFTEN_TEST_HV) #define MASKABLE_RELON_EXCEPTION_HV_OOL(vec, label) \ - EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_NOTEST_HV, vec); \ + EXCEPTION_PROLOG_1(PACA_EXGEN, SOFTEN_TEST_HV, vec); \ EXCEPTION_PROLOG_PSERIES_1(label, EXC_HV) /* diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 1e0b5a5d660a..8645897472b1 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -42,7 +42,7 @@ #define FW_FEATURE_SPLPAR ASM_CONST(0x0000000000100000) #define FW_FEATURE_LPAR ASM_CONST(0x0000000000400000) #define FW_FEATURE_PS3_LV1 ASM_CONST(0x0000000000800000) -/* Free ASM_CONST(0x0000000001000000) */ +#define FW_FEATURE_HPT_RESIZE ASM_CONST(0x0000000001000000) #define FW_FEATURE_CMO ASM_CONST(0x0000000002000000) #define FW_FEATURE_VPHN ASM_CONST(0x0000000004000000) #define FW_FEATURE_XCMO ASM_CONST(0x0000000008000000) @@ -66,7 +66,8 @@ enum { FW_FEATURE_MULTITCE | FW_FEATURE_SPLPAR | FW_FEATURE_LPAR | FW_FEATURE_CMO | FW_FEATURE_VPHN | FW_FEATURE_XCMO | FW_FEATURE_SET_MODE | FW_FEATURE_BEST_ENERGY | - FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN, + FW_FEATURE_TYPE1_AFFINITY | FW_FEATURE_PRRN | + FW_FEATURE_HPT_RESIZE, FW_FEATURE_PSERIES_ALWAYS = 0, FW_FEATURE_POWERNV_POSSIBLE = FW_FEATURE_OPAL, FW_FEATURE_POWERNV_ALWAYS = 0, diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index fca7033839a9..5067048daad4 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -38,8 +38,8 @@ * li r10,128 * mv r11,r10 - * FIXED_SECTION_ENTRY_BEGIN_LOCATION(section_name, label2, start_address) - * FIXED_SECTION_ENTRY_END_LOCATION(section_name, label2, end_address) + * FIXED_SECTION_ENTRY_BEGIN_LOCATION(section_name, label2, start_address, size) + * FIXED_SECTION_ENTRY_END_LOCATION(section_name, label2, start_address, size) * CLOSE_FIXED_SECTION(section_name) * * ZERO_FIXED_SECTION can be used to emit zeroed data. @@ -102,9 +102,15 @@ name: #define FIXED_SECTION_ENTRY_BEGIN(sname, name) \ __FIXED_SECTION_ENTRY_BEGIN(sname, name, IFETCH_ALIGN_BYTES) -#define FIXED_SECTION_ENTRY_BEGIN_LOCATION(sname, name, start) \ +#define FIXED_SECTION_ENTRY_BEGIN_LOCATION(sname, name, start, size) \ USE_FIXED_SECTION(sname); \ name##_start = (start); \ + .if ((start) % (size) != 0); \ + .error "Fixed section exception vector misalignment"; \ + .endif; \ + .if ((size) != 0x20) && ((size) != 0x80) && ((size) != 0x100); \ + .error "Fixed section exception vector bad size"; \ + .endif; \ .if (start) < sname##_start; \ .error "Fixed section underflow"; \ .abort; \ @@ -113,16 +119,16 @@ name: .global name; \ name: -#define FIXED_SECTION_ENTRY_END_LOCATION(sname, name, end) \ - .if (end) > sname##_end; \ +#define FIXED_SECTION_ENTRY_END_LOCATION(sname, name, start, size) \ + .if (start) + (size) > sname##_end; \ .error "Fixed section overflow"; \ .abort; \ .endif; \ - .if (. - name > end - name##_start); \ + .if (. - name > (start) + (size) - name##_start); \ .error "Fixed entry overflow"; \ .abort; \ .endif; \ - . = ((end) - sname##_start); \ + . = ((start) + (size) - sname##_start); \ /* @@ -147,12 +153,12 @@ name: * Following are the BOOK3S exception handler helper macros. * Handlers come in a number of types, and each type has a number of varieties. * - * EXC_REAL_* - real, unrelocated exception vectors - * EXC_VIRT_* - virt (AIL), unrelocated exception vectors + * EXC_REAL_* - real, unrelocated exception vectors + * EXC_VIRT_* - virt (AIL), unrelocated exception vectors * TRAMP_REAL_* - real, unrelocated helpers (virt can call these) - * TRAMP_VIRT_* - virt, unreloc helpers (in practice, real can use) - * TRAMP_KVM - KVM handlers that get put into real, unrelocated - * EXC_COMMON_* - virt, relocated common handlers + * TRAMP_VIRT_* - virt, unreloc helpers (in practice, real can use) + * TRAMP_KVM - KVM handlers that get put into real, unrelocated + * EXC_COMMON_* - virt, relocated common handlers * * The EXC handlers are given a name, and branch to name_common, or the * appropriate KVM or masking function. Vector handler verieties are as @@ -191,23 +197,23 @@ name: * and OOL handlers are implemented as types of TRAMP and TRAMP_VIRT handlers. */ -#define EXC_REAL_BEGIN(name, start, end) \ - FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start) +#define EXC_REAL_BEGIN(name, start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##name, start, size) -#define EXC_REAL_END(name, start, end) \ - FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, end) +#define EXC_REAL_END(name, start, size) \ + FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##name, start, size) -#define EXC_VIRT_BEGIN(name, start, end) \ - FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start) +#define EXC_VIRT_BEGIN(name, start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size) -#define EXC_VIRT_END(name, start, end) \ - FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, end) +#define EXC_VIRT_END(name, start, size) \ + FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##name, start, size) -#define EXC_COMMON_BEGIN(name) \ - USE_TEXT_SECTION(); \ - .balign IFETCH_ALIGN_BYTES; \ - .global name; \ - DEFINE_FIXED_SYMBOL(name); \ +#define EXC_COMMON_BEGIN(name) \ + USE_TEXT_SECTION(); \ + .balign IFETCH_ALIGN_BYTES; \ + .global name; \ + DEFINE_FIXED_SYMBOL(name); \ name: #define TRAMP_REAL_BEGIN(name) \ @@ -217,147 +223,147 @@ name: FIXED_SECTION_ENTRY_BEGIN(virt_trampolines, name) #ifdef CONFIG_KVM_BOOK3S_64_HANDLER -#define TRAMP_KVM_BEGIN(name) \ - TRAMP_REAL_BEGIN(name) +#define TRAMP_KVM_BEGIN(name) \ + TRAMP_VIRT_BEGIN(name) #else #define TRAMP_KVM_BEGIN(name) #endif -#define EXC_REAL_NONE(start, end) \ - FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start); \ - FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, end) +#define EXC_REAL_NONE(start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(real_vectors, exc_real_##start##_##unused, start, size); \ + FIXED_SECTION_ENTRY_END_LOCATION(real_vectors, exc_real_##start##_##unused, start, size) -#define EXC_VIRT_NONE(start, end) \ - FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##unused, start); \ - FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, end); +#define EXC_VIRT_NONE(start, size) \ + FIXED_SECTION_ENTRY_BEGIN_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size); \ + FIXED_SECTION_ENTRY_END_LOCATION(virt_vectors, exc_virt_##start##_##unused, start, size); -#define EXC_REAL(name, start, end) \ - EXC_REAL_BEGIN(name, start, end); \ +#define EXC_REAL(name, start, size) \ + EXC_REAL_BEGIN(name, start, size); \ STD_EXCEPTION_PSERIES(start, name##_common); \ - EXC_REAL_END(name, start, end); + EXC_REAL_END(name, start, size); -#define EXC_VIRT(name, start, end, realvec) \ - EXC_VIRT_BEGIN(name, start, end); \ +#define EXC_VIRT(name, start, size, realvec) \ + EXC_VIRT_BEGIN(name, start, size); \ STD_RELON_EXCEPTION_PSERIES(start, realvec, name##_common); \ - EXC_VIRT_END(name, start, end); + EXC_VIRT_END(name, start, size); -#define EXC_REAL_MASKABLE(name, start, end) \ - EXC_REAL_BEGIN(name, start, end); \ +#define EXC_REAL_MASKABLE(name, start, size) \ + EXC_REAL_BEGIN(name, start, size); \ MASKABLE_EXCEPTION_PSERIES(start, start, name##_common); \ - EXC_REAL_END(name, start, end); + EXC_REAL_END(name, start, size); -#define EXC_VIRT_MASKABLE(name, start, end, realvec) \ - EXC_VIRT_BEGIN(name, start, end); \ +#define EXC_VIRT_MASKABLE(name, start, size, realvec) \ + EXC_VIRT_BEGIN(name, start, size); \ MASKABLE_RELON_EXCEPTION_PSERIES(start, realvec, name##_common); \ - EXC_VIRT_END(name, start, end); + EXC_VIRT_END(name, start, size); -#define EXC_REAL_HV(name, start, end) \ - EXC_REAL_BEGIN(name, start, end); \ +#define EXC_REAL_HV(name, start, size) \ + EXC_REAL_BEGIN(name, start, size); \ STD_EXCEPTION_HV(start, start, name##_common); \ - EXC_REAL_END(name, start, end); + EXC_REAL_END(name, start, size); -#define EXC_VIRT_HV(name, start, end, realvec) \ - EXC_VIRT_BEGIN(name, start, end); \ +#define EXC_VIRT_HV(name, start, size, realvec) \ + EXC_VIRT_BEGIN(name, start, size); \ STD_RELON_EXCEPTION_HV(start, realvec, name##_common); \ - EXC_VIRT_END(name, start, end); + EXC_VIRT_END(name, start, size); -#define __EXC_REAL_OOL(name, start, end) \ - EXC_REAL_BEGIN(name, start, end); \ +#define __EXC_REAL_OOL(name, start, size) \ + EXC_REAL_BEGIN(name, start, size); \ __OOL_EXCEPTION(start, label, tramp_real_##name); \ - EXC_REAL_END(name, start, end); + EXC_REAL_END(name, start, size); -#define __TRAMP_REAL_REAL_OOL(name, vec) \ +#define __TRAMP_REAL_OOL(name, vec) \ TRAMP_REAL_BEGIN(tramp_real_##name); \ STD_EXCEPTION_PSERIES_OOL(vec, name##_common); \ -#define EXC_REAL_OOL(name, start, end) \ - __EXC_REAL_OOL(name, start, end); \ - __TRAMP_REAL_REAL_OOL(name, start); +#define EXC_REAL_OOL(name, start, size) \ + __EXC_REAL_OOL(name, start, size); \ + __TRAMP_REAL_OOL(name, start); -#define __EXC_REAL_OOL_MASKABLE(name, start, end) \ - __EXC_REAL_OOL(name, start, end); +#define __EXC_REAL_OOL_MASKABLE(name, start, size) \ + __EXC_REAL_OOL(name, start, size); -#define __TRAMP_REAL_REAL_OOL_MASKABLE(name, vec) \ +#define __TRAMP_REAL_OOL_MASKABLE(name, vec) \ TRAMP_REAL_BEGIN(tramp_real_##name); \ MASKABLE_EXCEPTION_PSERIES_OOL(vec, name##_common); \ -#define EXC_REAL_OOL_MASKABLE(name, start, end) \ - __EXC_REAL_OOL_MASKABLE(name, start, end); \ - __TRAMP_REAL_REAL_OOL_MASKABLE(name, start); +#define EXC_REAL_OOL_MASKABLE(name, start, size) \ + __EXC_REAL_OOL_MASKABLE(name, start, size); \ + __TRAMP_REAL_OOL_MASKABLE(name, start); -#define __EXC_REAL_OOL_HV_DIRECT(name, start, end, handler) \ - EXC_REAL_BEGIN(name, start, end); \ +#define __EXC_REAL_OOL_HV_DIRECT(name, start, size, handler) \ + EXC_REAL_BEGIN(name, start, size); \ __OOL_EXCEPTION(start, label, handler); \ - EXC_REAL_END(name, start, end); + EXC_REAL_END(name, start, size); -#define __EXC_REAL_OOL_HV(name, start, end) \ - __EXC_REAL_OOL(name, start, end); +#define __EXC_REAL_OOL_HV(name, start, size) \ + __EXC_REAL_OOL(name, start, size); -#define __TRAMP_REAL_REAL_OOL_HV(name, vec) \ +#define __TRAMP_REAL_OOL_HV(name, vec) \ TRAMP_REAL_BEGIN(tramp_real_##name); \ STD_EXCEPTION_HV_OOL(vec, name##_common); \ -#define EXC_REAL_OOL_HV(name, start, end) \ - __EXC_REAL_OOL_HV(name, start, end); \ - __TRAMP_REAL_REAL_OOL_HV(name, start); +#define EXC_REAL_OOL_HV(name, start, size) \ + __EXC_REAL_OOL_HV(name, start, size); \ + __TRAMP_REAL_OOL_HV(name, start); -#define __EXC_REAL_OOL_MASKABLE_HV(name, start, end) \ - __EXC_REAL_OOL(name, start, end); +#define __EXC_REAL_OOL_MASKABLE_HV(name, start, size) \ + __EXC_REAL_OOL(name, start, size); -#define __TRAMP_REAL_REAL_OOL_MASKABLE_HV(name, vec) \ +#define __TRAMP_REAL_OOL_MASKABLE_HV(name, vec) \ TRAMP_REAL_BEGIN(tramp_real_##name); \ MASKABLE_EXCEPTION_HV_OOL(vec, name##_common); \ -#define EXC_REAL_OOL_MASKABLE_HV(name, start, end) \ - __EXC_REAL_OOL_MASKABLE_HV(name, start, end); \ - __TRAMP_REAL_REAL_OOL_MASKABLE_HV(name, start); +#define EXC_REAL_OOL_MASKABLE_HV(name, start, size) \ + __EXC_REAL_OOL_MASKABLE_HV(name, start, size); \ + __TRAMP_REAL_OOL_MASKABLE_HV(name, start); -#define __EXC_VIRT_OOL(name, start, end) \ - EXC_VIRT_BEGIN(name, start, end); \ +#define __EXC_VIRT_OOL(name, start, size) \ + EXC_VIRT_BEGIN(name, start, size); \ __OOL_EXCEPTION(start, label, tramp_virt_##name); \ - EXC_VIRT_END(name, start, end); + EXC_VIRT_END(name, start, size); -#define __TRAMP_REAL_VIRT_OOL(name, realvec) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ +#define __TRAMP_VIRT_OOL(name, realvec) \ + TRAMP_VIRT_BEGIN(tramp_virt_##name); \ STD_RELON_EXCEPTION_PSERIES_OOL(realvec, name##_common); \ -#define EXC_VIRT_OOL(name, start, end, realvec) \ - __EXC_VIRT_OOL(name, start, end); \ - __TRAMP_REAL_VIRT_OOL(name, realvec); +#define EXC_VIRT_OOL(name, start, size, realvec) \ + __EXC_VIRT_OOL(name, start, size); \ + __TRAMP_VIRT_OOL(name, realvec); -#define __EXC_VIRT_OOL_MASKABLE(name, start, end) \ - __EXC_VIRT_OOL(name, start, end); +#define __EXC_VIRT_OOL_MASKABLE(name, start, size) \ + __EXC_VIRT_OOL(name, start, size); -#define __TRAMP_REAL_VIRT_OOL_MASKABLE(name, realvec) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ +#define __TRAMP_VIRT_OOL_MASKABLE(name, realvec) \ + TRAMP_VIRT_BEGIN(tramp_virt_##name); \ MASKABLE_RELON_EXCEPTION_PSERIES_OOL(realvec, name##_common); \ -#define EXC_VIRT_OOL_MASKABLE(name, start, end, realvec) \ - __EXC_VIRT_OOL_MASKABLE(name, start, end); \ - __TRAMP_REAL_VIRT_OOL_MASKABLE(name, realvec); +#define EXC_VIRT_OOL_MASKABLE(name, start, size, realvec) \ + __EXC_VIRT_OOL_MASKABLE(name, start, size); \ + __TRAMP_VIRT_OOL_MASKABLE(name, realvec); -#define __EXC_VIRT_OOL_HV(name, start, end) \ - __EXC_VIRT_OOL(name, start, end); +#define __EXC_VIRT_OOL_HV(name, start, size) \ + __EXC_VIRT_OOL(name, start, size); -#define __TRAMP_REAL_VIRT_OOL_HV(name, realvec) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ +#define __TRAMP_VIRT_OOL_HV(name, realvec) \ + TRAMP_VIRT_BEGIN(tramp_virt_##name); \ STD_RELON_EXCEPTION_HV_OOL(realvec, name##_common); \ -#define EXC_VIRT_OOL_HV(name, start, end, realvec) \ - __EXC_VIRT_OOL_HV(name, start, end); \ - __TRAMP_REAL_VIRT_OOL_HV(name, realvec); +#define EXC_VIRT_OOL_HV(name, start, size, realvec) \ + __EXC_VIRT_OOL_HV(name, start, size); \ + __TRAMP_VIRT_OOL_HV(name, realvec); -#define __EXC_VIRT_OOL_MASKABLE_HV(name, start, end) \ - __EXC_VIRT_OOL(name, start, end); +#define __EXC_VIRT_OOL_MASKABLE_HV(name, start, size) \ + __EXC_VIRT_OOL(name, start, size); -#define __TRAMP_REAL_VIRT_OOL_MASKABLE_HV(name, realvec) \ - TRAMP_VIRT_BEGIN(tramp_virt_##name); \ +#define __TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec) \ + TRAMP_VIRT_BEGIN(tramp_virt_##name); \ MASKABLE_RELON_EXCEPTION_HV_OOL(realvec, name##_common); \ -#define EXC_VIRT_OOL_MASKABLE_HV(name, start, end, realvec) \ - __EXC_VIRT_OOL_MASKABLE_HV(name, start, end); \ - __TRAMP_REAL_VIRT_OOL_MASKABLE_HV(name, realvec); +#define EXC_VIRT_OOL_MASKABLE_HV(name, start, size, realvec) \ + __EXC_VIRT_OOL_MASKABLE_HV(name, start, size); \ + __TRAMP_VIRT_OOL_MASKABLE_HV(name, realvec); #define TRAMP_KVM(area, n) \ TRAMP_KVM_BEGIN(do_kvm_##n); \ @@ -378,16 +384,16 @@ name: TRAMP_KVM_BEGIN(do_kvm_H##n); \ KVM_HANDLER_SKIP(area, EXC_HV, n + 0x2); \ -#define EXC_COMMON(name, realvec, hdlr) \ - EXC_COMMON_BEGIN(name); \ +#define EXC_COMMON(name, realvec, hdlr) \ + EXC_COMMON_BEGIN(name); \ STD_EXCEPTION_COMMON(realvec, name, hdlr); \ -#define EXC_COMMON_ASYNC(name, realvec, hdlr) \ - EXC_COMMON_BEGIN(name); \ +#define EXC_COMMON_ASYNC(name, realvec, hdlr) \ + EXC_COMMON_BEGIN(name); \ STD_EXCEPTION_COMMON_ASYNC(realvec, name, hdlr); \ #define EXC_COMMON_HV(name, realvec, hdlr) \ - EXC_COMMON_BEGIN(name); \ + EXC_COMMON_BEGIN(name); \ STD_EXCEPTION_COMMON(realvec + 0x2, name, hdlr); \ #endif /* _ASM_POWERPC_HEAD_64_H */ diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h index 77ff1ba99d1f..3cc12a86ef5d 100644 --- a/arch/powerpc/include/asm/hvcall.h +++ b/arch/powerpc/include/asm/hvcall.h @@ -276,6 +276,9 @@ #define H_GET_MPP_X 0x314 #define H_SET_MODE 0x31C #define H_CLEAR_HPT 0x358 +#define H_RESIZE_HPT_PREPARE 0x36C +#define H_RESIZE_HPT_COMMIT 0x370 +#define H_REGISTER_PROC_TBL 0x37C #define H_SIGNAL_SYS_RESET 0x380 #define MAX_HCALL_OPCODE H_SIGNAL_SYS_RESET @@ -313,6 +316,16 @@ #define H_SIGNAL_SYS_RESET_ALL_OTHERS -2 /* >= 0 values are CPU number */ +/* Flag values used in H_REGISTER_PROC_TBL hcall */ +#define PROC_TABLE_OP_MASK 0x18 +#define PROC_TABLE_DEREG 0x10 +#define PROC_TABLE_NEW 0x18 +#define PROC_TABLE_TYPE_MASK 0x06 +#define PROC_TABLE_HPT_SLB 0x00 +#define PROC_TABLE_HPT_PT 0x02 +#define PROC_TABLE_RADIX 0x04 +#define PROC_TABLE_GTSE 0x01 + #ifndef __ASSEMBLY__ /** diff --git a/arch/powerpc/include/asm/isa-bridge.h b/arch/powerpc/include/asm/isa-bridge.h new file mode 100644 index 000000000000..a3a7c1d63a7c --- /dev/null +++ b/arch/powerpc/include/asm/isa-bridge.h @@ -0,0 +1,29 @@ +#ifndef __ISA_BRIDGE_H +#define __ISA_BRIDGE_H + +#ifdef CONFIG_PPC64 + +extern void isa_bridge_find_early(struct pci_controller *hose); +extern void isa_bridge_init_non_pci(struct device_node *np); + +static inline int isa_vaddr_is_ioport(void __iomem *address) +{ + /* Check if address hits the reserved legacy IO range */ + unsigned long ea = (unsigned long)address; + return ea >= ISA_IO_BASE && ea < ISA_IO_END; +} + +#else + +static inline int isa_vaddr_is_ioport(void __iomem *address) +{ + /* No specific ISA handling on ppc32 at this stage, it + * all goes through PCI + */ + return 0; +} + +#endif + +#endif /* __ISA_BRIDGE_H */ + diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h index 97b8c1f83453..d821835ade86 100644 --- a/arch/powerpc/include/asm/kprobes.h +++ b/arch/powerpc/include/asm/kprobes.h @@ -29,6 +29,7 @@ #include <linux/types.h> #include <linux/ptrace.h> #include <linux/percpu.h> +#include <linux/module.h> #include <asm/probes.h> #include <asm/code-patching.h> @@ -39,7 +40,23 @@ struct pt_regs; struct kprobe; typedef ppc_opcode_t kprobe_opcode_t; -#define MAX_INSN_SIZE 1 + +extern kprobe_opcode_t optinsn_slot; + +/* Optinsn template address */ +extern kprobe_opcode_t optprobe_template_entry[]; +extern kprobe_opcode_t optprobe_template_op_address[]; +extern kprobe_opcode_t optprobe_template_call_handler[]; +extern kprobe_opcode_t optprobe_template_insn[]; +extern kprobe_opcode_t optprobe_template_call_emulate[]; +extern kprobe_opcode_t optprobe_template_ret[]; +extern kprobe_opcode_t optprobe_template_end[]; + +/* Fixed instruction size for powerpc */ +#define MAX_INSN_SIZE 1 +#define MAX_OPTIMIZED_LENGTH sizeof(kprobe_opcode_t) /* 4 bytes */ +#define MAX_OPTINSN_SIZE (optprobe_template_end - optprobe_template_entry) +#define RELATIVEJUMP_SIZE sizeof(kprobe_opcode_t) /* 4 bytes */ #ifdef PPC64_ELF_ABI_v2 /* PPC64 ABIv2 needs local entry point */ @@ -61,7 +78,7 @@ typedef ppc_opcode_t kprobe_opcode_t; #define kprobe_lookup_name(name, addr) \ { \ char dot_name[MODULE_NAME_LEN + 1 + KSYM_NAME_LEN]; \ - char *modsym; \ + const char *modsym; \ bool dot_appended = false; \ if ((modsym = strchr(name, ':')) != NULL) { \ modsym++; \ @@ -125,6 +142,12 @@ struct kprobe_ctlblk { struct prev_kprobe prev_kprobe; }; +struct arch_optimized_insn { + kprobe_opcode_t copied_insn[1]; + /* detour buffer */ + kprobe_opcode_t *insn; +}; + extern int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, void *data); extern int kprobe_fault_handler(struct pt_regs *regs, int trapnr); diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 5cf306ae0ac3..2bf35017ffc0 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -170,6 +170,8 @@ extern int kvmppc_book3s_hv_page_fault(struct kvm_run *run, unsigned long status); extern long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v, unsigned long valid); +extern int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long gpa, gva_t ea, int is_store); extern void kvmppc_mmu_hpte_cache_map(struct kvm_vcpu *vcpu, struct hpte_cache *pte); extern struct hpte_cache *kvmppc_mmu_hpte_cache_next(struct kvm_vcpu *vcpu); @@ -182,6 +184,25 @@ extern void kvmppc_mmu_hpte_sysexit(void); extern int kvmppc_mmu_hv_init(void); extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc); +extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run, + struct kvm_vcpu *vcpu, + unsigned long ea, unsigned long dsisr); +extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, bool data, bool iswrite); +extern int kvmppc_init_vm_radix(struct kvm *kvm); +extern void kvmppc_free_radix(struct kvm *kvm); +extern int kvmppc_radix_init(void); +extern void kvmppc_radix_exit(void); +extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); +extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); +extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); +extern long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, + struct kvm_memory_slot *memslot, unsigned long *map); +extern int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info); + /* XXX remove this export when load_last_inst() is generic */ extern int kvmppc_ld(struct kvm_vcpu *vcpu, ulong *eaddr, int size, void *ptr, bool data); extern void kvmppc_book3s_queue_irqprio(struct kvm_vcpu *vcpu, unsigned int vec); @@ -211,8 +232,11 @@ extern long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, extern long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, unsigned long pte_index, unsigned long avpn, unsigned long *hpret); -extern long kvmppc_hv_get_dirty_log(struct kvm *kvm, +extern long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long *map); +extern void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, + struct kvm_memory_slot *memslot, + unsigned long *map); extern void kvmppc_update_lpcr(struct kvm *kvm, unsigned long lpcr, unsigned long mask); extern void kvmppc_set_fscr(struct kvm_vcpu *vcpu, u64 fscr); diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 848292176908..0db010cc4e65 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -36,6 +36,12 @@ static inline void svcpu_put(struct kvmppc_book3s_shadow_vcpu *svcpu) #endif #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + +static inline bool kvm_is_radix(struct kvm *kvm) +{ + return kvm->arch.radix; +} + #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ #endif diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index e59b172666cd..b2dbeac3f450 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -263,7 +263,11 @@ struct kvm_arch { unsigned long hpt_mask; atomic_t hpte_mod_interest; cpumask_t need_tlb_flush; + cpumask_t cpu_in_guest; int hpt_cma_alloc; + u8 radix; + pgd_t *pgtable; + u64 process_table; struct dentry *debugfs_dir; struct dentry *htab_dentry; #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ @@ -603,6 +607,7 @@ struct kvm_vcpu_arch { ulong fault_dar; u32 fault_dsisr; unsigned long intr_msr; + ulong fault_gpa; /* guest real address of page fault (POWER9) */ #endif #ifdef CONFIG_BOOKE @@ -657,6 +662,7 @@ struct kvm_vcpu_arch { int state; int ptid; int thread_cpu; + int prev_cpu; bool timer_running; wait_queue_head_t cpu_run; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2da67bf1f2ec..48c760f89590 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -291,6 +291,8 @@ struct kvmppc_ops { struct irq_bypass_producer *); void (*irq_bypass_del_producer)(struct irq_bypass_consumer *, struct irq_bypass_producer *); + int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg); + int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info); }; extern struct kvmppc_ops *kvmppc_hv_ops; diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h index 233a7e8cc8e3..065e762fae85 100644 --- a/arch/powerpc/include/asm/mmu.h +++ b/arch/powerpc/include/asm/mmu.h @@ -136,6 +136,7 @@ enum { MMU_FTR_NO_SLBIE_B | MMU_FTR_16M_PAGE | MMU_FTR_TLBIEL | MMU_FTR_LOCKLESS_TLBIE | MMU_FTR_CI_LARGE_PAGE | MMU_FTR_1T_SEGMENT | MMU_FTR_TLBIE_CROP_VA | + MMU_FTR_KERNEL_RO | #ifdef CONFIG_PPC_RADIX_MMU MMU_FTR_TYPE_RADIX | #endif diff --git a/arch/powerpc/include/asm/opal-api.h b/arch/powerpc/include/asm/opal-api.h index 0e2e57bcab50..a0aa285869b5 100644 --- a/arch/powerpc/include/asm/opal-api.h +++ b/arch/powerpc/include/asm/opal-api.h @@ -167,7 +167,8 @@ #define OPAL_INT_EOI 124 #define OPAL_INT_SET_MFRR 125 #define OPAL_PCI_TCE_KILL 126 -#define OPAL_LAST 126 +#define OPAL_NMMU_SET_PTCR 127 +#define OPAL_LAST 127 /* Device tree flags */ diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index 5c7db0f1a708..1ff03a6da76e 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -67,7 +67,6 @@ int64_t opal_pci_config_write_half_word(uint64_t phb_id, uint64_t bus_dev_func, int64_t opal_pci_config_write_word(uint64_t phb_id, uint64_t bus_dev_func, uint64_t offset, uint32_t data); int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority); -int64_t opal_rm_set_xive(uint32_t isn, uint16_t server, uint8_t priority); int64_t opal_get_xive(uint32_t isn, __be16 *server, uint8_t *priority); int64_t opal_register_exception_handler(uint64_t opal_exception, uint64_t handler_address, @@ -220,18 +219,13 @@ int64_t opal_pci_set_power_state(uint64_t async_token, uint64_t id, int64_t opal_pci_poll2(uint64_t id, uint64_t data); int64_t opal_int_get_xirr(uint32_t *out_xirr, bool just_poll); -int64_t opal_rm_int_get_xirr(__be32 *out_xirr, bool just_poll); int64_t opal_int_set_cppr(uint8_t cppr); int64_t opal_int_eoi(uint32_t xirr); -int64_t opal_rm_int_eoi(uint32_t xirr); int64_t opal_int_set_mfrr(uint32_t cpu, uint8_t mfrr); -int64_t opal_rm_int_set_mfrr(uint32_t cpu, uint8_t mfrr); int64_t opal_pci_tce_kill(uint64_t phb_id, uint32_t kill_type, uint32_t pe_num, uint32_t tce_size, uint64_t dma_addr, uint32_t npages); -int64_t opal_rm_pci_tce_kill(uint64_t phb_id, uint32_t kill_type, - uint32_t pe_num, uint32_t tce_size, - uint64_t dma_addr, uint32_t npages); +int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr); /* Internal functions */ extern int early_init_dt_scan_opal(unsigned long node, const char *uname, diff --git a/arch/powerpc/include/asm/page_64.h b/arch/powerpc/include/asm/page_64.h index dd5f0712afa2..3e83d2a20b6f 100644 --- a/arch/powerpc/include/asm/page_64.h +++ b/arch/powerpc/include/asm/page_64.h @@ -47,14 +47,14 @@ static inline void clear_page(void *addr) unsigned long iterations; unsigned long onex, twox, fourx, eightx; - iterations = ppc64_caches.dlines_per_page / 8; + iterations = ppc64_caches.l1d.blocks_per_page / 8; /* * Some verisions of gcc use multiply instructions to * calculate the offsets so lets give it a hand to * do better. */ - onex = ppc64_caches.dline_size; + onex = ppc64_caches.l1d.block_size; twox = onex << 1; fourx = onex << 2; eightx = onex << 3; diff --git a/arch/powerpc/include/asm/pci-bridge.h b/arch/powerpc/include/asm/pci-bridge.h index c0309c59bed8..56c67d3f0108 100644 --- a/arch/powerpc/include/asm/pci-bridge.h +++ b/arch/powerpc/include/asm/pci-bridge.h @@ -174,14 +174,6 @@ extern int pci_device_from_OF_node(struct device_node *node, u8 *bus, u8 *devfn); extern void pci_create_OF_bus_map(void); -static inline int isa_vaddr_is_ioport(void __iomem *address) -{ - /* No specific ISA handling on ppc32 at this stage, it - * all goes through PCI - */ - return 0; -} - #else /* CONFIG_PPC64 */ /* @@ -269,16 +261,6 @@ extern void pci_hp_remove_devices(struct pci_bus *bus); /** Discover new pci devices under this bus, and add them */ extern void pci_hp_add_devices(struct pci_bus *bus); - -extern void isa_bridge_find_early(struct pci_controller *hose); - -static inline int isa_vaddr_is_ioport(void __iomem *address) -{ - /* Check if address hits the reserved legacy IO range */ - unsigned long ea = (unsigned long)address; - return ea >= ISA_IO_BASE && ea < ISA_IO_END; -} - extern int pcibios_unmap_io_space(struct pci_bus *bus); extern int pcibios_map_io_space(struct pci_bus *bus); diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h index 0bcc75e295e3..c7b164836bc3 100644 --- a/arch/powerpc/include/asm/plpar_wrappers.h +++ b/arch/powerpc/include/asm/plpar_wrappers.h @@ -210,6 +210,18 @@ static inline long plpar_pte_protect(unsigned long flags, unsigned long ptex, return plpar_hcall_norets(H_PROTECT, flags, ptex, avpn); } +static inline long plpar_resize_hpt_prepare(unsigned long flags, + unsigned long shift) +{ + return plpar_hcall_norets(H_RESIZE_HPT_PREPARE, flags, shift); +} + +static inline long plpar_resize_hpt_commit(unsigned long flags, + unsigned long shift) +{ + return plpar_hcall_norets(H_RESIZE_HPT_COMMIT, flags, shift); +} + static inline long plpar_tce_get(unsigned long liobn, unsigned long ioba, unsigned long *tce_ret) { diff --git a/arch/powerpc/include/asm/powernv.h b/arch/powerpc/include/asm/powernv.h new file mode 100644 index 000000000000..0e9c2402dd20 --- /dev/null +++ b/arch/powerpc/include/asm/powernv.h @@ -0,0 +1,19 @@ +/* + * Copyright 2017 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _ASM_POWERNV_H +#define _ASM_POWERNV_H + +#ifdef CONFIG_PPC_POWERNV +extern void powernv_set_nmmu_ptcr(unsigned long ptcr); +#else +static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { } +#endif + +#endif /* _ASM_POWERNV_H */ diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index c4ced1d01d57..d99bd442aacb 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -306,6 +306,7 @@ #define __PPC_WC(w) (((w) & 0x3) << 21) #define __PPC_WS(w) (((w) & 0x1f) << 11) #define __PPC_SH(s) __PPC_WS(s) +#define __PPC_SH64(s) (__PPC_SH(s) | (((s) & 0x20) >> 4)) #define __PPC_MB(s) (((s) & 0x1f) << 6) #define __PPC_ME(s) (((s) & 0x1f) << 1) #define __PPC_MB64(s) (__PPC_MB(s) | ((s) & 0x20)) diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index 1ba814436c73..21e0b52685b5 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -454,7 +454,8 @@ extern int powersave_nap; /* set if nap mode can be used in idle loop */ extern unsigned long power7_nap(int check_irq); extern unsigned long power7_sleep(void); extern unsigned long power7_winkle(void); -extern unsigned long power9_idle_stop(unsigned long stop_level); +extern unsigned long power9_idle_stop(unsigned long stop_psscr_val, + unsigned long stop_psscr_mask); extern void flush_instruction_cache(void); extern void hard_reset_now(void); diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index 5e57705b4759..2c8001cc93b6 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -121,6 +121,8 @@ struct of_drconf_cell { #define OV1_PPC_2_06 0x02 /* set if we support PowerPC 2.06 */ #define OV1_PPC_2_07 0x01 /* set if we support PowerPC 2.07 */ +#define OV1_PPC_3_00 0x80 /* set if we support PowerPC 3.00 */ + /* Option vector 2: Open Firmware options supported */ #define OV2_REAL_MODE 0x20 /* set if we want OF in real mode */ @@ -151,10 +153,18 @@ struct of_drconf_cell { #define OV5_XCMO 0x0440 /* Page Coalescing */ #define OV5_TYPE1_AFFINITY 0x0580 /* Type 1 NUMA affinity */ #define OV5_PRRN 0x0540 /* Platform Resource Reassignment */ -#define OV5_PFO_HW_RNG 0x0E80 /* PFO Random Number Generator */ -#define OV5_PFO_HW_842 0x0E40 /* PFO Compression Accelerator */ -#define OV5_PFO_HW_ENCR 0x0E20 /* PFO Encryption Accelerator */ -#define OV5_SUB_PROCESSORS 0x0F01 /* 1,2,or 4 Sub-Processors supported */ +#define OV5_RESIZE_HPT 0x0601 /* Hash Page Table resizing */ +#define OV5_PFO_HW_RNG 0x1180 /* PFO Random Number Generator */ +#define OV5_PFO_HW_842 0x1140 /* PFO Compression Accelerator */ +#define OV5_PFO_HW_ENCR 0x1120 /* PFO Encryption Accelerator */ +#define OV5_SUB_PROCESSORS 0x1501 /* 1,2,or 4 Sub-Processors supported */ +#define OV5_XIVE_EXPLOIT 0x1701 /* XIVE exploitation supported */ +#define OV5_MMU_RADIX_300 0x1880 /* ISA v3.00 radix MMU supported */ +#define OV5_MMU_HASH_300 0x1840 /* ISA v3.00 hash MMU supported */ +#define OV5_MMU_SEGM_RADIX 0x1820 /* radix mode (no segmentation) */ +#define OV5_MMU_PROC_TBL 0x1810 /* hcall selects SLB or proc table */ +#define OV5_MMU_SLB 0x1800 /* always use SLB */ +#define OV5_MMU_GTSE 0x1808 /* Guest translation shootdown */ /* Option Vector 6: IBM PAPR hints */ #define OV6_LINUX 0x02 /* Linux is our OS */ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index dff79798903d..cb02d32db147 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -274,10 +274,14 @@ #define SPRN_DSISR 0x012 /* Data Storage Interrupt Status Register */ #define DSISR_NOHPTE 0x40000000 /* no translation found */ #define DSISR_PROTFAULT 0x08000000 /* protection fault */ +#define DSISR_BADACCESS 0x04000000 /* bad access to CI or G */ #define DSISR_ISSTORE 0x02000000 /* access was a store */ #define DSISR_DABRMATCH 0x00400000 /* hit data breakpoint */ #define DSISR_NOSEGMENT 0x00200000 /* SLB miss */ #define DSISR_KEYFAULT 0x00200000 /* Key fault */ +#define DSISR_UNSUPP_MMU 0x00080000 /* Unsupported MMU config */ +#define DSISR_SET_RC 0x00040000 /* Failed setting of R/C bits */ +#define DSISR_PGDIRFAULT 0x00020000 /* Fault on page directory */ #define SPRN_TBRL 0x10C /* Time Base Read Lower Register (user, R/O) */ #define SPRN_TBRU 0x10D /* Time Base Read Upper Register (user, R/O) */ #define SPRN_CIR 0x11B /* Chip Information Register (hyper, R/0) */ @@ -338,7 +342,7 @@ #define LPCR_DPFD_SH 52 #define LPCR_DPFD (ASM_CONST(7) << LPCR_DPFD_SH) #define LPCR_VRMASD_SH 47 -#define LPCR_VRMASD (ASM_CONST(1) << LPCR_VRMASD_SH) +#define LPCR_VRMASD (ASM_CONST(0x1f) << LPCR_VRMASD_SH) #define LPCR_VRMA_L ASM_CONST(0x0008000000000000) #define LPCR_VRMA_LP0 ASM_CONST(0x0001000000000000) #define LPCR_VRMA_LP1 ASM_CONST(0x0000800000000000) diff --git a/arch/powerpc/include/asm/rtas.h b/arch/powerpc/include/asm/rtas.h index 9c23baa10b81..076b89247ab5 100644 --- a/arch/powerpc/include/asm/rtas.h +++ b/arch/powerpc/include/asm/rtas.h @@ -318,6 +318,7 @@ struct pseries_hp_errorlog { #define PSERIES_HP_ELOG_ACTION_ADD 1 #define PSERIES_HP_ELOG_ACTION_REMOVE 2 +#define PSERIES_HP_ELOG_ACTION_READD 3 #define PSERIES_HP_ELOG_ID_DRC_NAME 1 #define PSERIES_HP_ELOG_ID_DRC_INDEX 2 diff --git a/arch/powerpc/include/asm/sparsemem.h b/arch/powerpc/include/asm/sparsemem.h index f6fc0ee813d7..c88930c9db7f 100644 --- a/arch/powerpc/include/asm/sparsemem.h +++ b/arch/powerpc/include/asm/sparsemem.h @@ -18,6 +18,13 @@ #ifdef CONFIG_MEMORY_HOTPLUG extern int create_section_mapping(unsigned long start, unsigned long end); extern int remove_section_mapping(unsigned long start, unsigned long end); + +#ifdef CONFIG_PPC_BOOK3S_64 +extern void resize_hpt_for_hotplug(unsigned long new_mem_size); +#else +static inline void resize_hpt_for_hotplug(unsigned long new_mem_size) { } +#endif + #ifdef CONFIG_NUMA extern int hot_add_scn_to_nid(unsigned long scn_addr); #else diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index a15d84d59356..0e6add3187bc 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -261,7 +261,7 @@ do { \ ({ \ long __gu_err; \ unsigned long __gu_val; \ - __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ + const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __chk_user_ptr(ptr); \ if (!is_kernel_addr((unsigned long)__gu_addr)) \ might_fault(); \ @@ -274,7 +274,7 @@ do { \ ({ \ long __gu_err = -EFAULT; \ unsigned long __gu_val = 0; \ - __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ + const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ might_fault(); \ if (access_ok(VERIFY_READ, __gu_addr, (size))) \ __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ @@ -286,7 +286,7 @@ do { \ ({ \ long __gu_err; \ unsigned long __gu_val; \ - __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ + const __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __chk_user_ptr(ptr); \ __get_user_size(__gu_val, __gu_addr, (size), __gu_err); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ diff --git a/arch/powerpc/include/uapi/asm/auxvec.h b/arch/powerpc/include/uapi/asm/auxvec.h index ce17d2c9eb4e..be6e94ecec42 100644 --- a/arch/powerpc/include/uapi/asm/auxvec.h +++ b/arch/powerpc/include/uapi/asm/auxvec.h @@ -16,6 +16,37 @@ */ #define AT_SYSINFO_EHDR 33 -#define AT_VECTOR_SIZE_ARCH 6 /* entries in ARCH_DLINFO */ +/* + * AT_*CACHEBSIZE above represent the cache *block* size which is + * the size that is affected by the cache management instructions. + * + * It doesn't nececssarily matches the cache *line* size which is + * more of a performance tuning hint. Additionally the latter can + * be different for the different cache levels. + * + * The set of entries below represent more extensive information + * about the caches, in the form of two entry per cache type, + * one entry containing the cache size in bytes, and the other + * containing the cache line size in bytes in the bottom 16 bits + * and the cache associativity in the next 16 bits. + * + * The associativity is such that if N is the 16-bit value, the + * cache is N way set associative. A value if 0xffff means fully + * associative, a value of 1 means directly mapped. + * + * For all these fields, a value of 0 means that the information + * is not known. + */ + +#define AT_L1I_CACHESIZE 40 +#define AT_L1I_CACHEGEOMETRY 41 +#define AT_L1D_CACHESIZE 42 +#define AT_L1D_CACHEGEOMETRY 43 +#define AT_L2_CACHESIZE 44 +#define AT_L2_CACHEGEOMETRY 45 +#define AT_L3_CACHESIZE 46 +#define AT_L3_CACHEGEOMETRY 47 + +#define AT_VECTOR_SIZE_ARCH 14 /* entries in ARCH_DLINFO */ #endif diff --git a/arch/powerpc/include/uapi/asm/elf.h b/arch/powerpc/include/uapi/asm/elf.h index 3a9e44c45c78..b2c6fdd5ac30 100644 --- a/arch/powerpc/include/uapi/asm/elf.h +++ b/arch/powerpc/include/uapi/asm/elf.h @@ -162,29 +162,6 @@ typedef elf_vrreg_t elf_vrregset_t32[ELF_NVRREG32]; typedef elf_fpreg_t elf_vsrreghalf_t32[ELF_NVSRHALFREG]; #endif - -/* - * The requirements here are: - * - keep the final alignment of sp (sp & 0xf) - * - make sure the 32-bit value at the first 16 byte aligned position of - * AUXV is greater than 16 for glibc compatibility. - * AT_IGNOREPPC is used for that. - * - for compatibility with glibc ARCH_DLINFO must always be defined on PPC, - * even if DLINFO_ARCH_ITEMS goes to zero or is undefined. - * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes - */ -#define ARCH_DLINFO \ -do { \ - /* Handle glibc compatibility. */ \ - NEW_AUX_ENT(AT_IGNOREPPC, AT_IGNOREPPC); \ - NEW_AUX_ENT(AT_IGNOREPPC, AT_IGNOREPPC); \ - /* Cache size items */ \ - NEW_AUX_ENT(AT_DCACHEBSIZE, dcache_bsize); \ - NEW_AUX_ENT(AT_ICACHEBSIZE, icache_bsize); \ - NEW_AUX_ENT(AT_UCACHEBSIZE, ucache_bsize); \ - VDSO_AUX_ENT(AT_SYSINFO_EHDR, current->mm->context.vdso_base); \ -} while (0) - /* PowerPC64 relocations defined by the ABIs */ #define R_PPC64_NONE R_PPC_NONE #define R_PPC64_ADDR32 R_PPC_ADDR32 /* 32bit absolute address. */ diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index 3603b6f51b11..cc0908b6c2a0 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -413,6 +413,26 @@ struct kvm_get_htab_header { __u16 n_invalid; }; +/* For KVM_PPC_CONFIGURE_V3_MMU */ +struct kvm_ppc_mmuv3_cfg { + __u64 flags; + __u64 process_table; /* second doubleword of partition table entry */ +}; + +/* Flag values for KVM_PPC_CONFIGURE_V3_MMU */ +#define KVM_PPC_MMUV3_RADIX 1 /* 1 = radix mode, 0 = HPT */ +#define KVM_PPC_MMUV3_GTSE 2 /* global translation shootdown enb. */ + +/* For KVM_PPC_GET_RMMU_INFO */ +struct kvm_ppc_rmmu_info { + struct kvm_ppc_radix_geom { + __u8 page_shift; + __u8 level_bits[4]; + __u8 pad[3]; + } geometries[8]; + __u32 ap_encodings[8]; +}; + /* Per-vcpu XICS interrupt controller state */ #define KVM_REG_PPC_ICP_STATE (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0x8c) diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index f4c2b52e58b3..811f441a125f 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -15,7 +15,7 @@ CFLAGS_btext.o += -fPIC endif CFLAGS_cputable.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) -CFLAGS_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) +CFLAGS_prom_init.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_btext.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) CFLAGS_prom.o += $(DISABLE_LATENT_ENTROPY_PLUGIN) @@ -96,6 +96,7 @@ obj-$(CONFIG_KGDB) += kgdb.o obj-$(CONFIG_BOOTX_TEXT) += btext.o obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_KPROBES) += kprobes.o +obj-$(CONFIG_OPTPROBES) += optprobes.o optprobes_head.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index 8d58c61908f7..cbc7c42cdb74 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -204,7 +204,7 @@ static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr) int i, size; #ifdef __powerpc64__ - size = ppc64_caches.dline_size; + size = ppc64_caches.l1d.block_size; #else size = L1_CACHE_BYTES; #endif diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 9e8e771f8acb..f25239b3a06f 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -160,12 +160,12 @@ int main(void) DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); #ifdef CONFIG_PPC64 - DEFINE(DCACHEL1LINESIZE, offsetof(struct ppc64_caches, dline_size)); - DEFINE(DCACHEL1LOGLINESIZE, offsetof(struct ppc64_caches, log_dline_size)); - DEFINE(DCACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, dlines_per_page)); - DEFINE(ICACHEL1LINESIZE, offsetof(struct ppc64_caches, iline_size)); - DEFINE(ICACHEL1LOGLINESIZE, offsetof(struct ppc64_caches, log_iline_size)); - DEFINE(ICACHEL1LINESPERPAGE, offsetof(struct ppc64_caches, ilines_per_page)); + DEFINE(DCACHEL1BLOCKSIZE, offsetof(struct ppc64_caches, l1d.block_size)); + DEFINE(DCACHEL1LOGBLOCKSIZE, offsetof(struct ppc64_caches, l1d.log_block_size)); + DEFINE(DCACHEL1BLOCKSPERPAGE, offsetof(struct ppc64_caches, l1d.blocks_per_page)); + DEFINE(ICACHEL1BLOCKSIZE, offsetof(struct ppc64_caches, l1i.block_size)); + DEFINE(ICACHEL1LOGBLOCKSIZE, offsetof(struct ppc64_caches, l1i.log_block_size)); + DEFINE(ICACHEL1BLOCKSPERPAGE, offsetof(struct ppc64_caches, l1i.blocks_per_page)); /* paca */ DEFINE(PACA_SIZE, sizeof(struct paca_struct)); DEFINE(PACAPACAINDEX, offsetof(struct paca_struct, paca_index)); @@ -495,6 +495,7 @@ int main(void) DEFINE(KVM_NEED_FLUSH, offsetof(struct kvm, arch.need_tlb_flush.bits)); DEFINE(KVM_ENABLED_HCALLS, offsetof(struct kvm, arch.enabled_hcalls)); DEFINE(KVM_VRMA_SLB_V, offsetof(struct kvm, arch.vrma_slb_v)); + DEFINE(KVM_RADIX, offsetof(struct kvm, arch.radix)); DEFINE(VCPU_DSISR, offsetof(struct kvm_vcpu, arch.shregs.dsisr)); DEFINE(VCPU_DAR, offsetof(struct kvm_vcpu, arch.shregs.dar)); DEFINE(VCPU_VPA, offsetof(struct kvm_vcpu, arch.vpa.pinned_addr)); @@ -534,6 +535,7 @@ int main(void) DEFINE(VCPU_SLB_NR, offsetof(struct kvm_vcpu, arch.slb_nr)); DEFINE(VCPU_FAULT_DSISR, offsetof(struct kvm_vcpu, arch.fault_dsisr)); DEFINE(VCPU_FAULT_DAR, offsetof(struct kvm_vcpu, arch.fault_dar)); + DEFINE(VCPU_FAULT_GPA, offsetof(struct kvm_vcpu, arch.fault_gpa)); DEFINE(VCPU_INTR_MSR, offsetof(struct kvm_vcpu, arch.intr_msr)); DEFINE(VCPU_LAST_INST, offsetof(struct kvm_vcpu, arch.last_inst)); DEFINE(VCPU_TRAP, offsetof(struct kvm_vcpu, arch.trap)); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index d39d6118c6e9..857bf7c5b946 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -93,7 +93,7 @@ USE_FIXED_SECTION(real_vectors) __start_interrupts: /* No virt vectors corresponding with 0x0..0x100 */ -EXC_VIRT_NONE(0x4000, 0x4100) +EXC_VIRT_NONE(0x4000, 0x100) #ifdef CONFIG_PPC_P7_NAP @@ -114,15 +114,15 @@ EXC_VIRT_NONE(0x4000, 0x4100) #define IDLETEST NOTEST #endif -EXC_REAL_BEGIN(system_reset, 0x100, 0x200) +EXC_REAL_BEGIN(system_reset, 0x100, 0x100) SET_SCRATCH0(r13) GET_PACA(r13) clrrdi r13,r13,1 /* Last bit of HSPRG0 is set if waking from winkle */ EXCEPTION_PROLOG_PSERIES_PACA(PACA_EXGEN, system_reset_common, EXC_STD, IDLETEST, 0x100) -EXC_REAL_END(system_reset, 0x100, 0x200) -EXC_VIRT_NONE(0x4100, 0x4200) +EXC_REAL_END(system_reset, 0x100, 0x100) +EXC_VIRT_NONE(0x4100, 0x100) #ifdef CONFIG_PPC_P7_NAP EXC_COMMON_BEGIN(system_reset_idle_common) @@ -142,7 +142,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) lbz r0,HSTATE_HWTHREAD_REQ(r13) cmpwi r0,0 beq 1f - b kvm_start_guest + BRANCH_TO_KVM(r10, kvm_start_guest) 1: #endif @@ -166,7 +166,7 @@ TRAMP_REAL_BEGIN(system_reset_fwnmi) #endif /* CONFIG_PPC_PSERIES */ -EXC_REAL_BEGIN(machine_check, 0x200, 0x300) +EXC_REAL_BEGIN(machine_check, 0x200, 0x100) /* This is moved out of line as it can be patched by FW, but * some code path might still want to branch into the original * vector @@ -186,8 +186,8 @@ BEGIN_FTR_SECTION FTR_SECTION_ELSE b machine_check_pSeries_0 ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) -EXC_REAL_END(machine_check, 0x200, 0x300) -EXC_VIRT_NONE(0x4200, 0x4300) +EXC_REAL_END(machine_check, 0x200, 0x100) +EXC_VIRT_NONE(0x4200, 0x100) TRAMP_REAL_BEGIN(machine_check_powernv_early) BEGIN_FTR_SECTION EXCEPTION_PROLOG_1(PACA_EXMC, NOTEST, 0x200) @@ -381,12 +381,12 @@ EXC_COMMON_BEGIN(machine_check_handle_early) lbz r3,PACA_THREAD_IDLE_STATE(r13) cmpwi r3,PNV_THREAD_NAP bgt 10f - IDLE_STATE_ENTER_SEQ(PPC_NAP) + IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP) /* No return */ 10: cmpwi r3,PNV_THREAD_SLEEP bgt 2f - IDLE_STATE_ENTER_SEQ(PPC_SLEEP) + IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP) /* No return */ 2: @@ -400,7 +400,7 @@ EXC_COMMON_BEGIN(machine_check_handle_early) */ ori r13,r13,1 SET_PACA(r13) - IDLE_STATE_ENTER_SEQ(PPC_WINKLE) + IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE) /* No return */ 4: #endif @@ -483,8 +483,8 @@ EXC_COMMON_BEGIN(unrecover_mce) b 1b -EXC_REAL(data_access, 0x300, 0x380) -EXC_VIRT(data_access, 0x4300, 0x4380, 0x300) +EXC_REAL(data_access, 0x300, 0x80) +EXC_VIRT(data_access, 0x4300, 0x80, 0x300) TRAMP_KVM_SKIP(PACA_EXGEN, 0x300) EXC_COMMON_BEGIN(data_access_common) @@ -512,7 +512,7 @@ MMU_FTR_SECTION_ELSE ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) -EXC_REAL_BEGIN(data_access_slb, 0x380, 0x400) +EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) @@ -533,9 +533,9 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x400) mtctr r10 bctr #endif -EXC_REAL_END(data_access_slb, 0x380, 0x400) +EXC_REAL_END(data_access_slb, 0x380, 0x80) -EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x4400) +EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) @@ -556,12 +556,12 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x4400) mtctr r10 bctr #endif -EXC_VIRT_END(data_access_slb, 0x4380, 0x4400) +EXC_VIRT_END(data_access_slb, 0x4380, 0x80) TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) -EXC_REAL(instruction_access, 0x400, 0x480) -EXC_VIRT(instruction_access, 0x4400, 0x4480, 0x400) +EXC_REAL(instruction_access, 0x400, 0x80) +EXC_VIRT(instruction_access, 0x4400, 0x80, 0x400) TRAMP_KVM(PACA_EXGEN, 0x400) EXC_COMMON_BEGIN(instruction_access_common) @@ -580,7 +580,7 @@ MMU_FTR_SECTION_ELSE ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) -EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x500) +EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480) @@ -596,9 +596,9 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x500) mtctr r10 bctr #endif -EXC_REAL_END(instruction_access_slb, 0x480, 0x500) +EXC_REAL_END(instruction_access_slb, 0x480, 0x80) -EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x4500) +EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480) @@ -614,7 +614,7 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x4500) mtctr r10 bctr #endif -EXC_VIRT_END(instruction_access_slb, 0x4480, 0x4500) +EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) TRAMP_KVM(PACA_EXSLB, 0x480) @@ -711,23 +711,19 @@ EXC_COMMON_BEGIN(bad_addr_slb) bl slb_miss_bad_addr b ret_from_except -EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x600) +EXC_REAL_BEGIN(hardware_interrupt, 0x500, 0x100) .globl hardware_interrupt_hv; hardware_interrupt_hv: BEGIN_FTR_SECTION _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common, EXC_HV, SOFTEN_TEST_HV) -do_kvm_H0x500: - KVM_HANDLER(PACA_EXGEN, EXC_HV, 0x502) FTR_SECTION_ELSE _MASKABLE_EXCEPTION_PSERIES(0x500, hardware_interrupt_common, EXC_STD, SOFTEN_TEST_PR) -do_kvm_0x500: - KVM_HANDLER(PACA_EXGEN, EXC_STD, 0x500) ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) -EXC_REAL_END(hardware_interrupt, 0x500, 0x600) +EXC_REAL_END(hardware_interrupt, 0x500, 0x100) -EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x4600) +EXC_VIRT_BEGIN(hardware_interrupt, 0x4500, 0x100) .globl hardware_interrupt_relon_hv; hardware_interrupt_relon_hv: BEGIN_FTR_SECTION @@ -735,13 +731,15 @@ hardware_interrupt_relon_hv: FTR_SECTION_ELSE _MASKABLE_RELON_EXCEPTION_PSERIES(0x500, hardware_interrupt_common, EXC_STD, SOFTEN_TEST_PR) ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) -EXC_VIRT_END(hardware_interrupt, 0x4500, 0x4600) +EXC_VIRT_END(hardware_interrupt, 0x4500, 0x100) +TRAMP_KVM(PACA_EXGEN, 0x500) +TRAMP_KVM_HV(PACA_EXGEN, 0x500) EXC_COMMON_ASYNC(hardware_interrupt_common, 0x500, do_IRQ) -EXC_REAL(alignment, 0x600, 0x700) -EXC_VIRT(alignment, 0x4600, 0x4700, 0x600) +EXC_REAL(alignment, 0x600, 0x100) +EXC_VIRT(alignment, 0x4600, 0x100, 0x600) TRAMP_KVM(PACA_EXGEN, 0x600) EXC_COMMON_BEGIN(alignment_common) mfspr r10,SPRN_DAR @@ -760,8 +758,8 @@ EXC_COMMON_BEGIN(alignment_common) b ret_from_except -EXC_REAL(program_check, 0x700, 0x800) -EXC_VIRT(program_check, 0x4700, 0x4800, 0x700) +EXC_REAL(program_check, 0x700, 0x100) +EXC_VIRT(program_check, 0x4700, 0x100, 0x700) TRAMP_KVM(PACA_EXGEN, 0x700) EXC_COMMON_BEGIN(program_check_common) EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) @@ -772,8 +770,8 @@ EXC_COMMON_BEGIN(program_check_common) b ret_from_except -EXC_REAL(fp_unavailable, 0x800, 0x900) -EXC_VIRT(fp_unavailable, 0x4800, 0x4900, 0x800) +EXC_REAL(fp_unavailable, 0x800, 0x100) +EXC_VIRT(fp_unavailable, 0x4800, 0x100, 0x800) TRAMP_KVM(PACA_EXGEN, 0x800) EXC_COMMON_BEGIN(fp_unavailable_common) EXCEPTION_PROLOG_COMMON(0x800, PACA_EXGEN) @@ -805,20 +803,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif -EXC_REAL_MASKABLE(decrementer, 0x900, 0x980) -EXC_VIRT_MASKABLE(decrementer, 0x4900, 0x4980, 0x900) +EXC_REAL_MASKABLE(decrementer, 0x900, 0x80) +EXC_VIRT_MASKABLE(decrementer, 0x4900, 0x80, 0x900) TRAMP_KVM(PACA_EXGEN, 0x900) EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt) -EXC_REAL_HV(hdecrementer, 0x980, 0xa00) -EXC_VIRT_HV(hdecrementer, 0x4980, 0x4a00, 0x980) +EXC_REAL_HV(hdecrementer, 0x980, 0x80) +EXC_VIRT_HV(hdecrementer, 0x4980, 0x80, 0x980) TRAMP_KVM_HV(PACA_EXGEN, 0x980) EXC_COMMON(hdecrementer_common, 0x980, hdec_interrupt) -EXC_REAL_MASKABLE(doorbell_super, 0xa00, 0xb00) -EXC_VIRT_MASKABLE(doorbell_super, 0x4a00, 0x4b00, 0xa00) +EXC_REAL_MASKABLE(doorbell_super, 0xa00, 0x100) +EXC_VIRT_MASKABLE(doorbell_super, 0x4a00, 0x100, 0xa00) TRAMP_KVM(PACA_EXGEN, 0xa00) #ifdef CONFIG_PPC_DOORBELL EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, doorbell_exception) @@ -827,11 +825,36 @@ EXC_COMMON_ASYNC(doorbell_super_common, 0xa00, unknown_exception) #endif -EXC_REAL(trap_0b, 0xb00, 0xc00) -EXC_VIRT(trap_0b, 0x4b00, 0x4c00, 0xb00) +EXC_REAL(trap_0b, 0xb00, 0x100) +EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00) TRAMP_KVM(PACA_EXGEN, 0xb00) EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER + /* + * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems + * that support it) before changing to HMT_MEDIUM. That allows the KVM + * code to save that value into the guest state (it is the guest's PPR + * value). Otherwise just change to HMT_MEDIUM as userspace has + * already saved the PPR. + */ +#define SYSCALL_KVMTEST \ + SET_SCRATCH0(r13); \ + GET_PACA(r13); \ + std r9,PACA_EXGEN+EX_R9(r13); \ + OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \ + HMT_MEDIUM; \ + std r10,PACA_EXGEN+EX_R10(r13); \ + OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR); \ + mfcr r9; \ + KVMTEST_PR(0xc00); \ + GET_SCRATCH0(r13) + +#else +#define SYSCALL_KVMTEST \ + HMT_MEDIUM +#endif + #define LOAD_SYSCALL_HANDLER(reg) \ __LOAD_HANDLER(reg, system_call_common) @@ -884,50 +907,30 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ b system_call_common ; #endif -EXC_REAL_BEGIN(system_call, 0xc00, 0xd00) - /* - * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems - * that support it) before changing to HMT_MEDIUM. That allows the KVM - * code to save that value into the guest state (it is the guest's PPR - * value). Otherwise just change to HMT_MEDIUM as userspace has - * already saved the PPR. - */ -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER - SET_SCRATCH0(r13) - GET_PACA(r13) - std r9,PACA_EXGEN+EX_R9(r13) - OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); - HMT_MEDIUM; - std r10,PACA_EXGEN+EX_R10(r13) - OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR); - mfcr r9 - KVMTEST_PR(0xc00) - GET_SCRATCH0(r13) -#else - HMT_MEDIUM; -#endif +EXC_REAL_BEGIN(system_call, 0xc00, 0x100) + SYSCALL_KVMTEST SYSCALL_PSERIES_1 SYSCALL_PSERIES_2_RFID SYSCALL_PSERIES_3 -EXC_REAL_END(system_call, 0xc00, 0xd00) +EXC_REAL_END(system_call, 0xc00, 0x100) -EXC_VIRT_BEGIN(system_call, 0x4c00, 0x4d00) - HMT_MEDIUM +EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100) + SYSCALL_KVMTEST SYSCALL_PSERIES_1 SYSCALL_PSERIES_2_DIRECT SYSCALL_PSERIES_3 -EXC_VIRT_END(system_call, 0x4c00, 0x4d00) +EXC_VIRT_END(system_call, 0x4c00, 0x100) TRAMP_KVM(PACA_EXGEN, 0xc00) -EXC_REAL(single_step, 0xd00, 0xe00) -EXC_VIRT(single_step, 0x4d00, 0x4e00, 0xd00) +EXC_REAL(single_step, 0xd00, 0x100) +EXC_VIRT(single_step, 0x4d00, 0x100, 0xd00) TRAMP_KVM(PACA_EXGEN, 0xd00) EXC_COMMON(single_step_common, 0xd00, single_step_exception) -EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0xe20) -EXC_VIRT_NONE(0x4e00, 0x4e20) +EXC_REAL_OOL_HV(h_data_storage, 0xe00, 0x20) +EXC_VIRT_OOL_HV(h_data_storage, 0x4e00, 0x20, 0xe00) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0xe00) EXC_COMMON_BEGIN(h_data_storage_common) mfspr r10,SPRN_HDAR @@ -942,14 +945,14 @@ EXC_COMMON_BEGIN(h_data_storage_common) b ret_from_except -EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0xe40) -EXC_VIRT_NONE(0x4e20, 0x4e40) +EXC_REAL_OOL_HV(h_instr_storage, 0xe20, 0x20) +EXC_VIRT_OOL_HV(h_instr_storage, 0x4e20, 0x20, 0xe20) TRAMP_KVM_HV(PACA_EXGEN, 0xe20) EXC_COMMON(h_instr_storage_common, 0xe20, unknown_exception) -EXC_REAL_OOL_HV(emulation_assist, 0xe40, 0xe60) -EXC_VIRT_OOL_HV(emulation_assist, 0x4e40, 0x4e60, 0xe40) +EXC_REAL_OOL_HV(emulation_assist, 0xe40, 0x20) +EXC_VIRT_OOL_HV(emulation_assist, 0x4e40, 0x20, 0xe40) TRAMP_KVM_HV(PACA_EXGEN, 0xe40) EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt) @@ -959,9 +962,9 @@ EXC_COMMON(emulation_assist_common, 0xe40, emulation_assist_interrupt) * first, and then eventaully from there to the trampoline to get into virtual * mode. */ -__EXC_REAL_OOL_HV_DIRECT(hmi_exception, 0xe60, 0xe80, hmi_exception_early) -__TRAMP_REAL_REAL_OOL_MASKABLE_HV(hmi_exception, 0xe60) -EXC_VIRT_NONE(0x4e60, 0x4e80) +__EXC_REAL_OOL_HV_DIRECT(hmi_exception, 0xe60, 0x20, hmi_exception_early) +__TRAMP_REAL_OOL_MASKABLE_HV(hmi_exception, 0xe60) +EXC_VIRT_NONE(0x4e60, 0x20) TRAMP_KVM_HV(PACA_EXGEN, 0xe60) TRAMP_REAL_BEGIN(hmi_exception_early) EXCEPTION_PROLOG_1(PACA_EXGEN, KVMTEST_HV, 0xe60) @@ -979,7 +982,7 @@ TRAMP_REAL_BEGIN(hmi_exception_early) EXCEPTION_PROLOG_COMMON_2(PACA_EXGEN) EXCEPTION_PROLOG_COMMON_3(0xe60) addi r3,r1,STACK_FRAME_OVERHEAD - bl hmi_exception_realmode + BRANCH_LINK_TO_FAR(r4, hmi_exception_realmode) /* Windup the stack. */ /* Move original HSRR0 and HSRR1 into the respective regs */ ld r9,_MSR(r1) @@ -1015,8 +1018,8 @@ hmi_exception_after_realmode: EXC_COMMON_ASYNC(hmi_exception_common, 0xe60, handle_hmi_exception) -EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0xea0) -EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x4ea0, 0xe80) +EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20) +EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80) TRAMP_KVM_HV(PACA_EXGEN, 0xe80) #ifdef CONFIG_PPC_DOORBELL EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, doorbell_exception) @@ -1025,24 +1028,26 @@ EXC_COMMON_ASYNC(h_doorbell_common, 0xe80, unknown_exception) #endif -EXC_REAL_OOL_MASKABLE_HV(h_virt_irq, 0xea0, 0xec0) -EXC_VIRT_OOL_MASKABLE_HV(h_virt_irq, 0x4ea0, 0x4ec0, 0xea0) +EXC_REAL_OOL_MASKABLE_HV(h_virt_irq, 0xea0, 0x20) +EXC_VIRT_OOL_MASKABLE_HV(h_virt_irq, 0x4ea0, 0x20, 0xea0) TRAMP_KVM_HV(PACA_EXGEN, 0xea0) EXC_COMMON_ASYNC(h_virt_irq_common, 0xea0, do_IRQ) -EXC_REAL_NONE(0xec0, 0xf00) -EXC_VIRT_NONE(0x4ec0, 0x4f00) +EXC_REAL_NONE(0xec0, 0x20) +EXC_VIRT_NONE(0x4ec0, 0x20) +EXC_REAL_NONE(0xee0, 0x20) +EXC_VIRT_NONE(0x4ee0, 0x20) -EXC_REAL_OOL(performance_monitor, 0xf00, 0xf20) -EXC_VIRT_OOL(performance_monitor, 0x4f00, 0x4f20, 0xf00) +EXC_REAL_OOL(performance_monitor, 0xf00, 0x20) +EXC_VIRT_OOL(performance_monitor, 0x4f00, 0x20, 0xf00) TRAMP_KVM(PACA_EXGEN, 0xf00) EXC_COMMON_ASYNC(performance_monitor_common, 0xf00, performance_monitor_exception) -EXC_REAL_OOL(altivec_unavailable, 0xf20, 0xf40) -EXC_VIRT_OOL(altivec_unavailable, 0x4f20, 0x4f40, 0xf20) +EXC_REAL_OOL(altivec_unavailable, 0xf20, 0x20) +EXC_VIRT_OOL(altivec_unavailable, 0x4f20, 0x20, 0xf20) TRAMP_KVM(PACA_EXGEN, 0xf20) EXC_COMMON_BEGIN(altivec_unavailable_common) EXCEPTION_PROLOG_COMMON(0xf20, PACA_EXGEN) @@ -1078,8 +1083,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) b ret_from_except -EXC_REAL_OOL(vsx_unavailable, 0xf40, 0xf60) -EXC_VIRT_OOL(vsx_unavailable, 0x4f40, 0x4f60, 0xf40) +EXC_REAL_OOL(vsx_unavailable, 0xf40, 0x20) +EXC_VIRT_OOL(vsx_unavailable, 0x4f40, 0x20, 0xf40) TRAMP_KVM(PACA_EXGEN, 0xf40) EXC_COMMON_BEGIN(vsx_unavailable_common) EXCEPTION_PROLOG_COMMON(0xf40, PACA_EXGEN) @@ -1114,41 +1119,50 @@ END_FTR_SECTION_IFSET(CPU_FTR_VSX) b ret_from_except -EXC_REAL_OOL(facility_unavailable, 0xf60, 0xf80) -EXC_VIRT_OOL(facility_unavailable, 0x4f60, 0x4f80, 0xf60) +EXC_REAL_OOL(facility_unavailable, 0xf60, 0x20) +EXC_VIRT_OOL(facility_unavailable, 0x4f60, 0x20, 0xf60) TRAMP_KVM(PACA_EXGEN, 0xf60) EXC_COMMON(facility_unavailable_common, 0xf60, facility_unavailable_exception) -EXC_REAL_OOL_HV(h_facility_unavailable, 0xf80, 0xfa0) -EXC_VIRT_OOL_HV(h_facility_unavailable, 0x4f80, 0x4fa0, 0xf80) +EXC_REAL_OOL_HV(h_facility_unavailable, 0xf80, 0x20) +EXC_VIRT_OOL_HV(h_facility_unavailable, 0x4f80, 0x20, 0xf80) TRAMP_KVM_HV(PACA_EXGEN, 0xf80) EXC_COMMON(h_facility_unavailable_common, 0xf80, facility_unavailable_exception) -EXC_REAL_NONE(0xfa0, 0x1200) -EXC_VIRT_NONE(0x4fa0, 0x5200) +EXC_REAL_NONE(0xfa0, 0x20) +EXC_VIRT_NONE(0x4fa0, 0x20) +EXC_REAL_NONE(0xfc0, 0x20) +EXC_VIRT_NONE(0x4fc0, 0x20) +EXC_REAL_NONE(0xfe0, 0x20) +EXC_VIRT_NONE(0x4fe0, 0x20) + +EXC_REAL_NONE(0x1000, 0x100) +EXC_VIRT_NONE(0x5000, 0x100) +EXC_REAL_NONE(0x1100, 0x100) +EXC_VIRT_NONE(0x5100, 0x100) #ifdef CONFIG_CBE_RAS -EXC_REAL_HV(cbe_system_error, 0x1200, 0x1300) -EXC_VIRT_NONE(0x5200, 0x5300) +EXC_REAL_HV(cbe_system_error, 0x1200, 0x100) +EXC_VIRT_NONE(0x5200, 0x100) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1200) EXC_COMMON(cbe_system_error_common, 0x1200, cbe_system_error_exception) #else /* CONFIG_CBE_RAS */ -EXC_REAL_NONE(0x1200, 0x1300) -EXC_VIRT_NONE(0x5200, 0x5300) +EXC_REAL_NONE(0x1200, 0x100) +EXC_VIRT_NONE(0x5200, 0x100) #endif -EXC_REAL(instruction_breakpoint, 0x1300, 0x1400) -EXC_VIRT(instruction_breakpoint, 0x5300, 0x5400, 0x1300) +EXC_REAL(instruction_breakpoint, 0x1300, 0x100) +EXC_VIRT(instruction_breakpoint, 0x5300, 0x100, 0x1300) TRAMP_KVM_SKIP(PACA_EXGEN, 0x1300) EXC_COMMON(instruction_breakpoint_common, 0x1300, instruction_breakpoint_exception) -EXC_REAL_NONE(0x1400, 0x1500) -EXC_VIRT_NONE(0x5400, 0x5500) +EXC_REAL_NONE(0x1400, 0x100) +EXC_VIRT_NONE(0x5400, 0x100) -EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x1600) +EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x100) mtspr SPRN_SPRG_HSCRATCH0,r13 EXCEPTION_PROLOG_0(PACA_EXGEN) EXCEPTION_PROLOG_1(PACA_EXGEN, NOTEST, 0x1500) @@ -1163,14 +1177,14 @@ EXC_REAL_BEGIN(denorm_exception_hv, 0x1500, 0x1600) KVMTEST_PR(0x1500) EXCEPTION_PROLOG_PSERIES_1(denorm_common, EXC_HV) -EXC_REAL_END(denorm_exception_hv, 0x1500, 0x1600) +EXC_REAL_END(denorm_exception_hv, 0x1500, 0x100) #ifdef CONFIG_PPC_DENORMALISATION -EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x5600) +EXC_VIRT_BEGIN(denorm_exception, 0x5500, 0x100) b exc_real_0x1500_denorm_exception_hv -EXC_VIRT_END(denorm_exception, 0x5500, 0x5600) +EXC_VIRT_END(denorm_exception, 0x5500, 0x100) #else -EXC_VIRT_NONE(0x5500, 0x5600) +EXC_VIRT_NONE(0x5500, 0x100) #endif TRAMP_KVM_SKIP(PACA_EXGEN, 0x1500) @@ -1243,18 +1257,18 @@ EXC_COMMON_HV(denorm_common, 0x1500, unknown_exception) #ifdef CONFIG_CBE_RAS -EXC_REAL_HV(cbe_maintenance, 0x1600, 0x1700) -EXC_VIRT_NONE(0x5600, 0x5700) +EXC_REAL_HV(cbe_maintenance, 0x1600, 0x100) +EXC_VIRT_NONE(0x5600, 0x100) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1600) EXC_COMMON(cbe_maintenance_common, 0x1600, cbe_maintenance_exception) #else /* CONFIG_CBE_RAS */ -EXC_REAL_NONE(0x1600, 0x1700) -EXC_VIRT_NONE(0x5600, 0x5700) +EXC_REAL_NONE(0x1600, 0x100) +EXC_VIRT_NONE(0x5600, 0x100) #endif -EXC_REAL(altivec_assist, 0x1700, 0x1800) -EXC_VIRT(altivec_assist, 0x5700, 0x5800, 0x1700) +EXC_REAL(altivec_assist, 0x1700, 0x100) +EXC_VIRT(altivec_assist, 0x5700, 0x100, 0x1700) TRAMP_KVM(PACA_EXGEN, 0x1700) #ifdef CONFIG_ALTIVEC EXC_COMMON(altivec_assist_common, 0x1700, altivec_assist_exception) @@ -1264,13 +1278,13 @@ EXC_COMMON(altivec_assist_common, 0x1700, unknown_exception) #ifdef CONFIG_CBE_RAS -EXC_REAL_HV(cbe_thermal, 0x1800, 0x1900) -EXC_VIRT_NONE(0x5800, 0x5900) +EXC_REAL_HV(cbe_thermal, 0x1800, 0x100) +EXC_VIRT_NONE(0x5800, 0x100) TRAMP_KVM_HV_SKIP(PACA_EXGEN, 0x1800) EXC_COMMON(cbe_thermal_common, 0x1800, cbe_thermal_exception) #else /* CONFIG_CBE_RAS */ -EXC_REAL_NONE(0x1800, 0x1900) -EXC_VIRT_NONE(0x5800, 0x5900) +EXC_REAL_NONE(0x1800, 0x100) +EXC_VIRT_NONE(0x5800, 0x100) #endif diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 8f0c7c5d93f2..8ff0dd4e77a7 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -406,12 +406,35 @@ static void register_fw_dump(struct fadump_mem_struct *fdm) void crash_fadump(struct pt_regs *regs, const char *str) { struct fadump_crash_info_header *fdh = NULL; + int old_cpu, this_cpu; if (!fw_dump.dump_registered || !fw_dump.fadumphdr_addr) return; + /* + * old_cpu == -1 means this is the first CPU which has come here, + * go ahead and trigger fadump. + * + * old_cpu != -1 means some other CPU has already on it's way + * to trigger fadump, just keep looping here. + */ + this_cpu = smp_processor_id(); + old_cpu = cmpxchg(&crashing_cpu, -1, this_cpu); + + if (old_cpu != -1) { + /* + * We can't loop here indefinitely. Wait as long as fadump + * is in force. If we race with fadump un-registration this + * loop will break and then we go down to normal panic path + * and reboot. If fadump is in force the first crashing + * cpu will definitely trigger fadump. + */ + while (fw_dump.dump_registered) + cpu_relax(); + return; + } + fdh = __va(fw_dump.fadumphdr_addr); - crashing_cpu = smp_processor_id(); fdh->crashing_cpu = crashing_cpu; crash_save_vmcoreinfo(); diff --git a/arch/powerpc/kernel/hw_breakpoint.c b/arch/powerpc/kernel/hw_breakpoint.c index 4d3aa05e28be..53cc9270aac8 100644 --- a/arch/powerpc/kernel/hw_breakpoint.c +++ b/arch/powerpc/kernel/hw_breakpoint.c @@ -228,8 +228,10 @@ int hw_breakpoint_handler(struct die_args *args) rcu_read_lock(); bp = __this_cpu_read(bp_per_reg); - if (!bp) + if (!bp) { + rc = NOTIFY_DONE; goto out; + } info = counter_arch_bp(bp); /* diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 72dac0b58061..5f61cc0349c0 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -40,9 +40,7 @@ #define _WORC GPR11 #define _PTCR GPR12 -#define PSSCR_HV_TEMPLATE PSSCR_ESL | PSSCR_EC | \ - PSSCR_PSLL_MASK | PSSCR_TR_MASK | \ - PSSCR_MTL_MASK +#define PSSCR_EC_ESL_MASK_SHIFTED (PSSCR_EC | PSSCR_ESL) >> 16 .text @@ -205,7 +203,7 @@ pnv_enter_arch207_idle_mode: stb r3,PACA_THREAD_IDLE_STATE(r13) cmpwi cr3,r3,PNV_THREAD_SLEEP bge cr3,2f - IDLE_STATE_ENTER_SEQ(PPC_NAP) + IDLE_STATE_ENTER_SEQ_NORET(PPC_NAP) /* No return */ 2: /* Sleep or winkle */ @@ -239,7 +237,7 @@ pnv_fastsleep_workaround_at_entry: common_enter: /* common code for all the threads entering sleep or winkle */ bgt cr3,enter_winkle - IDLE_STATE_ENTER_SEQ(PPC_SLEEP) + IDLE_STATE_ENTER_SEQ_NORET(PPC_SLEEP) fastsleep_workaround_at_entry: ori r15,r15,PNV_CORE_IDLE_LOCK_BIT @@ -250,7 +248,7 @@ fastsleep_workaround_at_entry: /* Fast sleep workaround */ li r3,1 li r4,1 - bl opal_rm_config_cpu_idle_state + bl opal_config_cpu_idle_state /* Clear Lock bit */ li r0,0 @@ -261,10 +259,10 @@ fastsleep_workaround_at_entry: enter_winkle: bl save_sprs_to_stack - IDLE_STATE_ENTER_SEQ(PPC_WINKLE) + IDLE_STATE_ENTER_SEQ_NORET(PPC_WINKLE) /* - * r3 - requested stop state + * r3 - PSSCR value corresponding to the requested stop state. */ power_enter_stop: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -274,13 +272,22 @@ power_enter_stop: stb r4,HSTATE_HWTHREAD_STATE(r13) #endif /* + * Check if we are executing the lite variant with ESL=EC=0 + */ + andis. r4,r3,PSSCR_EC_ESL_MASK_SHIFTED + clrldi r3,r3,60 /* r3 = Bits[60:63] = Requested Level (RL) */ + bne 1f + IDLE_STATE_ENTER_SEQ(PPC_STOP) + li r3,0 /* Since we didn't lose state, return 0 */ + b pnv_wakeup_noloss +/* * Check if the requested state is a deep idle state. */ - LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state) +1: LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state) ld r4,ADDROFF(pnv_first_deep_stop_state)(r5) cmpd r3,r4 bge 2f - IDLE_STATE_ENTER_SEQ(PPC_STOP) + IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP) 2: /* * Entering deep idle state. @@ -302,7 +309,7 @@ lwarx_loop_stop: bl save_sprs_to_stack - IDLE_STATE_ENTER_SEQ(PPC_STOP) + IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP) _GLOBAL(power7_idle) /* Now check if user or arch enabled NAP mode */ @@ -353,16 +360,17 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ ld r3,ORIG_GPR3(r1); /* Restore original r3 */ \ 20: nop; - /* - * r3 - requested stop state + * r3 - The PSSCR value corresponding to the stop state. + * r4 - The PSSCR mask corrresonding to the stop state. */ _GLOBAL(power9_idle_stop) - LOAD_REG_IMMEDIATE(r4, PSSCR_HV_TEMPLATE) - or r4,r4,r3 - mtspr SPRN_PSSCR, r4 - li r4, 1 + mfspr r5,SPRN_PSSCR + andc r5,r5,r4 + or r3,r3,r5 + mtspr SPRN_PSSCR,r3 LOAD_REG_ADDR(r5,power_enter_stop) + li r4,1 b pnv_powersave_common /* No return */ /* @@ -544,7 +552,7 @@ timebase_resync: */ ble cr3,clear_lock /* Time base re-sync */ - bl opal_rm_resync_timebase; + bl opal_resync_timebase; /* * If waking up from sleep, per core state is not lost, skip to * clear_lock. @@ -633,7 +641,7 @@ hypervisor_state_restored: fastsleep_workaround_at_exit: li r3,1 li r4,0 - bl opal_rm_config_cpu_idle_state + bl opal_config_cpu_idle_state b timebase_resync /* diff --git a/arch/powerpc/kernel/iomap.c b/arch/powerpc/kernel/iomap.c index 3963f0b68d52..a1854d1ded8b 100644 --- a/arch/powerpc/kernel/iomap.c +++ b/arch/powerpc/kernel/iomap.c @@ -8,6 +8,7 @@ #include <linux/export.h> #include <asm/io.h> #include <asm/pci-bridge.h> +#include <asm/isa-bridge.h> /* * Here comes the ppc64 implementation of the IOMAP diff --git a/arch/powerpc/kernel/isa-bridge.c b/arch/powerpc/kernel/isa-bridge.c index ae1316106e2b..bb6f8993412e 100644 --- a/arch/powerpc/kernel/isa-bridge.c +++ b/arch/powerpc/kernel/isa-bridge.c @@ -29,6 +29,7 @@ #include <asm/pci-bridge.h> #include <asm/machdep.h> #include <asm/ppc-pci.h> +#include <asm/isa-bridge.h> unsigned long isa_io_base; /* NULL if no ISA bus */ EXPORT_SYMBOL(isa_io_base); @@ -167,6 +168,97 @@ void __init isa_bridge_find_early(struct pci_controller *hose) } /** + * isa_bridge_find_early - Find and map the ISA IO space early before + * main PCI discovery. This is optionally called by + * the arch code when adding PCI PHBs to get early + * access to ISA IO ports + */ +void __init isa_bridge_init_non_pci(struct device_node *np) +{ + const __be32 *ranges, *pbasep = NULL; + int rlen, i, rs; + u32 na, ns, pna; + u64 cbase, pbase, size = 0; + + /* If we already have an ISA bridge, bail off */ + if (isa_bridge_devnode != NULL) + return; + + pna = of_n_addr_cells(np); + if (of_property_read_u32(np, "#address-cells", &na) || + of_property_read_u32(np, "#size-cells", &ns)) { + pr_warn("ISA: Non-PCI bridge %s is missing address format\n", + np->full_name); + return; + } + + /* Check it's a supported address format */ + if (na != 2 || ns != 1) { + pr_warn("ISA: Non-PCI bridge %s has unsupported address format\n", + np->full_name); + return; + } + rs = na + ns + pna; + + /* Grab the ranges property */ + ranges = of_get_property(np, "ranges", &rlen); + if (ranges == NULL || rlen < rs) { + pr_warn("ISA: Non-PCI bridge %s has absent or invalid ranges\n", + np->full_name); + return; + } + + /* Parse it. We are only looking for IO space */ + for (i = 0; (i + rs - 1) < rlen; i += rs) { + if (be32_to_cpup(ranges + i) != 1) + continue; + cbase = be32_to_cpup(ranges + i + 1); + size = of_read_number(ranges + i + na + pna, ns); + pbasep = ranges + i + na; + break; + } + + /* Got something ? */ + if (!size || !pbasep) { + pr_warn("ISA: Non-PCI bridge %s has no usable IO range\n", + np->full_name); + return; + } + + /* Align size and make sure it's cropped to 64K */ + size = PAGE_ALIGN(size); + if (size > 0x10000) + size = 0x10000; + + /* Map pbase */ + pbase = of_translate_address(np, pbasep); + if (pbase == OF_BAD_ADDR) { + pr_warn("ISA: Non-PCI bridge %s failed to translate IO base\n", + np->full_name); + return; + } + + /* We need page alignment */ + if ((cbase & ~PAGE_MASK) || (pbase & ~PAGE_MASK)) { + pr_warn("ISA: Non-PCI bridge %s has non aligned IO range\n", + np->full_name); + return; + } + + /* Got it */ + isa_bridge_devnode = np; + + /* Set the global ISA io base to indicate we have an ISA bridge + * and map it + */ + isa_io_base = ISA_IO_BASE; + __ioremap_at(pbase, (void *)ISA_IO_BASE, + size, pgprot_val(pgprot_noncached(__pgprot(0)))); + + pr_debug("ISA: Non-PCI bridge is %s\n", np->full_name); +} + +/** * isa_bridge_find_late - Find and map the ISA IO space upon discovery of * a new ISA bridge */ diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 735ff3d3f77d..fce05a38851c 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -285,6 +285,7 @@ asm(".global kretprobe_trampoline\n" ".type kretprobe_trampoline, @function\n" "kretprobe_trampoline:\n" "nop\n" + "blr\n" ".size kretprobe_trampoline, .-kretprobe_trampoline\n"); /* @@ -337,6 +338,13 @@ static int __kprobes trampoline_probe_handler(struct kprobe *p, kretprobe_assert(ri, orig_ret_address, trampoline_address); regs->nip = orig_ret_address; + /* + * Make LR point to the orig_ret_address. + * When the 'nop' inside the kretprobe_trampoline + * is optimized, we can do a 'blr' after executing the + * detour buffer code. + */ + regs->link = orig_ret_address; reset_current_kprobe(); kretprobe_hash_unlock(current, &flags); @@ -467,15 +475,6 @@ int __kprobes kprobe_fault_handler(struct pt_regs *regs, int trapnr) return 0; } -/* - * Wrapper routine to for handling exceptions. - */ -int __kprobes kprobe_exceptions_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return NOTIFY_DONE; -} - unsigned long arch_deref_entry_point(void *entry) { return ppc_global_function_entry(entry); diff --git a/arch/powerpc/kernel/legacy_serial.c b/arch/powerpc/kernel/legacy_serial.c index bc525ea0dc09..0694d20f85b6 100644 --- a/arch/powerpc/kernel/legacy_serial.c +++ b/arch/powerpc/kernel/legacy_serial.c @@ -233,7 +233,8 @@ static int __init add_legacy_isa_port(struct device_node *np, * * Note: Don't even try on P8 lpc, we know it's not directly mapped */ - if (!of_device_is_compatible(isa_brg, "ibm,power8-lpc")) { + if (!of_device_is_compatible(isa_brg, "ibm,power8-lpc") || + of_get_property(isa_brg, "ranges", NULL)) { taddr = of_translate_address(np, reg); if (taddr == OF_BAD_ADDR) taddr = 0; diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 32be2a844947..ae179cb1bb3c 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -80,12 +80,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) * each other. */ ld r10,PPC64_CACHES@toc(r2) - lwz r7,DCACHEL1LINESIZE(r10)/* Get cache line size */ + lwz r7,DCACHEL1BLOCKSIZE(r10)/* Get cache block size */ addi r5,r7,-1 andc r6,r3,r5 /* round low to line bdy */ subf r8,r6,r4 /* compute length */ add r8,r8,r5 /* ensure we get enough */ - lwz r9,DCACHEL1LOGLINESIZE(r10) /* Get log-2 of cache line size */ + lwz r9,DCACHEL1LOGBLOCKSIZE(r10) /* Get log-2 of cache block size */ srw. r8,r8,r9 /* compute line count */ beqlr /* nothing to do? */ mtctr r8 @@ -96,12 +96,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) /* Now invalidate the instruction cache */ - lwz r7,ICACHEL1LINESIZE(r10) /* Get Icache line size */ + lwz r7,ICACHEL1BLOCKSIZE(r10) /* Get Icache block size */ addi r5,r7,-1 andc r6,r3,r5 /* round low to line bdy */ subf r8,r6,r4 /* compute length */ add r8,r8,r5 - lwz r9,ICACHEL1LOGLINESIZE(r10) /* Get log-2 of Icache line size */ + lwz r9,ICACHEL1LOGBLOCKSIZE(r10) /* Get log-2 of Icache block size */ srw. r8,r8,r9 /* compute line count */ beqlr /* nothing to do? */ mtctr r8 @@ -128,12 +128,12 @@ _GLOBAL(flush_dcache_range) * Different systems have different cache line sizes */ ld r10,PPC64_CACHES@toc(r2) - lwz r7,DCACHEL1LINESIZE(r10) /* Get dcache line size */ + lwz r7,DCACHEL1BLOCKSIZE(r10) /* Get dcache block size */ addi r5,r7,-1 andc r6,r3,r5 /* round low to line bdy */ subf r8,r6,r4 /* compute length */ add r8,r8,r5 /* ensure we get enough */ - lwz r9,DCACHEL1LOGLINESIZE(r10) /* Get log-2 of dcache line size */ + lwz r9,DCACHEL1LOGBLOCKSIZE(r10) /* Get log-2 of dcache block size */ srw. r8,r8,r9 /* compute line count */ beqlr /* nothing to do? */ mtctr r8 @@ -156,12 +156,12 @@ EXPORT_SYMBOL(flush_dcache_range) */ _GLOBAL(flush_dcache_phys_range) ld r10,PPC64_CACHES@toc(r2) - lwz r7,DCACHEL1LINESIZE(r10) /* Get dcache line size */ + lwz r7,DCACHEL1BLOCKSIZE(r10) /* Get dcache block size */ addi r5,r7,-1 andc r6,r3,r5 /* round low to line bdy */ subf r8,r6,r4 /* compute length */ add r8,r8,r5 /* ensure we get enough */ - lwz r9,DCACHEL1LOGLINESIZE(r10) /* Get log-2 of dcache line size */ + lwz r9,DCACHEL1LOGBLOCKSIZE(r10) /* Get log-2 of dcache block size */ srw. r8,r8,r9 /* compute line count */ beqlr /* nothing to do? */ mfmsr r5 /* Disable MMU Data Relocation */ @@ -184,12 +184,12 @@ _GLOBAL(flush_dcache_phys_range) _GLOBAL(flush_inval_dcache_range) ld r10,PPC64_CACHES@toc(r2) - lwz r7,DCACHEL1LINESIZE(r10) /* Get dcache line size */ + lwz r7,DCACHEL1BLOCKSIZE(r10) /* Get dcache block size */ addi r5,r7,-1 andc r6,r3,r5 /* round low to line bdy */ subf r8,r6,r4 /* compute length */ add r8,r8,r5 /* ensure we get enough */ - lwz r9,DCACHEL1LOGLINESIZE(r10)/* Get log-2 of dcache line size */ + lwz r9,DCACHEL1LOGBLOCKSIZE(r10)/* Get log-2 of dcache block size */ srw. r8,r8,r9 /* compute line count */ beqlr /* nothing to do? */ sync @@ -225,8 +225,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) /* Flush the dcache */ ld r7,PPC64_CACHES@toc(r2) clrrdi r3,r3,PAGE_SHIFT /* Page align */ - lwz r4,DCACHEL1LINESPERPAGE(r7) /* Get # dcache lines per page */ - lwz r5,DCACHEL1LINESIZE(r7) /* Get dcache line size */ + lwz r4,DCACHEL1BLOCKSPERPAGE(r7) /* Get # dcache blocks per page */ + lwz r5,DCACHEL1BLOCKSIZE(r7) /* Get dcache block size */ mr r6,r3 mtctr r4 0: dcbst 0,r6 @@ -236,8 +236,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE) /* Now invalidate the icache */ - lwz r4,ICACHEL1LINESPERPAGE(r7) /* Get # icache lines per page */ - lwz r5,ICACHEL1LINESIZE(r7) /* Get icache line size */ + lwz r4,ICACHEL1BLOCKSPERPAGE(r7) /* Get # icache blocks per page */ + lwz r5,ICACHEL1BLOCKSIZE(r7) /* Get icache block size */ mtctr r4 1: icbi 0,r3 add r3,r3,r5 diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c new file mode 100644 index 000000000000..2282bf4e63cd --- /dev/null +++ b/arch/powerpc/kernel/optprobes.c @@ -0,0 +1,347 @@ +/* + * Code for Kernel probes Jump optimization. + * + * Copyright 2017, Anju T, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kprobes.h> +#include <linux/jump_label.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/list.h> +#include <asm/kprobes.h> +#include <asm/ptrace.h> +#include <asm/cacheflush.h> +#include <asm/code-patching.h> +#include <asm/sstep.h> +#include <asm/ppc-opcode.h> + +#define TMPL_CALL_HDLR_IDX \ + (optprobe_template_call_handler - optprobe_template_entry) +#define TMPL_EMULATE_IDX \ + (optprobe_template_call_emulate - optprobe_template_entry) +#define TMPL_RET_IDX \ + (optprobe_template_ret - optprobe_template_entry) +#define TMPL_OP_IDX \ + (optprobe_template_op_address - optprobe_template_entry) +#define TMPL_INSN_IDX \ + (optprobe_template_insn - optprobe_template_entry) +#define TMPL_END_IDX \ + (optprobe_template_end - optprobe_template_entry) + +DEFINE_INSN_CACHE_OPS(ppc_optinsn); + +static bool insn_page_in_use; + +static void *__ppc_alloc_insn_page(void) +{ + if (insn_page_in_use) + return NULL; + insn_page_in_use = true; + return &optinsn_slot; +} + +static void __ppc_free_insn_page(void *page __maybe_unused) +{ + insn_page_in_use = false; +} + +struct kprobe_insn_cache kprobe_ppc_optinsn_slots = { + .mutex = __MUTEX_INITIALIZER(kprobe_ppc_optinsn_slots.mutex), + .pages = LIST_HEAD_INIT(kprobe_ppc_optinsn_slots.pages), + /* insn_size initialized later */ + .alloc = __ppc_alloc_insn_page, + .free = __ppc_free_insn_page, + .nr_garbage = 0, +}; + +/* + * Check if we can optimize this probe. Returns NIP post-emulation if this can + * be optimized and 0 otherwise. + */ +static unsigned long can_optimize(struct kprobe *p) +{ + struct pt_regs regs; + struct instruction_op op; + unsigned long nip = 0; + + /* + * kprobe placed for kretprobe during boot time + * has a 'nop' instruction, which can be emulated. + * So further checks can be skipped. + */ + if (p->addr == (kprobe_opcode_t *)&kretprobe_trampoline) + return (unsigned long)p->addr + sizeof(kprobe_opcode_t); + + /* + * We only support optimizing kernel addresses, but not + * module addresses. + * + * FIXME: Optimize kprobes placed in module addresses. + */ + if (!is_kernel_addr((unsigned long)p->addr)) + return 0; + + memset(®s, 0, sizeof(struct pt_regs)); + regs.nip = (unsigned long)p->addr; + regs.trap = 0x0; + regs.msr = MSR_KERNEL; + + /* + * Kprobe placed in conditional branch instructions are + * not optimized, as we can't predict the nip prior with + * dummy pt_regs and can not ensure that the return branch + * from detour buffer falls in the range of address (i.e 32MB). + * A branch back from trampoline is set up in the detour buffer + * to the nip returned by the analyse_instr() here. + * + * Ensure that the instruction is not a conditional branch, + * and that can be emulated. + */ + if (!is_conditional_branch(*p->ainsn.insn) && + analyse_instr(&op, ®s, *p->ainsn.insn)) + nip = regs.nip; + + return nip; +} + +static void optimized_callback(struct optimized_kprobe *op, + struct pt_regs *regs) +{ + struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); + unsigned long flags; + + /* This is possible if op is under delayed unoptimizing */ + if (kprobe_disabled(&op->kp)) + return; + + local_irq_save(flags); + hard_irq_disable(); + + if (kprobe_running()) { + kprobes_inc_nmissed_count(&op->kp); + } else { + __this_cpu_write(current_kprobe, &op->kp); + regs->nip = (unsigned long)op->kp.addr; + kcb->kprobe_status = KPROBE_HIT_ACTIVE; + opt_pre_handler(&op->kp, regs); + __this_cpu_write(current_kprobe, NULL); + } + + /* + * No need for an explicit __hard_irq_enable() here. + * local_irq_restore() will re-enable interrupts, + * if they were hard disabled. + */ + local_irq_restore(flags); +} +NOKPROBE_SYMBOL(optimized_callback); + +void arch_remove_optimized_kprobe(struct optimized_kprobe *op) +{ + if (op->optinsn.insn) { + free_ppc_optinsn_slot(op->optinsn.insn, 1); + op->optinsn.insn = NULL; + } +} + +/* + * emulate_step() requires insn to be emulated as + * second parameter. Load register 'r4' with the + * instruction. + */ +void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr) +{ + /* addis r4,0,(insn)@h */ + *addr++ = PPC_INST_ADDIS | ___PPC_RT(4) | + ((val >> 16) & 0xffff); + + /* ori r4,r4,(insn)@l */ + *addr = PPC_INST_ORI | ___PPC_RA(4) | ___PPC_RS(4) | + (val & 0xffff); +} + +/* + * Generate instructions to load provided immediate 64-bit value + * to register 'r3' and patch these instructions at 'addr'. + */ +void patch_imm64_load_insns(unsigned long val, kprobe_opcode_t *addr) +{ + /* lis r3,(op)@highest */ + *addr++ = PPC_INST_ADDIS | ___PPC_RT(3) | + ((val >> 48) & 0xffff); + + /* ori r3,r3,(op)@higher */ + *addr++ = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) | + ((val >> 32) & 0xffff); + + /* rldicr r3,r3,32,31 */ + *addr++ = PPC_INST_RLDICR | ___PPC_RA(3) | ___PPC_RS(3) | + __PPC_SH64(32) | __PPC_ME64(31); + + /* oris r3,r3,(op)@h */ + *addr++ = PPC_INST_ORIS | ___PPC_RA(3) | ___PPC_RS(3) | + ((val >> 16) & 0xffff); + + /* ori r3,r3,(op)@l */ + *addr = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) | + (val & 0xffff); +} + +int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) +{ + kprobe_opcode_t *buff, branch_op_callback, branch_emulate_step; + kprobe_opcode_t *op_callback_addr, *emulate_step_addr; + long b_offset; + unsigned long nip; + + kprobe_ppc_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; + + nip = can_optimize(p); + if (!nip) + return -EILSEQ; + + /* Allocate instruction slot for detour buffer */ + buff = get_ppc_optinsn_slot(); + if (!buff) + return -ENOMEM; + + /* + * OPTPROBE uses 'b' instruction to branch to optinsn.insn. + * + * The target address has to be relatively nearby, to permit use + * of branch instruction in powerpc, because the address is specified + * in an immediate field in the instruction opcode itself, ie 24 bits + * in the opcode specify the address. Therefore the address should + * be within 32MB on either side of the current instruction. + */ + b_offset = (unsigned long)buff - (unsigned long)p->addr; + if (!is_offset_in_branch_range(b_offset)) + goto error; + + /* Check if the return address is also within 32MB range */ + b_offset = (unsigned long)(buff + TMPL_RET_IDX) - + (unsigned long)nip; + if (!is_offset_in_branch_range(b_offset)) + goto error; + + /* Setup template */ + memcpy(buff, optprobe_template_entry, + TMPL_END_IDX * sizeof(kprobe_opcode_t)); + + /* + * Fixup the template with instructions to: + * 1. load the address of the actual probepoint + */ + patch_imm64_load_insns((unsigned long)op, buff + TMPL_OP_IDX); + + /* + * 2. branch to optimized_callback() and emulate_step() + */ + kprobe_lookup_name("optimized_callback", op_callback_addr); + kprobe_lookup_name("emulate_step", emulate_step_addr); + if (!op_callback_addr || !emulate_step_addr) { + WARN(1, "kprobe_lookup_name() failed\n"); + goto error; + } + + branch_op_callback = create_branch((unsigned int *)buff + TMPL_CALL_HDLR_IDX, + (unsigned long)op_callback_addr, + BRANCH_SET_LINK); + + branch_emulate_step = create_branch((unsigned int *)buff + TMPL_EMULATE_IDX, + (unsigned long)emulate_step_addr, + BRANCH_SET_LINK); + + if (!branch_op_callback || !branch_emulate_step) + goto error; + + buff[TMPL_CALL_HDLR_IDX] = branch_op_callback; + buff[TMPL_EMULATE_IDX] = branch_emulate_step; + + /* + * 3. load instruction to be emulated into relevant register, and + */ + patch_imm32_load_insns(*p->ainsn.insn, buff + TMPL_INSN_IDX); + + /* + * 4. branch back from trampoline + */ + buff[TMPL_RET_IDX] = create_branch((unsigned int *)buff + TMPL_RET_IDX, + (unsigned long)nip, 0); + + flush_icache_range((unsigned long)buff, + (unsigned long)(&buff[TMPL_END_IDX])); + + op->optinsn.insn = buff; + + return 0; + +error: + free_ppc_optinsn_slot(buff, 0); + return -ERANGE; + +} + +int arch_prepared_optinsn(struct arch_optimized_insn *optinsn) +{ + return optinsn->insn != NULL; +} + +/* + * On powerpc, Optprobes always replaces one instruction (4 bytes + * aligned and 4 bytes long). It is impossible to encounter another + * kprobe in this address range. So always return 0. + */ +int arch_check_optimized_kprobe(struct optimized_kprobe *op) +{ + return 0; +} + +void arch_optimize_kprobes(struct list_head *oplist) +{ + struct optimized_kprobe *op; + struct optimized_kprobe *tmp; + + list_for_each_entry_safe(op, tmp, oplist, list) { + /* + * Backup instructions which will be replaced + * by jump address + */ + memcpy(op->optinsn.copied_insn, op->kp.addr, + RELATIVEJUMP_SIZE); + patch_instruction(op->kp.addr, + create_branch((unsigned int *)op->kp.addr, + (unsigned long)op->optinsn.insn, 0)); + list_del_init(&op->list); + } +} + +void arch_unoptimize_kprobe(struct optimized_kprobe *op) +{ + arch_arm_kprobe(&op->kp); +} + +void arch_unoptimize_kprobes(struct list_head *oplist, + struct list_head *done_list) +{ + struct optimized_kprobe *op; + struct optimized_kprobe *tmp; + + list_for_each_entry_safe(op, tmp, oplist, list) { + arch_unoptimize_kprobe(op); + list_move(&op->list, done_list); + } +} + +int arch_within_optimized_kprobe(struct optimized_kprobe *op, + unsigned long addr) +{ + return ((unsigned long)op->kp.addr <= addr && + (unsigned long)op->kp.addr + RELATIVEJUMP_SIZE > addr); +} diff --git a/arch/powerpc/kernel/optprobes_head.S b/arch/powerpc/kernel/optprobes_head.S new file mode 100644 index 000000000000..53e429b5a29d --- /dev/null +++ b/arch/powerpc/kernel/optprobes_head.S @@ -0,0 +1,135 @@ +/* + * Code to prepare detour buffer for optprobes in Kernel. + * + * Copyright 2017, Anju T, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/ppc_asm.h> +#include <asm/ptrace.h> +#include <asm/asm-offsets.h> + +#define OPT_SLOT_SIZE 65536 + + .balign 4 + + /* + * Reserve an area to allocate slots for detour buffer. + * This is part of .text section (rather than vmalloc area) + * as this needs to be within 32MB of the probed address. + */ + .global optinsn_slot +optinsn_slot: + .space OPT_SLOT_SIZE + + /* + * Optprobe template: + * This template gets copied into one of the slots in optinsn_slot + * and gets fixed up with real optprobe structures et al. + */ + .global optprobe_template_entry +optprobe_template_entry: + /* Create an in-memory pt_regs */ + stdu r1,-INT_FRAME_SIZE(r1) + SAVE_GPR(0,r1) + /* Save the previous SP into stack */ + addi r0,r1,INT_FRAME_SIZE + std r0,GPR1(r1) + SAVE_10GPRS(2,r1) + SAVE_10GPRS(12,r1) + SAVE_10GPRS(22,r1) + /* Save SPRS */ + mfmsr r5 + std r5,_MSR(r1) + li r5,0x700 + std r5,_TRAP(r1) + li r5,0 + std r5,ORIG_GPR3(r1) + std r5,RESULT(r1) + mfctr r5 + std r5,_CTR(r1) + mflr r5 + std r5,_LINK(r1) + mfspr r5,SPRN_XER + std r5,_XER(r1) + mfcr r5 + std r5,_CCR(r1) + lbz r5,PACASOFTIRQEN(r13) + std r5,SOFTE(r1) + mfdar r5 + std r5,_DAR(r1) + mfdsisr r5 + std r5,_DSISR(r1) + + .global optprobe_template_op_address +optprobe_template_op_address: + /* + * Parameters to optimized_callback(): + * 1. optimized_kprobe structure in r3 + */ + nop + nop + nop + nop + nop + /* 2. pt_regs pointer in r4 */ + addi r4,r1,STACK_FRAME_OVERHEAD + + .global optprobe_template_call_handler +optprobe_template_call_handler: + /* Branch to optimized_callback() */ + nop + + /* + * Parameters for instruction emulation: + * 1. Pass SP in register r3. + */ + addi r3,r1,STACK_FRAME_OVERHEAD + + .global optprobe_template_insn +optprobe_template_insn: + /* 2, Pass instruction to be emulated in r4 */ + nop + nop + + .global optprobe_template_call_emulate +optprobe_template_call_emulate: + /* Branch to emulate_step() */ + nop + + /* + * All done. + * Now, restore the registers... + */ + ld r5,_MSR(r1) + mtmsr r5 + ld r5,_CTR(r1) + mtctr r5 + ld r5,_LINK(r1) + mtlr r5 + ld r5,_XER(r1) + mtxer r5 + ld r5,_CCR(r1) + mtcr r5 + ld r5,_DAR(r1) + mtdar r5 + ld r5,_DSISR(r1) + mtdsisr r5 + REST_GPR(0,r1) + REST_10GPRS(2,r1) + REST_10GPRS(12,r1) + REST_10GPRS(22,r1) + /* Restore the previous SP */ + addi r1,r1,INT_FRAME_SIZE + + .global optprobe_template_ret +optprobe_template_ret: + /* ... and jump back from trampoline */ + nop + + .global optprobe_template_end +optprobe_template_end: diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index ac83eb04a8b8..616de028f7f8 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -649,6 +649,7 @@ static void __init early_cmdline_parse(void) struct option_vector1 { u8 byte1; u8 arch_versions; + u8 arch_versions3; } __packed; struct option_vector2 { @@ -691,6 +692,9 @@ struct option_vector5 { u8 reserved2; __be16 reserved3; u8 subprocessors; + u8 byte22; + u8 intarch; + u8 mmu; } __packed; struct option_vector6 { @@ -700,7 +704,7 @@ struct option_vector6 { } __packed; struct ibm_arch_vec { - struct { u32 mask, val; } pvrs[10]; + struct { u32 mask, val; } pvrs[12]; u8 num_vectors; @@ -750,6 +754,14 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { .val = cpu_to_be32(0x004d0000), }, { + .mask = cpu_to_be32(0xffff0000), /* POWER9 */ + .val = cpu_to_be32(0x004e0000), + }, + { + .mask = cpu_to_be32(0xffffffff), /* all 3.00-compliant */ + .val = cpu_to_be32(0x0f000005), + }, + { .mask = cpu_to_be32(0xffffffff), /* all 2.07-compliant */ .val = cpu_to_be32(0x0f000004), }, @@ -774,6 +786,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { .byte1 = 0, .arch_versions = OV1_PPC_2_00 | OV1_PPC_2_01 | OV1_PPC_2_02 | OV1_PPC_2_03 | OV1_PPC_2_04 | OV1_PPC_2_05 | OV1_PPC_2_06 | OV1_PPC_2_07, + .arch_versions3 = OV1_PPC_3_00, }, .vec2_len = VECTOR_LENGTH(sizeof(struct option_vector2)), @@ -826,7 +839,7 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { 0, #endif .associativity = OV5_FEAT(OV5_TYPE1_AFFINITY) | OV5_FEAT(OV5_PRRN), - .bin_opts = 0, + .bin_opts = OV5_FEAT(OV5_RESIZE_HPT), .micro_checkpoint = 0, .reserved0 = 0, .max_cpus = cpu_to_be32(NR_CPUS), /* number of cores supported */ @@ -836,6 +849,9 @@ struct ibm_arch_vec __cacheline_aligned ibm_architecture_vec = { .reserved2 = 0, .reserved3 = 0, .subprocessors = 1, + .intarch = 0, + .mmu = OV5_FEAT(OV5_MMU_RADIX_300) | OV5_FEAT(OV5_MMU_HASH_300) | + OV5_FEAT(OV5_MMU_PROC_TBL) | OV5_FEAT(OV5_MMU_GTSE), }, /* option vector 6: IBM PAPR hints */ diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c index 112cc3b2ee1a..b8a4987f58cf 100644 --- a/arch/powerpc/kernel/rtas.c +++ b/arch/powerpc/kernel/rtas.c @@ -1145,31 +1145,29 @@ asmlinkage int ppc_rtas(struct rtas_args __user *uargs) void __init rtas_initialize(void) { unsigned long rtas_region = RTAS_INSTANTIATE_MAX; + u32 base, size, entry; + int no_base, no_size, no_entry; /* Get RTAS dev node and fill up our "rtas" structure with infos * about it. */ rtas.dev = of_find_node_by_name(NULL, "rtas"); - if (rtas.dev) { - const __be32 *basep, *entryp, *sizep; - - basep = of_get_property(rtas.dev, "linux,rtas-base", NULL); - sizep = of_get_property(rtas.dev, "rtas-size", NULL); - if (basep != NULL && sizep != NULL) { - rtas.base = __be32_to_cpu(*basep); - rtas.size = __be32_to_cpu(*sizep); - entryp = of_get_property(rtas.dev, - "linux,rtas-entry", NULL); - if (entryp == NULL) /* Ugh */ - rtas.entry = rtas.base; - else - rtas.entry = __be32_to_cpu(*entryp); - } else - rtas.dev = NULL; - } if (!rtas.dev) return; + no_base = of_property_read_u32(rtas.dev, "linux,rtas-base", &base); + no_size = of_property_read_u32(rtas.dev, "rtas-size", &size); + if (no_base || no_size) { + of_node_put(rtas.dev); + rtas.dev = NULL; + return; + } + + rtas.base = base; + rtas.size = size; + no_entry = of_property_read_u32(rtas.dev, "linux,rtas-entry", &entry); + rtas.entry = no_entry ? rtas.base : entry; + /* If RTAS was found, allocate the RMO buffer for it and look for * the stop-self token if any */ diff --git a/arch/powerpc/kernel/rtasd.c b/arch/powerpc/kernel/rtasd.c index 2bf1f9b5b34b..3650732639ed 100644 --- a/arch/powerpc/kernel/rtasd.c +++ b/arch/powerpc/kernel/rtasd.c @@ -21,6 +21,7 @@ #include <linux/cpu.h> #include <linux/workqueue.h> #include <linux/slab.h> +#include <linux/topology.h> #include <linux/uaccess.h> #include <asm/io.h> @@ -282,6 +283,7 @@ static void prrn_work_fn(struct work_struct *work) * the RTAS event. */ pseries_devicetree_update(-prrn_update_scope); + arch_update_cpu_topology(); } static DECLARE_WORK(prrn_work, prrn_work_fn); @@ -434,7 +436,10 @@ static void do_event_scan(void) } if (error == 0) { - pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, 0); + if (rtas_error_type((struct rtas_error_log *)logdata) != + RTAS_TYPE_PRRN) + pSeries_log_error(logdata, ERR_TYPE_RTAS_LOG, + 0); handle_rtas_event((struct rtas_error_log *)logdata); } diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index f516ac508ae3..4697da895133 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -87,6 +87,15 @@ EXPORT_SYMBOL(machine_id); int boot_cpuid = -1; EXPORT_SYMBOL_GPL(boot_cpuid); +/* + * These are used in binfmt_elf.c to put aux entries on the stack + * for each elf executable being started. + */ +int dcache_bsize; +int icache_bsize; +int ucache_bsize; + + unsigned long klimit = (unsigned long) _end; /* diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index 7fcf1f7f01c1..2f88f6cf1a42 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -59,14 +59,6 @@ EXPORT_SYMBOL(DMA_MODE_READ); EXPORT_SYMBOL(DMA_MODE_WRITE); /* - * These are used in binfmt_elf.c to put aux entries on the stack - * for each elf executable being started. - */ -int dcache_bsize; -int icache_bsize; -int ucache_bsize; - -/* * We're called here very early in the boot. * * Note that the kernel may be running at an address which is different diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 6824157e4d2e..b9855f1b290a 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -77,25 +77,18 @@ int spinning_secondaries; u64 ppc64_pft_size; -/* Pick defaults since we might want to patch instructions - * before we've read this from the device tree. - */ struct ppc64_caches ppc64_caches = { - .dline_size = 0x40, - .log_dline_size = 6, - .iline_size = 0x40, - .log_iline_size = 6 + .l1d = { + .block_size = 0x40, + .log_block_size = 6, + }, + .l1i = { + .block_size = 0x40, + .log_block_size = 6 + }, }; EXPORT_SYMBOL_GPL(ppc64_caches); -/* - * These are used in binfmt_elf.c to put aux entries on the stack - * for each elf executable being started. - */ -int dcache_bsize; -int icache_bsize; -int ucache_bsize; - #if defined(CONFIG_PPC_BOOK3E) && defined(CONFIG_SMP) void __init setup_tlb_core_data(void) { @@ -408,74 +401,135 @@ void smp_release_cpus(void) * cache informations about the CPU that will be used by cache flush * routines and/or provided to userland */ + +static void init_cache_info(struct ppc_cache_info *info, u32 size, u32 lsize, + u32 bsize, u32 sets) +{ + info->size = size; + info->sets = sets; + info->line_size = lsize; + info->block_size = bsize; + info->log_block_size = __ilog2(bsize); + info->blocks_per_page = PAGE_SIZE / bsize; + + if (sets == 0) + info->assoc = 0xffff; + else + info->assoc = size / (sets * lsize); +} + +static bool __init parse_cache_info(struct device_node *np, + bool icache, + struct ppc_cache_info *info) +{ + static const char *ipropnames[] __initdata = { + "i-cache-size", + "i-cache-sets", + "i-cache-block-size", + "i-cache-line-size", + }; + static const char *dpropnames[] __initdata = { + "d-cache-size", + "d-cache-sets", + "d-cache-block-size", + "d-cache-line-size", + }; + const char **propnames = icache ? ipropnames : dpropnames; + const __be32 *sizep, *lsizep, *bsizep, *setsp; + u32 size, lsize, bsize, sets; + bool success = true; + + size = 0; + sets = -1u; + lsize = bsize = cur_cpu_spec->dcache_bsize; + sizep = of_get_property(np, propnames[0], NULL); + if (sizep != NULL) + size = be32_to_cpu(*sizep); + setsp = of_get_property(np, propnames[1], NULL); + if (setsp != NULL) + sets = be32_to_cpu(*setsp); + bsizep = of_get_property(np, propnames[2], NULL); + lsizep = of_get_property(np, propnames[3], NULL); + if (bsizep == NULL) + bsizep = lsizep; + if (lsizep != NULL) + lsize = be32_to_cpu(*lsizep); + if (bsizep != NULL) + bsize = be32_to_cpu(*bsizep); + if (sizep == NULL || bsizep == NULL || lsizep == NULL) + success = false; + + /* + * OF is weird .. it represents fully associative caches + * as "1 way" which doesn't make much sense and doesn't + * leave room for direct mapped. We'll assume that 0 + * in OF means direct mapped for that reason. + */ + if (sets == 1) + sets = 0; + else if (sets == 0) + sets = 1; + + init_cache_info(info, size, lsize, bsize, sets); + + return success; +} + void __init initialize_cache_info(void) { - struct device_node *np; - unsigned long num_cpus = 0; + struct device_node *cpu = NULL, *l2, *l3 = NULL; + u32 pvr; DBG(" -> initialize_cache_info()\n"); - for_each_node_by_type(np, "cpu") { - num_cpus += 1; + /* + * All shipping POWER8 machines have a firmware bug that + * puts incorrect information in the device-tree. This will + * be (hopefully) fixed for future chips but for now hard + * code the values if we are running on one of these + */ + pvr = PVR_VER(mfspr(SPRN_PVR)); + if (pvr == PVR_POWER8 || pvr == PVR_POWER8E || + pvr == PVR_POWER8NVL) { + /* size lsize blk sets */ + init_cache_info(&ppc64_caches.l1i, 0x8000, 128, 128, 32); + init_cache_info(&ppc64_caches.l1d, 0x10000, 128, 128, 64); + init_cache_info(&ppc64_caches.l2, 0x80000, 128, 0, 512); + init_cache_info(&ppc64_caches.l3, 0x800000, 128, 0, 8192); + } else + cpu = of_find_node_by_type(NULL, "cpu"); + + /* + * We're assuming *all* of the CPUs have the same + * d-cache and i-cache sizes... -Peter + */ + if (cpu) { + if (!parse_cache_info(cpu, false, &ppc64_caches.l1d)) + DBG("Argh, can't find dcache properties !\n"); + + if (!parse_cache_info(cpu, true, &ppc64_caches.l1i)) + DBG("Argh, can't find icache properties !\n"); /* - * We're assuming *all* of the CPUs have the same - * d-cache and i-cache sizes... -Peter + * Try to find the L2 and L3 if any. Assume they are + * unified and use the D-side properties. */ - if (num_cpus == 1) { - const __be32 *sizep, *lsizep; - u32 size, lsize; - - size = 0; - lsize = cur_cpu_spec->dcache_bsize; - sizep = of_get_property(np, "d-cache-size", NULL); - if (sizep != NULL) - size = be32_to_cpu(*sizep); - lsizep = of_get_property(np, "d-cache-block-size", - NULL); - /* fallback if block size missing */ - if (lsizep == NULL) - lsizep = of_get_property(np, - "d-cache-line-size", - NULL); - if (lsizep != NULL) - lsize = be32_to_cpu(*lsizep); - if (sizep == NULL || lsizep == NULL) - DBG("Argh, can't find dcache properties ! " - "sizep: %p, lsizep: %p\n", sizep, lsizep); - - ppc64_caches.dsize = size; - ppc64_caches.dline_size = lsize; - ppc64_caches.log_dline_size = __ilog2(lsize); - ppc64_caches.dlines_per_page = PAGE_SIZE / lsize; - - size = 0; - lsize = cur_cpu_spec->icache_bsize; - sizep = of_get_property(np, "i-cache-size", NULL); - if (sizep != NULL) - size = be32_to_cpu(*sizep); - lsizep = of_get_property(np, "i-cache-block-size", - NULL); - if (lsizep == NULL) - lsizep = of_get_property(np, - "i-cache-line-size", - NULL); - if (lsizep != NULL) - lsize = be32_to_cpu(*lsizep); - if (sizep == NULL || lsizep == NULL) - DBG("Argh, can't find icache properties ! " - "sizep: %p, lsizep: %p\n", sizep, lsizep); - - ppc64_caches.isize = size; - ppc64_caches.iline_size = lsize; - ppc64_caches.log_iline_size = __ilog2(lsize); - ppc64_caches.ilines_per_page = PAGE_SIZE / lsize; + l2 = of_find_next_cache_node(cpu); + of_node_put(cpu); + if (l2) { + parse_cache_info(l2, false, &ppc64_caches.l2); + l3 = of_find_next_cache_node(l2); + of_node_put(l2); + } + if (l3) { + parse_cache_info(l3, false, &ppc64_caches.l3); + of_node_put(l3); } } /* For use by binfmt_elf */ - dcache_bsize = ppc64_caches.dline_size; - icache_bsize = ppc64_caches.iline_size; + dcache_bsize = ppc64_caches.l1d.block_size; + icache_bsize = ppc64_caches.l1i.block_size; DBG(" <- initialize_cache_info()\n"); } diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 4111d30badfa..22b01a3962f0 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -736,16 +736,14 @@ static int __init vdso_init(void) if (firmware_has_feature(FW_FEATURE_LPAR)) vdso_data->platform |= 1; vdso_data->physicalMemorySize = memblock_phys_mem_size(); - vdso_data->dcache_size = ppc64_caches.dsize; - vdso_data->dcache_line_size = ppc64_caches.dline_size; - vdso_data->icache_size = ppc64_caches.isize; - vdso_data->icache_line_size = ppc64_caches.iline_size; - - /* XXXOJN: Blocks should be added to ppc64_caches and used instead */ - vdso_data->dcache_block_size = ppc64_caches.dline_size; - vdso_data->icache_block_size = ppc64_caches.iline_size; - vdso_data->dcache_log_block_size = ppc64_caches.log_dline_size; - vdso_data->icache_log_block_size = ppc64_caches.log_iline_size; + vdso_data->dcache_size = ppc64_caches.l1d.size; + vdso_data->dcache_line_size = ppc64_caches.l1d.line_size; + vdso_data->icache_size = ppc64_caches.l1i.size; + vdso_data->icache_line_size = ppc64_caches.l1i.line_size; + vdso_data->dcache_block_size = ppc64_caches.l1d.block_size; + vdso_data->icache_block_size = ppc64_caches.l1i.block_size; + vdso_data->dcache_log_block_size = ppc64_caches.l1d.log_block_size; + vdso_data->icache_log_block_size = ppc64_caches.l1i.log_block_size; /* * Calculate the size of the 64 bits vDSO diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile index 7dd89b79d038..b87ccde2137a 100644 --- a/arch/powerpc/kvm/Makefile +++ b/arch/powerpc/kvm/Makefile @@ -70,7 +70,8 @@ endif kvm-hv-y += \ book3s_hv.o \ book3s_hv_interrupts.o \ - book3s_64_mmu_hv.o + book3s_64_mmu_hv.o \ + book3s_64_mmu_radix.o kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ book3s_hv_rm_xics.o diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 019f008775b9..b6b5c185bd92 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -239,6 +239,7 @@ void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar, kvmppc_set_dsisr(vcpu, flags); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_DATA_STORAGE); } +EXPORT_SYMBOL_GPL(kvmppc_core_queue_data_storage); /* used by kvm_hv */ void kvmppc_core_queue_inst_storage(struct kvm_vcpu *vcpu, ulong flags) { diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index b795dd1ac2ef..9df3d940acec 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -119,6 +119,9 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) long err = -EBUSY; long order; + if (kvm_is_radix(kvm)) + return -EINVAL; + mutex_lock(&kvm->lock); if (kvm->arch.hpte_setup_done) { kvm->arch.hpte_setup_done = 0; @@ -152,12 +155,11 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, u32 *htab_orderp) void kvmppc_free_hpt(struct kvm *kvm) { - kvmppc_free_lpid(kvm->arch.lpid); vfree(kvm->arch.revmap); if (kvm->arch.hpt_cma_alloc) kvm_release_hpt(virt_to_page(kvm->arch.hpt_virt), 1 << (kvm->arch.hpt_order - PAGE_SHIFT)); - else + else if (kvm->arch.hpt_virt) free_pages(kvm->arch.hpt_virt, kvm->arch.hpt_order - PAGE_SHIFT); } @@ -392,8 +394,8 @@ static int instruction_is_store(unsigned int instr) return (instr & mask) != 0; } -static int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, - unsigned long gpa, gva_t ea, int is_store) +int kvmppc_hv_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long gpa, gva_t ea, int is_store) { u32 last_inst; @@ -458,6 +460,9 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, unsigned long rcbits; long mmio_update; + if (kvm_is_radix(kvm)) + return kvmppc_book3s_radix_page_fault(run, vcpu, ea, dsisr); + /* * Real-mode code has already searched the HPT and found the * entry we're interested in. Lock the entry and check that @@ -695,12 +700,13 @@ static void kvmppc_rmap_reset(struct kvm *kvm) srcu_read_unlock(&kvm->srcu, srcu_idx); } +typedef int (*hva_handler_fn)(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn); + static int kvm_handle_hva_range(struct kvm *kvm, unsigned long start, unsigned long end, - int (*handler)(struct kvm *kvm, - unsigned long *rmapp, - unsigned long gfn)) + hva_handler_fn handler) { int ret; int retval = 0; @@ -725,9 +731,7 @@ static int kvm_handle_hva_range(struct kvm *kvm, gfn_end = hva_to_gfn_memslot(hva_end + PAGE_SIZE - 1, memslot); for (; gfn < gfn_end; ++gfn) { - gfn_t gfn_offset = gfn - memslot->base_gfn; - - ret = handler(kvm, &memslot->arch.rmap[gfn_offset], gfn); + ret = handler(kvm, memslot, gfn); retval |= ret; } } @@ -736,20 +740,21 @@ static int kvm_handle_hva_range(struct kvm *kvm, } static int kvm_handle_hva(struct kvm *kvm, unsigned long hva, - int (*handler)(struct kvm *kvm, unsigned long *rmapp, - unsigned long gfn)) + hva_handler_fn handler) { return kvm_handle_hva_range(kvm, hva, hva + 1, handler); } -static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { struct revmap_entry *rev = kvm->arch.revmap; unsigned long h, i, j; __be64 *hptep; unsigned long ptel, psize, rcbits; + unsigned long *rmapp; + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; for (;;) { lock_rmap(rmapp); if (!(*rmapp & KVMPPC_RMAP_PRESENT)) { @@ -810,26 +815,36 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, int kvm_unmap_hva_hv(struct kvm *kvm, unsigned long hva) { - kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; + kvm_handle_hva(kvm, hva, handler); return 0; } int kvm_unmap_hva_range_hv(struct kvm *kvm, unsigned long start, unsigned long end) { - kvm_handle_hva_range(kvm, start, end, kvm_unmap_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; + kvm_handle_hva_range(kvm, start, end, handler); return 0; } void kvmppc_core_flush_memslot_hv(struct kvm *kvm, struct kvm_memory_slot *memslot) { - unsigned long *rmapp; unsigned long gfn; unsigned long n; + unsigned long *rmapp; - rmapp = memslot->arch.rmap; gfn = memslot->base_gfn; - for (n = memslot->npages; n; --n) { + rmapp = memslot->arch.rmap; + for (n = memslot->npages; n; --n, ++gfn) { + if (kvm_is_radix(kvm)) { + kvm_unmap_radix(kvm, memslot, gfn); + continue; + } /* * Testing the present bit without locking is OK because * the memslot has been marked invalid already, and hence @@ -837,20 +852,21 @@ void kvmppc_core_flush_memslot_hv(struct kvm *kvm, * thus the present bit can't go from 0 to 1. */ if (*rmapp & KVMPPC_RMAP_PRESENT) - kvm_unmap_rmapp(kvm, rmapp, gfn); + kvm_unmap_rmapp(kvm, memslot, gfn); ++rmapp; - ++gfn; } } -static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { struct revmap_entry *rev = kvm->arch.revmap; unsigned long head, i, j; __be64 *hptep; int ret = 0; + unsigned long *rmapp; + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; retry: lock_rmap(rmapp); if (*rmapp & KVMPPC_RMAP_REFERENCED) { @@ -898,17 +914,22 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, int kvm_age_hva_hv(struct kvm *kvm, unsigned long start, unsigned long end) { - return kvm_handle_hva_range(kvm, start, end, kvm_age_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_age_radix : kvm_age_rmapp; + return kvm_handle_hva_range(kvm, start, end, handler); } -static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, +static int kvm_test_age_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long gfn) { struct revmap_entry *rev = kvm->arch.revmap; unsigned long head, i, j; unsigned long *hp; int ret = 1; + unsigned long *rmapp; + rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; if (*rmapp & KVMPPC_RMAP_REFERENCED) return 1; @@ -934,12 +955,18 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, int kvm_test_age_hva_hv(struct kvm *kvm, unsigned long hva) { - return kvm_handle_hva(kvm, hva, kvm_test_age_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_test_age_radix : kvm_test_age_rmapp; + return kvm_handle_hva(kvm, hva, handler); } void kvm_set_spte_hva_hv(struct kvm *kvm, unsigned long hva, pte_t pte) { - kvm_handle_hva(kvm, hva, kvm_unmap_rmapp); + hva_handler_fn handler; + + handler = kvm_is_radix(kvm) ? kvm_unmap_radix : kvm_unmap_rmapp; + kvm_handle_hva(kvm, hva, handler); } static int vcpus_running(struct kvm *kvm) @@ -1040,7 +1067,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp) return npages_dirty; } -static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, +void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa, struct kvm_memory_slot *memslot, unsigned long *map) { @@ -1058,12 +1085,11 @@ static void harvest_vpa_dirty(struct kvmppc_vpa *vpa, __set_bit_le(gfn - memslot->base_gfn, map); } -long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, - unsigned long *map) +long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm, + struct kvm_memory_slot *memslot, unsigned long *map) { unsigned long i, j; unsigned long *rmapp; - struct kvm_vcpu *vcpu; preempt_disable(); rmapp = memslot->arch.rmap; @@ -1079,15 +1105,6 @@ long kvmppc_hv_get_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot, __set_bit_le(j, map); ++rmapp; } - - /* Harvest dirty bits from VPA and DTL updates */ - /* Note: we never modify the SLB shadow buffer areas */ - kvm_for_each_vcpu(i, vcpu, kvm) { - spin_lock(&vcpu->arch.vpa_update_lock); - harvest_vpa_dirty(&vcpu->arch.vpa, memslot, map); - harvest_vpa_dirty(&vcpu->arch.dtl, memslot, map); - spin_unlock(&vcpu->arch.vpa_update_lock); - } preempt_enable(); return 0; } @@ -1142,10 +1159,14 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa, srcu_idx = srcu_read_lock(&kvm->srcu); memslot = gfn_to_memslot(kvm, gfn); if (memslot) { - rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; - lock_rmap(rmap); - *rmap |= KVMPPC_RMAP_CHANGED; - unlock_rmap(rmap); + if (!kvm_is_radix(kvm)) { + rmap = &memslot->arch.rmap[gfn - memslot->base_gfn]; + lock_rmap(rmap); + *rmap |= KVMPPC_RMAP_CHANGED; + unlock_rmap(rmap); + } else if (memslot->dirty_bitmap) { + mark_page_dirty(kvm, gfn); + } } srcu_read_unlock(&kvm->srcu, srcu_idx); } @@ -1675,7 +1696,10 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu) vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */ - mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; + if (kvm_is_radix(vcpu->kvm)) + mmu->xlate = kvmppc_mmu_radix_xlate; + else + mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate; mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr; vcpu->arch.hflags |= BOOK3S_HFLAG_SLB; diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c new file mode 100644 index 000000000000..4344651f408c --- /dev/null +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -0,0 +1,716 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + * + * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + */ + +#include <linux/types.h> +#include <linux/string.h> +#include <linux/kvm.h> +#include <linux/kvm_host.h> + +#include <asm/kvm_ppc.h> +#include <asm/kvm_book3s.h> +#include <asm/page.h> +#include <asm/mmu.h> +#include <asm/pgtable.h> +#include <asm/pgalloc.h> + +/* + * Supported radix tree geometry. + * Like p9, we support either 5 or 9 bits at the first (lowest) level, + * for a page size of 64k or 4k. + */ +static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; + +int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, + struct kvmppc_pte *gpte, bool data, bool iswrite) +{ + struct kvm *kvm = vcpu->kvm; + u32 pid; + int ret, level, ps; + __be64 prte, rpte; + unsigned long root, pte, index; + unsigned long rts, bits, offset; + unsigned long gpa; + unsigned long proc_tbl_size; + + /* Work out effective PID */ + switch (eaddr >> 62) { + case 0: + pid = vcpu->arch.pid; + break; + case 3: + pid = 0; + break; + default: + return -EINVAL; + } + proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12); + if (pid * 16 >= proc_tbl_size) + return -EINVAL; + + /* Read partition table to find root of tree for effective PID */ + ret = kvm_read_guest(kvm, kvm->arch.process_table + pid * 16, + &prte, sizeof(prte)); + if (ret) + return ret; + + root = be64_to_cpu(prte); + rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | + ((root & RTS2_MASK) >> RTS2_SHIFT); + bits = root & RPDS_MASK; + root = root & RPDB_MASK; + + /* P9 DD1 interprets RTS (radix tree size) differently */ + offset = rts + 31; + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) + offset -= 3; + + /* current implementations only support 52-bit space */ + if (offset != 52) + return -EINVAL; + + for (level = 3; level >= 0; --level) { + if (level && bits != p9_supported_radix_bits[level]) + return -EINVAL; + if (level == 0 && !(bits == 5 || bits == 9)) + return -EINVAL; + offset -= bits; + index = (eaddr >> offset) & ((1UL << bits) - 1); + /* check that low bits of page table base are zero */ + if (root & ((1UL << (bits + 3)) - 1)) + return -EINVAL; + ret = kvm_read_guest(kvm, root + index * 8, + &rpte, sizeof(rpte)); + if (ret) + return ret; + pte = __be64_to_cpu(rpte); + if (!(pte & _PAGE_PRESENT)) + return -ENOENT; + if (pte & _PAGE_PTE) + break; + bits = pte & 0x1f; + root = pte & 0x0fffffffffffff00ul; + } + /* need a leaf at lowest level; 512GB pages not supported */ + if (level < 0 || level == 3) + return -EINVAL; + + /* offset is now log base 2 of the page size */ + gpa = pte & 0x01fffffffffff000ul; + if (gpa & ((1ul << offset) - 1)) + return -EINVAL; + gpa += eaddr & ((1ul << offset) - 1); + for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) + if (offset == mmu_psize_defs[ps].shift) + break; + gpte->page_size = ps; + + gpte->eaddr = eaddr; + gpte->raddr = gpa; + + /* Work out permissions */ + gpte->may_read = !!(pte & _PAGE_READ); + gpte->may_write = !!(pte & _PAGE_WRITE); + gpte->may_execute = !!(pte & _PAGE_EXEC); + if (kvmppc_get_msr(vcpu) & MSR_PR) { + if (pte & _PAGE_PRIVILEGED) { + gpte->may_read = 0; + gpte->may_write = 0; + gpte->may_execute = 0; + } + } else { + if (!(pte & _PAGE_PRIVILEGED)) { + /* Check AMR/IAMR to see if strict mode is in force */ + if (vcpu->arch.amr & (1ul << 62)) + gpte->may_read = 0; + if (vcpu->arch.amr & (1ul << 63)) + gpte->may_write = 0; + if (vcpu->arch.iamr & (1ul << 62)) + gpte->may_execute = 0; + } + } + + return 0; +} + +#ifdef CONFIG_PPC_64K_PAGES +#define MMU_BASE_PSIZE MMU_PAGE_64K +#else +#define MMU_BASE_PSIZE MMU_PAGE_4K +#endif + +static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, + unsigned int pshift) +{ + int psize = MMU_BASE_PSIZE; + + if (pshift >= PMD_SHIFT) + psize = MMU_PAGE_2M; + addr &= ~0xfffUL; + addr |= mmu_psize_defs[psize].ap << 5; + asm volatile("ptesync": : :"memory"); + asm volatile(PPC_TLBIE_5(%0, %1, 0, 0, 1) + : : "r" (addr), "r" (kvm->arch.lpid) : "memory"); + asm volatile("ptesync": : :"memory"); +} + +unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, + unsigned long clr, unsigned long set, + unsigned long addr, unsigned int shift) +{ + unsigned long old = 0; + + if (!(clr & _PAGE_PRESENT) && cpu_has_feature(CPU_FTR_POWER9_DD1) && + pte_present(*ptep)) { + /* have to invalidate it first */ + old = __radix_pte_update(ptep, _PAGE_PRESENT, 0); + kvmppc_radix_tlbie_page(kvm, addr, shift); + set |= _PAGE_PRESENT; + old &= _PAGE_PRESENT; + } + return __radix_pte_update(ptep, clr, set) | old; +} + +void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + radix__set_pte_at(kvm->mm, addr, ptep, pte, 0); +} + +static struct kmem_cache *kvm_pte_cache; + +static pte_t *kvmppc_pte_alloc(void) +{ + return kmem_cache_alloc(kvm_pte_cache, GFP_KERNEL); +} + +static void kvmppc_pte_free(pte_t *ptep) +{ + kmem_cache_free(kvm_pte_cache, ptep); +} + +static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, + unsigned int level, unsigned long mmu_seq) +{ + pgd_t *pgd; + pud_t *pud, *new_pud = NULL; + pmd_t *pmd, *new_pmd = NULL; + pte_t *ptep, *new_ptep = NULL; + unsigned long old; + int ret; + + /* Traverse the guest's 2nd-level tree, allocate new levels needed */ + pgd = kvm->arch.pgtable + pgd_index(gpa); + pud = NULL; + if (pgd_present(*pgd)) + pud = pud_offset(pgd, gpa); + else + new_pud = pud_alloc_one(kvm->mm, gpa); + + pmd = NULL; + if (pud && pud_present(*pud)) + pmd = pmd_offset(pud, gpa); + else + new_pmd = pmd_alloc_one(kvm->mm, gpa); + + if (level == 0 && !(pmd && pmd_present(*pmd))) + new_ptep = kvmppc_pte_alloc(); + + /* Check if we might have been invalidated; let the guest retry if so */ + spin_lock(&kvm->mmu_lock); + ret = -EAGAIN; + if (mmu_notifier_retry(kvm, mmu_seq)) + goto out_unlock; + + /* Now traverse again under the lock and change the tree */ + ret = -ENOMEM; + if (pgd_none(*pgd)) { + if (!new_pud) + goto out_unlock; + pgd_populate(kvm->mm, pgd, new_pud); + new_pud = NULL; + } + pud = pud_offset(pgd, gpa); + if (pud_none(*pud)) { + if (!new_pmd) + goto out_unlock; + pud_populate(kvm->mm, pud, new_pmd); + new_pmd = NULL; + } + pmd = pmd_offset(pud, gpa); + if (pmd_large(*pmd)) { + /* Someone else has instantiated a large page here; retry */ + ret = -EAGAIN; + goto out_unlock; + } + if (level == 1 && !pmd_none(*pmd)) { + /* + * There's a page table page here, but we wanted + * to install a large page. Tell the caller and let + * it try installing a normal page if it wants. + */ + ret = -EBUSY; + goto out_unlock; + } + if (level == 0) { + if (pmd_none(*pmd)) { + if (!new_ptep) + goto out_unlock; + pmd_populate(kvm->mm, pmd, new_ptep); + new_ptep = NULL; + } + ptep = pte_offset_kernel(pmd, gpa); + if (pte_present(*ptep)) { + /* PTE was previously valid, so invalidate it */ + old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, + 0, gpa, 0); + kvmppc_radix_tlbie_page(kvm, gpa, 0); + if (old & _PAGE_DIRTY) + mark_page_dirty(kvm, gpa >> PAGE_SHIFT); + } + kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); + } else { + kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); + } + ret = 0; + + out_unlock: + spin_unlock(&kvm->mmu_lock); + if (new_pud) + pud_free(kvm->mm, new_pud); + if (new_pmd) + pmd_free(kvm->mm, new_pmd); + if (new_ptep) + kvmppc_pte_free(new_ptep); + return ret; +} + +int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, + unsigned long ea, unsigned long dsisr) +{ + struct kvm *kvm = vcpu->kvm; + unsigned long mmu_seq, pte_size; + unsigned long gpa, gfn, hva, pfn; + struct kvm_memory_slot *memslot; + struct page *page = NULL, *pages[1]; + long ret, npages, ok; + unsigned int writing; + struct vm_area_struct *vma; + unsigned long flags; + pte_t pte, *ptep; + unsigned long pgflags; + unsigned int shift, level; + + /* Check for unusual errors */ + if (dsisr & DSISR_UNSUPP_MMU) { + pr_err("KVM: Got unsupported MMU fault\n"); + return -EFAULT; + } + if (dsisr & DSISR_BADACCESS) { + /* Reflect to the guest as DSI */ + pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr); + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + + /* Translate the logical address and get the page */ + gpa = vcpu->arch.fault_gpa & ~0xfffUL; + gpa &= ~0xF000000000000000ul; + gfn = gpa >> PAGE_SHIFT; + if (!(dsisr & DSISR_PGDIRFAULT)) + gpa |= ea & 0xfff; + memslot = gfn_to_memslot(kvm, gfn); + + /* No memslot means it's an emulated MMIO region */ + if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { + if (dsisr & (DSISR_PGDIRFAULT | DSISR_BADACCESS | + DSISR_SET_RC)) { + /* + * Bad address in guest page table tree, or other + * unusual error - reflect it to the guest as DSI. + */ + kvmppc_core_queue_data_storage(vcpu, ea, dsisr); + return RESUME_GUEST; + } + return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, + dsisr & DSISR_ISSTORE); + } + + /* used to check for invalidations in progress */ + mmu_seq = kvm->mmu_notifier_seq; + smp_rmb(); + + writing = (dsisr & DSISR_ISSTORE) != 0; + hva = gfn_to_hva_memslot(memslot, gfn); + if (dsisr & DSISR_SET_RC) { + /* + * Need to set an R or C bit in the 2nd-level tables; + * if the relevant bits aren't already set in the linux + * page tables, fall through to do the gup_fast to + * set them in the linux page tables too. + */ + ok = 0; + pgflags = _PAGE_ACCESSED; + if (writing) + pgflags |= _PAGE_DIRTY; + local_irq_save(flags); + ptep = __find_linux_pte_or_hugepte(current->mm->pgd, hva, + NULL, NULL); + if (ptep) { + pte = READ_ONCE(*ptep); + if (pte_present(pte) && + (pte_val(pte) & pgflags) == pgflags) + ok = 1; + } + local_irq_restore(flags); + if (ok) { + spin_lock(&kvm->mmu_lock); + if (mmu_notifier_retry(vcpu->kvm, mmu_seq)) { + spin_unlock(&kvm->mmu_lock); + return RESUME_GUEST; + } + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, + gpa, NULL, &shift); + if (ptep && pte_present(*ptep)) { + kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, + gpa, shift); + spin_unlock(&kvm->mmu_lock); + return RESUME_GUEST; + } + spin_unlock(&kvm->mmu_lock); + } + } + + ret = -EFAULT; + pfn = 0; + pte_size = PAGE_SIZE; + pgflags = _PAGE_READ | _PAGE_EXEC; + level = 0; + npages = get_user_pages_fast(hva, 1, writing, pages); + if (npages < 1) { + /* Check if it's an I/O mapping */ + down_read(¤t->mm->mmap_sem); + vma = find_vma(current->mm, hva); + if (vma && vma->vm_start <= hva && hva < vma->vm_end && + (vma->vm_flags & VM_PFNMAP)) { + pfn = vma->vm_pgoff + + ((hva - vma->vm_start) >> PAGE_SHIFT); + pgflags = pgprot_val(vma->vm_page_prot); + } + up_read(¤t->mm->mmap_sem); + if (!pfn) + return -EFAULT; + } else { + page = pages[0]; + pfn = page_to_pfn(page); + if (PageHuge(page)) { + page = compound_head(page); + pte_size <<= compound_order(page); + /* See if we can insert a 2MB large-page PTE here */ + if (pte_size >= PMD_SIZE && + (gpa & PMD_MASK & PAGE_MASK) == + (hva & PMD_MASK & PAGE_MASK)) { + level = 1; + pfn &= ~((PMD_SIZE >> PAGE_SHIFT) - 1); + } + } + /* See if we can provide write access */ + if (writing) { + /* + * We assume gup_fast has set dirty on the host PTE. + */ + pgflags |= _PAGE_WRITE; + } else { + local_irq_save(flags); + ptep = __find_linux_pte_or_hugepte(current->mm->pgd, + hva, NULL, NULL); + if (ptep && pte_write(*ptep) && pte_dirty(*ptep)) + pgflags |= _PAGE_WRITE; + local_irq_restore(flags); + } + } + + /* + * Compute the PTE value that we need to insert. + */ + pgflags |= _PAGE_PRESENT | _PAGE_PTE | _PAGE_ACCESSED; + if (pgflags & _PAGE_WRITE) + pgflags |= _PAGE_DIRTY; + pte = pfn_pte(pfn, __pgprot(pgflags)); + + /* Allocate space in the tree and write the PTE */ + ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); + if (ret == -EBUSY) { + /* + * There's already a PMD where wanted to install a large page; + * for now, fall back to installing a small page. + */ + level = 0; + pfn |= gfn & ((PMD_SIZE >> PAGE_SHIFT) - 1); + pte = pfn_pte(pfn, __pgprot(pgflags)); + ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); + } + if (ret == 0 || ret == -EAGAIN) + ret = RESUME_GUEST; + + if (page) { + /* + * We drop pages[0] here, not page because page might + * have been set to the head page of a compound, but + * we have to drop the reference on the correct tail + * page to match the get inside gup() + */ + put_page(pages[0]); + } + return ret; +} + +static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn, unsigned int order) +{ + unsigned long i, limit; + unsigned long *dp; + + if (!memslot->dirty_bitmap) + return; + limit = 1ul << order; + if (limit < BITS_PER_LONG) { + for (i = 0; i < limit; ++i) + mark_page_dirty(kvm, gfn + i); + return; + } + dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn); + limit /= BITS_PER_LONG; + for (i = 0; i < limit; ++i) + *dp++ = ~0ul; +} + +/* Called with kvm->lock held */ +int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) +{ + pte_t *ptep; + unsigned long gpa = gfn << PAGE_SHIFT; + unsigned int shift; + unsigned long old; + + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, + NULL, &shift); + if (ptep && pte_present(*ptep)) { + old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0, + gpa, shift); + kvmppc_radix_tlbie_page(kvm, gpa, shift); + if (old & _PAGE_DIRTY) { + if (!shift) + mark_page_dirty(kvm, gfn); + else + mark_pages_dirty(kvm, memslot, + gfn, shift - PAGE_SHIFT); + } + } + return 0; +} + +/* Called with kvm->lock held */ +int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) +{ + pte_t *ptep; + unsigned long gpa = gfn << PAGE_SHIFT; + unsigned int shift; + int ref = 0; + + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, + NULL, &shift); + if (ptep && pte_present(*ptep) && pte_young(*ptep)) { + kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, 0, + gpa, shift); + /* XXX need to flush tlb here? */ + ref = 1; + } + return ref; +} + +/* Called with kvm->lock held */ +int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, + unsigned long gfn) +{ + pte_t *ptep; + unsigned long gpa = gfn << PAGE_SHIFT; + unsigned int shift; + int ref = 0; + + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, + NULL, &shift); + if (ptep && pte_present(*ptep) && pte_young(*ptep)) + ref = 1; + return ref; +} + +/* Returns the number of PAGE_SIZE pages that are dirty */ +static int kvm_radix_test_clear_dirty(struct kvm *kvm, + struct kvm_memory_slot *memslot, int pagenum) +{ + unsigned long gfn = memslot->base_gfn + pagenum; + unsigned long gpa = gfn << PAGE_SHIFT; + pte_t *ptep; + unsigned int shift; + int ret = 0; + + ptep = __find_linux_pte_or_hugepte(kvm->arch.pgtable, gpa, + NULL, &shift); + if (ptep && pte_present(*ptep) && pte_dirty(*ptep)) { + ret = 1; + if (shift) + ret = 1 << (shift - PAGE_SHIFT); + kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, + gpa, shift); + kvmppc_radix_tlbie_page(kvm, gpa, shift); + } + return ret; +} + +long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, + struct kvm_memory_slot *memslot, unsigned long *map) +{ + unsigned long i, j; + unsigned long n, *p; + int npages; + + /* + * Radix accumulates dirty bits in the first half of the + * memslot's dirty_bitmap area, for when pages are paged + * out or modified by the host directly. Pick up these + * bits and add them to the map. + */ + n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long); + p = memslot->dirty_bitmap; + for (i = 0; i < n; ++i) + map[i] |= xchg(&p[i], 0); + + for (i = 0; i < memslot->npages; i = j) { + npages = kvm_radix_test_clear_dirty(kvm, memslot, i); + + /* + * Note that if npages > 0 then i must be a multiple of npages, + * since huge pages are only used to back the guest at guest + * real addresses that are a multiple of their size. + * Since we have at most one PTE covering any given guest + * real address, if npages > 1 we can skip to i + npages. + */ + j = i + 1; + if (npages) + for (j = i; npages; ++j, --npages) + __set_bit_le(j, map); + } + return 0; +} + +static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info, + int psize, int *indexp) +{ + if (!mmu_psize_defs[psize].shift) + return; + info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift | + (mmu_psize_defs[psize].ap << 29); + ++(*indexp); +} + +int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info) +{ + int i; + + if (!radix_enabled()) + return -EINVAL; + memset(info, 0, sizeof(*info)); + + /* 4k page size */ + info->geometries[0].page_shift = 12; + info->geometries[0].level_bits[0] = 9; + for (i = 1; i < 4; ++i) + info->geometries[0].level_bits[i] = p9_supported_radix_bits[i]; + /* 64k page size */ + info->geometries[1].page_shift = 16; + for (i = 0; i < 4; ++i) + info->geometries[1].level_bits[i] = p9_supported_radix_bits[i]; + + i = 0; + add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i); + add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i); + add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i); + add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i); + + return 0; +} + +int kvmppc_init_vm_radix(struct kvm *kvm) +{ + kvm->arch.pgtable = pgd_alloc(kvm->mm); + if (!kvm->arch.pgtable) + return -ENOMEM; + return 0; +} + +void kvmppc_free_radix(struct kvm *kvm) +{ + unsigned long ig, iu, im; + pte_t *pte; + pmd_t *pmd; + pud_t *pud; + pgd_t *pgd; + + if (!kvm->arch.pgtable) + return; + pgd = kvm->arch.pgtable; + for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { + if (!pgd_present(*pgd)) + continue; + pud = pud_offset(pgd, 0); + for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++pud) { + if (!pud_present(*pud)) + continue; + pmd = pmd_offset(pud, 0); + for (im = 0; im < PTRS_PER_PMD; ++im, ++pmd) { + if (pmd_huge(*pmd)) { + pmd_clear(pmd); + continue; + } + if (!pmd_present(*pmd)) + continue; + pte = pte_offset_map(pmd, 0); + memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE); + kvmppc_pte_free(pte); + pmd_clear(pmd); + } + pmd_free(kvm->mm, pmd_offset(pud, 0)); + pud_clear(pud); + } + pud_free(kvm->mm, pud_offset(pgd, 0)); + pgd_clear(pgd); + } + pgd_free(kvm->mm, kvm->arch.pgtable); +} + +static void pte_ctor(void *addr) +{ + memset(addr, 0, PTE_TABLE_SIZE); +} + +int kvmppc_radix_init(void) +{ + unsigned long size = sizeof(void *) << PTE_INDEX_SIZE; + + kvm_pte_cache = kmem_cache_create("kvm-pte", size, size, 0, pte_ctor); + if (!kvm_pte_cache) + return -ENOMEM; + return 0; +} + +void kvmppc_radix_exit(void) +{ + kmem_cache_destroy(kvm_pte_cache); +} diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ec34e39471a7..e4a79679342e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1135,7 +1135,7 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr, /* * Userspace can only modify DPFD (default prefetch depth), * ILE (interrupt little-endian) and TC (translation control). - * On POWER8 userspace can also modify AIL (alt. interrupt loc.) + * On POWER8 and POWER9 userspace can also modify AIL (alt. interrupt loc.). */ mask = LPCR_DPFD | LPCR_ILE | LPCR_TC; if (cpu_has_feature(CPU_FTR_ARCH_207S)) @@ -1821,6 +1821,7 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, vcpu->arch.vcore = vcore; vcpu->arch.ptid = vcpu->vcpu_id - vcore->first_vcpuid; vcpu->arch.thread_cpu = -1; + vcpu->arch.prev_cpu = -1; vcpu->arch.cpu_type = KVM_CPU_3S_64; kvmppc_sanity_check(vcpu); @@ -1950,11 +1951,33 @@ static void kvmppc_release_hwthread(int cpu) tpaca->kvm_hstate.kvm_split_mode = NULL; } +static void do_nothing(void *x) +{ +} + +static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) +{ + int i; + + cpu = cpu_first_thread_sibling(cpu); + cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); + /* + * Make sure setting of bit in need_tlb_flush precedes + * testing of cpu_in_guest bits. The matching barrier on + * the other side is the first smp_mb() in kvmppc_run_core(). + */ + smp_mb(); + for (i = 0; i < threads_per_core; ++i) + if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest)) + smp_call_function_single(cpu + i, do_nothing, NULL, 1); +} + static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) { int cpu; struct paca_struct *tpaca; struct kvmppc_vcore *mvc = vc->master_vcore; + struct kvm *kvm = vc->kvm; cpu = vc->pcpu; if (vcpu) { @@ -1965,6 +1988,27 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) cpu += vcpu->arch.ptid; vcpu->cpu = mvc->pcpu; vcpu->arch.thread_cpu = cpu; + + /* + * With radix, the guest can do TLB invalidations itself, + * and it could choose to use the local form (tlbiel) if + * it is invalidating a translation that has only ever been + * used on one vcpu. However, that doesn't mean it has + * only ever been used on one physical cpu, since vcpus + * can move around between pcpus. To cope with this, when + * a vcpu moves from one pcpu to another, we need to tell + * any vcpus running on the same core as this vcpu previously + * ran to flush the TLB. The TLB is shared between threads, + * so we use a single bit in .need_tlb_flush for all 4 threads. + */ + if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) { + if (vcpu->arch.prev_cpu >= 0 && + cpu_first_thread_sibling(vcpu->arch.prev_cpu) != + cpu_first_thread_sibling(cpu)) + radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); + vcpu->arch.prev_cpu = cpu; + } + cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest); } tpaca = &paca[cpu]; tpaca->kvm_hstate.kvm_vcpu = vcpu; @@ -2552,6 +2596,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) kvmppc_release_hwthread(pcpu + i); if (sip && sip->napped[i]) kvmppc_ipi_thread(pcpu + i); + cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest); } kvmppc_set_host_core(pcpu); @@ -2877,7 +2922,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) smp_mb(); /* On the first time here, set up HTAB and VRMA */ - if (!vcpu->kvm->arch.hpte_setup_done) { + if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) { r = kvmppc_hv_setup_htab_rma(vcpu); if (r) goto out; @@ -2939,6 +2984,13 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, { struct kvm_ppc_one_seg_page_size *sps; + /* + * Since we don't yet support HPT guests on a radix host, + * return an error if the host uses radix. + */ + if (radix_enabled()) + return -EINVAL; + info->flags = KVM_PPC_PAGE_SIZES_REAL; if (mmu_has_feature(MMU_FTR_1T_SEGMENT)) info->flags |= KVM_PPC_1T_SEGMENTS; @@ -2961,8 +3013,10 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, { struct kvm_memslots *slots; struct kvm_memory_slot *memslot; - int r; + int i, r; unsigned long n; + unsigned long *buf; + struct kvm_vcpu *vcpu; mutex_lock(&kvm->slots_lock); @@ -2976,15 +3030,32 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm, if (!memslot->dirty_bitmap) goto out; + /* + * Use second half of bitmap area because radix accumulates + * bits in the first half. + */ n = kvm_dirty_bitmap_bytes(memslot); - memset(memslot->dirty_bitmap, 0, n); + buf = memslot->dirty_bitmap + n / sizeof(long); + memset(buf, 0, n); - r = kvmppc_hv_get_dirty_log(kvm, memslot, memslot->dirty_bitmap); + if (kvm_is_radix(kvm)) + r = kvmppc_hv_get_dirty_log_radix(kvm, memslot, buf); + else + r = kvmppc_hv_get_dirty_log_hpt(kvm, memslot, buf); if (r) goto out; + /* Harvest dirty bits from VPA and DTL updates */ + /* Note: we never modify the SLB shadow buffer areas */ + kvm_for_each_vcpu(i, vcpu, kvm) { + spin_lock(&vcpu->arch.vpa_update_lock); + kvmppc_harvest_vpa_dirty(&vcpu->arch.vpa, memslot, buf); + kvmppc_harvest_vpa_dirty(&vcpu->arch.dtl, memslot, buf); + spin_unlock(&vcpu->arch.vpa_update_lock); + } + r = -EFAULT; - if (copy_to_user(log->dirty_bitmap, memslot->dirty_bitmap, n)) + if (copy_to_user(log->dirty_bitmap, buf, n)) goto out; r = 0; @@ -3005,6 +3076,15 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free, static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot, unsigned long npages) { + /* + * For now, if radix_enabled() then we only support radix guests, + * and in that case we don't need the rmap array. + */ + if (radix_enabled()) { + slot->arch.rmap = NULL; + return 0; + } + slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap)); if (!slot->arch.rmap) return -ENOMEM; @@ -3037,7 +3117,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, if (npages) atomic64_inc(&kvm->arch.mmio_update); - if (npages && old->npages) { + if (npages && old->npages && !kvm_is_radix(kvm)) { /* * If modifying a memslot, reset all the rmap dirty bits. * If this is a new memslot, we don't need to do anything @@ -3046,7 +3126,7 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm, */ slots = kvm_memslots(kvm); memslot = id_to_memslot(slots, mem->slot); - kvmppc_hv_get_dirty_log(kvm, memslot, NULL); + kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL); } } @@ -3085,14 +3165,20 @@ static void kvmppc_setup_partition_table(struct kvm *kvm) { unsigned long dw0, dw1; - /* PS field - page size for VRMA */ - dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) | - ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1); - /* HTABSIZE and HTABORG fields */ - dw0 |= kvm->arch.sdr1; + if (!kvm_is_radix(kvm)) { + /* PS field - page size for VRMA */ + dw0 = ((kvm->arch.vrma_slb_v & SLB_VSID_L) >> 1) | + ((kvm->arch.vrma_slb_v & SLB_VSID_LP) << 1); + /* HTABSIZE and HTABORG fields */ + dw0 |= kvm->arch.sdr1; - /* Second dword has GR=0; other fields are unused since UPRT=0 */ - dw1 = 0; + /* Second dword as set by userspace */ + dw1 = kvm->arch.process_table; + } else { + dw0 = PATB_HR | radix__get_tree_size() | + __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE; + dw1 = PATB_GR | kvm->arch.process_table; + } mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1); } @@ -3262,6 +3348,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) { unsigned long lpcr, lpid; char buf[32]; + int ret; /* Allocate the guest's logical partition ID */ @@ -3309,13 +3396,30 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) lpcr |= LPCR_HVICE; } + /* + * For now, if the host uses radix, the guest must be radix. + */ + if (radix_enabled()) { + kvm->arch.radix = 1; + lpcr &= ~LPCR_VPM1; + lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR; + ret = kvmppc_init_vm_radix(kvm); + if (ret) { + kvmppc_free_lpid(kvm->arch.lpid); + return ret; + } + kvmppc_setup_partition_table(kvm); + } + kvm->arch.lpcr = lpcr; /* * Work out how many sets the TLB has, for the use of * the TLB invalidation loop in book3s_hv_rmhandlers.S. */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) + if (kvm_is_radix(kvm)) + kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */ + else if (cpu_has_feature(CPU_FTR_ARCH_300)) kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */ else if (cpu_has_feature(CPU_FTR_ARCH_207S)) kvm->arch.tlb_sets = POWER8_TLB_SETS; /* 512 */ @@ -3325,8 +3429,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) /* * Track that we now have a HV mode VM active. This blocks secondary * CPU threads from coming online. + * On POWER9, we only need to do this for HPT guests on a radix + * host, which is not yet supported. */ - kvm_hv_vm_activated(); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + kvm_hv_vm_activated(); /* * Create a debugfs directory for the VM @@ -3352,11 +3459,17 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) { debugfs_remove_recursive(kvm->arch.debugfs_dir); - kvm_hv_vm_deactivated(); + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + kvm_hv_vm_deactivated(); kvmppc_free_vcores(kvm); - kvmppc_free_hpt(kvm); + kvmppc_free_lpid(kvm->arch.lpid); + + if (kvm_is_radix(kvm)) + kvmppc_free_radix(kvm); + else + kvmppc_free_hpt(kvm); kvmppc_free_pimap(kvm); } @@ -3385,11 +3498,6 @@ static int kvmppc_core_check_processor_compat_hv(void) if (!cpu_has_feature(CPU_FTR_HVMODE) || !cpu_has_feature(CPU_FTR_ARCH_206)) return -EIO; - /* - * Disable KVM for Power9 in radix mode. - */ - if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled()) - return -EIO; return 0; } @@ -3657,6 +3765,41 @@ static void init_default_hcalls(void) } } +static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) +{ + unsigned long lpcr; + int radix; + + /* If not on a POWER9, reject it */ + if (!cpu_has_feature(CPU_FTR_ARCH_300)) + return -ENODEV; + + /* If any unknown flags set, reject it */ + if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE)) + return -EINVAL; + + /* We can't change a guest to/from radix yet */ + radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX); + if (radix != kvm_is_radix(kvm)) + return -EINVAL; + + /* GR (guest radix) bit in process_table field must match */ + if (!!(cfg->process_table & PATB_GR) != radix) + return -EINVAL; + + /* Process table size field must be reasonable, i.e. <= 24 */ + if ((cfg->process_table & PRTS_MASK) > 24) + return -EINVAL; + + kvm->arch.process_table = cfg->process_table; + kvmppc_setup_partition_table(kvm); + + lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0; + kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE); + + return 0; +} + static struct kvmppc_ops kvm_ops_hv = { .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, @@ -3694,6 +3837,8 @@ static struct kvmppc_ops kvm_ops_hv = { .irq_bypass_add_producer = kvmppc_irq_bypass_add_producer_hv, .irq_bypass_del_producer = kvmppc_irq_bypass_del_producer_hv, #endif + .configure_mmu = kvmhv_configure_mmu, + .get_rmmu_info = kvmhv_get_rmmu_info, }; static int kvm_init_subcore_bitmap(void) @@ -3728,6 +3873,11 @@ static int kvm_init_subcore_bitmap(void) return 0; } +static int kvmppc_radix_possible(void) +{ + return cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled(); +} + static int kvmppc_book3s_init_hv(void) { int r; @@ -3767,12 +3917,19 @@ static int kvmppc_book3s_init_hv(void) init_vcore_lists(); r = kvmppc_mmu_hv_init(); + if (r) + return r; + + if (kvmppc_radix_possible()) + r = kvmppc_radix_init(); return r; } static void kvmppc_book3s_exit_hv(void) { kvmppc_free_host_rm_ops(); + if (kvmppc_radix_possible()) + kvmppc_radix_exit(); kvmppc_hv_ops = NULL; } diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 5bb24be0b346..2f69fbc19bb0 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -200,7 +200,6 @@ static inline void rm_writeb(unsigned long paddr, u8 val) /* * Send an interrupt or message to another CPU. - * This can only be called in real mode. * The caller needs to include any barrier needed to order writes * to memory vs. the IPI/message. */ @@ -229,8 +228,7 @@ void kvmhv_rm_send_ipi(int cpu) if (xics_phys) rm_writeb(xics_phys + XICS_MFRR, IPI_PRIORITY); else - opal_rm_int_set_mfrr(get_hard_smp_processor_id(cpu), - IPI_PRIORITY); + opal_int_set_mfrr(get_hard_smp_processor_id(cpu), IPI_PRIORITY); } /* @@ -412,14 +410,13 @@ static long kvmppc_read_one_intr(bool *again) /* Now read the interrupt from the ICP */ xics_phys = local_paca->kvm_hstate.xics_phys; - if (!xics_phys) { - /* Use OPAL to read the XIRR */ - rc = opal_rm_int_get_xirr(&xirr, false); - if (rc < 0) - return 1; - } else { + rc = 0; + if (!xics_phys) + rc = opal_int_get_xirr(&xirr, false); + else xirr = _lwzcix(xics_phys + XICS_XIRR); - } + if (rc < 0) + return 1; /* * Save XIRR for later. Since we get control in reverse endian @@ -445,15 +442,16 @@ static long kvmppc_read_one_intr(bool *again) * If it is an IPI, clear the MFRR and EOI it. */ if (xisr == XICS_IPI) { + rc = 0; if (xics_phys) { _stbcix(xics_phys + XICS_MFRR, 0xff); _stwcix(xics_phys + XICS_XIRR, xirr); } else { - opal_rm_int_set_mfrr(hard_smp_processor_id(), 0xff); - rc = opal_rm_int_eoi(h_xirr); - /* If rc > 0, there is another interrupt pending */ - *again = rc > 0; + opal_int_set_mfrr(hard_smp_processor_id(), 0xff); + rc = opal_int_eoi(h_xirr); } + /* If rc > 0, there is another interrupt pending */ + *again = rc > 0; /* * Need to ensure side effects of above stores @@ -474,8 +472,8 @@ static long kvmppc_read_one_intr(bool *again) if (xics_phys) _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY); else - opal_rm_int_set_mfrr(hard_smp_processor_id(), - IPI_PRIORITY); + opal_int_set_mfrr(hard_smp_processor_id(), + IPI_PRIORITY); /* Let side effects complete */ smp_mb(); return 1; diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c b/arch/powerpc/kvm/book3s_hv_rm_mmu.c index 9ef3c4be952f..b095afcd4309 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c @@ -43,6 +43,7 @@ static void *real_vmalloc_addr(void *x) static int global_invalidates(struct kvm *kvm, unsigned long flags) { int global; + int cpu; /* * If there is only one vcore, and it's currently running, @@ -60,8 +61,14 @@ static int global_invalidates(struct kvm *kvm, unsigned long flags) /* any other core might now have stale TLB entries... */ smp_wmb(); cpumask_setall(&kvm->arch.need_tlb_flush); - cpumask_clear_cpu(local_paca->kvm_hstate.kvm_vcore->pcpu, - &kvm->arch.need_tlb_flush); + cpu = local_paca->kvm_hstate.kvm_vcore->pcpu; + /* + * On POWER9, threads are independent but the TLB is shared, + * so use the bit for the first thread to represent the core. + */ + if (cpu_has_feature(CPU_FTR_ARCH_300)) + cpu = cpu_first_thread_sibling(cpu); + cpumask_clear_cpu(cpu, &kvm->arch.need_tlb_flush); } return global; @@ -182,6 +189,8 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags, unsigned long mmu_seq; unsigned long rcbits, irq_flags = 0; + if (kvm_is_radix(kvm)) + return H_FUNCTION; psize = hpte_page_size(pteh, ptel); if (!psize) return H_PARAMETER; @@ -458,6 +467,8 @@ long kvmppc_do_h_remove(struct kvm *kvm, unsigned long flags, struct revmap_entry *rev; u64 pte, orig_pte, pte_r; + if (kvm_is_radix(kvm)) + return H_FUNCTION; if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; hpte = (__be64 *)(kvm->arch.hpt_virt + (pte_index << 4)); @@ -529,6 +540,8 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) struct revmap_entry *rev, *revs[4]; u64 hp0, hp1; + if (kvm_is_radix(kvm)) + return H_FUNCTION; global = global_invalidates(kvm, 0); for (i = 0; i < 4 && ret == H_SUCCESS; ) { n = 0; @@ -642,6 +655,8 @@ long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long v, r, rb, mask, bits; u64 pte_v, pte_r; + if (kvm_is_radix(kvm)) + return H_FUNCTION; if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; @@ -711,6 +726,8 @@ long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, int i, n = 1; struct revmap_entry *rev = NULL; + if (kvm_is_radix(kvm)) + return H_FUNCTION; if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; if (flags & H_READ_4) { @@ -750,6 +767,8 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long *rmap; long ret = H_NOT_FOUND; + if (kvm_is_radix(kvm)) + return H_FUNCTION; if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; @@ -796,6 +815,8 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags, unsigned long *rmap; long ret = H_NOT_FOUND; + if (kvm_is_radix(kvm)) + return H_FUNCTION; if (pte_index >= kvm->arch.hpt_npte) return H_PARAMETER; diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 06edc4366639..29f43ed6d5eb 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -36,7 +36,7 @@ EXPORT_SYMBOL(kvm_irq_bypass); static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, u32 new_irq); -static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu); +static int xics_opal_set_server(unsigned int hw_irq, int server_cpu); /* -- ICS routines -- */ static void ics_rm_check_resend(struct kvmppc_xics *xics, @@ -70,11 +70,9 @@ static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) hcpu = hcore << threads_shift; kvmppc_host_rm_ops_hv->rm_core[hcore].rm_data = vcpu; smp_muxed_ipi_set_message(hcpu, PPC_MSG_RM_HOST_ACTION); - if (paca[hcpu].kvm_hstate.xics_phys) - icp_native_cause_ipi_rm(hcpu); - else - opal_rm_int_set_mfrr(get_hard_smp_processor_id(hcpu), - IPI_PRIORITY); + kvmppc_set_host_ipi(hcpu, 1); + smp_mb(); + kvmhv_rm_send_ipi(hcpu); } #else static inline void icp_send_hcore_msg(int hcore, struct kvm_vcpu *vcpu) { } @@ -730,7 +728,7 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) ++vcpu->stat.pthru_host; if (state->intr_cpu != pcpu) { ++vcpu->stat.pthru_bad_aff; - xics_opal_rm_set_server(state->host_irq, pcpu); + xics_opal_set_server(state->host_irq, pcpu); } state->intr_cpu = -1; } @@ -758,16 +756,16 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) if (xics_phys) { _stwcix(xics_phys + XICS_XIRR, xirr); } else { - rc = opal_rm_int_eoi(be32_to_cpu(xirr)); + rc = opal_int_eoi(be32_to_cpu(xirr)); *again = rc > 0; } } -static int xics_opal_rm_set_server(unsigned int hw_irq, int server_cpu) +static int xics_opal_set_server(unsigned int hw_irq, int server_cpu) { unsigned int mangle_cpu = get_hard_smp_processor_id(server_cpu) << 2; - return opal_rm_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY); + return opal_set_xive(hw_irq, mangle_cpu, DEFAULT_PRIORITY); } /* diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9338a818e05c..47414a6fe2dd 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -148,6 +148,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) addi r1, r1, 112 ld r7, HSTATE_HOST_MSR(r13) + /* + * If we came back from the guest via a relocation-on interrupt, + * we will be in virtual mode at this point, which makes it a + * little easier to get back to the caller. + */ + mfmsr r0 + andi. r0, r0, MSR_IR /* in real mode? */ + bne .Lvirt_return + cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL beq 11f @@ -181,6 +190,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) mtspr SPRN_HSRR1, r7 ba 0xe80 + /* Virtual-mode return - can't get here for HMI or machine check */ +.Lvirt_return: + cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL + beq 16f + cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL + beq 17f + andi. r0, r7, MSR_EE /* were interrupts hard-enabled? */ + beq 18f + mtmsrd r7, 1 /* if so then re-enable them */ +18: mtlr r8 + blr + +16: mtspr SPRN_HSRR0, r8 /* jump to reloc-on external vector */ + mtspr SPRN_HSRR1, r7 + b exc_virt_0x4500_hardware_interrupt + +17: mtspr SPRN_HSRR0, r8 + mtspr SPRN_HSRR1, r7 + b exc_virt_0x4e80_h_doorbell + kvmppc_primary_no_guest: /* We handle this much like a ceded vcpu */ /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ @@ -518,6 +547,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) /* Stack frame offsets */ #define STACK_SLOT_TID (112-16) #define STACK_SLOT_PSSCR (112-24) +#define STACK_SLOT_PID (112-32) .global kvmppc_hv_entry kvmppc_hv_entry: @@ -530,6 +560,7 @@ kvmppc_hv_entry: * R1 = host R1 * R2 = TOC * all other volatile GPRS = free + * Does not preserve non-volatile GPRs or CR fields */ mflr r0 std r0, PPC_LR_STKOFF(r1) @@ -549,32 +580,38 @@ kvmppc_hv_entry: bl kvmhv_start_timing 1: #endif - /* Clear out SLB */ + + /* Use cr7 as an indication of radix mode */ + ld r5, HSTATE_KVM_VCORE(r13) + ld r9, VCORE_KVM(r5) /* pointer to struct kvm */ + lbz r0, KVM_RADIX(r9) + cmpwi cr7, r0, 0 + + /* Clear out SLB if hash */ + bne cr7, 2f li r6,0 slbmte r6,r6 slbia ptesync - +2: /* * POWER7/POWER8 host -> guest partition switch code. * We don't have to lock against concurrent tlbies, * but we do have to coordinate across hardware threads. */ /* Set bit in entry map iff exit map is zero. */ - ld r5, HSTATE_KVM_VCORE(r13) li r7, 1 lbz r6, HSTATE_PTID(r13) sld r7, r7, r6 - addi r9, r5, VCORE_ENTRY_EXIT -21: lwarx r3, 0, r9 + addi r8, r5, VCORE_ENTRY_EXIT +21: lwarx r3, 0, r8 cmpwi r3, 0x100 /* any threads starting to exit? */ bge secondary_too_late /* if so we're too late to the party */ or r3, r3, r7 - stwcx. r3, 0, r9 + stwcx. r3, 0, r8 bne 21b /* Primary thread switches to guest partition. */ - ld r9,VCORE_KVM(r5) /* pointer to struct kvm */ cmpwi r6,0 bne 10f lwz r7,KVM_LPID(r9) @@ -590,30 +627,44 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) /* See if we need to flush the TLB */ lhz r6,PACAPACAINDEX(r13) /* test_bit(cpu, need_tlb_flush) */ +BEGIN_FTR_SECTION + /* + * On POWER9, individual threads can come in here, but the + * TLB is shared between the 4 threads in a core, hence + * invalidating on one thread invalidates for all. + * Thus we make all 4 threads use the same bit here. + */ + clrrdi r6,r6,2 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) clrldi r7,r6,64-6 /* extract bit number (6 bits) */ srdi r6,r6,6 /* doubleword number */ sldi r6,r6,3 /* address offset */ add r6,r6,r9 addi r6,r6,KVM_NEED_FLUSH /* dword in kvm->arch.need_tlb_flush */ - li r0,1 - sld r0,r0,r7 + li r8,1 + sld r8,r8,r7 ld r7,0(r6) - and. r7,r7,r0 + and. r7,r7,r8 beq 22f -23: ldarx r7,0,r6 /* if set, clear the bit */ - andc r7,r7,r0 - stdcx. r7,0,r6 - bne 23b /* Flush the TLB of any entries for this LPID */ - lwz r6,KVM_TLB_SETS(r9) - li r0,0 /* RS for P9 version of tlbiel */ - mtctr r6 + lwz r0,KVM_TLB_SETS(r9) + mtctr r0 li r7,0x800 /* IS field = 0b10 */ ptesync -28: tlbiel r7 + li r0,0 /* RS for P9 version of tlbiel */ + bne cr7, 29f +28: tlbiel r7 /* On P9, rs=0, RIC=0, PRS=0, R=0 */ addi r7,r7,0x1000 bdnz 28b - ptesync + b 30f +29: PPC_TLBIEL(7,0,2,1,1) /* for radix, RIC=2, PRS=1, R=1 */ + addi r7,r7,0x1000 + bdnz 29b +30: ptesync +23: ldarx r7,0,r6 /* clear the bit after TLB flushed */ + andc r7,r7,r8 + stdcx. r7,0,r6 + bne 23b /* Add timebase offset onto timebase */ 22: ld r8,VCORE_TB_OFFSET(r5) @@ -658,7 +709,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) beq kvmppc_primary_no_guest kvmppc_got_guest: - /* Load up guest SLB entries */ + /* Load up guest SLB entries (N.B. slb_max will be 0 for radix) */ lwz r5,VCPU_SLB_MAX(r4) cmpwi r5,0 beq 9f @@ -696,8 +747,10 @@ kvmppc_got_guest: BEGIN_FTR_SECTION mfspr r5, SPRN_TIDR mfspr r6, SPRN_PSSCR + mfspr r7, SPRN_PID std r5, STACK_SLOT_TID(r1) std r6, STACK_SLOT_PSSCR(r1) + std r7, STACK_SLOT_PID(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) BEGIN_FTR_SECTION @@ -824,6 +877,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) mtspr SPRN_PID, r7 mtspr SPRN_WORT, r8 BEGIN_FTR_SECTION + PPC_INVALIDATE_ERAT +END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) +BEGIN_FTR_SECTION /* POWER8-only registers */ ld r5, VCPU_TCSCR(r4) ld r6, VCPU_ACOP(r4) @@ -1057,13 +1113,13 @@ hdec_soon: kvmppc_interrupt_hv: /* * Register contents: - * R12 = interrupt vector + * R12 = (guest CR << 32) | interrupt vector * R13 = PACA - * guest CR, R12 saved in shadow VCPU SCRATCH1/0 + * guest R12 saved in shadow VCPU SCRATCH0 + * guest CTR saved in shadow VCPU SCRATCH1 if RELOCATABLE * guest R13 saved in SPRN_SCRATCH0 */ std r9, HSTATE_SCRATCH2(r13) - lbz r9, HSTATE_IN_GUEST(r13) cmpwi r9, KVM_GUEST_MODE_HOST_HV beq kvmppc_bad_host_intr @@ -1094,8 +1150,9 @@ kvmppc_interrupt_hv: std r10, VCPU_GPR(R10)(r9) std r11, VCPU_GPR(R11)(r9) ld r3, HSTATE_SCRATCH0(r13) - lwz r4, HSTATE_SCRATCH1(r13) std r3, VCPU_GPR(R12)(r9) + /* CR is in the high half of r12 */ + srdi r4, r12, 32 stw r4, VCPU_CR(r9) BEGIN_FTR_SECTION ld r3, HSTATE_CFAR(r13) @@ -1114,6 +1171,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) mfspr r11, SPRN_SRR1 std r10, VCPU_SRR0(r9) std r11, VCPU_SRR1(r9) + /* trap is in the low half of r12, clear CR from the high half */ + clrldi r12, r12, 32 andi. r0, r12, 2 /* need to read HSRR0/1? */ beq 1f mfspr r10, SPRN_HSRR0 @@ -1149,7 +1208,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) 11: stw r3,VCPU_HEIR(r9) /* these are volatile across C function calls */ +#ifdef CONFIG_RELOCATABLE + ld r3, HSTATE_SCRATCH1(r13) + mtctr r3 +#else mfctr r3 +#endif mfxer r4 std r3, VCPU_CTR(r9) std r4, VCPU_XER(r9) @@ -1285,11 +1349,15 @@ mc_cont: mtspr SPRN_CTRLT,r6 4: /* Read the guest SLB and save it away */ + ld r5, VCPU_KVM(r9) + lbz r0, KVM_RADIX(r5) + cmpwi r0, 0 + li r5, 0 + bne 3f /* for radix, save 0 entries */ lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */ mtctr r0 li r6,0 addi r7,r9,VCPU_SLB - li r5,0 1: slbmfee r8,r6 andis. r0,r8,SLB_ESID_V@h beq 2f @@ -1301,7 +1369,7 @@ mc_cont: addi r5,r5,1 2: addi r6,r6,1 bdnz 1b - stw r5,VCPU_SLB_MAX(r9) +3: stw r5,VCPU_SLB_MAX(r9) /* * Save the guest PURR/SPURR @@ -1550,9 +1618,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) BEGIN_FTR_SECTION ld r5, STACK_SLOT_TID(r1) ld r6, STACK_SLOT_PSSCR(r1) + ld r7, STACK_SLOT_PID(r1) mtspr SPRN_TIDR, r5 mtspr SPRN_PSSCR, r6 + mtspr SPRN_PID, r7 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) +BEGIN_FTR_SECTION + PPC_INVALIDATE_ERAT +END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1) /* * POWER7/POWER8 guest -> host partition switch code. @@ -1663,6 +1736,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) isync /* load host SLB entries */ +BEGIN_MMU_FTR_SECTION + b 0f +END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) ld r8,PACA_SLBSHADOWPTR(r13) .rept SLB_NUM_BOLTED @@ -1675,7 +1751,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) slbmte r6,r5 1: addi r8,r8,16 .endr - +0: #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING /* Finish timing, if we have a vcpu */ ld r4, HSTATE_KVM_VCPU(r13) @@ -1702,11 +1778,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) * reflect the HDSI to the guest as a DSI. */ kvmppc_hdsi: + ld r3, VCPU_KVM(r9) + lbz r0, KVM_RADIX(r3) + cmpwi r0, 0 mfspr r4, SPRN_HDAR mfspr r6, SPRN_HDSISR + bne .Lradix_hdsi /* on radix, just save DAR/DSISR/ASDR */ /* HPTE not found fault or protection fault? */ andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h beq 1f /* if not, send it to the guest */ +BEGIN_FTR_SECTION + mfspr r5, SPRN_ASDR /* on POWER9, use ASDR to get VSID */ + b 4f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) andi. r0, r11, MSR_DR /* data relocation enabled? */ beq 3f clrrdi r0, r4, 28 @@ -1776,13 +1860,29 @@ fast_interrupt_c_return: stb r0, HSTATE_IN_GUEST(r13) b guest_exit_cont +.Lradix_hdsi: + std r4, VCPU_FAULT_DAR(r9) + stw r6, VCPU_FAULT_DSISR(r9) +.Lradix_hisi: + mfspr r5, SPRN_ASDR + std r5, VCPU_FAULT_GPA(r9) + b guest_exit_cont + /* * Similarly for an HISI, reflect it to the guest as an ISI unless * it is an HPTE not found fault for a page that we have paged out. */ kvmppc_hisi: + ld r3, VCPU_KVM(r9) + lbz r0, KVM_RADIX(r3) + cmpwi r0, 0 + bne .Lradix_hisi /* for radix, just save ASDR */ andis. r0, r11, SRR1_ISI_NOPT@h beq 1f +BEGIN_FTR_SECTION + mfspr r5, SPRN_ASDR /* on POWER9, use ASDR to get VSID */ + b 4f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) andi. r0, r11, MSR_IR /* instruction relocation enabled? */ beq 3f clrrdi r0, r10, 28 diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index ca8f174289bb..2a2b96d53999 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -167,20 +167,38 @@ kvmppc_handler_trampoline_enter_end: * * *****************************************************************************/ -.global kvmppc_handler_trampoline_exit -kvmppc_handler_trampoline_exit: - .global kvmppc_interrupt_pr kvmppc_interrupt_pr: + /* 64-bit entry. Register usage at this point: + * + * SPRG_SCRATCH0 = guest R13 + * R12 = (guest CR << 32) | exit handler id + * R13 = PACA + * HSTATE.SCRATCH0 = guest R12 + * HSTATE.SCRATCH1 = guest CTR if RELOCATABLE + */ +#ifdef CONFIG_PPC64 + /* Match 32-bit entry */ +#ifdef CONFIG_RELOCATABLE + std r9, HSTATE_SCRATCH2(r13) + ld r9, HSTATE_SCRATCH1(r13) + mtctr r9 + ld r9, HSTATE_SCRATCH2(r13) +#endif + rotldi r12, r12, 32 /* Flip R12 halves for stw */ + stw r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */ + srdi r12, r12, 32 /* shift trap into low half */ +#endif +.global kvmppc_handler_trampoline_exit +kvmppc_handler_trampoline_exit: /* Register usage at this point: * - * SPRG_SCRATCH0 = guest R13 - * R12 = exit handler id - * R13 = shadow vcpu (32-bit) or PACA (64-bit) + * SPRG_SCRATCH0 = guest R13 + * R12 = exit handler id + * R13 = shadow vcpu (32-bit) or PACA (64-bit) * HSTATE.SCRATCH0 = guest R12 * HSTATE.SCRATCH1 = guest CR - * */ /* Save registers */ diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index cd892dec7cb6..40a5b2d75ed1 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -565,6 +565,13 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_PPC_HWRNG: r = kvmppc_hwrng_present(); break; + case KVM_CAP_PPC_MMU_RADIX: + r = !!(hv_enabled && radix_enabled()); + break; + case KVM_CAP_PPC_MMU_HASH_V3: + r = !!(hv_enabled && !radix_enabled() && + cpu_has_feature(CPU_FTR_ARCH_300)); + break; #endif case KVM_CAP_SYNC_MMU: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -1468,6 +1475,31 @@ long kvm_arch_vm_ioctl(struct file *filp, r = kvm_vm_ioctl_rtas_define_token(kvm, argp); break; } + case KVM_PPC_CONFIGURE_V3_MMU: { + struct kvm *kvm = filp->private_data; + struct kvm_ppc_mmuv3_cfg cfg; + + r = -EINVAL; + if (!kvm->arch.kvm_ops->configure_mmu) + goto out; + r = -EFAULT; + if (copy_from_user(&cfg, argp, sizeof(cfg))) + goto out; + r = kvm->arch.kvm_ops->configure_mmu(kvm, &cfg); + break; + } + case KVM_PPC_GET_RMMU_INFO: { + struct kvm *kvm = filp->private_data; + struct kvm_ppc_rmmu_info info; + + r = -EINVAL; + if (!kvm->arch.kvm_ops->get_rmmu_info) + goto out; + r = kvm->arch.kvm_ops->get_rmmu_info(kvm, &info); + if (r >= 0 && copy_to_user(argp, &info, sizeof(info))) + r = -EFAULT; + break; + } default: { struct kvm *kvm = filp->private_data; r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg); diff --git a/arch/powerpc/lib/Makefile b/arch/powerpc/lib/Makefile index 309361e86523..0e649d72fe8d 100644 --- a/arch/powerpc/lib/Makefile +++ b/arch/powerpc/lib/Makefile @@ -21,9 +21,7 @@ obj64-y += copypage_64.o copyuser_64.o usercopy_64.o mem_64.o hweight_64.o \ obj64-$(CONFIG_SMP) += locks.o obj64-$(CONFIG_ALTIVEC) += vmx-helper.o -ifeq ($(CONFIG_GENERIC_CSUM),) obj-y += checksum_$(BITS).o checksum_wrappers.o -endif obj-$(CONFIG_PPC_EMULATE_SSTEP) += sstep.o ldstfp.o diff --git a/arch/powerpc/lib/checksum_64.S b/arch/powerpc/lib/checksum_64.S index d0d311e108ff..d7f1a966136e 100644 --- a/arch/powerpc/lib/checksum_64.S +++ b/arch/powerpc/lib/checksum_64.S @@ -36,7 +36,7 @@ _GLOBAL(__csum_partial) * work to calculate the correct checksum, we ignore that case * and take the potential slowdown of unaligned loads. */ - rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ + rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */ beq .Lcsum_aligned li r7,4 @@ -168,8 +168,12 @@ _GLOBAL(__csum_partial) beq .Lcsum_finish lbz r6,0(r3) +#ifdef __BIG_ENDIAN__ sldi r9,r6,8 /* Pad the byte out to 16 bits */ adde r0,r0,r9 +#else + adde r0,r0,r6 +#endif .Lcsum_finish: addze r0,r0 /* add in final carry */ @@ -224,7 +228,7 @@ _GLOBAL(csum_partial_copy_generic) * If the source and destination are relatively unaligned we only * align the source. This keeps things simple. */ - rldicl. r6,r3,64-1,64-2 /* r6 = (r3 & 0x3) >> 1 */ + rldicl. r6,r3,64-1,64-2 /* r6 = (r3 >> 1) & 0x3 */ beq .Lcopy_aligned li r9,4 @@ -386,8 +390,12 @@ dstnr; sth r6,0(r4) beq .Lcopy_finish srcnr; lbz r6,0(r3) +#ifdef __BIG_ENDIAN__ sldi r9,r6,8 /* Pad the byte out to 16 bits */ adde r0,r0,r9 +#else + adde r0,r0,r6 +#endif dstnr; stb r6,0(r4) .Lcopy_finish: diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index c1746df0f88e..0899315e1434 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -32,6 +32,49 @@ int patch_branch(unsigned int *addr, unsigned long target, int flags) return patch_instruction(addr, create_branch(addr, target, flags)); } +bool is_offset_in_branch_range(long offset) +{ + /* + * Powerpc branch instruction is : + * + * 0 6 30 31 + * +---------+----------------+---+---+ + * | opcode | LI |AA |LK | + * +---------+----------------+---+---+ + * Where AA = 0 and LK = 0 + * + * LI is a signed 24 bits integer. The real branch offset is computed + * by: imm32 = SignExtend(LI:'0b00', 32); + * + * So the maximum forward branch should be: + * (0x007fffff << 2) = 0x01fffffc = 0x1fffffc + * The maximum backward branch should be: + * (0xff800000 << 2) = 0xfe000000 = -0x2000000 + */ + return (offset >= -0x2000000 && offset <= 0x1fffffc && !(offset & 0x3)); +} + +/* + * Helper to check if a given instruction is a conditional branch + * Derived from the conditional checks in analyse_instr() + */ +bool __kprobes is_conditional_branch(unsigned int instr) +{ + unsigned int opcode = instr >> 26; + + if (opcode == 16) /* bc, bca, bcl, bcla */ + return true; + if (opcode == 19) { + switch ((instr >> 1) & 0x3ff) { + case 16: /* bclr, bclrl */ + case 528: /* bcctr, bcctrl */ + case 560: /* bctar, bctarl */ + return true; + } + } + return false; +} + unsigned int create_branch(const unsigned int *addr, unsigned long target, int flags) { @@ -43,7 +86,7 @@ unsigned int create_branch(const unsigned int *addr, offset = offset - (unsigned long)addr; /* Check we can represent the target in the instruction format */ - if (offset < -0x2000000 || offset > 0x1fffffc || offset & 0x3) + if (!is_offset_in_branch_range(offset)) return 0; /* Mask out the flags and target, so they don't step on each other. */ diff --git a/arch/powerpc/lib/copypage_64.S b/arch/powerpc/lib/copypage_64.S index 21367b3a8146..4bcc9e76fb55 100644 --- a/arch/powerpc/lib/copypage_64.S +++ b/arch/powerpc/lib/copypage_64.S @@ -26,8 +26,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_VMX_COPY) ori r5,r5,PAGE_SIZE@l BEGIN_FTR_SECTION ld r10,PPC64_CACHES@toc(r2) - lwz r11,DCACHEL1LOGLINESIZE(r10) /* log2 of cache line size */ - lwz r12,DCACHEL1LINESIZE(r10) /* get cache line size */ + lwz r11,DCACHEL1LOGBLOCKSIZE(r10) /* log2 of cache block size */ + lwz r12,DCACHEL1BLOCKSIZE(r10) /* get cache block size */ li r9,0 srd r8,r5,r11 diff --git a/arch/powerpc/lib/sstep.c b/arch/powerpc/lib/sstep.c index 06c7e9b88408..846dba2c6360 100644 --- a/arch/powerpc/lib/sstep.c +++ b/arch/powerpc/lib/sstep.c @@ -1803,9 +1803,8 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) return 0; if (op.ea & (size - 1)) break; /* can't handle misaligned */ - err = -EFAULT; if (!address_ok(regs, op.ea, size)) - goto ldst_done; + return 0; err = 0; switch (size) { case 4: @@ -1828,9 +1827,8 @@ int __kprobes emulate_step(struct pt_regs *regs, unsigned int instr) return 0; if (op.ea & (size - 1)) break; /* can't handle misaligned */ - err = -EFAULT; if (!address_ok(regs, op.ea, size)) - goto ldst_done; + return 0; err = 0; switch (size) { case 4: diff --git a/arch/powerpc/lib/string_64.S b/arch/powerpc/lib/string_64.S index c100f4d5d5d0..d5b4d9498c54 100644 --- a/arch/powerpc/lib/string_64.S +++ b/arch/powerpc/lib/string_64.S @@ -152,9 +152,9 @@ err2; std r0,0(r3) addi r3,r3,8 addi r4,r4,-8 - /* Destination is 16 byte aligned, need to get it cacheline aligned */ -11: lwz r7,DCACHEL1LOGLINESIZE(r5) - lwz r9,DCACHEL1LINESIZE(r5) + /* Destination is 16 byte aligned, need to get it cache block aligned */ +11: lwz r7,DCACHEL1LOGBLOCKSIZE(r5) + lwz r9,DCACHEL1BLOCKSIZE(r5) /* * With worst case alignment the long clear loop takes a minimum diff --git a/arch/powerpc/mm/copro_fault.c b/arch/powerpc/mm/copro_fault.c index aaa7ec6788b9..697b70ad1195 100644 --- a/arch/powerpc/mm/copro_fault.c +++ b/arch/powerpc/mm/copro_fault.c @@ -67,11 +67,13 @@ int copro_handle_mm_fault(struct mm_struct *mm, unsigned long ea, if (!(vma->vm_flags & (VM_READ | VM_EXEC))) goto out_unlock; /* - * protfault should only happen due to us - * mapping a region readonly temporarily. PROT_NONE - * is also covered by the VMA check above. + * PROT_NONE is covered by the VMA check above. + * and hash should get a NOHPTE fault instead of + * a PROTFAULT in case fixup is needed for things + * like autonuma. */ - WARN_ON_ONCE(dsisr & DSISR_PROTFAULT); + if (!radix_enabled()) + WARN_ON_ONCE(dsisr & DSISR_PROTFAULT); } ret = 0; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 62a50d6d1053..8dc758658972 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -407,15 +407,6 @@ good_area: (cpu_has_feature(CPU_FTR_NOEXECUTE) || !(vma->vm_flags & (VM_READ | VM_WRITE)))) goto bad_area; - -#ifdef CONFIG_PPC_STD_MMU - /* - * protfault should only happen due to us - * mapping a region readonly temporarily. PROT_NONE - * is also covered by the VMA check above. - */ - WARN_ON_ONCE(error_code & DSISR_PROTFAULT); -#endif /* CONFIG_PPC_STD_MMU */ /* a write */ } else if (is_write) { if (!(vma->vm_flags & VM_WRITE)) @@ -425,8 +416,40 @@ good_area: } else { if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))) goto bad_area; - WARN_ON_ONCE(error_code & DSISR_PROTFAULT); } +#ifdef CONFIG_PPC_STD_MMU + /* + * For hash translation mode, we should never get a + * PROTFAULT. Any update to pte to reduce access will result in us + * removing the hash page table entry, thus resulting in a DSISR_NOHPTE + * fault instead of DSISR_PROTFAULT. + * + * A pte update to relax the access will not result in a hash page table + * entry invalidate and hence can result in DSISR_PROTFAULT. + * ptep_set_access_flags() doesn't do a hpte flush. This is why we have + * the special !is_write in the below conditional. + * + * For platforms that doesn't supports coherent icache and do support + * per page noexec bit, we do setup things such that we do the + * sync between D/I cache via fault. But that is handled via low level + * hash fault code (hash_page_do_lazy_icache()) and we should not reach + * here in such case. + * + * For wrong access that can result in PROTFAULT, the above vma->vm_flags + * check should handle those and hence we should fall to the bad_area + * handling correctly. + * + * For embedded with per page exec support that doesn't support coherent + * icache we do get PROTFAULT and we handle that D/I cache sync in + * set_pte_at while taking the noexec/prot fault. Hence this is WARN_ON + * is conditional for server MMU. + * + * For radix, we can get prot fault for autonuma case, because radix + * page table will have them marked noaccess for user. + */ + if (!radix_enabled() && !is_write) + WARN_ON_ONCE(error_code & DSISR_PROTFAULT); +#endif /* CONFIG_PPC_STD_MMU */ /* * If for any reason at all we couldn't handle the fault, diff --git a/arch/powerpc/mm/hash_utils_64.c b/arch/powerpc/mm/hash_utils_64.c index 67e19a0821be..12d679df50bd 100644 --- a/arch/powerpc/mm/hash_utils_64.c +++ b/arch/powerpc/mm/hash_utils_64.c @@ -35,7 +35,9 @@ #include <linux/memblock.h> #include <linux/context_tracking.h> #include <linux/libfdt.h> +#include <linux/debugfs.h> +#include <asm/debug.h> #include <asm/processor.h> #include <asm/pgtable.h> #include <asm/mmu.h> @@ -747,6 +749,35 @@ static unsigned long __init htab_get_table_size(void) } #ifdef CONFIG_MEMORY_HOTPLUG +void resize_hpt_for_hotplug(unsigned long new_mem_size) +{ + unsigned target_hpt_shift; + + if (!mmu_hash_ops.resize_hpt) + return; + + target_hpt_shift = htab_shift_for_mem_size(new_mem_size); + + /* + * To avoid lots of HPT resizes if memory size is fluctuating + * across a boundary, we deliberately have some hysterisis + * here: we immediately increase the HPT size if the target + * shift exceeds the current shift, but we won't attempt to + * reduce unless the target shift is at least 2 below the + * current shift + */ + if ((target_hpt_shift > ppc64_pft_size) + || (target_hpt_shift < (ppc64_pft_size - 1))) { + int rc; + + rc = mmu_hash_ops.resize_hpt(target_hpt_shift); + if (rc) + printk(KERN_WARNING + "Unable to resize hash page table to target order %d: %d\n", + target_hpt_shift, rc); + } +} + int hash__create_section_mapping(unsigned long start, unsigned long end) { int rc = htab_bolt_mapping(start, end, __pa(start), @@ -1795,3 +1826,34 @@ void hash__setup_initial_memory_limit(phys_addr_t first_memblock_base, /* Finally limit subsequent allocations */ memblock_set_current_limit(ppc64_rma_size); } + +#ifdef CONFIG_DEBUG_FS + +static int hpt_order_get(void *data, u64 *val) +{ + *val = ppc64_pft_size; + return 0; +} + +static int hpt_order_set(void *data, u64 val) +{ + if (!mmu_hash_ops.resize_hpt) + return -ENODEV; + + return mmu_hash_ops.resize_hpt(val); +} + +DEFINE_SIMPLE_ATTRIBUTE(fops_hpt_order, hpt_order_get, hpt_order_set, "%llu\n"); + +static int __init hash64_debugfs(void) +{ + if (!debugfs_create_file("hpt_order", 0600, powerpc_debugfs_root, + NULL, &fops_hpt_order)) { + pr_err("lpar: unable to create hpt_order debugsfs file\n"); + } + + return 0; +} +machine_device_initcall(pseries, hash64_debugfs); + +#endif /* CONFIG_DEBUG_FS */ diff --git a/arch/powerpc/mm/hugetlbpage-hash64.c b/arch/powerpc/mm/hugetlbpage-hash64.c index 37b5f91e381b..a84bb44497f9 100644 --- a/arch/powerpc/mm/hugetlbpage-hash64.c +++ b/arch/powerpc/mm/hugetlbpage-hash64.c @@ -116,24 +116,3 @@ int __hash_page_huge(unsigned long ea, unsigned long access, unsigned long vsid, *ptep = __pte(new_pte & ~H_PAGE_BUSY); return 0; } - -#if defined(CONFIG_PPC_64K_PAGES) && defined(CONFIG_DEBUG_VM) -/* - * This enables us to catch the wrong page directory format - * Moved here so that we can use WARN() in the call. - */ -int hugepd_ok(hugepd_t hpd) -{ - bool is_hugepd; - unsigned long hpdval; - - hpdval = hpd_val(hpd); - - /* - * We should not find this format in page directory, warn otherwise. - */ - is_hugepd = (((hpdval & 0x3) == 0x0) && ((hpdval & HUGEPD_SHIFT_MASK) != 0)); - WARN(is_hugepd, "Found wrong page directory format\n"); - return 0; -} -#endif diff --git a/arch/powerpc/mm/init-common.c b/arch/powerpc/mm/init-common.c index f2108c40e697..eb8c6c8c4851 100644 --- a/arch/powerpc/mm/init-common.c +++ b/arch/powerpc/mm/init-common.c @@ -41,6 +41,7 @@ static void pmd_ctor(void *addr) } struct kmem_cache *pgtable_cache[MAX_PGTABLE_INDEX_SIZE]; +EXPORT_SYMBOL_GPL(pgtable_cache); /* used by kvm_hv module */ /* * Create a kmem_cache() for pagetables. This is not used for PTE @@ -86,7 +87,7 @@ void pgtable_cache_add(unsigned shift, void (*ctor)(void *)) pr_debug("Allocated pgtable cache for order %d\n", shift); } - +EXPORT_SYMBOL_GPL(pgtable_cache_add); /* used by kvm_hv module */ void pgtable_cache_init(void) { diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 8e1588021d1c..6aa3b76aa0d6 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -42,6 +42,8 @@ #include <linux/memblock.h> #include <linux/hugetlb.h> #include <linux/slab.h> +#include <linux/of_fdt.h> +#include <linux/libfdt.h> #include <asm/pgalloc.h> #include <asm/page.h> @@ -344,6 +346,30 @@ static int __init parse_disable_radix(char *p) } early_param("disable_radix", parse_disable_radix); +/* + * If we're running under a hypervisor, we need to check the contents of + * /chosen/ibm,architecture-vec-5 to see if the hypervisor is willing to do + * radix. If not, we clear the radix feature bit so we fall back to hash. + */ +static void early_check_vec5(void) +{ + unsigned long root, chosen; + int size; + const u8 *vec5; + + root = of_get_flat_dt_root(); + chosen = of_get_flat_dt_subnode_by_name(root, "chosen"); + if (chosen == -FDT_ERR_NOTFOUND) + return; + vec5 = of_get_flat_dt_prop(chosen, "ibm,architecture-vec-5", &size); + if (!vec5) + return; + if (size <= OV5_INDX(OV5_MMU_RADIX_300) || + !(vec5[OV5_INDX(OV5_MMU_RADIX_300)] & OV5_FEAT(OV5_MMU_RADIX_300))) + /* Hypervisor doesn't support radix */ + cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; +} + void __init mmu_early_init_devtree(void) { /* Disable radix mode based on kernel command line. */ @@ -351,6 +377,15 @@ void __init mmu_early_init_devtree(void) if (disable_radix || !(mfmsr() & MSR_HV)) cur_cpu_spec->mmu_features &= ~MMU_FTR_TYPE_RADIX; + /* + * Check /chosen/ibm,architecture-vec-5 if running as a guest. + * When running bare-metal, we can use radix if we like + * even though the ibm,architecture-vec-5 property created by + * skiboot doesn't have the necessary bits set. + */ + if (early_radix_enabled() && !(mfmsr() & MSR_HV)) + early_check_vec5(); + if (early_radix_enabled()) radix__early_init_devtree(); else diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 5f844337de21..9ee536ec0739 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -134,6 +134,8 @@ int arch_add_memory(int nid, u64 start, u64 size, bool for_device) unsigned long nr_pages = size >> PAGE_SHIFT; int rc; + resize_hpt_for_hotplug(memblock_phys_mem_size()); + pgdata = NODE_DATA(nid); start = (unsigned long)__va(start); @@ -174,6 +176,8 @@ int arch_remove_memory(u64 start, u64 size) */ vm_unmap_aliases(); + resize_hpt_for_hotplug(memblock_phys_mem_size()); + return ret; } #endif diff --git a/arch/powerpc/mm/mmu_context_iommu.c b/arch/powerpc/mm/mmu_context_iommu.c index 104bad029ce9..7de7124ac91b 100644 --- a/arch/powerpc/mm/mmu_context_iommu.c +++ b/arch/powerpc/mm/mmu_context_iommu.c @@ -184,7 +184,7 @@ long mm_iommu_get(struct mm_struct *mm, unsigned long ua, unsigned long entries, * of the CMA zone if possible. NOTE: faulting in + migration * can be expensive. Batching can be considered later */ - if (get_pageblock_migratetype(page) == MIGRATE_CMA) { + if (is_migrate_cma_page(page)) { if (mm_iommu_move_page_from_cma(page)) goto populate; if (1 != get_user_pages_fast(ua + (i << PAGE_SHIFT), diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index b1099cb2f393..9befaee237d6 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -290,7 +290,7 @@ int of_node_to_nid(struct device_node *device) return nid; } -EXPORT_SYMBOL_GPL(of_node_to_nid); +EXPORT_SYMBOL(of_node_to_nid); static int __init find_min_common_depth(void) { @@ -786,14 +786,9 @@ new_range: fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid); node_set_online(nid); - if (!(size = numa_enforce_memory_limit(start, size))) { - if (--ranges) - goto new_range; - else - continue; - } - - memblock_set_node(start, size, &memblock.memory, nid); + size = numa_enforce_memory_limit(start, size); + if (size) + memblock_set_node(start, size, &memblock.memory, nid); if (--ranges) goto new_range; @@ -1098,7 +1093,7 @@ int hot_add_scn_to_nid(unsigned long scn_addr) nid = hot_add_node_scn_to_nid(scn_addr); } - if (nid < 0 || !node_online(nid)) + if (nid < 0 || !node_possible(nid)) nid = first_online_node; return nid; diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 653ff6c74ebe..b798ff674fab 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -131,7 +131,7 @@ void mmu_cleanup_all(void) int create_section_mapping(unsigned long start, unsigned long end) { if (radix_enabled()) - return -ENODEV; + return radix__create_section_mapping(start, end); return hash__create_section_mapping(start, end); } @@ -139,7 +139,7 @@ int create_section_mapping(unsigned long start, unsigned long end) int remove_section_mapping(unsigned long start, unsigned long end) { if (radix_enabled()) - return -ENODEV; + return radix__remove_section_mapping(start, end); return hash__remove_section_mapping(start, end); } diff --git a/arch/powerpc/mm/pgtable-radix.c b/arch/powerpc/mm/pgtable-radix.c index 34f1a0dbc898..feeda90cd06d 100644 --- a/arch/powerpc/mm/pgtable-radix.c +++ b/arch/powerpc/mm/pgtable-radix.c @@ -18,6 +18,7 @@ #include <asm/machdep.h> #include <asm/mmu.h> #include <asm/firmware.h> +#include <asm/powernv.h> #include <trace/events/thp.h> @@ -107,54 +108,66 @@ set_the_pte: return 0; } +static inline void __meminit print_mapping(unsigned long start, + unsigned long end, + unsigned long size) +{ + if (end <= start) + return; + + pr_info("Mapped range 0x%lx - 0x%lx with 0x%lx\n", start, end, size); +} + +static int __meminit create_physical_mapping(unsigned long start, + unsigned long end) +{ + unsigned long addr, mapping_size = 0; + + start = _ALIGN_UP(start, PAGE_SIZE); + for (addr = start; addr < end; addr += mapping_size) { + unsigned long gap, previous_size; + int rc; + + gap = end - addr; + previous_size = mapping_size; + + if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE && + mmu_psize_defs[MMU_PAGE_1G].shift) + mapping_size = PUD_SIZE; + else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE && + mmu_psize_defs[MMU_PAGE_2M].shift) + mapping_size = PMD_SIZE; + else + mapping_size = PAGE_SIZE; + + if (mapping_size != previous_size) { + print_mapping(start, addr, previous_size); + start = addr; + } + + rc = radix__map_kernel_page((unsigned long)__va(addr), addr, + PAGE_KERNEL_X, mapping_size); + if (rc) + return rc; + } + + print_mapping(start, addr, mapping_size); + return 0; +} + static void __init radix_init_pgtable(void) { - int loop_count; - u64 base, end, start_addr; unsigned long rts_field; struct memblock_region *reg; - unsigned long linear_page_size; /* We don't support slb for radix */ mmu_slb_size = 0; /* * Create the linear mapping, using standard page size for now */ - loop_count = 0; - for_each_memblock(memory, reg) { - - start_addr = reg->base; - -redo: - if (loop_count < 1 && mmu_psize_defs[MMU_PAGE_1G].shift) - linear_page_size = PUD_SIZE; - else if (loop_count < 2 && mmu_psize_defs[MMU_PAGE_2M].shift) - linear_page_size = PMD_SIZE; - else - linear_page_size = PAGE_SIZE; - - base = _ALIGN_UP(start_addr, linear_page_size); - end = _ALIGN_DOWN(reg->base + reg->size, linear_page_size); - - pr_info("Mapping range 0x%lx - 0x%lx with 0x%lx\n", - (unsigned long)base, (unsigned long)end, - linear_page_size); - - while (base < end) { - radix__map_kernel_page((unsigned long)__va(base), - base, PAGE_KERNEL_X, - linear_page_size); - base += linear_page_size; - } - /* - * map the rest using lower page size - */ - if (end < reg->base + reg->size) { - start_addr = end; - loop_count++; - goto redo; - } - } + for_each_memblock(memory, reg) + WARN_ON(create_physical_mapping(reg->base, + reg->base + reg->size)); /* * Allocate Partition table and process table for the * host. @@ -401,6 +414,8 @@ void __init radix__early_init_mmu(void) mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR); radix_init_partition_table(); radix_init_amor(); + } else { + radix_init_pseries(); } memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE); @@ -438,6 +453,7 @@ void radix__mmu_cleanup_all(void) lpcr = mfspr(SPRN_LPCR); mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT); mtspr(SPRN_PTCR, 0); + powernv_set_nmmu_ptcr(0); radix__flush_tlb_all(); } } @@ -467,6 +483,173 @@ void radix__setup_initial_memory_limit(phys_addr_t first_memblock_base, memblock_set_current_limit(first_memblock_base + first_memblock_size); } +#ifdef CONFIG_MEMORY_HOTPLUG +static void free_pte_table(pte_t *pte_start, pmd_t *pmd) +{ + pte_t *pte; + int i; + + for (i = 0; i < PTRS_PER_PTE; i++) { + pte = pte_start + i; + if (!pte_none(*pte)) + return; + } + + pte_free_kernel(&init_mm, pte_start); + pmd_clear(pmd); +} + +static void free_pmd_table(pmd_t *pmd_start, pud_t *pud) +{ + pmd_t *pmd; + int i; + + for (i = 0; i < PTRS_PER_PMD; i++) { + pmd = pmd_start + i; + if (!pmd_none(*pmd)) + return; + } + + pmd_free(&init_mm, pmd_start); + pud_clear(pud); +} + +static void remove_pte_table(pte_t *pte_start, unsigned long addr, + unsigned long end) +{ + unsigned long next; + pte_t *pte; + + pte = pte_start + pte_index(addr); + for (; addr < end; addr = next, pte++) { + next = (addr + PAGE_SIZE) & PAGE_MASK; + if (next > end) + next = end; + + if (!pte_present(*pte)) + continue; + + if (!PAGE_ALIGNED(addr) || !PAGE_ALIGNED(next)) { + /* + * The vmemmap_free() and remove_section_mapping() + * codepaths call us with aligned addresses. + */ + WARN_ONCE(1, "%s: unaligned range\n", __func__); + continue; + } + + pte_clear(&init_mm, addr, pte); + } +} + +static void remove_pmd_table(pmd_t *pmd_start, unsigned long addr, + unsigned long end) +{ + unsigned long next; + pte_t *pte_base; + pmd_t *pmd; + + pmd = pmd_start + pmd_index(addr); + for (; addr < end; addr = next, pmd++) { + next = pmd_addr_end(addr, end); + + if (!pmd_present(*pmd)) + continue; + + if (pmd_huge(*pmd)) { + if (!IS_ALIGNED(addr, PMD_SIZE) || + !IS_ALIGNED(next, PMD_SIZE)) { + WARN_ONCE(1, "%s: unaligned range\n", __func__); + continue; + } + + pte_clear(&init_mm, addr, (pte_t *)pmd); + continue; + } + + pte_base = (pte_t *)pmd_page_vaddr(*pmd); + remove_pte_table(pte_base, addr, next); + free_pte_table(pte_base, pmd); + } +} + +static void remove_pud_table(pud_t *pud_start, unsigned long addr, + unsigned long end) +{ + unsigned long next; + pmd_t *pmd_base; + pud_t *pud; + + pud = pud_start + pud_index(addr); + for (; addr < end; addr = next, pud++) { + next = pud_addr_end(addr, end); + + if (!pud_present(*pud)) + continue; + + if (pud_huge(*pud)) { + if (!IS_ALIGNED(addr, PUD_SIZE) || + !IS_ALIGNED(next, PUD_SIZE)) { + WARN_ONCE(1, "%s: unaligned range\n", __func__); + continue; + } + + pte_clear(&init_mm, addr, (pte_t *)pud); + continue; + } + + pmd_base = (pmd_t *)pud_page_vaddr(*pud); + remove_pmd_table(pmd_base, addr, next); + free_pmd_table(pmd_base, pud); + } +} + +static void remove_pagetable(unsigned long start, unsigned long end) +{ + unsigned long addr, next; + pud_t *pud_base; + pgd_t *pgd; + + spin_lock(&init_mm.page_table_lock); + + for (addr = start; addr < end; addr = next) { + next = pgd_addr_end(addr, end); + + pgd = pgd_offset_k(addr); + if (!pgd_present(*pgd)) + continue; + + if (pgd_huge(*pgd)) { + if (!IS_ALIGNED(addr, PGDIR_SIZE) || + !IS_ALIGNED(next, PGDIR_SIZE)) { + WARN_ONCE(1, "%s: unaligned range\n", __func__); + continue; + } + + pte_clear(&init_mm, addr, (pte_t *)pgd); + continue; + } + + pud_base = (pud_t *)pgd_page_vaddr(*pgd); + remove_pud_table(pud_base, addr, next); + } + + spin_unlock(&init_mm.page_table_lock); + radix__flush_tlb_kernel_range(start, end); +} + +int __ref radix__create_section_mapping(unsigned long start, unsigned long end) +{ + return create_physical_mapping(start, end); +} + +int radix__remove_section_mapping(unsigned long start, unsigned long end) +{ + remove_pagetable(start, end); + return 0; +} +#endif /* CONFIG_MEMORY_HOTPLUG */ + #ifdef CONFIG_SPARSEMEM_VMEMMAP int __meminit radix__vmemmap_create_mapping(unsigned long start, unsigned long page_size, @@ -482,7 +665,7 @@ int __meminit radix__vmemmap_create_mapping(unsigned long start, #ifdef CONFIG_MEMORY_HOTPLUG void radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size) { - /* FIXME!! intel does more. We should free page tables mapping vmemmap ? */ + remove_pagetable(start, start + page_size); } #endif #endif diff --git a/arch/powerpc/mm/pgtable_64.c b/arch/powerpc/mm/pgtable_64.c index 8bca7f58afc4..db93cf747a03 100644 --- a/arch/powerpc/mm/pgtable_64.c +++ b/arch/powerpc/mm/pgtable_64.c @@ -52,6 +52,7 @@ #include <asm/sections.h> #include <asm/firmware.h> #include <asm/dma.h> +#include <asm/powernv.h> #include "mmu_decl.h" @@ -436,6 +437,7 @@ void pgtable_free_tlb(struct mmu_gather *tlb, void *table, int shift) void __init mmu_partition_table_init(void) { unsigned long patb_size = 1UL << PATB_SIZE_SHIFT; + unsigned long ptcr; BUILD_BUG_ON_MSG((PATB_SIZE_SHIFT > 36), "Partition table size too large."); partition_tb = __va(memblock_alloc_base(patb_size, patb_size, @@ -448,19 +450,31 @@ void __init mmu_partition_table_init(void) * update partition table control register, * 64 K size. */ - mtspr(SPRN_PTCR, __pa(partition_tb) | (PATB_SIZE_SHIFT - 12)); + ptcr = __pa(partition_tb) | (PATB_SIZE_SHIFT - 12); + mtspr(SPRN_PTCR, ptcr); + powernv_set_nmmu_ptcr(ptcr); } void mmu_partition_table_set_entry(unsigned int lpid, unsigned long dw0, unsigned long dw1) { + unsigned long old = be64_to_cpu(partition_tb[lpid].patb0); + partition_tb[lpid].patb0 = cpu_to_be64(dw0); partition_tb[lpid].patb1 = cpu_to_be64(dw1); - /* Global flush of TLBs and partition table caches for this lpid */ + /* + * Global flush of TLBs and partition table caches for this lpid. + * The type of flush (hash or radix) depends on what the previous + * use of this partition ID was, not the new use. + */ asm volatile("ptesync" : : : "memory"); - asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : - "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + if (old & PATB_HR) + asm volatile(PPC_TLBIE_5(%0,%1,2,0,1) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); + else + asm volatile(PPC_TLBIE_5(%0,%1,2,0,0) : : + "r" (TLBIEL_INVAL_SET_LPID), "r" (lpid)); asm volatile("eieio; tlbsync; ptesync" : : : "memory"); } EXPORT_SYMBOL_GPL(mmu_partition_table_set_entry); diff --git a/arch/powerpc/mm/subpage-prot.c b/arch/powerpc/mm/subpage-prot.c index 5c096c01e8bd..94210940112f 100644 --- a/arch/powerpc/mm/subpage-prot.c +++ b/arch/powerpc/mm/subpage-prot.c @@ -248,9 +248,8 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) nw = (next - addr) >> PAGE_SHIFT; up_write(&mm->mmap_sem); - err = -EFAULT; if (__copy_from_user(spp, map, nw * sizeof(u32))) - goto out2; + return -EFAULT; map += nw; down_write(&mm->mmap_sem); @@ -262,6 +261,5 @@ long sys_subpage_prot(unsigned long addr, unsigned long len, u32 __user *map) err = 0; out: up_write(&mm->mmap_sem); - out2: return err; } diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 89f70073dec8..30cf03f53428 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -157,8 +157,7 @@ #define PPC_SRAD(d, a, s) EMIT(PPC_INST_SRAD | ___PPC_RA(d) | \ ___PPC_RS(a) | ___PPC_RB(s)) #define PPC_SRADI(d, a, i) EMIT(PPC_INST_SRADI | ___PPC_RA(d) | \ - ___PPC_RS(a) | __PPC_SH(i) | \ - (((i) & 0x20) >> 4)) + ___PPC_RS(a) | __PPC_SH64(i)) #define PPC_RLWINM(d, a, i, mb, me) EMIT(PPC_INST_RLWINM | ___PPC_RA(d) | \ ___PPC_RS(a) | __PPC_SH(i) | \ __PPC_MB(mb) | __PPC_ME(me)) @@ -166,11 +165,11 @@ ___PPC_RS(a) | __PPC_SH(i) | \ __PPC_MB(mb) | __PPC_ME(me)) #define PPC_RLDICL(d, a, i, mb) EMIT(PPC_INST_RLDICL | ___PPC_RA(d) | \ - ___PPC_RS(a) | __PPC_SH(i) | \ - __PPC_MB64(mb) | (((i) & 0x20) >> 4)) + ___PPC_RS(a) | __PPC_SH64(i) | \ + __PPC_MB64(mb)) #define PPC_RLDICR(d, a, i, me) EMIT(PPC_INST_RLDICR | ___PPC_RA(d) | \ - ___PPC_RS(a) | __PPC_SH(i) | \ - __PPC_ME64(me) | (((i) & 0x20) >> 4)) + ___PPC_RS(a) | __PPC_SH64(i) | \ + __PPC_ME64(me)) /* slwi = rlwinm Rx, Ry, n, 0, 31-n */ #define PPC_SLWI(d, a, i) PPC_RLWINM(d, a, i, 0, 31-(i)) diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 7e706f36e364..f9941b3b5770 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -662,16 +662,17 @@ void bpf_jit_compile(struct bpf_prog *fp) */ bpf_jit_dump(flen, proglen, pass, code_base); - if (image) { - bpf_flush_icache(code_base, code_base + (proglen/4)); + bpf_flush_icache(code_base, code_base + (proglen/4)); + #ifdef CONFIG_PPC64 - /* Function descriptor nastiness: Address + TOC */ - ((u64 *)image)[0] = (u64)code_base; - ((u64 *)image)[1] = local_paca->kernel_toc; + /* Function descriptor nastiness: Address + TOC */ + ((u64 *)image)[0] = (u64)code_base; + ((u64 *)image)[1] = local_paca->kernel_toc; #endif - fp->bpf_func = (void *)image; - fp->jited = 1; - } + + fp->bpf_func = (void *)image; + fp->jited = 1; + out: kfree(addrs); return; diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index c34166ef76fc..aee2bb817ac6 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -1044,16 +1044,16 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) */ bpf_jit_dump(flen, proglen, pass, code_base); - if (image) { - bpf_flush_icache(bpf_hdr, image + alloclen); #ifdef PPC64_ELF_ABI_v1 - /* Function descriptor nastiness: Address + TOC */ - ((u64 *)image)[0] = (u64)code_base; - ((u64 *)image)[1] = local_paca->kernel_toc; + /* Function descriptor nastiness: Address + TOC */ + ((u64 *)image)[0] = (u64)code_base; + ((u64 *)image)[1] = local_paca->kernel_toc; #endif - fp->bpf_func = (void *)image; - fp->jited = 1; - } + + fp->bpf_func = (void *)image; + fp->jited = 1; + + bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + (bpf_hdr->pages * PAGE_SIZE)); out: kfree(addrs); diff --git a/arch/powerpc/platforms/maple/pci.c b/arch/powerpc/platforms/maple/pci.c index a0589aac4163..69794d9389c2 100644 --- a/arch/powerpc/platforms/maple/pci.c +++ b/arch/powerpc/platforms/maple/pci.c @@ -24,6 +24,7 @@ #include <asm/machdep.h> #include <asm/iommu.h> #include <asm/ppc-pci.h> +#include <asm/isa-bridge.h> #include "maple.h" diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 479c25601612..4ee837e6391a 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -237,15 +237,21 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600, show_fastsleep_workaround_applyonce, store_fastsleep_workaround_applyonce); +/* + * The default stop state that will be used by ppc_md.power_save + * function on platforms that support stop instruction. + */ +u64 pnv_default_stop_val; +u64 pnv_default_stop_mask; /* * Used for ppc_md.power_save which needs a function with no parameters */ static void power9_idle(void) { - /* Requesting stop state 0 */ - power9_idle_stop(0); + power9_idle_stop(pnv_default_stop_val, pnv_default_stop_mask); } + /* * First deep stop state. Used to figure out when to save/restore * hypervisor context. @@ -253,9 +259,11 @@ static void power9_idle(void) u64 pnv_first_deep_stop_state = MAX_STOP_STATE; /* - * Deepest stop idle state. Used when a cpu is offlined + * psscr value and mask of the deepest stop idle state. + * Used when a cpu is offlined. */ -u64 pnv_deepest_stop_state; +u64 pnv_deepest_stop_psscr_val; +u64 pnv_deepest_stop_psscr_mask; /* * Power ISA 3.0 idle initialization. @@ -292,53 +300,157 @@ u64 pnv_deepest_stop_state; * Bits 60:63 - Requested Level * Used to specify which power-saving level must be entered on executing * stop instruction + */ + +int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags) +{ + int err = 0; + + /* + * psscr_mask == 0xf indicates an older firmware. + * Set remaining fields of psscr to the default values. + * See NOTE above definition of PSSCR_HV_DEFAULT_VAL + */ + if (*psscr_mask == 0xf) { + *psscr_val = *psscr_val | PSSCR_HV_DEFAULT_VAL; + *psscr_mask = PSSCR_HV_DEFAULT_MASK; + return err; + } + + /* + * New firmware is expected to set the psscr_val bits correctly. + * Validate that the following invariants are correctly maintained by + * the new firmware. + * - ESL bit value matches the EC bit value. + * - ESL bit is set for all the deep stop states. + */ + if (GET_PSSCR_ESL(*psscr_val) != GET_PSSCR_EC(*psscr_val)) { + err = ERR_EC_ESL_MISMATCH; + } else if ((flags & OPAL_PM_LOSE_FULL_CONTEXT) && + GET_PSSCR_ESL(*psscr_val) == 0) { + err = ERR_DEEP_STATE_ESL_MISMATCH; + } + + return err; +} + +/* + * pnv_arch300_idle_init: Initializes the default idle state, first + * deep idle state and deepest idle state on + * ISA 3.0 CPUs. * * @np: /ibm,opal/power-mgt device node * @flags: cpu-idle-state-flags array * @dt_idle_states: Number of idle state entries * Returns 0 on success */ -static int __init pnv_arch300_idle_init(struct device_node *np, u32 *flags, +static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags, int dt_idle_states) { u64 *psscr_val = NULL; + u64 *psscr_mask = NULL; + u32 *residency_ns = NULL; + u64 max_residency_ns = 0; int rc = 0, i; + bool default_stop_found = false, deepest_stop_found = false; - psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), - GFP_KERNEL); - if (!psscr_val) { + psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL); + psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL); + residency_ns = kcalloc(dt_idle_states, sizeof(*residency_ns), + GFP_KERNEL); + + if (!psscr_val || !psscr_mask || !residency_ns) { rc = -1; goto out; } + if (of_property_read_u64_array(np, "ibm,cpu-idle-state-psscr", psscr_val, dt_idle_states)) { - pr_warn("cpuidle-powernv: missing ibm,cpu-idle-states-psscr in DT\n"); + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr in DT\n"); + rc = -1; + goto out; + } + + if (of_property_read_u64_array(np, + "ibm,cpu-idle-state-psscr-mask", + psscr_mask, dt_idle_states)) { + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr-mask in DT\n"); + rc = -1; + goto out; + } + + if (of_property_read_u32_array(np, + "ibm,cpu-idle-state-residency-ns", + residency_ns, dt_idle_states)) { + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-residency-ns in DT\n"); rc = -1; goto out; } /* - * Set pnv_first_deep_stop_state and pnv_deepest_stop_state. + * Set pnv_first_deep_stop_state, pnv_deepest_stop_psscr_{val,mask}, + * and the pnv_default_stop_{val,mask}. + * * pnv_first_deep_stop_state should be set to the first stop * level to cause hypervisor state loss. - * pnv_deepest_stop_state should be set to the deepest stop - * stop state. + * + * pnv_deepest_stop_{val,mask} should be set to values corresponding to + * the deepest stop state. + * + * pnv_default_stop_{val,mask} should be set to values corresponding to + * the shallowest (OPAL_PM_STOP_INST_FAST) loss-less stop state. */ pnv_first_deep_stop_state = MAX_STOP_STATE; for (i = 0; i < dt_idle_states; i++) { + int err; u64 psscr_rl = psscr_val[i] & PSSCR_RL_MASK; if ((flags[i] & OPAL_PM_LOSE_FULL_CONTEXT) && (pnv_first_deep_stop_state > psscr_rl)) pnv_first_deep_stop_state = psscr_rl; - if (pnv_deepest_stop_state < psscr_rl) - pnv_deepest_stop_state = psscr_rl; + err = validate_psscr_val_mask(&psscr_val[i], &psscr_mask[i], + flags[i]); + if (err) { + report_invalid_psscr_val(psscr_val[i], err); + continue; + } + + if (max_residency_ns < residency_ns[i]) { + max_residency_ns = residency_ns[i]; + pnv_deepest_stop_psscr_val = psscr_val[i]; + pnv_deepest_stop_psscr_mask = psscr_mask[i]; + deepest_stop_found = true; + } + + if (!default_stop_found && + (flags[i] & OPAL_PM_STOP_INST_FAST)) { + pnv_default_stop_val = psscr_val[i]; + pnv_default_stop_mask = psscr_mask[i]; + default_stop_found = true; + } + } + + if (!default_stop_found) { + pnv_default_stop_val = PSSCR_HV_DEFAULT_VAL; + pnv_default_stop_mask = PSSCR_HV_DEFAULT_MASK; + pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n", + pnv_default_stop_val, pnv_default_stop_mask); + } + + if (!deepest_stop_found) { + pnv_deepest_stop_psscr_val = PSSCR_HV_DEFAULT_VAL; + pnv_deepest_stop_psscr_mask = PSSCR_HV_DEFAULT_MASK; + pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n", + pnv_deepest_stop_psscr_val, + pnv_deepest_stop_psscr_mask); } out: kfree(psscr_val); + kfree(psscr_mask); + kfree(residency_ns); return rc; } @@ -373,7 +485,7 @@ static void __init pnv_probe_idle_states(void) } if (cpu_has_feature(CPU_FTR_ARCH_300)) { - if (pnv_arch300_idle_init(np, flags, dt_idle_states)) + if (pnv_power9_idle_init(np, flags, dt_idle_states)) goto out; } diff --git a/arch/powerpc/platforms/powernv/opal-hmi.c b/arch/powerpc/platforms/powernv/opal-hmi.c index c0a8201cb4d9..88f3c61eec95 100644 --- a/arch/powerpc/platforms/powernv/opal-hmi.c +++ b/arch/powerpc/platforms/powernv/opal-hmi.c @@ -180,7 +180,8 @@ static void print_hmi_event_info(struct OpalHMIEvent *hmi_evt) "An XSCOM operation completed", "SCOM has set a reserved FIR bit to cause recovery", "Debug trigger has set a reserved FIR bit to cause recovery", - "A hypervisor resource error occurred" + "A hypervisor resource error occurred", + "CAPP recovery process is in progress", }; /* Print things out */ diff --git a/arch/powerpc/platforms/powernv/opal-irqchip.c b/arch/powerpc/platforms/powernv/opal-irqchip.c index 998316bf2dad..ecdcba9d1220 100644 --- a/arch/powerpc/platforms/powernv/opal-irqchip.c +++ b/arch/powerpc/platforms/powernv/opal-irqchip.c @@ -183,8 +183,9 @@ void opal_event_shutdown(void) int __init opal_event_init(void) { struct device_node *dn, *opal_node; - const __be32 *irqs; - int i, irqlen, rc = 0; + const char **names; + u32 *irqs; + int i, rc; opal_node = of_find_node_by_path("/ibm,opal"); if (!opal_node) { @@ -209,31 +210,56 @@ int __init opal_event_init(void) goto out; } - /* Get interrupt property */ - irqs = of_get_property(opal_node, "opal-interrupts", &irqlen); - opal_irq_count = irqs ? (irqlen / 4) : 0; + /* Get opal-interrupts property and names if present */ + rc = of_property_count_u32_elems(opal_node, "opal-interrupts"); + if (rc < 0) + goto out; + + opal_irq_count = rc; pr_debug("Found %d interrupts reserved for OPAL\n", opal_irq_count); - /* Install interrupt handlers */ + irqs = kcalloc(opal_irq_count, sizeof(*irqs), GFP_KERNEL); + names = kcalloc(opal_irq_count, sizeof(*names), GFP_KERNEL); opal_irqs = kcalloc(opal_irq_count, sizeof(*opal_irqs), GFP_KERNEL); - for (i = 0; irqs && i < opal_irq_count; i++, irqs++) { - unsigned int irq, virq; + + if (WARN_ON(!irqs || !names || !opal_irqs)) + goto out_free; + + rc = of_property_read_u32_array(opal_node, "opal-interrupts", + irqs, opal_irq_count); + if (rc < 0) { + pr_err("Error %d reading opal-interrupts array\n", rc); + goto out_free; + } + + /* It's not an error for the names to be missing */ + of_property_read_string_array(opal_node, "opal-interrupts-names", + names, opal_irq_count); + + /* Install interrupt handlers */ + for (i = 0; i < opal_irq_count; i++) { + unsigned int virq; + char *name; /* Get hardware and virtual IRQ */ - irq = be32_to_cpup(irqs); - virq = irq_create_mapping(NULL, irq); + virq = irq_create_mapping(NULL, irqs[i]); if (!virq) { - pr_warn("Failed to map irq 0x%x\n", irq); + pr_warn("Failed to map irq 0x%x\n", irqs[i]); continue; } + if (names[i] && strlen(names[i])) + name = kasprintf(GFP_KERNEL, "opal-%s", names[i]); + else + name = kasprintf(GFP_KERNEL, "opal"); + /* Install interrupt handler */ rc = request_irq(virq, opal_interrupt, IRQF_TRIGGER_LOW, - "opal", NULL); + name, NULL); if (rc) { irq_dispose_mapping(virq); pr_warn("Error %d requesting irq %d (0x%x)\n", - rc, virq, irq); + rc, virq, irqs[i]); continue; } @@ -241,6 +267,9 @@ int __init opal_event_init(void) opal_irqs[i] = virq; } +out_free: + kfree(irqs); + kfree(names); out: of_node_put(opal_node); return rc; diff --git a/arch/powerpc/platforms/powernv/opal-lpc.c b/arch/powerpc/platforms/powernv/opal-lpc.c index 4886eb8b6381..a91d7876fae2 100644 --- a/arch/powerpc/platforms/powernv/opal-lpc.c +++ b/arch/powerpc/platforms/powernv/opal-lpc.c @@ -18,11 +18,11 @@ #include <asm/machdep.h> #include <asm/firmware.h> -#include <asm/xics.h> #include <asm/opal.h> #include <asm/prom.h> #include <linux/uaccess.h> #include <asm/debug.h> +#include <asm/isa-bridge.h> static int opal_lpc_chip_id = -1; @@ -386,7 +386,7 @@ static int opal_lpc_init_debugfs(void) machine_device_initcall(powernv, opal_lpc_init_debugfs); #endif /* CONFIG_DEBUG_FS */ -void opal_lpc_init(void) +void __init opal_lpc_init(void) { struct device_node *np; @@ -406,9 +406,17 @@ void opal_lpc_init(void) if (opal_lpc_chip_id < 0) return; - /* Setup special IO ops */ - ppc_pci_io = opal_lpc_io; - isa_io_special = true; - - pr_info("OPAL: Power8 LPC bus found, chip ID %d\n", opal_lpc_chip_id); + /* Does it support direct mapping ? */ + if (of_get_property(np, "ranges", NULL)) { + pr_info("OPAL: Found memory mapped LPC bus on chip %d\n", + opal_lpc_chip_id); + isa_bridge_init_non_pci(np); + } else { + pr_info("OPAL: Found non-mapped LPC bus on chip %d\n", + opal_lpc_chip_id); + + /* Setup special IO ops */ + ppc_pci_io = opal_lpc_io; + isa_io_special = true; + } } diff --git a/arch/powerpc/platforms/powernv/opal-msglog.c b/arch/powerpc/platforms/powernv/opal-msglog.c index 39d6ff9e5630..7a9cde0cfbd1 100644 --- a/arch/powerpc/platforms/powernv/opal-msglog.c +++ b/arch/powerpc/platforms/powernv/opal-msglog.c @@ -123,6 +123,10 @@ void __init opal_msglog_init(void) return; } + /* Report maximum size */ + opal_msglog_attr.size = be32_to_cpu(mc->ibuf_size) + + be32_to_cpu(mc->obuf_size); + opal_memcons = mc; } diff --git a/arch/powerpc/platforms/powernv/opal-wrappers.S b/arch/powerpc/platforms/powernv/opal-wrappers.S index 3aa40f1b20f5..6693f75e93d1 100644 --- a/arch/powerpc/platforms/powernv/opal-wrappers.S +++ b/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -58,14 +58,16 @@ END_FTR_SECTION(0, 1); \ #define OPAL_CALL(name, token) \ _GLOBAL_TOC(name); \ + mfmsr r12; \ mflr r0; \ + andi. r11,r12,MSR_IR|MSR_DR; \ std r0,PPC_LR_STKOFF(r1); \ li r0,token; \ + beq opal_real_call; \ OPAL_BRANCH(opal_tracepoint_entry) \ - mfcr r12; \ - stw r12,8(r1); \ + mfcr r11; \ + stw r11,8(r1); \ li r11,0; \ - mfmsr r12; \ ori r11,r11,MSR_EE; \ std r12,PACASAVEDMSR(r13); \ andc r12,r12,r11; \ @@ -98,6 +100,30 @@ opal_return: mtcr r4; rfid +opal_real_call: + mfcr r11 + stw r11,8(r1) + /* Set opal return address */ + LOAD_REG_ADDR(r11, opal_return_realmode) + mtlr r11 + li r11,MSR_LE + andc r12,r12,r11 + mtspr SPRN_HSRR1,r12 + LOAD_REG_ADDR(r11,opal) + ld r12,8(r11) + ld r2,0(r11) + mtspr SPRN_HSRR0,r12 + hrfid + +opal_return_realmode: + FIXUP_ENDIAN + ld r2,PACATOC(r13); + lwz r11,8(r1); + ld r12,PPC_LR_STKOFF(r1) + mtcr r11; + mtlr r12 + blr + #ifdef CONFIG_TRACEPOINTS opal_tracepoint_entry: stdu r1,-STACKFRAMESIZE(r1) @@ -146,7 +172,7 @@ opal_tracepoint_entry: opal_tracepoint_return: std r3,STK_REG(R31)(r1) mr r4,r3 - ld r0,STK_REG(R23)(r1) + ld r3,STK_REG(R23)(r1) bl __trace_opal_exit ld r3,STK_REG(R31)(r1) addi r1,r1,STACKFRAMESIZE @@ -155,36 +181,6 @@ opal_tracepoint_return: blr #endif -#define OPAL_CALL_REAL(name, token) \ - _GLOBAL_TOC(name); \ - mflr r0; \ - std r0,PPC_LR_STKOFF(r1); \ - li r0,token; \ - mfcr r12; \ - stw r12,8(r1); \ - \ - /* Set opal return address */ \ - LOAD_REG_ADDR(r11, opal_return_realmode); \ - mtlr r11; \ - mfmsr r12; \ - li r11,MSR_LE; \ - andc r12,r12,r11; \ - mtspr SPRN_HSRR1,r12; \ - LOAD_REG_ADDR(r11,opal); \ - ld r12,8(r11); \ - ld r2,0(r11); \ - mtspr SPRN_HSRR0,r12; \ - hrfid - -opal_return_realmode: - FIXUP_ENDIAN - ld r2,PACATOC(r13); - lwz r11,8(r1); - ld r12,PPC_LR_STKOFF(r1) - mtcr r11; - mtlr r12 - blr - OPAL_CALL(opal_invalid_call, OPAL_INVALID_CALL); OPAL_CALL(opal_console_write, OPAL_CONSOLE_WRITE); @@ -208,7 +204,6 @@ OPAL_CALL(opal_pci_config_write_byte, OPAL_PCI_CONFIG_WRITE_BYTE); OPAL_CALL(opal_pci_config_write_half_word, OPAL_PCI_CONFIG_WRITE_HALF_WORD); OPAL_CALL(opal_pci_config_write_word, OPAL_PCI_CONFIG_WRITE_WORD); OPAL_CALL(opal_set_xive, OPAL_SET_XIVE); -OPAL_CALL_REAL(opal_rm_set_xive, OPAL_SET_XIVE); OPAL_CALL(opal_get_xive, OPAL_GET_XIVE); OPAL_CALL(opal_register_exception_handler, OPAL_REGISTER_OPAL_EXCEPTION_HANDLER); OPAL_CALL(opal_pci_eeh_freeze_status, OPAL_PCI_EEH_FREEZE_STATUS); @@ -264,7 +259,6 @@ OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE); OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE); OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE); OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE); -OPAL_CALL_REAL(opal_rm_resync_timebase, OPAL_RESYNC_TIMEBASE); OPAL_CALL(opal_check_token, OPAL_CHECK_TOKEN); OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT); OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO); @@ -280,9 +274,7 @@ OPAL_CALL(opal_sensor_read, OPAL_SENSOR_READ); OPAL_CALL(opal_get_param, OPAL_GET_PARAM); OPAL_CALL(opal_set_param, OPAL_SET_PARAM); OPAL_CALL(opal_handle_hmi, OPAL_HANDLE_HMI); -OPAL_CALL_REAL(opal_rm_handle_hmi, OPAL_HANDLE_HMI); OPAL_CALL(opal_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); -OPAL_CALL_REAL(opal_rm_config_cpu_idle_state, OPAL_CONFIG_CPU_IDLE_STATE); OPAL_CALL(opal_slw_set_reg, OPAL_SLW_SET_REG); OPAL_CALL(opal_register_dump_region, OPAL_REGISTER_DUMP_REGION); OPAL_CALL(opal_unregister_dump_region, OPAL_UNREGISTER_DUMP_REGION); @@ -304,11 +296,8 @@ OPAL_CALL(opal_pci_get_presence_state, OPAL_PCI_GET_PRESENCE_STATE); OPAL_CALL(opal_pci_get_power_state, OPAL_PCI_GET_POWER_STATE); OPAL_CALL(opal_pci_set_power_state, OPAL_PCI_SET_POWER_STATE); OPAL_CALL(opal_int_get_xirr, OPAL_INT_GET_XIRR); -OPAL_CALL_REAL(opal_rm_int_get_xirr, OPAL_INT_GET_XIRR); OPAL_CALL(opal_int_set_cppr, OPAL_INT_SET_CPPR); OPAL_CALL(opal_int_eoi, OPAL_INT_EOI); -OPAL_CALL_REAL(opal_rm_int_eoi, OPAL_INT_EOI); OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); -OPAL_CALL_REAL(opal_rm_int_set_mfrr, OPAL_INT_SET_MFRR); OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); -OPAL_CALL_REAL(opal_rm_pci_tce_kill, OPAL_PCI_TCE_KILL); +OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR); diff --git a/arch/powerpc/platforms/powernv/opal.c b/arch/powerpc/platforms/powernv/opal.c index 282293572dc8..86d9fde93c17 100644 --- a/arch/powerpc/platforms/powernv/opal.c +++ b/arch/powerpc/platforms/powernv/opal.c @@ -875,6 +875,17 @@ int opal_error_code(int rc) } } +void powernv_set_nmmu_ptcr(unsigned long ptcr) +{ + int rc; + + if (firmware_has_feature(FW_FEATURE_OPAL)) { + rc = opal_nmmu_set_ptcr(-1UL, ptcr); + if (rc != OPAL_SUCCESS && rc != OPAL_UNSUPPORTED) + pr_warn("%s: Unable to set nest mmu ptcr\n", __func__); + } +} + EXPORT_SYMBOL_GPL(opal_poll_events); EXPORT_SYMBOL_GPL(opal_rtc_read); EXPORT_SYMBOL_GPL(opal_rtc_write); diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index b07680cd2518..8278f43ad4b8 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1326,7 +1326,9 @@ static int pnv_pci_vf_assign_m64(struct pci_dev *pdev, u16 num_vfs) else m64_bars = 1; - pdn->m64_map = kmalloc(sizeof(*pdn->m64_map) * m64_bars, GFP_KERNEL); + pdn->m64_map = kmalloc_array(m64_bars, + sizeof(*pdn->m64_map), + GFP_KERNEL); if (!pdn->m64_map) return -ENOMEM; /* Initialize the m64_map to IODA_INVALID_M64 */ @@ -1593,8 +1595,9 @@ int pnv_pci_sriov_enable(struct pci_dev *pdev, u16 num_vfs) /* Allocating pe_num_map */ if (pdn->m64_single_mode) - pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map) * num_vfs, - GFP_KERNEL); + pdn->pe_num_map = kmalloc_array(num_vfs, + sizeof(*pdn->pe_num_map), + GFP_KERNEL); else pdn->pe_num_map = kmalloc(sizeof(*pdn->pe_num_map), GFP_KERNEL); @@ -1950,7 +1953,12 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, struct pnv_phb *phb = pe->phb; unsigned int shift = tbl->it_page_shift; - if (phb->type == PNV_PHB_NPU) { + /* + * NVLink1 can use the TCE kill register directly as + * it's the same as PHB3. NVLink2 is different and + * should go via the OPAL call. + */ + if (phb->model == PNV_PHB_MODEL_NPU) { /* * The NVLink hardware does not support TCE kill * per TCE entry so we have to invalidate @@ -1962,11 +1970,6 @@ static void pnv_pci_ioda2_tce_invalidate(struct iommu_table *tbl, if (phb->model == PNV_PHB_MODEL_PHB3 && phb->regs) pnv_pci_phb3_tce_invalidate(pe, rm, shift, index, npages); - else if (rm) - opal_rm_pci_tce_kill(phb->opal_id, - OPAL_PCI_TCE_KILL_PAGES, - pe->pe_number, 1u << shift, - index << shift, npages); else opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL_PAGES, @@ -3671,6 +3674,8 @@ static void __init pnv_pci_init_ioda_phb(struct device_node *np, phb->model = PNV_PHB_MODEL_PHB3; else if (of_device_is_compatible(np, "ibm,power8-npu-pciex")) phb->model = PNV_PHB_MODEL_NPU; + else if (of_device_is_compatible(np, "ibm,power9-npu-pciex")) + phb->model = PNV_PHB_MODEL_NPU2; else phb->model = PNV_PHB_MODEL_UNKNOWN; diff --git a/arch/powerpc/platforms/powernv/pci.c b/arch/powerpc/platforms/powernv/pci.c index c6d554fe585c..eb835e977e33 100644 --- a/arch/powerpc/platforms/powernv/pci.c +++ b/arch/powerpc/platforms/powernv/pci.c @@ -940,6 +940,13 @@ void __init pnv_pci_init(void) for_each_compatible_node(np, NULL, "ibm,ioda2-npu-phb") pnv_pci_init_npu_phb(np); + /* + * Look for NPU2 PHBs which we treat mostly as NPU PHBs with + * the exception of TCE kill which requires an OPAL call. + */ + for_each_compatible_node(np, NULL, "ibm,ioda2-npu2-phb") + pnv_pci_init_npu_phb(np); + /* Configure IOMMU DMA hooks */ set_pci_dma_ops(&dma_iommu_ops); } diff --git a/arch/powerpc/platforms/powernv/pci.h b/arch/powerpc/platforms/powernv/pci.h index e64df7894d6e..e1d3e5526b54 100644 --- a/arch/powerpc/platforms/powernv/pci.h +++ b/arch/powerpc/platforms/powernv/pci.h @@ -19,6 +19,7 @@ enum pnv_phb_model { PNV_PHB_MODEL_P7IOC, PNV_PHB_MODEL_PHB3, PNV_PHB_MODEL_NPU, + PNV_PHB_MODEL_NPU2, }; #define PNV_PCI_DIAG_BUF_SIZE 8192 diff --git a/arch/powerpc/platforms/powernv/powernv.h b/arch/powerpc/platforms/powernv/powernv.h index da7c843ac7f1..613052232475 100644 --- a/arch/powerpc/platforms/powernv/powernv.h +++ b/arch/powerpc/platforms/powernv/powernv.h @@ -18,7 +18,8 @@ static inline void pnv_pci_shutdown(void) { } #endif extern u32 pnv_get_supported_cpuidle_states(void); -extern u64 pnv_deepest_stop_state; +extern u64 pnv_deepest_stop_psscr_val; +extern u64 pnv_deepest_stop_psscr_mask; extern void pnv_lpc_init(void); diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index eec0e8d0454d..e39e6c428af1 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -184,15 +184,17 @@ static void pnv_smp_cpu_kill_self(void) ppc64_runlatch_off(); - if (cpu_has_feature(CPU_FTR_ARCH_300)) - srr1 = power9_idle_stop(pnv_deepest_stop_state); - else if (idle_states & OPAL_PM_WINKLE_ENABLED) + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, + pnv_deepest_stop_psscr_mask); + } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { srr1 = power7_winkle(); - else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || - (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) + } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || + (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { srr1 = power7_sleep(); - else + } else { srr1 = power7_nap(1); + } ppc64_runlatch_on(); diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig index e1c280a95d58..30ec04f1c67c 100644 --- a/arch/powerpc/platforms/pseries/Kconfig +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -17,7 +17,6 @@ config PPC_PSERIES select PPC_UDBG_16550 select PPC_NATIVE select PPC_DOORBELL - select HAVE_CONTEXT_TRACKING select HOTPLUG_CPU if SMP select ARCH_RANDOM select PPC_DOORBELL diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c index 4839db385bb0..4ac419c7eb4c 100644 --- a/arch/powerpc/platforms/pseries/cmm.c +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -76,7 +76,7 @@ module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. " "[Default=" __stringify(CMM_DEFAULT_DELAY) "]"); module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove " +MODULE_PARM_DESC(hotplug_delay, "Delay (in seconds) after memory hotplug remove " "before loaning resumes. " "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]"); module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR); diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c index 5cb2e4beffc5..d3a81e746fc4 100644 --- a/arch/powerpc/platforms/pseries/dlpar.c +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -551,7 +551,13 @@ dlpar_store_out: return rc ? rc : count; } -static CLASS_ATTR(dlpar, S_IWUSR, NULL, dlpar_store); +static ssize_t dlpar_show(struct class *class, struct class_attribute *attr, + char *buf) +{ + return sprintf(buf, "%s\n", "memory,cpu"); +} + +static CLASS_ATTR(dlpar, S_IWUSR | S_IRUSR, dlpar_show, dlpar_store); static int __init pseries_dlpar_init(void) { diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c index ea7f09bd73b1..63cc82ad58ac 100644 --- a/arch/powerpc/platforms/pseries/firmware.c +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -64,6 +64,7 @@ hypertas_fw_features_table[] = { {FW_FEATURE_VPHN, "hcall-vphn"}, {FW_FEATURE_SET_MODE, "hcall-set-mode"}, {FW_FEATURE_BEST_ENERGY, "hcall-best-energy-1*"}, + {FW_FEATURE_HPT_RESIZE, "hcall-hpt-resize"}, }; /* Build up the firmware features bitmask using the contents of @@ -126,7 +127,7 @@ static void __init fw_vec5_feature_init(const char *vec5, unsigned long len) index = OV5_INDX(vec5_fw_features_table[i].feature); feat = OV5_FEAT(vec5_fw_features_table[i].feature); - if (vec5[index] & feat) + if (index < len && (vec5[index] & feat)) powerpc_firmware_features |= vec5_fw_features_table[i].val; } diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 2617f9f356bd..3381c20edbc0 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -446,9 +446,7 @@ static int dlpar_remove_lmb(struct of_drconf_cell *lmb) /* Update memory regions for memory remove */ memblock_remove(lmb->base_addr, block_sz); - dlpar_release_drc(lmb->drc_index); dlpar_remove_device_tree_lmb(lmb); - return 0; } @@ -516,6 +514,7 @@ static int dlpar_memory_remove_by_count(u32 lmbs_to_remove, if (!lmbs[i].reserved) continue; + dlpar_release_drc(lmbs[i].drc_index); pr_info("Memory at %llx was hot-removed\n", lmbs[i].base_addr); @@ -545,6 +544,9 @@ static int dlpar_memory_remove_by_index(u32 drc_index, struct property *prop) if (lmbs[i].drc_index == drc_index) { lmb_found = 1; rc = dlpar_remove_lmb(&lmbs[i]); + if (!rc) + dlpar_release_drc(lmbs[i].drc_index); + break; } } @@ -561,6 +563,44 @@ static int dlpar_memory_remove_by_index(u32 drc_index, struct property *prop) return rc; } +static int dlpar_memory_readd_by_index(u32 drc_index, struct property *prop) +{ + struct of_drconf_cell *lmbs; + u32 num_lmbs, *p; + int lmb_found; + int i, rc; + + pr_info("Attempting to update LMB, drc index %x\n", drc_index); + + p = prop->value; + num_lmbs = *p++; + lmbs = (struct of_drconf_cell *)p; + + lmb_found = 0; + for (i = 0; i < num_lmbs; i++) { + if (lmbs[i].drc_index == drc_index) { + lmb_found = 1; + rc = dlpar_remove_lmb(&lmbs[i]); + if (!rc) { + rc = dlpar_add_lmb(&lmbs[i]); + if (rc) + dlpar_release_drc(lmbs[i].drc_index); + } + break; + } + } + + if (!lmb_found) + rc = -EINVAL; + + if (rc) + pr_info("Failed to update memory at %llx\n", + lmbs[i].base_addr); + else + pr_info("Memory at %llx was updated\n", lmbs[i].base_addr); + + return rc; +} #else static inline int pseries_remove_memblock(unsigned long base, unsigned int memblock_size) @@ -599,10 +639,6 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb) if (lmb->flags & DRCONF_MEM_ASSIGNED) return -EINVAL; - rc = dlpar_acquire_drc(lmb->drc_index); - if (rc) - return rc; - rc = dlpar_add_device_tree_lmb(lmb); if (rc) { pr_err("Couldn't update device tree for drc index %x\n", @@ -618,12 +654,10 @@ static int dlpar_add_lmb(struct of_drconf_cell *lmb) /* Add the memory */ rc = add_memory(nid, lmb->base_addr, block_sz); - if (rc) { + if (rc) dlpar_remove_device_tree_lmb(lmb); - dlpar_release_drc(lmb->drc_index); - } else { + else lmb->flags |= DRCONF_MEM_ASSIGNED; - } return rc; } @@ -655,10 +689,16 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add, struct property *prop) return -EINVAL; for (i = 0; i < num_lmbs && lmbs_to_add != lmbs_added; i++) { - rc = dlpar_add_lmb(&lmbs[i]); + rc = dlpar_acquire_drc(lmbs[i].drc_index); if (rc) continue; + rc = dlpar_add_lmb(&lmbs[i]); + if (rc) { + dlpar_release_drc(lmbs[i].drc_index); + continue; + } + lmbs_added++; /* Mark this lmb so we can remove it later if all of the @@ -678,6 +718,8 @@ static int dlpar_memory_add_by_count(u32 lmbs_to_add, struct property *prop) if (rc) pr_err("Failed to remove LMB, drc index %x\n", be32_to_cpu(lmbs[i].drc_index)); + else + dlpar_release_drc(lmbs[i].drc_index); } rc = -EINVAL; } else { @@ -711,7 +753,13 @@ static int dlpar_memory_add_by_index(u32 drc_index, struct property *prop) for (i = 0; i < num_lmbs; i++) { if (lmbs[i].drc_index == drc_index) { lmb_found = 1; - rc = dlpar_add_lmb(&lmbs[i]); + rc = dlpar_acquire_drc(lmbs[i].drc_index); + if (!rc) { + rc = dlpar_add_lmb(&lmbs[i]); + if (rc) + dlpar_release_drc(lmbs[i].drc_index); + } + break; } } @@ -769,6 +817,9 @@ int dlpar_memory(struct pseries_hp_errorlog *hp_elog) else rc = -EINVAL; break; + case PSERIES_HP_ELOG_ACTION_READD: + rc = dlpar_memory_readd_by_index(drc_index, prop); + break; default: pr_err("Invalid action (%d) specified\n", hp_elog->action); rc = -EINVAL; diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index 5dc1c3c6e716..251060cf1713 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -27,6 +27,8 @@ #include <linux/console.h> #include <linux/export.h> #include <linux/jump_label.h> +#include <linux/delay.h> +#include <linux/stop_machine.h> #include <asm/processor.h> #include <asm/mmu.h> #include <asm/page.h> @@ -609,6 +611,135 @@ static int __init disable_bulk_remove(char *str) __setup("bulk_remove=", disable_bulk_remove); +#define HPT_RESIZE_TIMEOUT 10000 /* ms */ + +struct hpt_resize_state { + unsigned long shift; + int commit_rc; +}; + +static int pseries_lpar_resize_hpt_commit(void *data) +{ + struct hpt_resize_state *state = data; + + state->commit_rc = plpar_resize_hpt_commit(0, state->shift); + if (state->commit_rc != H_SUCCESS) + return -EIO; + + /* Hypervisor has transitioned the HTAB, update our globals */ + ppc64_pft_size = state->shift; + htab_size_bytes = 1UL << ppc64_pft_size; + htab_hash_mask = (htab_size_bytes >> 7) - 1; + + return 0; +} + +/* Must be called in user context */ +static int pseries_lpar_resize_hpt(unsigned long shift) +{ + struct hpt_resize_state state = { + .shift = shift, + .commit_rc = H_FUNCTION, + }; + unsigned int delay, total_delay = 0; + int rc; + ktime_t t0, t1, t2; + + might_sleep(); + + if (!firmware_has_feature(FW_FEATURE_HPT_RESIZE)) + return -ENODEV; + + printk(KERN_INFO "lpar: Attempting to resize HPT to shift %lu\n", + shift); + + t0 = ktime_get(); + + rc = plpar_resize_hpt_prepare(0, shift); + while (H_IS_LONG_BUSY(rc)) { + delay = get_longbusy_msecs(rc); + total_delay += delay; + if (total_delay > HPT_RESIZE_TIMEOUT) { + /* prepare with shift==0 cancels an in-progress resize */ + rc = plpar_resize_hpt_prepare(0, 0); + if (rc != H_SUCCESS) + printk(KERN_WARNING + "lpar: Unexpected error %d cancelling timed out HPT resize\n", + rc); + return -ETIMEDOUT; + } + msleep(delay); + rc = plpar_resize_hpt_prepare(0, shift); + }; + + switch (rc) { + case H_SUCCESS: + /* Continue on */ + break; + + case H_PARAMETER: + return -EINVAL; + case H_RESOURCE: + return -EPERM; + default: + printk(KERN_WARNING + "lpar: Unexpected error %d from H_RESIZE_HPT_PREPARE\n", + rc); + return -EIO; + } + + t1 = ktime_get(); + + rc = stop_machine(pseries_lpar_resize_hpt_commit, &state, NULL); + + t2 = ktime_get(); + + if (rc != 0) { + switch (state.commit_rc) { + case H_PTEG_FULL: + printk(KERN_WARNING + "lpar: Hash collision while resizing HPT\n"); + return -ENOSPC; + + default: + printk(KERN_WARNING + "lpar: Unexpected error %d from H_RESIZE_HPT_COMMIT\n", + state.commit_rc); + return -EIO; + }; + } + + printk(KERN_INFO + "lpar: HPT resize to shift %lu complete (%lld ms / %lld ms)\n", + shift, (long long) ktime_ms_delta(t1, t0), + (long long) ktime_ms_delta(t2, t1)); + + return 0; +} + +/* Actually only used for radix, so far */ +static int pseries_lpar_register_process_table(unsigned long base, + unsigned long page_size, unsigned long table_size) +{ + long rc; + unsigned long flags = PROC_TABLE_NEW; + + if (radix_enabled()) + flags |= PROC_TABLE_RADIX | PROC_TABLE_GTSE; + for (;;) { + rc = plpar_hcall_norets(H_REGISTER_PROC_TBL, flags, base, + page_size, table_size); + if (!H_IS_LONG_BUSY(rc)) + break; + mdelay(get_longbusy_msecs(rc)); + } + if (rc != H_SUCCESS) { + pr_err("Failed to register process table (rc=%ld)\n", rc); + BUG(); + } + return rc; +} + void __init hpte_init_pseries(void) { mmu_hash_ops.hpte_invalidate = pSeries_lpar_hpte_invalidate; @@ -620,6 +751,13 @@ void __init hpte_init_pseries(void) mmu_hash_ops.flush_hash_range = pSeries_lpar_flush_hash_range; mmu_hash_ops.hpte_clear_all = pseries_hpte_clear_all; mmu_hash_ops.hugepage_invalidate = pSeries_lpar_hugepage_invalidate; + mmu_hash_ops.resize_hpt = pseries_lpar_resize_hpt; +} + +void radix_init_pseries(void) +{ + pr_info("Using radix MMU under hypervisor\n"); + register_process_table = pseries_lpar_register_process_table; } #ifdef CONFIG_PPC_SMLPAR diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index a560a98bcf3b..5a0c7ba429ce 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -39,6 +39,7 @@ struct update_props_workarea { #define ADD_DT_NODE 0x03000000 #define MIGRATION_SCOPE (1) +#define PRRN_SCOPE -2 static int mobility_rtas_call(int token, char *buf, s32 scope) { @@ -236,6 +237,35 @@ static int add_dt_node(__be32 parent_phandle, __be32 drc_index) return rc; } +static void prrn_update_node(__be32 phandle) +{ + struct pseries_hp_errorlog *hp_elog; + struct device_node *dn; + + /* + * If a node is found from a the given phandle, the phandle does not + * represent the drc index of an LMB and we can ignore. + */ + dn = of_find_node_by_phandle(be32_to_cpu(phandle)); + if (dn) { + of_node_put(dn); + return; + } + + hp_elog = kzalloc(sizeof(*hp_elog), GFP_KERNEL); + if(!hp_elog) + return; + + hp_elog->resource = PSERIES_HP_ELOG_RESOURCE_MEM; + hp_elog->action = PSERIES_HP_ELOG_ACTION_READD; + hp_elog->id_type = PSERIES_HP_ELOG_ID_DRC_INDEX; + hp_elog->_drc_u.drc_index = phandle; + + queue_hotplug_event(hp_elog, NULL, NULL); + + kfree(hp_elog); +} + int pseries_devicetree_update(s32 scope) { char *rtas_buf; @@ -274,6 +304,10 @@ int pseries_devicetree_update(s32 scope) break; case UPDATE_DT_NODE: update_dt_node(phandle, scope); + + if (scope == PRRN_SCOPE) + prrn_update_node(phandle); + break; case ADD_DT_NODE: drc_index = *data++; diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c index 7736352f7279..b4d362ed03a1 100644 --- a/arch/powerpc/platforms/pseries/setup.c +++ b/arch/powerpc/platforms/pseries/setup.c @@ -66,6 +66,7 @@ #include <asm/reg.h> #include <asm/plpar_wrappers.h> #include <asm/kexec.h> +#include <asm/isa-bridge.h> #include "pseries.h" diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 3f864c36d847..1be0499f5397 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -1403,7 +1403,7 @@ static void xmon_show_stack(unsigned long sp, unsigned long lr, struct pt_regs regs; while (max_to_print--) { - if (sp < PAGE_OFFSET) { + if (!is_kernel_addr(sp)) { if (sp != 0) printf("SP (%lx) is in userspace\n", sp); break; @@ -1431,12 +1431,12 @@ static void xmon_show_stack(unsigned long sp, unsigned long lr, mread(newsp + LRSAVE_OFFSET, &nextip, sizeof(unsigned long)); if (lr == ip) { - if (lr < PAGE_OFFSET + if (!is_kernel_addr(lr) || (fnstart <= lr && lr < fnend)) printip = 0; } else if (lr == nextip) { printip = 0; - } else if (lr >= PAGE_OFFSET + } else if (is_kernel_addr(lr) && !(fnstart <= lr && lr < fnend)) { printf("[link register ] "); xmon_print_symbol(lr, " ", "\n"); @@ -1496,7 +1496,7 @@ static void print_bug_trap(struct pt_regs *regs) if (regs->msr & MSR_PR) return; /* not in kernel */ addr = regs->nip; /* address of trap instruction */ - if (addr < PAGE_OFFSET) + if (!is_kernel_addr(addr)) return; bug = find_bug(regs->nip); if (bug == NULL) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 0835a37a5f3a..370593006f5f 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -19,7 +19,12 @@ #include <asm/firmware.h> #include <asm/opal.h> #include <asm/runlatch.h> +#include <asm/cpuidle.h> +/* + * Expose only those Hardware idle states via the cpuidle framework + * that have latency value below POWERNV_THRESHOLD_LATENCY_NS. + */ #define POWERNV_THRESHOLD_LATENCY_NS 200000 static struct cpuidle_driver powernv_idle_driver = { @@ -30,7 +35,12 @@ static struct cpuidle_driver powernv_idle_driver = { static int max_idle_state; static struct cpuidle_state *cpuidle_state_table; -static u64 stop_psscr_table[CPUIDLE_STATE_MAX]; +struct stop_psscr_table { + u64 val; + u64 mask; +}; + +static struct stop_psscr_table stop_psscr_table[CPUIDLE_STATE_MAX]; static u64 snooze_timeout; static bool snooze_timeout_en; @@ -102,7 +112,8 @@ static int stop_loop(struct cpuidle_device *dev, int index) { ppc64_runlatch_off(); - power9_idle_stop(stop_psscr_table[index]); + power9_idle_stop(stop_psscr_table[index].val, + stop_psscr_table[index].mask); ppc64_runlatch_on(); return index; } @@ -167,6 +178,25 @@ static int powernv_cpuidle_driver_init(void) return 0; } +static inline void add_powernv_state(int index, const char *name, + unsigned int flags, + int (*idle_fn)(struct cpuidle_device *, + struct cpuidle_driver *, + int), + unsigned int target_residency, + unsigned int exit_latency, + u64 psscr_val, u64 psscr_mask) +{ + strlcpy(powernv_states[index].name, name, CPUIDLE_NAME_LEN); + strlcpy(powernv_states[index].desc, name, CPUIDLE_NAME_LEN); + powernv_states[index].flags = flags; + powernv_states[index].target_residency = target_residency; + powernv_states[index].exit_latency = exit_latency; + powernv_states[index].enter = idle_fn; + stop_psscr_table[index].val = psscr_val; + stop_psscr_table[index].mask = psscr_mask; +} + static int powernv_add_idle_states(void) { struct device_node *power_mgt; @@ -176,7 +206,9 @@ static int powernv_add_idle_states(void) u32 residency_ns[CPUIDLE_STATE_MAX]; u32 flags[CPUIDLE_STATE_MAX]; u64 psscr_val[CPUIDLE_STATE_MAX]; + u64 psscr_mask[CPUIDLE_STATE_MAX]; const char *names[CPUIDLE_STATE_MAX]; + u32 has_stop_states = 0; int i, rc; /* Currently we have snooze statically defined */ @@ -223,19 +255,30 @@ static int powernv_add_idle_states(void) /* * If the idle states use stop instruction, probe for psscr values - * which are necessary to specify required stop level. + * and psscr mask which are necessary to specify required stop level. */ - if (flags[0] & (OPAL_PM_STOP_INST_FAST | OPAL_PM_STOP_INST_DEEP)) + has_stop_states = (flags[0] & + (OPAL_PM_STOP_INST_FAST | OPAL_PM_STOP_INST_DEEP)); + if (has_stop_states) { if (of_property_read_u64_array(power_mgt, "ibm,cpu-idle-state-psscr", psscr_val, dt_idle_states)) { - pr_warn("cpuidle-powernv: missing ibm,cpu-idle-states-psscr in DT\n"); + pr_warn("cpuidle-powernv: missing ibm,cpu-idle-state-psscr in DT\n"); + goto out; + } + + if (of_property_read_u64_array(power_mgt, + "ibm,cpu-idle-state-psscr-mask", + psscr_mask, dt_idle_states)) { + pr_warn("cpuidle-powernv:Missing ibm,cpu-idle-state-psscr-mask in DT\n"); goto out; } + } rc = of_property_read_u32_array(power_mgt, "ibm,cpu-idle-state-residency-ns", residency_ns, dt_idle_states); for (i = 0; i < dt_idle_states; i++) { + unsigned int exit_latency, target_residency; /* * If an idle state has exit latency beyond * POWERNV_THRESHOLD_LATENCY_NS then don't use it @@ -243,28 +286,43 @@ static int powernv_add_idle_states(void) */ if (latency_ns[i] > POWERNV_THRESHOLD_LATENCY_NS) continue; + /* + * Firmware passes residency and latency values in ns. + * cpuidle expects it in us. + */ + exit_latency = latency_ns[i] / 1000; + if (!rc) + target_residency = residency_ns[i] / 1000; + else + target_residency = 0; + + if (has_stop_states) { + int err = validate_psscr_val_mask(&psscr_val[i], + &psscr_mask[i], + flags[i]); + if (err) { + report_invalid_psscr_val(psscr_val[i], err); + continue; + } + } /* - * Cpuidle accepts exit_latency and target_residency in us. - * Use default target_residency values if f/w does not expose it. + * For nap and fastsleep, use default target_residency + * values if f/w does not expose it. */ if (flags[i] & OPAL_PM_NAP_ENABLED) { + if (!rc) + target_residency = 100; /* Add NAP state */ - strcpy(powernv_states[nr_idle_states].name, "Nap"); - strcpy(powernv_states[nr_idle_states].desc, "Nap"); - powernv_states[nr_idle_states].flags = 0; - powernv_states[nr_idle_states].target_residency = 100; - powernv_states[nr_idle_states].enter = nap_loop; + add_powernv_state(nr_idle_states, "Nap", + CPUIDLE_FLAG_NONE, nap_loop, + target_residency, exit_latency, 0, 0); } else if ((flags[i] & OPAL_PM_STOP_INST_FAST) && !(flags[i] & OPAL_PM_TIMEBASE_STOP)) { - strncpy(powernv_states[nr_idle_states].name, - names[i], CPUIDLE_NAME_LEN); - strncpy(powernv_states[nr_idle_states].desc, - names[i], CPUIDLE_NAME_LEN); - powernv_states[nr_idle_states].flags = 0; - - powernv_states[nr_idle_states].enter = stop_loop; - stop_psscr_table[nr_idle_states] = psscr_val[i]; + add_powernv_state(nr_idle_states, names[i], + CPUIDLE_FLAG_NONE, stop_loop, + target_residency, exit_latency, + psscr_val[i], psscr_mask[i]); } /* @@ -274,32 +332,21 @@ static int powernv_add_idle_states(void) #ifdef CONFIG_TICK_ONESHOT if (flags[i] & OPAL_PM_SLEEP_ENABLED || flags[i] & OPAL_PM_SLEEP_ENABLED_ER1) { + if (!rc) + target_residency = 300000; /* Add FASTSLEEP state */ - strcpy(powernv_states[nr_idle_states].name, "FastSleep"); - strcpy(powernv_states[nr_idle_states].desc, "FastSleep"); - powernv_states[nr_idle_states].flags = CPUIDLE_FLAG_TIMER_STOP; - powernv_states[nr_idle_states].target_residency = 300000; - powernv_states[nr_idle_states].enter = fastsleep_loop; + add_powernv_state(nr_idle_states, "FastSleep", + CPUIDLE_FLAG_TIMER_STOP, + fastsleep_loop, + target_residency, exit_latency, 0, 0); } else if ((flags[i] & OPAL_PM_STOP_INST_DEEP) && (flags[i] & OPAL_PM_TIMEBASE_STOP)) { - strncpy(powernv_states[nr_idle_states].name, - names[i], CPUIDLE_NAME_LEN); - strncpy(powernv_states[nr_idle_states].desc, - names[i], CPUIDLE_NAME_LEN); - - powernv_states[nr_idle_states].flags = CPUIDLE_FLAG_TIMER_STOP; - powernv_states[nr_idle_states].enter = stop_loop; - stop_psscr_table[nr_idle_states] = psscr_val[i]; + add_powernv_state(nr_idle_states, names[i], + CPUIDLE_FLAG_TIMER_STOP, stop_loop, + target_residency, exit_latency, + psscr_val[i], psscr_mask[i]); } #endif - powernv_states[nr_idle_states].exit_latency = - ((unsigned int)latency_ns[i]) / 1000; - - if (!rc) { - powernv_states[nr_idle_states].target_residency = - ((unsigned int)residency_ns[i]) / 1000; - } - nr_idle_states++; } out: diff --git a/drivers/macintosh/Kconfig b/drivers/macintosh/Kconfig index 5d80810934df..97a420c11eed 100644 --- a/drivers/macintosh/Kconfig +++ b/drivers/macintosh/Kconfig @@ -30,14 +30,6 @@ config ADB_MACII Quadra 610, Quadra 650, Quadra 700, Quadra 800, Centris 610 and Centris 650. -config ADB_MACIISI - bool "Include Mac IIsi ADB driver" - depends on ADB && MAC && BROKEN - help - Say Y here if want your kernel to support Macintosh systems that use - the Mac IIsi style ADB. This includes the IIsi, IIvi, IIvx, Classic - II, LC, LC II, LC III, Performa 460, and the Performa 600. - config ADB_IOP bool "Include IOP (IIfx/Quadra 9x0) ADB driver" depends on ADB && MAC @@ -60,17 +52,15 @@ config ADB_PMU68K # we want to change this to something like CONFIG_SYSCTRL_CUDA/PMU config ADB_CUDA - bool "Support for CUDA based Macs and PowerMacs" + bool "Support for Cuda/Egret based Macs and PowerMacs" depends on (ADB || PPC_PMAC) && !PPC_PMAC64 help - This provides support for CUDA based Macintosh and Power Macintosh - systems. This includes many m68k based Macs (Color Classic, Mac TV, - Performa 475, Performa 520, Performa 550, Performa 575, - Performa 588, Quadra 605, Quadra 630, Quadra/Centris 660AV, and - Quadra 840AV), most OldWorld PowerMacs, the first generation iMacs, - the Blue&White G3 and the "Yikes" G4 (PCI Graphics). All later - models should use CONFIG_ADB_PMU instead. It is safe to say Y here - even if your machine doesn't have a CUDA. + This provides support for Cuda/Egret based Macintosh and + Power Macintosh systems. This includes most m68k based Macs, + most Old World PowerMacs, the first generation iMacs, the + Blue & White G3 and the "Yikes" G4 (PCI Graphics). All later + models should use CONFIG_ADB_PMU instead. It is safe to say Y + here even if your machine doesn't have a Cuda or Egret device. If unsure say Y. diff --git a/drivers/macintosh/Makefile b/drivers/macintosh/Makefile index 383ba920085b..516eb65bcacc 100644 --- a/drivers/macintosh/Makefile +++ b/drivers/macintosh/Makefile @@ -20,7 +20,6 @@ obj-$(CONFIG_PMAC_SMU) += smu.o obj-$(CONFIG_ADB) += adb.o obj-$(CONFIG_ADB_MACII) += via-macii.o -obj-$(CONFIG_ADB_MACIISI) += via-maciisi.o obj-$(CONFIG_ADB_IOP) += adb-iop.o obj-$(CONFIG_ADB_PMU68K) += via-pmu68k.o obj-$(CONFIG_ADB_MACIO) += macio-adb.o diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c index 226179b975a0..152414e6378a 100644 --- a/drivers/macintosh/adb.c +++ b/drivers/macintosh/adb.c @@ -48,7 +48,6 @@ EXPORT_SYMBOL(adb_client_list); extern struct adb_driver via_macii_driver; -extern struct adb_driver via_maciisi_driver; extern struct adb_driver via_cuda_driver; extern struct adb_driver adb_iop_driver; extern struct adb_driver via_pmu_driver; @@ -59,9 +58,6 @@ static struct adb_driver *adb_driver_list[] = { #ifdef CONFIG_ADB_MACII &via_macii_driver, #endif -#ifdef CONFIG_ADB_MACIISI - &via_maciisi_driver, -#endif #ifdef CONFIG_ADB_CUDA &via_cuda_driver, #endif diff --git a/drivers/macintosh/via-cuda.c b/drivers/macintosh/via-cuda.c index 2088e23a8002..c60415958dfe 100644 --- a/drivers/macintosh/via-cuda.c +++ b/drivers/macintosh/via-cuda.c @@ -1,10 +1,10 @@ /* - * Device driver for the via-cuda on Apple Powermacs. + * Device driver for the Cuda and Egret system controllers found on PowerMacs + * and 68k Macs. * - * The VIA (versatile interface adapter) interfaces to the CUDA, - * a 6805 microprocessor core which controls the ADB (Apple Desktop - * Bus) which connects to the keyboard and mouse. The CUDA also - * controls system power and the RTC (real time clock) chip. + * The Cuda or Egret is a 6805 microcontroller interfaced to the 6522 VIA. + * This MCU controls system power, Parameter RAM, Real Time Clock and the + * Apple Desktop Bus (ADB) that connects to the keyboard and mouse. * * Copyright (C) 1996 Paul Mackerras. */ @@ -50,10 +50,27 @@ static DEFINE_SPINLOCK(cuda_lock); #define IER (14*RS) /* Interrupt enable register */ #define ANH (15*RS) /* A-side data, no handshake */ -/* Bits in B data register: all active low */ -#define TREQ 0x08 /* Transfer request (input) */ -#define TACK 0x10 /* Transfer acknowledge (output) */ -#define TIP 0x20 /* Transfer in progress (output) */ +/* + * When the Cuda design replaced the Egret, some signal names and + * logic sense changed. They all serve the same purposes, however. + * + * VIA pin | Egret pin + * ----------------+------------------------------------------ + * PB3 (input) | Transceiver session (active low) + * PB4 (output) | VIA full (active high) + * PB5 (output) | System session (active high) + * + * VIA pin | Cuda pin + * ----------------+------------------------------------------ + * PB3 (input) | Transfer request (active low) + * PB4 (output) | Byte acknowledge (active low) + * PB5 (output) | Transfer in progress (active low) + */ + +/* Bits in Port B data register */ +#define TREQ 0x08 /* Transfer request */ +#define TACK 0x10 /* Transfer acknowledge */ +#define TIP 0x20 /* Transfer in progress */ /* Bits in ACR */ #define SR_CTRL 0x1c /* Shift register control bits */ @@ -65,6 +82,74 @@ static DEFINE_SPINLOCK(cuda_lock); #define IER_CLR 0 /* clear bits in IER */ #define SR_INT 0x04 /* Shift register full/empty */ +/* Duration of byte acknowledgement pulse (us) */ +#define EGRET_TACK_ASSERTED_DELAY 300 +#define EGRET_TACK_NEGATED_DELAY 400 + +/* Interval from interrupt to start of session (us) */ +#define EGRET_SESSION_DELAY 450 + +#ifdef CONFIG_PPC +#define mcu_is_egret false +#else +static bool mcu_is_egret; +#endif + +static inline bool TREQ_asserted(u8 portb) +{ + return !(portb & TREQ); +} + +static inline void assert_TIP(void) +{ + if (mcu_is_egret) { + udelay(EGRET_SESSION_DELAY); + out_8(&via[B], in_8(&via[B]) | TIP); + } else + out_8(&via[B], in_8(&via[B]) & ~TIP); +} + +static inline void assert_TIP_and_TACK(void) +{ + if (mcu_is_egret) { + udelay(EGRET_SESSION_DELAY); + out_8(&via[B], in_8(&via[B]) | TIP | TACK); + } else + out_8(&via[B], in_8(&via[B]) & ~(TIP | TACK)); +} + +static inline void assert_TACK(void) +{ + if (mcu_is_egret) { + udelay(EGRET_TACK_NEGATED_DELAY); + out_8(&via[B], in_8(&via[B]) | TACK); + } else + out_8(&via[B], in_8(&via[B]) & ~TACK); +} + +static inline void toggle_TACK(void) +{ + out_8(&via[B], in_8(&via[B]) ^ TACK); +} + +static inline void negate_TACK(void) +{ + if (mcu_is_egret) { + udelay(EGRET_TACK_ASSERTED_DELAY); + out_8(&via[B], in_8(&via[B]) & ~TACK); + } else + out_8(&via[B], in_8(&via[B]) | TACK); +} + +static inline void negate_TIP_and_TACK(void) +{ + if (mcu_is_egret) { + udelay(EGRET_TACK_ASSERTED_DELAY); + out_8(&via[B], in_8(&via[B]) & ~(TIP | TACK)); + } else + out_8(&via[B], in_8(&via[B]) | TIP | TACK); +} + static enum cuda_state { idle, sent_first_byte, @@ -120,11 +205,13 @@ int __init find_via_cuda(void) struct adb_request req; int err; - if (macintosh_config->adb_type != MAC_ADB_CUDA) + if (macintosh_config->adb_type != MAC_ADB_CUDA && + macintosh_config->adb_type != MAC_ADB_EGRET) return 0; via = via1; cuda_state = idle; + mcu_is_egret = macintosh_config->adb_type == MAC_ADB_EGRET; err = cuda_init_via(); if (err) { @@ -221,7 +308,7 @@ static int __init via_cuda_start(void) return -EAGAIN; } - printk("Macintosh CUDA driver v0.5 for Unified ADB.\n"); + pr_info("Macintosh Cuda and Egret driver.\n"); cuda_fully_inited = 1; return 0; @@ -237,7 +324,8 @@ cuda_probe(void) if (sys_ctrler != SYS_CTRLER_CUDA) return -ENODEV; #else - if (macintosh_config->adb_type != MAC_ADB_CUDA) + if (macintosh_config->adb_type != MAC_ADB_CUDA && + macintosh_config->adb_type != MAC_ADB_EGRET) return -ENODEV; #endif if (via == NULL) @@ -246,12 +334,39 @@ cuda_probe(void) } #endif /* CONFIG_ADB */ +static int __init sync_egret(void) +{ + if (TREQ_asserted(in_8(&via[B]))) { + /* Complete the inbound transfer */ + assert_TIP_and_TACK(); + while (1) { + negate_TACK(); + mdelay(1); + (void)in_8(&via[SR]); + assert_TACK(); + if (!TREQ_asserted(in_8(&via[B]))) + break; + } + negate_TIP_and_TACK(); + } else if (in_8(&via[B]) & TIP) { + /* Terminate the outbound transfer */ + negate_TACK(); + assert_TACK(); + mdelay(1); + negate_TIP_and_TACK(); + } + /* Clear shift register interrupt */ + if (in_8(&via[IFR]) & SR_INT) + (void)in_8(&via[SR]); + return 0; +} + #define WAIT_FOR(cond, what) \ do { \ int x; \ for (x = 1000; !(cond); --x) { \ if (x == 0) { \ - printk("Timeout waiting for " what "\n"); \ + pr_err("Timeout waiting for " what "\n"); \ return -ENXIO; \ } \ udelay(100); \ @@ -261,10 +376,6 @@ cuda_probe(void) static int __init cuda_init_via(void) { - out_8(&via[DIRB], (in_8(&via[DIRB]) | TACK | TIP) & ~TREQ); /* TACK & TIP out */ - out_8(&via[B], in_8(&via[B]) | TACK | TIP); /* negate them */ - out_8(&via[ACR] ,(in_8(&via[ACR]) & ~SR_CTRL) | SR_EXT); /* SR data in */ - (void)in_8(&via[SR]); /* clear any left-over data */ #ifdef CONFIG_PPC out_8(&via[IER], 0x7f); /* disable interrupts from VIA */ (void)in_8(&via[IER]); @@ -272,16 +383,25 @@ __init cuda_init_via(void) out_8(&via[IER], SR_INT); /* disable SR interrupt from VIA */ #endif + out_8(&via[DIRB], (in_8(&via[DIRB]) | TACK | TIP) & ~TREQ); /* TACK & TIP out */ + out_8(&via[ACR], (in_8(&via[ACR]) & ~SR_CTRL) | SR_EXT); /* SR data in */ + (void)in_8(&via[SR]); /* clear any left-over data */ + + if (mcu_is_egret) + return sync_egret(); + + negate_TIP_and_TACK(); + /* delay 4ms and then clear any pending interrupt */ mdelay(4); (void)in_8(&via[SR]); out_8(&via[IFR], SR_INT); /* sync with the CUDA - assert TACK without TIP */ - out_8(&via[B], in_8(&via[B]) & ~TACK); + assert_TACK(); /* wait for the CUDA to assert TREQ in response */ - WAIT_FOR((in_8(&via[B]) & TREQ) == 0, "CUDA response to sync"); + WAIT_FOR(TREQ_asserted(in_8(&via[B])), "CUDA response to sync"); /* wait for the interrupt and then clear it */ WAIT_FOR(in_8(&via[IFR]) & SR_INT, "CUDA response to sync (2)"); @@ -289,14 +409,13 @@ __init cuda_init_via(void) out_8(&via[IFR], SR_INT); /* finish the sync by negating TACK */ - out_8(&via[B], in_8(&via[B]) | TACK); + negate_TACK(); /* wait for the CUDA to negate TREQ and the corresponding interrupt */ - WAIT_FOR(in_8(&via[B]) & TREQ, "CUDA response to sync (3)"); + WAIT_FOR(!TREQ_asserted(in_8(&via[B])), "CUDA response to sync (3)"); WAIT_FOR(in_8(&via[IFR]) & SR_INT, "CUDA response to sync (4)"); (void)in_8(&via[SR]); out_8(&via[IFR], SR_INT); - out_8(&via[B], in_8(&via[B]) | TIP); /* should be unnecessary */ return 0; } @@ -357,6 +476,7 @@ cuda_reset_adb_bus(void) return 0; } #endif /* CONFIG_ADB */ + /* Construct and send a cuda request */ int cuda_request(struct adb_request *req, void (*done)(struct adb_request *), @@ -413,47 +533,43 @@ cuda_write(struct adb_request *req) static void cuda_start(void) { - struct adb_request *req; - /* assert cuda_state == idle */ - /* get the packet to send */ - req = current_req; - if (req == 0) + if (current_req == NULL) return; - if ((in_8(&via[B]) & TREQ) == 0) + data_index = 0; + if (TREQ_asserted(in_8(&via[B]))) return; /* a byte is coming in from the CUDA */ /* set the shift register to shift out and send a byte */ out_8(&via[ACR], in_8(&via[ACR]) | SR_OUT); - out_8(&via[SR], req->data[0]); - out_8(&via[B], in_8(&via[B]) & ~TIP); + out_8(&via[SR], current_req->data[data_index++]); + if (mcu_is_egret) + assert_TIP_and_TACK(); + else + assert_TIP(); cuda_state = sent_first_byte; } void cuda_poll(void) { - /* cuda_interrupt only takes a normal lock, we disable - * interrupts here to avoid re-entering and thus deadlocking. - */ - if (cuda_irq) - disable_irq(cuda_irq); - cuda_interrupt(0, NULL); - if (cuda_irq) - enable_irq(cuda_irq); + cuda_interrupt(0, NULL); } EXPORT_SYMBOL(cuda_poll); +#define ARRAY_FULL(a, p) ((p) - (a) == ARRAY_SIZE(a)) + static irqreturn_t cuda_interrupt(int irq, void *arg) { - int status; + unsigned long flags; + u8 status; struct adb_request *req = NULL; unsigned char ibuf[16]; int ibuf_len = 0; int complete = 0; - spin_lock(&cuda_lock); + spin_lock_irqsave(&cuda_lock, flags); /* On powermacs, this handler is registered for the VIA IRQ. But they use * just the shift register IRQ -- other VIA interrupt sources are disabled. @@ -466,52 +582,50 @@ cuda_interrupt(int irq, void *arg) #endif { if ((in_8(&via[IFR]) & SR_INT) == 0) { - spin_unlock(&cuda_lock); + spin_unlock_irqrestore(&cuda_lock, flags); return IRQ_NONE; } else { out_8(&via[IFR], SR_INT); } } - - status = (~in_8(&via[B]) & (TIP|TREQ)) | (in_8(&via[ACR]) & SR_OUT); - /* printk("cuda_interrupt: state=%d status=%x\n", cuda_state, status); */ + + status = in_8(&via[B]) & (TIP | TACK | TREQ); + switch (cuda_state) { case idle: - /* CUDA has sent us the first byte of data - unsolicited */ - if (status != TREQ) - printk("cuda: state=idle, status=%x\n", status); + /* System controller has unsolicited data for us */ (void)in_8(&via[SR]); - out_8(&via[B], in_8(&via[B]) & ~TIP); +idle_state: + assert_TIP(); cuda_state = reading; reply_ptr = cuda_rbuf; reading_reply = 0; break; case awaiting_reply: - /* CUDA has sent us the first byte of data of a reply */ - if (status != TREQ) - printk("cuda: state=awaiting_reply, status=%x\n", status); + /* System controller has reply data for us */ (void)in_8(&via[SR]); - out_8(&via[B], in_8(&via[B]) & ~TIP); + assert_TIP(); cuda_state = reading; reply_ptr = current_req->reply; reading_reply = 1; break; case sent_first_byte: - if (status == TREQ + TIP + SR_OUT) { + if (TREQ_asserted(status)) { /* collision */ out_8(&via[ACR], in_8(&via[ACR]) & ~SR_OUT); (void)in_8(&via[SR]); - out_8(&via[B], in_8(&via[B]) | TIP | TACK); + negate_TIP_and_TACK(); cuda_state = idle; + /* Egret does not raise an "aborted" interrupt */ + if (mcu_is_egret) + goto idle_state; } else { - /* assert status == TIP + SR_OUT */ - if (status != TIP + SR_OUT) - printk("cuda: state=sent_first_byte status=%x\n", status); - out_8(&via[SR], current_req->data[1]); - out_8(&via[B], in_8(&via[B]) ^ TACK); - data_index = 2; + out_8(&via[SR], current_req->data[data_index++]); + toggle_TACK(); + if (mcu_is_egret) + assert_TACK(); cuda_state = sending; } break; @@ -521,7 +635,7 @@ cuda_interrupt(int irq, void *arg) if (data_index >= req->nbytes) { out_8(&via[ACR], in_8(&via[ACR]) & ~SR_OUT); (void)in_8(&via[SR]); - out_8(&via[B], in_8(&via[B]) | TACK | TIP); + negate_TIP_and_TACK(); req->sent = 1; if (req->reply_expected) { cuda_state = awaiting_reply; @@ -534,26 +648,37 @@ cuda_interrupt(int irq, void *arg) } } else { out_8(&via[SR], req->data[data_index++]); - out_8(&via[B], in_8(&via[B]) ^ TACK); + toggle_TACK(); + if (mcu_is_egret) + assert_TACK(); } break; case reading: - *reply_ptr++ = in_8(&via[SR]); - if (status == TIP) { + if (reading_reply ? ARRAY_FULL(current_req->reply, reply_ptr) + : ARRAY_FULL(cuda_rbuf, reply_ptr)) + (void)in_8(&via[SR]); + else + *reply_ptr++ = in_8(&via[SR]); + if (!TREQ_asserted(status)) { + if (mcu_is_egret) + assert_TACK(); /* that's all folks */ - out_8(&via[B], in_8(&via[B]) | TACK | TIP); + negate_TIP_and_TACK(); cuda_state = read_done; + /* Egret does not raise a "read done" interrupt */ + if (mcu_is_egret) + goto read_done_state; } else { - /* assert status == TIP | TREQ */ - if (status != TIP + TREQ) - printk("cuda: state=reading status=%x\n", status); - out_8(&via[B], in_8(&via[B]) ^ TACK); + toggle_TACK(); + if (mcu_is_egret) + negate_TACK(); } break; case read_done: (void)in_8(&via[SR]); +read_done_state: if (reading_reply) { req = current_req; req->reply_len = reply_ptr - req->reply; @@ -570,6 +695,7 @@ cuda_interrupt(int irq, void *arg) } current_req = req->next; complete = 1; + reading_reply = 0; } else { /* This is tricky. We must break the spinlock to call * cuda_input. However, doing so means we might get @@ -581,21 +707,19 @@ cuda_interrupt(int irq, void *arg) ibuf_len = reply_ptr - cuda_rbuf; memcpy(ibuf, cuda_rbuf, ibuf_len); } - if (status == TREQ) { - out_8(&via[B], in_8(&via[B]) & ~TIP); + reply_ptr = cuda_rbuf; + cuda_state = idle; + cuda_start(); + if (cuda_state == idle && TREQ_asserted(in_8(&via[B]))) { + assert_TIP(); cuda_state = reading; - reply_ptr = cuda_rbuf; - reading_reply = 0; - } else { - cuda_state = idle; - cuda_start(); } break; default: - printk("cuda_interrupt: unknown cuda_state %d?\n", cuda_state); + pr_err("cuda_interrupt: unknown cuda_state %d?\n", cuda_state); } - spin_unlock(&cuda_lock); + spin_unlock_irqrestore(&cuda_lock, flags); if (complete && req) { void (*done)(struct adb_request *) = req->done; mb(); @@ -614,8 +738,6 @@ cuda_interrupt(int irq, void *arg) static void cuda_input(unsigned char *buf, int nb) { - int i; - switch (buf[0]) { case ADB_PACKET: #ifdef CONFIG_XMON @@ -632,10 +754,14 @@ cuda_input(unsigned char *buf, int nb) #endif /* CONFIG_ADB */ break; + case TIMER_PACKET: + /* Egret sends these periodically. Might be useful as a 'heartbeat' + * to trigger a recovery for the VIA shift register errata. + */ + break; + default: - printk("data from cuda (%d bytes):", nb); - for (i = 0; i < nb; ++i) - printk(" %.2x", buf[i]); - printk("\n"); + print_hex_dump(KERN_INFO, "cuda_input: ", DUMP_PREFIX_NONE, 32, 1, + buf, nb, false); } } diff --git a/drivers/macintosh/via-maciisi.c b/drivers/macintosh/via-maciisi.c deleted file mode 100644 index 34d02a91b29f..000000000000 --- a/drivers/macintosh/via-maciisi.c +++ /dev/null @@ -1,677 +0,0 @@ -/* - * Device driver for the IIsi-style ADB on some Mac LC and II-class machines - * - * Based on via-cuda.c and via-macii.c, as well as the original - * adb-bus.c, which in turn is somewhat influenced by (but uses no - * code from) the NetBSD HWDIRECT ADB code. Original IIsi driver work - * was done by Robert Thompson and integrated into the old style - * driver by Michael Schmitz. - * - * Original sources (c) Alan Cox, Paul Mackerras, and others. - * - * Rewritten for Unified ADB by David Huggins-Daines <dhd@debian.org> - * - * 7/13/2000- extensive changes by Andrew McPherson <andrew@macduff.dhs.org> - * Works about 30% of the time now. - */ - -#include <linux/types.h> -#include <linux/errno.h> -#include <linux/kernel.h> -#include <linux/adb.h> -#include <linux/cuda.h> -#include <linux/delay.h> -#include <linux/interrupt.h> -#include <asm/macintosh.h> -#include <asm/macints.h> -#include <asm/mac_via.h> - -static volatile unsigned char *via; - -/* VIA registers - spaced 0x200 bytes apart - only the ones we actually use */ -#define RS 0x200 /* skip between registers */ -#define B 0 /* B-side data */ -#define A RS /* A-side data */ -#define DIRB (2*RS) /* B-side direction (1=output) */ -#define DIRA (3*RS) /* A-side direction (1=output) */ -#define SR (10*RS) /* Shift register */ -#define ACR (11*RS) /* Auxiliary control register */ -#define IFR (13*RS) /* Interrupt flag register */ -#define IER (14*RS) /* Interrupt enable register */ - -/* Bits in B data register: all active low */ -#define TREQ 0x08 /* Transfer request (input) */ -#define TACK 0x10 /* Transfer acknowledge (output) */ -#define TIP 0x20 /* Transfer in progress (output) */ -#define ST_MASK 0x30 /* mask for selecting ADB state bits */ - -/* Bits in ACR */ -#define SR_CTRL 0x1c /* Shift register control bits */ -#define SR_EXT 0x0c /* Shift on external clock */ -#define SR_OUT 0x10 /* Shift out if 1 */ - -/* Bits in IFR and IER */ -#define IER_SET 0x80 /* set bits in IER */ -#define IER_CLR 0 /* clear bits in IER */ -#define SR_INT 0x04 /* Shift register full/empty */ -#define SR_DATA 0x08 /* Shift register data */ -#define SR_CLOCK 0x10 /* Shift register clock */ - -#define ADB_DELAY 150 - -#undef DEBUG_MACIISI_ADB - -static struct adb_request* current_req; -static struct adb_request* last_req; -static unsigned char maciisi_rbuf[16]; -static unsigned char *reply_ptr; -static int data_index; -static int reading_reply; -static int reply_len; -static int tmp; -static int need_sync; - -static enum maciisi_state { - idle, - sending, - reading, -} maciisi_state; - -static int maciisi_probe(void); -static int maciisi_init(void); -static int maciisi_send_request(struct adb_request* req, int sync); -static void maciisi_sync(struct adb_request *req); -static int maciisi_write(struct adb_request* req); -static irqreturn_t maciisi_interrupt(int irq, void* arg); -static void maciisi_input(unsigned char *buf, int nb); -static int maciisi_init_via(void); -static void maciisi_poll(void); -static int maciisi_start(void); - -struct adb_driver via_maciisi_driver = { - "Mac IIsi", - maciisi_probe, - maciisi_init, - maciisi_send_request, - NULL, /* maciisi_adb_autopoll, */ - maciisi_poll, - NULL /* maciisi_reset_adb_bus */ -}; - -static int -maciisi_probe(void) -{ - if (macintosh_config->adb_type != MAC_ADB_IISI) - return -ENODEV; - - via = via1; - return 0; -} - -static int -maciisi_init(void) -{ - int err; - - if (via == NULL) - return -ENODEV; - - if ((err = maciisi_init_via())) { - printk(KERN_ERR "maciisi_init: maciisi_init_via() failed, code %d\n", err); - via = NULL; - return err; - } - - if (request_irq(IRQ_MAC_ADB, maciisi_interrupt, 0, "ADB", - maciisi_interrupt)) { - printk(KERN_ERR "maciisi_init: can't get irq %d\n", IRQ_MAC_ADB); - return -EAGAIN; - } - - printk("adb: Mac IIsi driver v0.2 for Unified ADB.\n"); - return 0; -} - -/* Flush data from the ADB controller */ -static void -maciisi_stfu(void) -{ - int status = via[B] & (TIP|TREQ); - - if (status & TREQ) { -#ifdef DEBUG_MACIISI_ADB - printk (KERN_DEBUG "maciisi_stfu called with TREQ high!\n"); -#endif - return; - } - - udelay(ADB_DELAY); - via[ACR] &= ~SR_OUT; - via[IER] = IER_CLR | SR_INT; - - udelay(ADB_DELAY); - - status = via[B] & (TIP|TREQ); - - if (!(status & TREQ)) - { - via[B] |= TIP; - - while(1) - { - int poll_timeout = ADB_DELAY * 5; - /* Poll for SR interrupt */ - while (!(via[IFR] & SR_INT) && poll_timeout-- > 0) - status = via[B] & (TIP|TREQ); - - tmp = via[SR]; /* Clear shift register */ -#ifdef DEBUG_MACIISI_ADB - printk(KERN_DEBUG "maciisi_stfu: status %x timeout %d data %x\n", - status, poll_timeout, tmp); -#endif - if(via[B] & TREQ) - break; - - /* ACK on-off */ - via[B] |= TACK; - udelay(ADB_DELAY); - via[B] &= ~TACK; - } - - /* end frame */ - via[B] &= ~TIP; - udelay(ADB_DELAY); - } - - via[IER] = IER_SET | SR_INT; -} - -/* All specifically VIA-related initialization goes here */ -static int -maciisi_init_via(void) -{ - int i; - - /* Set the lines up. We want TREQ as input TACK|TIP as output */ - via[DIRB] = (via[DIRB] | TACK | TIP) & ~TREQ; - /* Shift register on input */ - via[ACR] = (via[ACR] & ~SR_CTRL) | SR_EXT; -#ifdef DEBUG_MACIISI_ADB - printk(KERN_DEBUG "maciisi_init_via: initial status %x\n", via[B] & (TIP|TREQ)); -#endif - /* Wipe any pending data and int */ - tmp = via[SR]; - /* Enable keyboard interrupts */ - via[IER] = IER_SET | SR_INT; - /* Set initial state: idle */ - via[B] &= ~(TACK|TIP); - /* Clear interrupt bit */ - via[IFR] = SR_INT; - - for(i = 0; i < 60; i++) { - udelay(ADB_DELAY); - maciisi_stfu(); - udelay(ADB_DELAY); - if(via[B] & TREQ) - break; - } - if (i == 60) - printk(KERN_ERR "maciisi_init_via: bus jam?\n"); - - maciisi_state = idle; - need_sync = 0; - - return 0; -} - -/* Send a request, possibly waiting for a reply */ -static int -maciisi_send_request(struct adb_request* req, int sync) -{ - int i; - -#ifdef DEBUG_MACIISI_ADB - static int dump_packet = 0; -#endif - - if (via == NULL) { - req->complete = 1; - return -ENXIO; - } - -#ifdef DEBUG_MACIISI_ADB - if (dump_packet) { - printk(KERN_DEBUG "maciisi_send_request:"); - for (i = 0; i < req->nbytes; i++) { - printk(" %.2x", req->data[i]); - } - printk(" sync %d\n", sync); - } -#endif - - req->reply_expected = 1; - - i = maciisi_write(req); - if (i) - { - /* Normally, if a packet requires syncing, that happens at the end of - * maciisi_send_request. But if the transfer fails, it will be restarted - * by maciisi_interrupt(). We use need_sync to tell maciisi_interrupt - * when to sync a packet that it sends out. - * - * Suggestions on a better way to do this are welcome. - */ - if(i == -EBUSY && sync) - need_sync = 1; - else - need_sync = 0; - return i; - } - if(sync) - maciisi_sync(req); - - return 0; -} - -/* Poll the ADB chip until the request completes */ -static void maciisi_sync(struct adb_request *req) -{ - int count = 0; - -#ifdef DEBUG_MACIISI_ADB - printk(KERN_DEBUG "maciisi_sync called\n"); -#endif - - /* If for some reason the ADB chip shuts up on us, we want to avoid an endless loop. */ - while (!req->complete && count++ < 50) { - maciisi_poll(); - } - /* This could be BAD... when the ADB controller doesn't respond - * for this long, it's probably not coming back :-( */ - if (count > 50) /* Hopefully shouldn't happen */ - printk(KERN_ERR "maciisi_send_request: poll timed out!\n"); -} - -int -maciisi_request(struct adb_request *req, void (*done)(struct adb_request *), - int nbytes, ...) -{ - va_list list; - int i; - - req->nbytes = nbytes; - req->done = done; - req->reply_expected = 0; - va_start(list, nbytes); - for (i = 0; i < nbytes; i++) - req->data[i++] = va_arg(list, int); - va_end(list); - - return maciisi_send_request(req, 1); -} - -/* Enqueue a request, and run the queue if possible */ -static int -maciisi_write(struct adb_request* req) -{ - unsigned long flags; - int i; - - /* We will accept CUDA packets - the VIA sends them to us, so - it figures that we should be able to send them to it */ - if (req->nbytes < 2 || req->data[0] > CUDA_PACKET) { - printk(KERN_ERR "maciisi_write: packet too small or not an ADB or CUDA packet\n"); - req->complete = 1; - return -EINVAL; - } - req->next = NULL; - req->sent = 0; - req->complete = 0; - req->reply_len = 0; - - local_irq_save(flags); - - if (current_req) { - last_req->next = req; - last_req = req; - } else { - current_req = req; - last_req = req; - } - if (maciisi_state == idle) - { - i = maciisi_start(); - if(i != 0) - { - local_irq_restore(flags); - return i; - } - } - else - { -#ifdef DEBUG_MACIISI_ADB - printk(KERN_DEBUG "maciisi_write: would start, but state is %d\n", maciisi_state); -#endif - local_irq_restore(flags); - return -EBUSY; - } - - local_irq_restore(flags); - - return 0; -} - -static int -maciisi_start(void) -{ - struct adb_request* req; - int status; - -#ifdef DEBUG_MACIISI_ADB - status = via[B] & (TIP | TREQ); - - printk(KERN_DEBUG "maciisi_start called, state=%d, status=%x, ifr=%x\n", maciisi_state, status, via[IFR]); -#endif - - if (maciisi_state != idle) { - /* shouldn't happen */ - printk(KERN_ERR "maciisi_start: maciisi_start called when driver busy!\n"); - return -EBUSY; - } - - req = current_req; - if (req == NULL) - return -EINVAL; - - status = via[B] & (TIP|TREQ); - if (!(status & TREQ)) { -#ifdef DEBUG_MACIISI_ADB - printk(KERN_DEBUG "maciisi_start: bus busy - aborting\n"); -#endif - return -EBUSY; - } - - /* Okay, send */ -#ifdef DEBUG_MACIISI_ADB - printk(KERN_DEBUG "maciisi_start: sending\n"); -#endif - /* Set state to active */ - via[B] |= TIP; - /* ACK off */ - via[B] &= ~TACK; - /* Delay */ - udelay(ADB_DELAY); - /* Shift out and send */ - via[ACR] |= SR_OUT; - via[SR] = req->data[0]; - data_index = 1; - /* ACK on */ - via[B] |= TACK; - maciisi_state = sending; - - return 0; -} - -void -maciisi_poll(void) -{ - unsigned long flags; - - local_irq_save(flags); - if (via[IFR] & SR_INT) { - maciisi_interrupt(0, NULL); - } - else /* avoid calling this function too quickly in a loop */ - udelay(ADB_DELAY); - - local_irq_restore(flags); -} - -/* Shift register interrupt - this is *supposed* to mean that the - register is either full or empty. In practice, I have no idea what - it means :( */ -static irqreturn_t -maciisi_interrupt(int irq, void* arg) -{ - int status; - struct adb_request *req; -#ifdef DEBUG_MACIISI_ADB - static int dump_reply = 0; -#endif - int i; - unsigned long flags; - - local_irq_save(flags); - - status = via[B] & (TIP|TREQ); -#ifdef DEBUG_MACIISI_ADB - printk(KERN_DEBUG "state %d status %x ifr %x\n", maciisi_state, status, via[IFR]); -#endif - - if (!(via[IFR] & SR_INT)) { - /* Shouldn't happen, we hope */ - printk(KERN_ERR "maciisi_interrupt: called without interrupt flag set\n"); - local_irq_restore(flags); - return IRQ_NONE; - } - - /* Clear the interrupt */ - /* via[IFR] = SR_INT; */ - - switch_start: - switch (maciisi_state) { - case idle: - if (status & TIP) - printk(KERN_ERR "maciisi_interrupt: state is idle but TIP asserted!\n"); - - if(!reading_reply) - udelay(ADB_DELAY); - /* Shift in */ - via[ACR] &= ~SR_OUT; - /* Signal start of frame */ - via[B] |= TIP; - /* Clear the interrupt (throw this value on the floor, it's useless) */ - tmp = via[SR]; - /* ACK adb chip, high-low */ - via[B] |= TACK; - udelay(ADB_DELAY); - via[B] &= ~TACK; - reply_len = 0; - maciisi_state = reading; - if (reading_reply) { - reply_ptr = current_req->reply; - } else { - reply_ptr = maciisi_rbuf; - } - break; - - case sending: - /* via[SR]; */ - /* Set ACK off */ - via[B] &= ~TACK; - req = current_req; - - if (!(status & TREQ)) { - /* collision */ - printk(KERN_ERR "maciisi_interrupt: send collision\n"); - /* Set idle and input */ - via[ACR] &= ~SR_OUT; - tmp = via[SR]; - via[B] &= ~TIP; - /* Must re-send */ - reading_reply = 0; - reply_len = 0; - maciisi_state = idle; - udelay(ADB_DELAY); - /* process this now, because the IFR has been cleared */ - goto switch_start; - } - - udelay(ADB_DELAY); - - if (data_index >= req->nbytes) { - /* Sent the whole packet, put the bus back in idle state */ - /* Shift in, we are about to read a reply (hopefully) */ - via[ACR] &= ~SR_OUT; - tmp = via[SR]; - /* End of frame */ - via[B] &= ~TIP; - req->sent = 1; - maciisi_state = idle; - if (req->reply_expected) { - /* Note: only set this once we've - successfully sent the packet */ - reading_reply = 1; - } else { - current_req = req->next; - if (req->done) - (*req->done)(req); - /* Do any queued requests now */ - i = maciisi_start(); - if(i == 0 && need_sync) { - /* Packet needs to be synced */ - maciisi_sync(current_req); - } - if(i != -EBUSY) - need_sync = 0; - } - } else { - /* Sending more stuff */ - /* Shift out */ - via[ACR] |= SR_OUT; - /* Write */ - via[SR] = req->data[data_index++]; - /* Signal 'byte ready' */ - via[B] |= TACK; - } - break; - - case reading: - /* Shift in */ - /* via[ACR] &= ~SR_OUT; */ /* Not in 2.2 */ - if (reply_len++ > 16) { - printk(KERN_ERR "maciisi_interrupt: reply too long, aborting read\n"); - via[B] |= TACK; - udelay(ADB_DELAY); - via[B] &= ~(TACK|TIP); - maciisi_state = idle; - i = maciisi_start(); - if(i == 0 && need_sync) { - /* Packet needs to be synced */ - maciisi_sync(current_req); - } - if(i != -EBUSY) - need_sync = 0; - break; - } - /* Read data */ - *reply_ptr++ = via[SR]; - status = via[B] & (TIP|TREQ); - /* ACK on/off */ - via[B] |= TACK; - udelay(ADB_DELAY); - via[B] &= ~TACK; - if (!(status & TREQ)) - break; /* more stuff to deal with */ - - /* end of frame */ - via[B] &= ~TIP; - tmp = via[SR]; /* That's what happens in 2.2 */ - udelay(ADB_DELAY); /* Give controller time to recover */ - - /* end of packet, deal with it */ - if (reading_reply) { - req = current_req; - req->reply_len = reply_ptr - req->reply; - if (req->data[0] == ADB_PACKET) { - /* Have to adjust the reply from ADB commands */ - if (req->reply_len <= 2 || (req->reply[1] & 2) != 0) { - /* the 0x2 bit indicates no response */ - req->reply_len = 0; - } else { - /* leave just the command and result bytes in the reply */ - req->reply_len -= 2; - memmove(req->reply, req->reply + 2, req->reply_len); - } - } -#ifdef DEBUG_MACIISI_ADB - if (dump_reply) { - int i; - printk(KERN_DEBUG "maciisi_interrupt: reply is "); - for (i = 0; i < req->reply_len; ++i) - printk(" %.2x", req->reply[i]); - printk("\n"); - } -#endif - req->complete = 1; - current_req = req->next; - if (req->done) - (*req->done)(req); - /* Obviously, we got it */ - reading_reply = 0; - } else { - maciisi_input(maciisi_rbuf, reply_ptr - maciisi_rbuf); - } - maciisi_state = idle; - status = via[B] & (TIP|TREQ); - if (!(status & TREQ)) { - /* Timeout?! More likely, another packet coming in already */ -#ifdef DEBUG_MACIISI_ADB - printk(KERN_DEBUG "extra data after packet: status %x ifr %x\n", - status, via[IFR]); -#endif -#if 0 - udelay(ADB_DELAY); - via[B] |= TIP; - - maciisi_state = reading; - reading_reply = 0; - reply_ptr = maciisi_rbuf; -#else - /* Process the packet now */ - reading_reply = 0; - goto switch_start; -#endif - /* We used to do this... but the controller might actually have data for us */ - /* maciisi_stfu(); */ - } - else { - /* Do any queued requests now if possible */ - i = maciisi_start(); - if(i == 0 && need_sync) { - /* Packet needs to be synced */ - maciisi_sync(current_req); - } - if(i != -EBUSY) - need_sync = 0; - } - break; - - default: - printk("maciisi_interrupt: unknown maciisi_state %d?\n", maciisi_state); - } - local_irq_restore(flags); - return IRQ_HANDLED; -} - -static void -maciisi_input(unsigned char *buf, int nb) -{ -#ifdef DEBUG_MACIISI_ADB - int i; -#endif - - switch (buf[0]) { - case ADB_PACKET: - adb_input(buf+2, nb-2, buf[1] & 0x40); - break; - default: -#ifdef DEBUG_MACIISI_ADB - printk(KERN_DEBUG "data from IIsi ADB (%d bytes):", nb); - for (i = 0; i < nb; ++i) - printk(" %.2x", buf[i]); - printk("\n"); -#endif - break; - } -} diff --git a/drivers/misc/cxl/Makefile b/drivers/misc/cxl/Makefile index 56e9a4732ef0..c14fd6b65b5a 100644 --- a/drivers/misc/cxl/Makefile +++ b/drivers/misc/cxl/Makefile @@ -2,9 +2,10 @@ ccflags-y := $(call cc-disable-warning, unused-const-variable) ccflags-$(CONFIG_PPC_WERROR) += -Werror cxl-y += main.o file.o irq.o fault.o native.o -cxl-y += context.o sysfs.o debugfs.o pci.o trace.o +cxl-y += context.o sysfs.o pci.o trace.o cxl-y += vphb.o phb.o api.o cxl-$(CONFIG_PPC_PSERIES) += flash.o guest.o of.o hcalls.o +cxl-$(CONFIG_DEBUG_FS) += debugfs.o obj-$(CONFIG_CXL) += cxl.o obj-$(CONFIG_CXL_BASE) += base.o diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c index 1b35e33d2434..bcc030eacab7 100644 --- a/drivers/misc/cxl/api.c +++ b/drivers/misc/cxl/api.c @@ -11,7 +11,6 @@ #include <linux/slab.h> #include <linux/file.h> #include <misc/cxl.h> -#include <asm/pnv-pci.h> #include <linux/msi.h> #include <linux/module.h> #include <linux/mount.h> diff --git a/drivers/misc/cxl/cxl.h b/drivers/misc/cxl/cxl.h index b24d76723fb0..6c722d96b775 100644 --- a/drivers/misc/cxl/cxl.h +++ b/drivers/misc/cxl/cxl.h @@ -418,6 +418,8 @@ struct cxl_afu { struct dentry *debugfs; struct mutex contexts_lock; spinlock_t afu_cntl_lock; + /* Used to block access to AFU config space while deconfigured */ + struct rw_semaphore configured_rwsem; /* AFU error buffer fields and bin attribute for sysfs */ u64 eb_len, eb_offset; @@ -800,12 +802,67 @@ int afu_register_irqs(struct cxl_context *ctx, u32 count); void afu_release_irqs(struct cxl_context *ctx, void *cookie); void afu_irq_name_free(struct cxl_context *ctx); +#ifdef CONFIG_DEBUG_FS + int cxl_debugfs_init(void); void cxl_debugfs_exit(void); int cxl_debugfs_adapter_add(struct cxl *adapter); void cxl_debugfs_adapter_remove(struct cxl *adapter); int cxl_debugfs_afu_add(struct cxl_afu *afu); void cxl_debugfs_afu_remove(struct cxl_afu *afu); +void cxl_stop_trace(struct cxl *cxl); +void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir); +void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, struct dentry *dir); +void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir); + +#else /* CONFIG_DEBUG_FS */ + +static inline int __init cxl_debugfs_init(void) +{ + return 0; +} + +static inline void cxl_debugfs_exit(void) +{ +} + +static inline int cxl_debugfs_adapter_add(struct cxl *adapter) +{ + return 0; +} + +static inline void cxl_debugfs_adapter_remove(struct cxl *adapter) +{ +} + +static inline int cxl_debugfs_afu_add(struct cxl_afu *afu) +{ + return 0; +} + +static inline void cxl_debugfs_afu_remove(struct cxl_afu *afu) +{ +} + +static inline void cxl_stop_trace(struct cxl *cxl) +{ +} + +static inline void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, + struct dentry *dir) +{ +} + +static inline void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, + struct dentry *dir) +{ +} + +static inline void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir) +{ +} + +#endif /* CONFIG_DEBUG_FS */ void cxl_handle_fault(struct work_struct *work); void cxl_prefault(struct cxl_context *ctx, u64 wed); @@ -870,12 +927,8 @@ int cxl_data_cache_flush(struct cxl *adapter); int cxl_afu_disable(struct cxl_afu *afu); int cxl_psl_purge(struct cxl_afu *afu); -void cxl_debugfs_add_adapter_psl_regs(struct cxl *adapter, struct dentry *dir); -void cxl_debugfs_add_adapter_xsl_regs(struct cxl *adapter, struct dentry *dir); -void cxl_debugfs_add_afu_psl_regs(struct cxl_afu *afu, struct dentry *dir); void cxl_native_psl_irq_dump_regs(struct cxl_context *ctx); void cxl_native_err_irq_dump_regs(struct cxl *adapter); -void cxl_stop_trace(struct cxl *cxl); int cxl_pci_vphb_add(struct cxl_afu *afu); void cxl_pci_vphb_remove(struct cxl_afu *afu); void cxl_release_mapping(struct cxl_context *ctx); diff --git a/drivers/misc/cxl/main.c b/drivers/misc/cxl/main.c index 62e0dfb5f15b..2a6bf1d0a3a4 100644 --- a/drivers/misc/cxl/main.c +++ b/drivers/misc/cxl/main.c @@ -268,7 +268,8 @@ struct cxl_afu *cxl_alloc_afu(struct cxl *adapter, int slice) idr_init(&afu->contexts_idr); mutex_init(&afu->contexts_lock); spin_lock_init(&afu->afu_cntl_lock); - + init_rwsem(&afu->configured_rwsem); + down_write(&afu->configured_rwsem); afu->prefault_mode = CXL_PREFAULT_NONE; afu->irqs_max = afu->adapter->user_irqs; diff --git a/drivers/misc/cxl/pci.c b/drivers/misc/cxl/pci.c index 80a87ab25b83..cca938845ffd 100644 --- a/drivers/misc/cxl/pci.c +++ b/drivers/misc/cxl/pci.c @@ -1129,6 +1129,7 @@ static int pci_configure_afu(struct cxl_afu *afu, struct cxl *adapter, struct pc if ((rc = cxl_native_register_psl_irq(afu))) goto err2; + up_write(&afu->configured_rwsem); return 0; err2: @@ -1141,6 +1142,7 @@ err1: static void pci_deconfigure_afu(struct cxl_afu *afu) { + down_write(&afu->configured_rwsem); cxl_native_release_psl_irq(afu); if (afu->adapter->native->sl_ops->release_serr_irq) afu->adapter->native->sl_ops->release_serr_irq(afu); @@ -1610,6 +1612,9 @@ static void cxl_pci_remove_adapter(struct cxl *adapter) cxl_sysfs_adapter_remove(adapter); cxl_debugfs_adapter_remove(adapter); + /* Flush adapter datacache as its about to be removed */ + cxl_data_cache_flush(adapter); + cxl_deconfigure_adapter(adapter); device_unregister(&adapter->dev); diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c index 3519acebfdab..639a343b7836 100644 --- a/drivers/misc/cxl/vphb.c +++ b/drivers/misc/cxl/vphb.c @@ -76,23 +76,22 @@ static int cxl_pcie_cfg_record(u8 bus, u8 devfn) return (bus << 8) + devfn; } -static int cxl_pcie_config_info(struct pci_bus *bus, unsigned int devfn, - struct cxl_afu **_afu, int *_record) +static inline struct cxl_afu *pci_bus_to_afu(struct pci_bus *bus) { - struct pci_controller *phb; - struct cxl_afu *afu; - int record; + struct pci_controller *phb = bus ? pci_bus_to_host(bus) : NULL; - phb = pci_bus_to_host(bus); - if (phb == NULL) - return PCIBIOS_DEVICE_NOT_FOUND; + return phb ? phb->private_data : NULL; +} + +static inline int cxl_pcie_config_info(struct pci_bus *bus, unsigned int devfn, + struct cxl_afu *afu, int *_record) +{ + int record; - afu = (struct cxl_afu *)phb->private_data; record = cxl_pcie_cfg_record(bus->number, devfn); if (record > afu->crs_num) return PCIBIOS_DEVICE_NOT_FOUND; - *_afu = afu; *_record = record; return 0; } @@ -106,9 +105,14 @@ static int cxl_pcie_read_config(struct pci_bus *bus, unsigned int devfn, u16 val16; u32 val32; - rc = cxl_pcie_config_info(bus, devfn, &afu, &record); + afu = pci_bus_to_afu(bus); + /* Grab a reader lock on afu. */ + if (afu == NULL || !down_read_trylock(&afu->configured_rwsem)) + return PCIBIOS_DEVICE_NOT_FOUND; + + rc = cxl_pcie_config_info(bus, devfn, afu, &record); if (rc) - return rc; + goto out; switch (len) { case 1: @@ -127,10 +131,9 @@ static int cxl_pcie_read_config(struct pci_bus *bus, unsigned int devfn, WARN_ON(1); } - if (rc) - return PCIBIOS_DEVICE_NOT_FOUND; - - return PCIBIOS_SUCCESSFUL; +out: + up_read(&afu->configured_rwsem); + return rc ? PCIBIOS_DEVICE_NOT_FOUND : PCIBIOS_SUCCESSFUL; } static int cxl_pcie_write_config(struct pci_bus *bus, unsigned int devfn, @@ -139,9 +142,14 @@ static int cxl_pcie_write_config(struct pci_bus *bus, unsigned int devfn, int rc, record; struct cxl_afu *afu; - rc = cxl_pcie_config_info(bus, devfn, &afu, &record); + afu = pci_bus_to_afu(bus); + /* Grab a reader lock on afu. */ + if (afu == NULL || !down_read_trylock(&afu->configured_rwsem)) + return PCIBIOS_DEVICE_NOT_FOUND; + + rc = cxl_pcie_config_info(bus, devfn, afu, &record); if (rc) - return rc; + goto out; switch (len) { case 1: @@ -157,10 +165,9 @@ static int cxl_pcie_write_config(struct pci_bus *bus, unsigned int devfn, WARN_ON(1); } - if (rc) - return PCIBIOS_SET_FAILED; - - return PCIBIOS_SUCCESSFUL; +out: + up_read(&afu->configured_rwsem); + return rc ? PCIBIOS_SET_FAILED : PCIBIOS_SUCCESSFUL; } static struct pci_ops cxl_pcie_pci_ops = diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index da346f2817a8..fc1e5d7fc1c7 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -62,6 +62,7 @@ struct cpuidle_state { }; /* Idle State Flags */ +#define CPUIDLE_FLAG_NONE (0x00) #define CPUIDLE_FLAG_COUPLED (0x02) /* state applies to multiple cpus */ #define CPUIDLE_FLAG_TIMER_STOP (0x04) /* timer is stopped on this state */ diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index cac48eda1075..e0035808c814 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -871,6 +871,8 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_S390_USER_INSTR0 130 #define KVM_CAP_MSI_DEVID 131 #define KVM_CAP_PPC_HTM 132 +#define KVM_CAP_PPC_MMU_RADIX 134 +#define KVM_CAP_PPC_MMU_HASH_V3 135 #ifdef KVM_CAP_IRQ_ROUTING @@ -1187,6 +1189,10 @@ struct kvm_s390_ucas_mapping { #define KVM_ARM_SET_DEVICE_ADDR _IOW(KVMIO, 0xab, struct kvm_arm_device_addr) /* Available with KVM_CAP_PPC_RTAS */ #define KVM_PPC_RTAS_DEFINE_TOKEN _IOW(KVMIO, 0xac, struct kvm_rtas_token_args) +/* Available with KVM_CAP_PPC_RADIX_MMU or KVM_CAP_PPC_HASH_MMU_V3 */ +#define KVM_PPC_CONFIGURE_V3_MMU _IOW(KVMIO, 0xaf, struct kvm_ppc_mmuv3_cfg) +/* Available with KVM_CAP_PPC_RADIX_MMU */ +#define KVM_PPC_GET_RMMU_INFO _IOW(KVMIO, 0xb0, struct kvm_ppc_rmmu_info) /* ioctl for vm fd */ #define KVM_CREATE_DEVICE _IOWR(KVMIO, 0xe0, struct kvm_create_device) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index ebb4dadca66b..699c5bc51a92 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1740,6 +1740,12 @@ void unregister_kprobes(struct kprobe **kps, int num) } EXPORT_SYMBOL_GPL(unregister_kprobes); +int __weak __kprobes kprobe_exceptions_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return NOTIFY_DONE; +} + static struct notifier_block kprobe_exceptions_nb = { .notifier_call = kprobe_exceptions_notify, .priority = 0x7fffffff /* we need to be notified first */ diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins index a084f7a511d8..82335533620e 100644 --- a/scripts/Makefile.gcc-plugins +++ b/scripts/Makefile.gcc-plugins @@ -8,7 +8,7 @@ ifdef CONFIG_GCC_PLUGINS gcc-plugin-$(CONFIG_GCC_PLUGIN_LATENT_ENTROPY) += latent_entropy_plugin.so gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_LATENT_ENTROPY) += -DLATENT_ENTROPY_PLUGIN - ifdef CONFIG_PAX_LATENT_ENTROPY + ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY DISABLE_LATENT_ENTROPY_PLUGIN += -fplugin-arg-latent_entropy_plugin-disable endif @@ -51,6 +51,14 @@ gcc-plugins-check: FORCE ifdef CONFIG_GCC_PLUGINS ifeq ($(PLUGINCC),) ifneq ($(GCC_PLUGINS_CFLAGS),) + # Various gccs between 4.5 and 5.1 have bugs on powerpc due to missing + # header files. gcc <= 4.6 doesn't work at all, gccs from 4.8 to 5.1 have + # issues with 64-bit targets. + ifeq ($(ARCH),powerpc) + ifeq ($(call cc-ifversion, -le, 0501, y), y) + @echo "Cannot use CONFIG_GCC_PLUGINS: plugin support on gcc <= 5.1 is buggy on powerpc, please upgrade to gcc 5.2 or newer" >&2 && exit 1 + endif + endif ifeq ($(call cc-ifversion, -ge, 0405, y), y) $(Q)$(srctree)/scripts/gcc-plugin.sh --show-error "$(__PLUGINCC)" "$(HOSTCXX)" "$(CC)" || true @echo "Cannot use CONFIG_GCC_PLUGINS: your gcc installation does not support plugins, perhaps the necessary headers are missing?" >&2 && exit 1 |