From ec4867355244755fb5c06037ad2fff760701b465 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Tue, 16 May 2017 14:19:44 +0530 Subject: powerpc/powernv/idle: Decouple Timebase restore & Per-core SPRs restore On POWER8, in case of - nap: both timebase and hypervisor state is retained. - fast-sleep: timebase is lost. But the hypervisor state is retained. - winkle: timebase and hypervisor state is lost. Hence, the current code for handling exit from a idle state assumes that if the timebase value is retained, then so is the hypervisor state. Thus, the current code doesn't restore per-core hypervisor state in such cases. But that is no longer the case on POWER9 where we do have stop states in which timebase value is retained, but the hypervisor state is lost. So we have to ensure that the per-core hypervisor state gets restored in such cases. Fix this by ensuring that even in the case when timebase is retained, we explicitly check if we are waking up from a deep stop that loses per-core hypervisor state (indicated by cr4 being eq or gt), and if this is the case, we restore the per-core hypervisor state. Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_book3s.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 4898d676dcae..afd029f1039b 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -731,13 +731,14 @@ timebase_resync: * Use cr3 which indicates that we are waking up with atleast partial * hypervisor state loss to determine if TIMEBASE RESYNC is needed. */ - ble cr3,clear_lock + ble cr3,.Ltb_resynced /* Time base re-sync */ bl opal_resync_timebase; /* - * If waking up from sleep, per core state is not lost, skip to - * clear_lock. + * If waking up from sleep (POWER8), per core state + * is not lost, skip to clear_lock. */ +.Ltb_resynced: blt cr4,clear_lock /* -- cgit v1.2.3 From cb0be7ec03077a31712183bfbe7801061e2966b8 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Tue, 16 May 2017 14:19:45 +0530 Subject: powerpc/powernv/idle: Restore LPCR on wakeup from deep-stop On wakeup from a deep stop state which is supposed to lose the hypervisor state, we don't restore the LPCR to the old value but set it to a "sane" value via cur_cpu_spec->cpu_restore(). The problem is that the "sane" value doesn't include UPRT and the HR bits which are required to run correctly in Radix mode. Fix this on POWER9 onwards by restoring the LPCR value whatever it was before executing the stop instruction. Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_book3s.S | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index afd029f1039b..6c9920d9221c 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -31,6 +31,7 @@ * registers for winkle support. */ #define _SDR1 GPR3 +#define _PTCR GPR3 #define _RPR GPR4 #define _SPURR GPR5 #define _PURR GPR6 @@ -39,7 +40,7 @@ #define _AMOR GPR9 #define _WORT GPR10 #define _WORC GPR11 -#define _PTCR GPR12 +#define _LPCR GPR12 #define PSSCR_EC_ESL_MASK_SHIFTED (PSSCR_EC | PSSCR_ESL) >> 16 @@ -55,12 +56,14 @@ save_sprs_to_stack: * here since any thread in the core might wake up first */ BEGIN_FTR_SECTION - mfspr r3,SPRN_PTCR - std r3,_PTCR(r1) /* * Note - SDR1 is dropped in Power ISA v3. Hence not restoring * SDR1 here */ + mfspr r3,SPRN_PTCR + std r3,_PTCR(r1) + mfspr r3,SPRN_LPCR + std r3,_LPCR(r1) FTR_SECTION_ELSE mfspr r3,SPRN_SDR1 std r3,_SDR1(r1) @@ -813,6 +816,10 @@ no_segments: mtctr r12 bctrl +BEGIN_FTR_SECTION + ld r4,_LPCR(r1) + mtspr SPRN_LPCR,r4 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) hypervisor_state_restored: mtspr SPRN_SRR1,r16 -- cgit v1.2.3 From 22c6663dc69a042a7b4158f162582b4b1ba7a4b7 Mon Sep 17 00:00:00 2001 From: "Gautham R. Shenoy" Date: Tue, 16 May 2017 14:19:47 +0530 Subject: powerpc/powernv/idle: Use Requested Level for restoring state on P9 DD1 On Power9 DD1 due to a hardware bug the Power-Saving Level Status field (PLS) of the PSSCR for a thread waking up from a deep state can under-report if some other thread in the core is in a shallow stop state. The scenario in which this can manifest is as follows: 1) All the threads of the core are in deep stop. 2) One of the threads is woken up. The PLS for this thread will correctly reflect that it is waking up from deep stop. 3) The thread that has woken up now executes a shallow stop. 4) When some other thread in the core is woken, its PLS will reflect the shallow stop state. Thus, the subsequent thread for which the PLS is under-reporting the wakeup state will not restore the hypervisor resources. Hence, on DD1 systems, use the Requested Level (RL) field as a workaround to restore the contents of the hypervisor resources on the wakeup from the stop state. Signed-off-by: Gautham R. Shenoy Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/paca.h | 2 ++ arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kernel/idle_book3s.S | 13 ++++++++++++- 3 files changed, 15 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 1c09f8fe2ee8..77f60a0f1405 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -177,6 +177,8 @@ struct paca_struct { * to the sibling threads' paca. */ struct paca_struct **thread_sibling_pacas; + /* The PSSCR value that the kernel requested before going to stop */ + u64 requested_psscr; #endif #ifdef CONFIG_PPC_STD_MMU_64 diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 709e23425317..e15c178ba079 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -742,6 +742,7 @@ int main(void) OFFSET(PACA_THREAD_MASK, paca_struct, thread_mask); OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask); OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas); + OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr); #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 6c9920d9221c..98a6d07ecb5c 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -379,6 +379,7 @@ _GLOBAL(power9_idle_stop) mfspr r5,SPRN_PSSCR andc r5,r5,r4 or r3,r3,r5 + std r3, PACA_REQ_PSSCR(r13) mtspr SPRN_PSSCR,r3 LOAD_REG_ADDR(r5,power_enter_stop) li r4,1 @@ -498,12 +499,22 @@ pnv_restore_hyp_resource_arch300: LOAD_REG_ADDRBASE(r5,pnv_first_deep_stop_state) ld r4,ADDROFF(pnv_first_deep_stop_state)(r5) - mfspr r5,SPRN_PSSCR +BEGIN_FTR_SECTION_NESTED(71) + /* + * Assume that we are waking up from the state + * same as the Requested Level (RL) in the PSSCR + * which are Bits 60-63 + */ + ld r5,PACA_REQ_PSSCR(r13) + rldicl r5,r5,0,60 +FTR_SECTION_ELSE_NESTED(71) /* * 0-3 bits correspond to Power-Saving Level Status * which indicates the idle state we are waking up from */ + mfspr r5, SPRN_PSSCR rldicl r5,r5,4,60 +ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_POWER9_DD1, 71) cmpd cr4,r5,r4 bge cr4,pnv_wakeup_tb_loss /* returns to caller */ -- cgit v1.2.3 From 6e2f03e292ef46eed2b31b0a344a91d514f9cd81 Mon Sep 17 00:00:00 2001 From: Ivan Mikhaylov Date: Fri, 19 May 2017 18:47:05 +0300 Subject: powerpc/[booke|4xx]: Don't clobber TCR[WP] when setting TCR[DIE] Prevent a kernel panic caused by unintentionally clearing TCR watchdog bits. At this point in the kernel boot, the watchdog may have already been enabled by u-boot. The original code's attempt to write to the TCR register results in an inadvertent clearing of the watchdog configuration bits, causing the 476 to reset. Signed-off-by: Ivan Mikhaylov Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/time.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 2b33cfaac7b8..60714b8c9a2f 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -739,12 +739,20 @@ static int __init get_freq(char *name, int cells, unsigned long *val) static void start_cpu_decrementer(void) { #if defined(CONFIG_BOOKE) || defined(CONFIG_40x) + unsigned int tcr; + /* Clear any pending timer interrupts */ mtspr(SPRN_TSR, TSR_ENW | TSR_WIS | TSR_DIS | TSR_FIS); - /* Enable decrementer interrupt */ - mtspr(SPRN_TCR, TCR_DIE); -#endif /* defined(CONFIG_BOOKE) || defined(CONFIG_40x) */ + tcr = mfspr(SPRN_TCR); + /* + * The watchdog may have already been enabled by u-boot. So leave + * TRC[WP] (Watchdog Period) alone. + */ + tcr &= TCR_WP_MASK; /* Clear all bits except for TCR[WP] */ + tcr |= TCR_DIE; /* Enable decrementer */ + mtspr(SPRN_TCR, tcr); +#endif } void __init generic_calibrate_decr(void) -- cgit v1.2.3 From e8c688251d0e8baca1cd68992c9ef4078a0361c8 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 12 May 2017 01:56:48 +1000 Subject: powerpc/64: Place sfpr section explicitly with the linker script Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/vmlinux.lds.S | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 2f793be3d2b1..bcfda21c3179 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -115,6 +115,14 @@ SECTIONS KPROBES_TEXT IRQENTRY_TEXT SOFTIRQENTRY_TEXT + /* + * -Os builds call FP save/restore functions. The powerpc64 + * linker generates those on demand in the .sfpr section. + * .sfpr gets placed at the beginning of a group of input + * sections, which can break start-of-text offset if it is + * included with the main text sections, so put it by itself. + */ + *(.sfpr); MEM_KEEP(init.text) MEM_KEEP(exit.text) -- cgit v1.2.3 From 951eedebcdea06fdcc742c82dc347509ce0e1ba4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 29 May 2017 17:39:40 +1000 Subject: powerpc/64: Handle linker stubs in low .text code Very large kernels may require linker stubs for branches from HEAD text code. The linker may place these stubs before the HEAD text sections, which breaks the assumption that HEAD text is located at 0 (or the .text section being located at 0x7000/0x8000 on Book3S kernels). Provide an option to create a small section just before the .text section with an empty 256 - 4 bytes, and adjust the start of the .text section to match. The linker will tend to put stubs in that section and not break our relative-to-absolute offset assumptions. This causes a small waste of space on common kernels, but allows large kernels to build and boot. For now, it is an EXPERT config option, defaulting to =n, but a reference is provided for it in the build-time check for such breakage. This is good enough for allyesconfig and custom users / hackers. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 11 +++++++++++ arch/powerpc/include/asm/head-64.h | 18 ++++++++++++++++++ arch/powerpc/kernel/vmlinux.lds.S | 5 +++++ 3 files changed, 34 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d090275ace44..0153275e9f75 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -455,6 +455,17 @@ config PPC_TRANSACTIONAL_MEM ---help--- Support user-mode Transactional Memory on POWERPC. +config LD_HEAD_STUB_CATCH + bool "Reserve 256 bytes to cope with linker stubs in HEAD text" if EXPERT + depends on PPC64 + default n + help + Very large kernels can cause linker branch stubs to be generated by + code in head_64.S, which moves the head text sections out of their + specified location. This option can work around the problem. + + If unsure, say "N". + config DISABLE_MPROFILE_KERNEL bool "Disable use of mprofile-kernel for kernel tracing" depends on PPC64 && CPU_LITTLE_ENDIAN diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index 86eb87382031..68828aa6e056 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -63,11 +63,29 @@ . = 0x0; \ start_##sname: +/* + * .linker_stub_catch section is used to catch linker stubs from being + * inserted in our .text section, above the start_text label (which breaks + * the ABS_ADDR calculation). See kernel/vmlinux.lds.S and tools/head_check.sh + * for more details. We would prefer to just keep a cacheline (0x80), but + * 0x100 seems to be how the linker aligns branch stub groups. + */ +#ifdef CONFIG_LD_HEAD_STUB_CATCH +#define OPEN_TEXT_SECTION(start) \ + .section ".linker_stub_catch","ax",@progbits; \ +linker_stub_catch: \ + . = 0x4; \ + text_start = (start) + 0x100; \ + .section ".text","ax",@progbits; \ + .balign 0x100; \ +start_text: +#else #define OPEN_TEXT_SECTION(start) \ text_start = (start); \ .section ".text","ax",@progbits; \ . = 0x0; \ start_text: +#endif #define ZERO_FIXED_SECTION(sname, start, end) \ sname##_start = (start); \ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index bcfda21c3179..be8578e02ab6 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -103,6 +103,11 @@ SECTIONS * section placement to work. */ .text BLOCK(0) : AT(ADDR(.text) - LOAD_OFFSET) { +#ifdef CONFIG_LD_HEAD_STUB_CATCH + *(.linker_stub_catch); + . = . ; +#endif + #else .text : AT(ADDR(.text) - LOAD_OFFSET) { ALIGN_FUNCTION(); -- cgit v1.2.3 From c494adefef9fcd0de172132e20f102d44c62fa2f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 12 May 2017 03:40:39 +1000 Subject: powerpc/64: Tool to check head sections location sanity Use a tool to check that the location of "fixed sections" are where we expected them to be, which catches cases the linker script can't (stubs being added to start of .text section), and which ends up being neater. Sample output: ERROR: start_text address is c000000000008100, should be c000000000008000 ERROR: see comments in arch/powerpc/tools/head_check.sh Signed-off-by: Nicholas Piggin [mpe: Fold in fix from Nick for 4.6 era toolchains] Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile.postlink | 8 +++- arch/powerpc/include/asm/head-64.h | 4 +- arch/powerpc/kernel/vmlinux.lds.S | 22 ----------- arch/powerpc/tools/head_check.sh | 78 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 87 insertions(+), 25 deletions(-) create mode 100644 arch/powerpc/tools/head_check.sh (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/Makefile.postlink b/arch/powerpc/Makefile.postlink index 50336930e6f7..5db43ebbe2df 100644 --- a/arch/powerpc/Makefile.postlink +++ b/arch/powerpc/Makefile.postlink @@ -10,6 +10,9 @@ __archpost: -include include/config/auto.conf include scripts/Kbuild.include +quiet_cmd_head_check = CHKHEAD $@ + cmd_head_check = $(CONFIG_SHELL) $(srctree)/arch/powerpc/tools/head_check.sh "$(NM)" "$@" + quiet_cmd_relocs_check = CHKREL $@ ifdef CONFIG_PPC_BOOK3S_64 cmd_relocs_check = \ @@ -24,6 +27,9 @@ endif vmlinux: FORCE @true +ifdef CONFIG_PPC64 + $(call cmd,head_check) +endif ifdef CONFIG_RELOCATABLE $(call if_changed,relocs_check) endif @@ -32,7 +38,7 @@ endif @true clean: - @true + rm -f .tmp_symbols.txt PHONY += FORCE clean diff --git a/arch/powerpc/include/asm/head-64.h b/arch/powerpc/include/asm/head-64.h index 68828aa6e056..7ab95798f170 100644 --- a/arch/powerpc/include/asm/head-64.h +++ b/arch/powerpc/include/asm/head-64.h @@ -49,8 +49,8 @@ * CLOSE_FIXED_SECTION() or elsewhere, there may be something * unexpected being added there. Remove the '. = x_len' line, rebuild, and * check what is pushing the section down. - * - If the build dies in linking, check arch/powerpc/kernel/vmlinux.lds.S - * for instructions. + * - If the build dies in linking, check arch/powerpc/tools/head_check.sh + * comments. * - If the kernel crashes or hangs in very early boot, it could be linker * stubs at the start of the main text. */ diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index be8578e02ab6..e69155f0db36 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -58,7 +58,6 @@ SECTIONS #ifdef CONFIG_PPC64 KEEP(*(.head.text.first_256B)); #ifdef CONFIG_PPC_BOOK3E -# define END_FIXED 0x100 #else KEEP(*(.head.text.real_vectors)); *(.head.text.real_trampolines); @@ -66,12 +65,8 @@ SECTIONS *(.head.text.virt_trampolines); # if defined(CONFIG_PPC_PSERIES) || defined(CONFIG_PPC_POWERNV) KEEP(*(.head.data.fwnmi_page)); -# define END_FIXED 0x8000 -# else -# define END_FIXED 0x7000 # endif #endif - ASSERT((. == END_FIXED), "vmlinux.lds.S: fixed section overflow error"); #else /* !CONFIG_PPC64 */ HEAD_TEXT #endif @@ -79,23 +74,6 @@ SECTIONS __head_end = .; - /* - * If the build dies here, it's likely code in head_64.S is referencing - * labels it can't reach, and the linker inserting stubs without the - * assembler's knowledge. To debug, remove the above assert and - * rebuild. Look for branch stubs in the fixed section region. - * - * Linker stub generation could be allowed in "trampoline" - * sections if absolutely necessary, but this would require - * some rework of the fixed sections. Before resorting to this, - * consider references that have sufficient addressing range, - * (e.g., hand coded trampolines) so the linker does not have - * to add stubs. - * - * Linker stubs at the top of the main text section are currently not - * detected, and will result in a crash at boot due to offsets being - * wrong. - */ #ifdef CONFIG_PPC64 /* * BLOCK(0) overrides the default output section alignment because diff --git a/arch/powerpc/tools/head_check.sh b/arch/powerpc/tools/head_check.sh new file mode 100644 index 000000000000..ad9e57209aa4 --- /dev/null +++ b/arch/powerpc/tools/head_check.sh @@ -0,0 +1,78 @@ +# Copyright © 2016 IBM Corporation + +# This program is free software; you can redistribute it and/or +# modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation; either version +# 2 of the License, or (at your option) any later version. + +# This script checks the head of a vmlinux for linker stubs that +# break our placement of fixed-location code for 64-bit. + +# based on relocs_check.pl +# Copyright © 2009 IBM Corporation + +# NOTE! +# +# If the build dies here, it's likely code in head_64.S/exception-64*.S or +# nearby, is branching to labels it can't reach directly, which results in the +# linker inserting branch stubs. This can move code around in ways that break +# the fixed section calculations (head-64.h). To debug this, disassemble the +# vmlinux and look for branch stubs (long_branch, plt_branch, etc.) in the +# fixed section region (0 - 0x8000ish). Check what code is calling those stubs, +# and perhaps change so a direct branch can reach. +# +# A ".linker_stub_catch" section is used to catch some stubs generated by +# early .text code, which tend to get placed at the start of the section. +# If there are too many such stubs, they can overflow this section. Expanding +# it may help (or reducing the number of stub branches). +# +# Linker stubs use the TOC pointer, so even if fixed section code could +# tolerate them being inserted into head code, they can't be allowed in low +# level entry code (boot, interrupt vectors, etc) until r2 is set up. This +# could cause the kernel to die in early boot. + +# Turn this on if you want more debug output: +# set -x + +if [ $# -lt 2 ]; then + echo "$0 [path to nm] [path to vmlinux]" 1>&2 + exit 1 +fi + +# Have Kbuild supply the path to nm so we handle cross compilation. +nm="$1" +vmlinux="$2" + +# gcc-4.6-era toolchain make _stext an A (absolute) symbol rather than T +$nm "$vmlinux" | grep -e " [TA] _stext$" -e " t start_first_256B$" -e " a text_start$" -e " t start_text$" -m4 > .tmp_symbols.txt + + +vma=$(cat .tmp_symbols.txt | grep -e " [TA] _stext$" | cut -d' ' -f1) + +expected_start_head_addr=$vma + +start_head_addr=$(cat .tmp_symbols.txt | grep " t start_first_256B$" | cut -d' ' -f1) + +if [ "$start_head_addr" != "$expected_start_head_addr" ]; then + echo "ERROR: head code starts at $start_head_addr, should be $expected_start_head_addr" + echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" + echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" + + exit 1 +fi + +top_vma=$(echo $vma | cut -d'0' -f1) + +expected_start_text_addr=$(cat .tmp_symbols.txt | grep " a text_start$" | cut -d' ' -f1 | sed "s/^0/$top_vma/") + +start_text_addr=$(cat .tmp_symbols.txt | grep " t start_text$" | cut -d' ' -f1) + +if [ "$start_text_addr" != "$expected_start_text_addr" ]; then + echo "ERROR: start_text address is $start_text_addr, should be $expected_start_text_addr" + echo "ERROR: try to enable LD_HEAD_STUB_CATCH config option" + echo "ERROR: see comments in arch/powerpc/tools/head_check.sh" + + exit 1 +fi + +rm -f .tmp_symbols.txt -- cgit v1.2.3 From 83a092cf95f28696ddc36c8add0cf03ac034897f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 12 May 2017 03:40:40 +1000 Subject: powerpc: Link warning for orphan sections Add --orphan-handling=warn to final link flags. This ensures we can handle all sections explicitly. This would have caught subtle breakage such as 7de3b27bac47da9de08409df1d69664acbb72197 at build-time. Also bring existing orphan sections into the fold: - .text.hot and .text.unlikely are compiler generated sections. - .sdata2, .dynsbss, .plt are used by PPC32 - We previously did not specify DWARF_DEBUG or STABS_DEBUG - DWARF_DEBUG did not include all DWARF sections that can be emitted - A number of sections are unused and can be discarded. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/Makefile | 1 + arch/powerpc/kernel/vmlinux.lds.S | 16 ++++++++++++++-- include/asm-generic/vmlinux.lds.h | 12 ++++++++++++ 3 files changed, 27 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/Makefile b/arch/powerpc/Makefile index eaa1865e4a8d..8d4ed73d5490 100644 --- a/arch/powerpc/Makefile +++ b/arch/powerpc/Makefile @@ -98,6 +98,7 @@ endif LDFLAGS_vmlinux-y := -Bstatic LDFLAGS_vmlinux-$(CONFIG_RELOCATABLE) := -pie LDFLAGS_vmlinux := $(LDFLAGS_vmlinux-y) +LDFLAGS_vmlinux += $(call ld-option,--orphan-handling=warn) ifeq ($(CONFIG_PPC64),y) ifeq ($(call cc-option-yn,-mcmodel=medium),y) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index e69155f0db36..ace6b6579961 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -91,7 +91,7 @@ SECTIONS ALIGN_FUNCTION(); #endif /* careful! __ftr_alt_* sections need to be close to .text */ - *(.text .fixup __ftr_alt_* .ref.text) + *(.text.hot .text .text.fixup .text.unlikely .fixup __ftr_alt_* .ref.text); SCHED_TEXT CPUIDLE_TEXT LOCK_TEXT @@ -258,7 +258,9 @@ SECTIONS .data : AT(ADDR(.data) - LOAD_OFFSET) { DATA_DATA *(.sdata) + *(.sdata2) *(.got.plt) *(.got) + *(.plt) } #else .data : AT(ADDR(.data) - LOAD_OFFSET) { @@ -321,6 +323,16 @@ SECTIONS _end = . ; PROVIDE32 (end = .); - /* Sections to be discarded. */ + STABS_DEBUG + + DWARF_DEBUG + DISCARDS + /DISCARD/ : { + *(*.EMB.apuinfo) + *(.glink .iplt .plt .rela* .comment) + *(.gnu.version*) + *(.gnu.attributes) + *(.eh_frame) + } } diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 314a0b9219c6..9862afb3ae05 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -595,6 +595,7 @@ #define SBSS(sbss_align) \ . = ALIGN(sbss_align); \ .sbss : AT(ADDR(.sbss) - LOAD_OFFSET) { \ + *(.dynsbss) \ *(.sbss) \ *(.scommon) \ } @@ -641,11 +642,22 @@ .debug_str 0 : { *(.debug_str) } \ .debug_loc 0 : { *(.debug_loc) } \ .debug_macinfo 0 : { *(.debug_macinfo) } \ + .debug_pubtypes 0 : { *(.debug_pubtypes) } \ + /* DWARF 3 */ \ + .debug_ranges 0 : { *(.debug_ranges) } \ /* SGI/MIPS DWARF 2 extensions */ \ .debug_weaknames 0 : { *(.debug_weaknames) } \ .debug_funcnames 0 : { *(.debug_funcnames) } \ .debug_typenames 0 : { *(.debug_typenames) } \ .debug_varnames 0 : { *(.debug_varnames) } \ + /* GNU DWARF 2 extensions */ \ + .debug_gnu_pubnames 0 : { *(.debug_gnu_pubnames) } \ + .debug_gnu_pubtypes 0 : { *(.debug_gnu_pubtypes) } \ + /* DWARF 4 */ \ + .debug_types 0 : { *(.debug_types) } \ + /* DWARF 5 */ \ + .debug_macro 0 : { *(.debug_macro) } \ + .debug_addr 0 : { *(.debug_addr) } /* Stabs debugging sections. */ #define STABS_DEBUG \ -- cgit v1.2.3 From 362957c27ed0d9ff485d3266ed22d944cbfea6cc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 5 Aug 2016 13:28:05 +0200 Subject: powerpc/40x: Clear MSR_DR in one insn instead of two Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/misc_32.S | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 84db14e435f5..3f7a9a2d2435 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -244,8 +244,7 @@ _GLOBAL(_nmask_and_or_msr) */ _GLOBAL(real_readb) mfmsr r7 - ori r0,r7,MSR_DR - xori r0,r0,MSR_DR + rlwinm r0,r7,0,~MSR_DR sync mtmsr r0 sync @@ -262,8 +261,7 @@ _GLOBAL(real_readb) */ _GLOBAL(real_writeb) mfmsr r7 - ori r0,r7,MSR_DR - xori r0,r0,MSR_DR + rlwinm r0,r7,0,~MSR_DR sync mtmsr r0 sync -- cgit v1.2.3 From 45cb08f4791ce6a15c54598b4cb73db4b4b8294f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Thu, 16 Mar 2017 09:55:45 +0100 Subject: powerpc: Handle simultaneous interrupts at once It often happens to have simultaneous interrupts, for instance when having double Ethernet attachment. With the current implementation, we suffer the cost of kernel entry/exit for each interrupt. This patch introduces a loop in __do_irq() to handle all interrupts at once before returning. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/irq.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 5c291df30fe3..ab2ed9afd3c2 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -481,7 +481,11 @@ void __do_irq(struct pt_regs *regs) if (unlikely(!irq)) __this_cpu_inc(irq_stat.spurious_irqs); else - generic_handle_irq(irq); + do { + generic_handle_irq(irq); + + irq = ppc_md.get_irq(); + } while (irq); trace_irq_exit(regs); -- cgit v1.2.3 From 98b8cd7f75643e0a442d7a4c1cef2c9d53b7e92b Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Sat, 27 May 2017 17:46:15 +0200 Subject: powerpc/fadump: Return error when fadump registration fails - log an error message when registration fails and no error code listed in the switch is returned - translate the hv error code to posix error code and return it from fw_register - return the posix error code from fw_register to the process writing to sysfs - return EEXIST on re-registration - return success on deregistration when fadump is not registered - return ENODEV when no memory is reserved for fadump Signed-off-by: Michal Suchanek Tested-by: Hari Bathini [mpe: Use pr_err() to shrink the error print] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/fadump.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 466569e26278..8a5d0e029b93 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -377,9 +377,9 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); -static void register_fw_dump(struct fadump_mem_struct *fdm) +static int register_fw_dump(struct fadump_mem_struct *fdm) { - int rc; + int rc, err; unsigned int wait_time; pr_debug("Registering for firmware-assisted kernel dump...\n"); @@ -396,7 +396,11 @@ static void register_fw_dump(struct fadump_mem_struct *fdm) } while (wait_time); + err = -EIO; switch (rc) { + default: + pr_err("Failed to register. Unknown Error(%d).\n", rc); + break; case -1: printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Hardware Error(%d).\n", rc); @@ -404,18 +408,22 @@ static void register_fw_dump(struct fadump_mem_struct *fdm) case -3: printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Parameter Error(%d).\n", rc); + err = -EINVAL; break; case -9: printk(KERN_ERR "firmware-assisted kernel dump is already " " registered."); fw_dump.dump_registered = 1; + err = -EEXIST; break; case 0: printk(KERN_INFO "firmware-assisted kernel dump registration" " is successful\n"); fw_dump.dump_registered = 1; + err = 0; break; } + return err; } void crash_fadump(struct pt_regs *regs, const char *str) @@ -956,7 +964,7 @@ static unsigned long init_fadump_header(unsigned long addr) return addr; } -static void register_fadump(void) +static int register_fadump(void) { unsigned long addr; void *vaddr; @@ -966,7 +974,7 @@ static void register_fadump(void) * assisted dump. */ if (!fw_dump.reserve_dump_area_size) - return; + return -ENODEV; fadump_setup_crash_memory_ranges(); @@ -979,7 +987,7 @@ static void register_fadump(void) fadump_create_elfcore_headers(vaddr); /* register the future kernel dump with firmware. */ - register_fw_dump(&fdm); + return register_fw_dump(&fdm); } static int fadump_unregister_dump(struct fadump_mem_struct *fdm) @@ -1161,7 +1169,6 @@ static ssize_t fadump_register_store(struct kobject *kobj, switch (buf[0]) { case '0': if (fw_dump.dump_registered == 0) { - ret = -EINVAL; goto unlock_out; } /* Un-register Firmware-assisted dump */ @@ -1169,11 +1176,11 @@ static ssize_t fadump_register_store(struct kobject *kobj, break; case '1': if (fw_dump.dump_registered == 1) { - ret = -EINVAL; + ret = -EEXIST; goto unlock_out; } /* Register Firmware-assisted dump */ - register_fadump(); + ret = register_fadump(); break; default: ret = -EINVAL; -- cgit v1.2.3 From 81d9eca502fcc360950ef476124626d97856e139 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Mon, 22 May 2017 15:04:23 +0530 Subject: powerpc/fadump: Add a warning when 'fadump_reserve_mem=' is used With commit 11550dc0a00b ("powerpc/fadump: reuse crashkernel parameter for fadump memory reservation"), 'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter. Add a warning if 'fadump_reserve_mem=' is still used. Fixes: 11550dc0a00b ("powerpc/fadump: reuse crashkernel parameter for fadump memory reservation") Suggested-by: Prarit Bhargava Signed-off-by: Hari Bathini [mpe: Unsplit long printk strings] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/fadump.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 8a5d0e029b93..1bdbe0b257e0 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -212,6 +212,9 @@ static inline unsigned long fadump_calculate_reserve_size(void) int ret; unsigned long long base, size; + if (fw_dump.reserve_bootvar) + pr_warn("'fadump_reserve_mem=' parameter is deprecated in favor of 'crashkernel=' parameter.\n"); + /* * Check if the size is specified through crashkernel= cmdline * option. If yes, then use that but ignore base as fadump @@ -220,8 +223,17 @@ static inline unsigned long fadump_calculate_reserve_size(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &size, &base); if (ret == 0 && size > 0) { + if (fw_dump.reserve_bootvar) + pr_info("Using 'crashkernel=' parameter for memory reservation.\n"); + fw_dump.reserve_bootvar = (unsigned long)size; return fw_dump.reserve_bootvar; + } else if (fw_dump.reserve_bootvar) { + /* + * 'fadump_reserve_mem=' is being used to reserve memory + * for firmware-assisted dump. + */ + return fw_dump.reserve_bootvar; } /* divide by 20 to get 5% of value */ @@ -377,6 +389,19 @@ static int __init early_fadump_param(char *p) } early_param("fadump", early_fadump_param); +/* + * Look for fadump_reserve_mem= cmdline option + * TODO: Remove references to 'fadump_reserve_mem=' parameter, + * the sooner 'crashkernel=' parameter is accustomed to. + */ +static int __init early_fadump_reserve_mem(char *p) +{ + if (p) + fw_dump.reserve_bootvar = memparse(p, &p); + return 0; +} +early_param("fadump_reserve_mem", early_fadump_reserve_mem); + static int register_fw_dump(struct fadump_mem_struct *fdm) { int rc, err; -- cgit v1.2.3 From e7467dc6947d7074417aa4cda44b851010fd0795 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Mon, 22 May 2017 15:04:47 +0530 Subject: powerpc/fadump: Update comment about offset where fadump is reserved With commit f6e6bedb7731 ("powerpc/fadump: Reserve memory at an offset closer to bottom of RAM"), memory for fadump is no longer reserved at the top of RAM. But there are still a few places which say so. Change them appropriately. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman --- Documentation/powerpc/firmware-assisted-dump.txt | 4 ++-- arch/powerpc/kernel/fadump.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/Documentation/powerpc/firmware-assisted-dump.txt b/Documentation/powerpc/firmware-assisted-dump.txt index 9cabaf8a207e..bdd344aa18d9 100644 --- a/Documentation/powerpc/firmware-assisted-dump.txt +++ b/Documentation/powerpc/firmware-assisted-dump.txt @@ -61,8 +61,8 @@ as follows: boot successfully. For syntax of crashkernel= parameter, refer to Documentation/kdump/kdump.txt. If any offset is provided in crashkernel= parameter, it will be ignored - as fadump reserves memory at end of RAM for boot memory - dump preservation in case of a crash. + as fadump uses a predefined offset to reserve memory + for boot memory dump preservation in case of a crash. -- After the low memory (boot memory) area has been saved, the firmware will reset PCI and other hardware state. It will diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 1bdbe0b257e0..1ec6ea67159c 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -217,8 +217,8 @@ static inline unsigned long fadump_calculate_reserve_size(void) /* * Check if the size is specified through crashkernel= cmdline - * option. If yes, then use that but ignore base as fadump - * reserves memory at end of RAM. + * option. If yes, then use that but ignore base as fadump reserves + * memory at a predefined offset. */ ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &size, &base); -- cgit v1.2.3 From 48a316e350974739235c234430ec0e129f864a43 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Fri, 2 Jun 2017 13:00:27 +0530 Subject: powerpc/fadump: Set an upper limit for boot memory size By default, 5% of system RAM is reserved for preserving boot memory. Alternatively, a user can specify the amount of memory to reserve. See Documentation/powerpc/firmware-assisted-dump.txt for details. In addition to the memory reserved for preserving boot memory, some more memory is reserved, to save HPTE region, CPU state data and ELF core headers. Memory Reservation during first kernel looks like below: Low memory Top of memory 0 boot memory size | | | |<--Reserved dump area -->| V V | Permanent Reservation V +-----------+----------/ /----------+---+----+-----------+----+ | | |CPU|HPTE| DUMP |ELF | +-----------+----------/ /----------+---+----+-----------+----+ | ^ | | \ / ------------------------------------------- Boot memory content gets transferred to reserved area by firmware at the time of crash This implicitly means that the sum of the sizes of boot memory, CPU state data, HPTE region, DUMP preserving area and ELF core headers can't be greater than the total memory size. But currently, a user is allowed to specify any value as boot memory size. So, the above rule is violated when a boot memory size around 50% of the total available memory is specified. As the kernel is not handling this currently, it may lead to undefined behavior. Fix it by setting an upper limit for boot memory size to 25% of the total available memory. Also, instead of using memblock_end_of_DRAM(), which doesn't take the holes, if any, in the memory layout into account, use memblock_phys_mem_size() to calculate the percentage of total available memory. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/fadump.h | 3 +++ arch/powerpc/kernel/fadump.c | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index 60b91084f33c..a3de219073af 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -43,6 +43,9 @@ #define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ + (0x1UL << 26)) +/* The upper limit percentage for user specified boot memory size (25%) */ +#define MAX_BOOT_MEM_RATIO 4 + #define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt) /* Firmware provided dump sections */ diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 1ec6ea67159c..12837d52e84a 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -223,10 +223,24 @@ static inline unsigned long fadump_calculate_reserve_size(void) ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), &size, &base); if (ret == 0 && size > 0) { + unsigned long max_size; + if (fw_dump.reserve_bootvar) pr_info("Using 'crashkernel=' parameter for memory reservation.\n"); fw_dump.reserve_bootvar = (unsigned long)size; + + /* + * Adjust if the boot memory size specified is above + * the upper limit. + */ + max_size = memblock_phys_mem_size() / MAX_BOOT_MEM_RATIO; + if (fw_dump.reserve_bootvar > max_size) { + fw_dump.reserve_bootvar = max_size; + pr_info("Adjusted boot memory size to %luMB\n", + (fw_dump.reserve_bootvar >> 20)); + } + return fw_dump.reserve_bootvar; } else if (fw_dump.reserve_bootvar) { /* @@ -237,7 +251,7 @@ static inline unsigned long fadump_calculate_reserve_size(void) } /* divide by 20 to get 5% of value */ - size = memblock_end_of_DRAM() / 20; + size = memblock_phys_mem_size() / 20; /* round it down in multiples of 256 */ size = size & ~0x0FFFFFFFUL; -- cgit v1.2.3 From 90df4bfb4d9e00a1ab6885900b808bef2b62a21c Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 29 May 2017 16:26:44 +1000 Subject: powerpc/64s: Machine check handle ifetch from foreign real address for POWER9 The i-side 0111b machine check, which is "Instruction Fetch to foreign address space", was missed by 7b9f71f974 ("powerpc/64s: POWER9 machine check handler"). The POWER9 processor core considers host real addresses with a nonzero value in RA(8:12) as foreign address space, accessible only by the copy and paste instructions. The copy and paste instruction pair can be used to invoke the Nest accelerators via the Virtual Accelerator Switchboard (VAS). It is an error for any regular load/store or ifetch to go to a foreign addresses. When relocation is on, this causes an MMU exception. When relocation is off, a machine check exception. It is possible to trigger this machine check by branching to a foreign address with MSR[IR]=0. Fixes: 7b9f71f974a1 ("powerpc/64s: POWER9 machine check handler") Reported-by: Mahesh Salgaonkar Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/mce.h | 15 ++++++++------- arch/powerpc/kernel/mce.c | 1 + arch/powerpc/kernel/mce_power.c | 3 +++ 3 files changed, 12 insertions(+), 7 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/mce.h b/arch/powerpc/include/asm/mce.h index 81eff8631434..190d69a7f701 100644 --- a/arch/powerpc/include/asm/mce.h +++ b/arch/powerpc/include/asm/mce.h @@ -90,13 +90,14 @@ enum MCE_UserErrorType { enum MCE_RaErrorType { MCE_RA_ERROR_INDETERMINATE = 0, MCE_RA_ERROR_IFETCH = 1, - MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH = 2, - MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN = 3, - MCE_RA_ERROR_LOAD = 4, - MCE_RA_ERROR_STORE = 5, - MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE = 6, - MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN = 7, - MCE_RA_ERROR_LOAD_STORE_FOREIGN = 8, + MCE_RA_ERROR_IFETCH_FOREIGN = 2, + MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH = 3, + MCE_RA_ERROR_PAGE_TABLE_WALK_IFETCH_FOREIGN = 4, + MCE_RA_ERROR_LOAD = 5, + MCE_RA_ERROR_STORE = 6, + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE = 7, + MCE_RA_ERROR_PAGE_TABLE_WALK_LOAD_STORE_FOREIGN = 8, + MCE_RA_ERROR_LOAD_STORE_FOREIGN = 9, }; enum MCE_LinkErrorType { diff --git a/arch/powerpc/kernel/mce.c b/arch/powerpc/kernel/mce.c index 5f9eada3519b..92f185875694 100644 --- a/arch/powerpc/kernel/mce.c +++ b/arch/powerpc/kernel/mce.c @@ -268,6 +268,7 @@ void machine_check_print_event_info(struct machine_check_event *evt, static const char *mc_ra_types[] = { "Indeterminate", "Instruction fetch (bad)", + "Instruction fetch (foreign)", "Page table walk ifetch (bad)", "Page table walk ifetch (foreign)", "Load (bad)", diff --git a/arch/powerpc/kernel/mce_power.c b/arch/powerpc/kernel/mce_power.c index f913139bb0c2..d24e689e893f 100644 --- a/arch/powerpc/kernel/mce_power.c +++ b/arch/powerpc/kernel/mce_power.c @@ -236,6 +236,9 @@ static const struct mce_ierror_table mce_p9_ierror_table[] = { { 0x00000000081c0000, 0x0000000000180000, true, MCE_ERROR_TYPE_UE, MCE_UE_ERROR_PAGE_TABLE_WALK_IFETCH, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, +{ 0x00000000081c0000, 0x00000000001c0000, true, + MCE_ERROR_TYPE_RA, MCE_RA_ERROR_IFETCH_FOREIGN, + MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, { 0x00000000081c0000, 0x0000000008000000, true, MCE_ERROR_TYPE_LINK,MCE_LINK_ERROR_IFETCH_TIMEOUT, MCE_INITIATOR_CPU, MCE_SEV_ERROR_SYNC, }, -- cgit v1.2.3 From 0edc2ca9ccc9df99f7a94b6407ae2a0ff27d86b2 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 15 Jun 2017 16:20:46 +1000 Subject: Revert "powerpc: Handle simultaneous interrupts at once" This reverts commit 45cb08f4791ce6a15c54598b4cb73db4b4b8294f. For some reason this is causing IRQ problems on Freescale Book3E machines, eg on my p5020ds: irq 25: nobody cared (try booting with the "irqpoll" option) CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.12.0-rc3-gcc-6.3.1-00037-g45cb08f4791c #624 Call Trace: [c0000000fffdbb10] [c00000000049962c] .dump_stack+0xa8/0xe8 (unreliable) [c0000000fffdbba0] [c0000000000babf4] .__report_bad_irq+0x54/0x140 [c0000000fffdbc40] [c0000000000bb11c] .note_interrupt+0x324/0x380 [c0000000fffdbd00] [c0000000000b7110] .handle_irq_event_percpu+0x68/0x88 [c0000000fffdbd90] [c0000000000b718c] .handle_irq_event+0x5c/0xa8 [c0000000fffdbe10] [c0000000000bc01c] .handle_fasteoi_irq+0xe4/0x298 [c0000000fffdbe90] [c0000000000b59c4] .generic_handle_irq+0x50/0x74 [c0000000fffdbf10] [c0000000000075d8] .__do_irq+0x74/0x1f0 [c0000000fffdbf90] [c0000000000189f8] .call_do_irq+0x14/0x24 [c0000000f7173060] [c0000000000077e4] .do_IRQ+0x90/0x120 [c0000000f7173100] [c00000000001d93c] exc_0x500_common+0xfc/0x100 --- interrupt: 501 at .prepare_to_wait_event+0xc/0x14c LR = .fsl_elbc_run_command+0xc8/0x23c [c0000000f71734d0] [c00000000065f418] .nand_reset+0xb8/0x168 [c0000000f7173560] [c00000000065fec4] .nand_scan_ident+0x2b0/0x1638 [c0000000f7173650] [c000000000666cd8] .fsl_elbc_nand_probe+0x34c/0x5f0 ata2: SATA link up 1.5 Gbps (SStatus 113 SControl 300) [c0000000f7173750] [c0000000005a3c60] .platform_drv_probe+0x64/0xb0 [c0000000f71737d0] [c0000000005a12e0] .really_probe+0x290/0x334 [c0000000f7173870] [c0000000005a14a0] .__driver_attach+0x11c/0x120 [c0000000f7173900] [c00000000059e6a0] .bus_for_each_dev+0x98/0xfc [c0000000f71739a0] [c0000000005a0b3c] .driver_attach+0x34/0x4c [c0000000f7173a20] [c0000000005a04b0] .bus_add_driver+0x1ac/0x2e0 [c0000000f7173ac0] [c0000000005a2170] .driver_register+0x94/0x160 [c0000000f7173b40] [c0000000005a3be0] .__platform_driver_register+0x60/0x7c [c0000000f7173bc0] [c000000000d6aab4] .fsl_elbc_nand_driver_init+0x24/0x38 [c0000000f7173c30] [c000000000001934] .do_one_initcall+0x68/0x1b8 [c0000000f7173d00] [c000000000d210f8] .kernel_init_freeable+0x260/0x338 [c0000000f7173db0] [c0000000000021b0] .kernel_init+0x20/0xe70 [c0000000f7173e30] [c0000000000009bc] .ret_from_kernel_thread+0x58/0x9c handlers: [] .fsl_lbc_ctrl_irq Disabling IRQ #25 Ben also had concerns with the implementation being potentially slow on some PICs, so revert it for now. Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/irq.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index ab2ed9afd3c2..5c291df30fe3 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -481,11 +481,7 @@ void __do_irq(struct pt_regs *regs) if (unlikely(!irq)) __this_cpu_inc(irq_stat.spurious_irqs); else - do { - generic_handle_irq(irq); - - irq = ppc_md.get_irq(); - } while (irq); + generic_handle_irq(irq); trace_irq_exit(regs); -- cgit v1.2.3 From acd7d8cef01537062e318143d700357d5a92bd6b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:35:04 +1000 Subject: powerpc/64s: Optimize hypercall/syscall entry After bc3551257a ("powerpc/64: Allow for relocation-on interrupts from guest to host"), a getppid() system call goes from 307 cycles to 358 cycles (+17%) on POWER8. This is due significantly to the scratch SPR used by the hypercall check. It turns out there are a some volatile registers common to both system call and hypercall (in particular, r12, cr0, ctr), which can be used to avoid the SPR and some other overheads. This brings getppid to 320 cycles (+4%). Testing hcall entry performance by running "sc 1" in guest userspace before this patch is 854 cycles, afterwards is 826. Also a small win there. POWER9 syscall is improved by about the same amount, hcall not tested. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 134 +++++++++++++++++++++++++---------- 1 file changed, 97 insertions(+), 37 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ae418b85c17c..2f700a15bfa3 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -821,46 +821,80 @@ EXC_VIRT(trap_0b, 0x4b00, 0x100, 0xb00) TRAMP_KVM(PACA_EXGEN, 0xb00) EXC_COMMON(trap_0b_common, 0xb00, unknown_exception) +/* + * system call / hypercall (0xc00, 0x4c00) + * + * The system call exception is invoked with "sc 0" and does not alter HV bit. + * There is support for kernel code to invoke system calls but there are no + * in-tree users. + * + * The hypercall is invoked with "sc 1" and sets HV=1. + * + * In HPT, sc 1 always goes to 0xc00 real mode. In RADIX, sc 1 can go to + * 0x4c00 virtual mode. + * + * Call convention: + * + * syscall register convention is in Documentation/powerpc/syscall64-abi.txt + * + * For hypercalls, the register convention is as follows: + * r0 volatile + * r1-2 nonvolatile + * r3 volatile parameter and return value for status + * r4-r10 volatile input and output value + * r11 volatile hypercall number and output value + * r12 volatile + * r13-r31 nonvolatile + * LR nonvolatile + * CTR volatile + * XER volatile + * CR0-1 CR5-7 volatile + * CR2-4 nonvolatile + * Other registers nonvolatile + * + * The intersection of volatile registers that don't contain possible + * inputs is: r12, cr0, xer, ctr. We may use these as scratch regs + * upon entry without saving. + */ #ifdef CONFIG_KVM_BOOK3S_64_HANDLER - /* - * If CONFIG_KVM_BOOK3S_64_HANDLER is set, save the PPR (on systems - * that support it) before changing to HMT_MEDIUM. That allows the KVM - * code to save that value into the guest state (it is the guest's PPR - * value). Otherwise just change to HMT_MEDIUM as userspace has - * already saved the PPR. - */ + /* + * There is a little bit of juggling to get syscall and hcall + * working well. Save r10 in ctr to be restored in case it is a + * hcall. + * + * Userspace syscalls have already saved the PPR, hcalls must save + * it before setting HMT_MEDIUM. + */ #define SYSCALL_KVMTEST \ - SET_SCRATCH0(r13); \ + mr r12,r13; \ GET_PACA(r13); \ - std r9,PACA_EXGEN+EX_R9(r13); \ - OPT_GET_SPR(r9, SPRN_PPR, CPU_FTR_HAS_PPR); \ + mtctr r10; \ + KVMTEST_PR(0xc00); /* uses r10, branch to do_kvm_0xc00_system_call */ \ HMT_MEDIUM; \ - std r10,PACA_EXGEN+EX_R10(r13); \ - OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r9, CPU_FTR_HAS_PPR); \ - mfcr r9; \ - KVMTEST_PR(0xc00); \ - GET_SCRATCH0(r13) + mr r9,r12; \ #else #define SYSCALL_KVMTEST \ - HMT_MEDIUM + HMT_MEDIUM; \ + mr r9,r13; \ + GET_PACA(r13); #endif #define LOAD_SYSCALL_HANDLER(reg) \ __LOAD_HANDLER(reg, system_call_common) -/* Syscall routine is used twice, in reloc-off and reloc-on paths */ -#define SYSCALL_PSERIES_1 \ +#define SYSCALL_FASTENDIAN_TEST \ BEGIN_FTR_SECTION \ cmpdi r0,0x1ebe ; \ beq- 1f ; \ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ - mr r9,r13 ; \ - GET_PACA(r13) ; \ - mfspr r11,SPRN_SRR0 ; \ -0: -#define SYSCALL_PSERIES_2_RFID \ +/* + * After SYSCALL_KVMTEST, we reach here with PACA in r13, r13 in r9, + * and HMT_MEDIUM. + */ +#define SYSCALL_REAL \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ LOAD_SYSCALL_HANDLER(r10) ; \ mtspr SPRN_SRR0,r10 ; \ @@ -869,11 +903,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ rfid ; \ b . ; /* prevent speculative execution */ -#define SYSCALL_PSERIES_3 \ +#define SYSCALL_FASTENDIAN \ /* Fast LE/BE switch system call */ \ 1: mfspr r12,SPRN_SRR1 ; \ xori r12,r12,MSR_LE ; \ mtspr SPRN_SRR1,r12 ; \ + mr r13,r9 ; \ rfid ; /* return to userspace */ \ b . ; /* prevent speculative execution */ @@ -882,16 +917,18 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ * We can't branch directly so we do it via the CTR which * is volatile across system calls. */ -#define SYSCALL_PSERIES_2_DIRECT \ - LOAD_SYSCALL_HANDLER(r12) ; \ - mtctr r12 ; \ +#define SYSCALL_VIRT \ + LOAD_SYSCALL_HANDLER(r10) ; \ + mtctr r10 ; \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ li r10,MSR_RI ; \ mtmsrd r10,1 ; \ bctr ; #else /* We can branch directly */ -#define SYSCALL_PSERIES_2_DIRECT \ +#define SYSCALL_VIRT \ + mfspr r11,SPRN_SRR0 ; \ mfspr r12,SPRN_SRR1 ; \ li r10,MSR_RI ; \ mtmsrd r10,1 ; /* Set RI (EE=0) */ \ @@ -899,20 +936,43 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE) \ #endif EXC_REAL_BEGIN(system_call, 0xc00, 0x100) - SYSCALL_KVMTEST - SYSCALL_PSERIES_1 - SYSCALL_PSERIES_2_RFID - SYSCALL_PSERIES_3 + SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */ + SYSCALL_FASTENDIAN_TEST + SYSCALL_REAL + SYSCALL_FASTENDIAN EXC_REAL_END(system_call, 0xc00, 0x100) EXC_VIRT_BEGIN(system_call, 0x4c00, 0x100) - SYSCALL_KVMTEST - SYSCALL_PSERIES_1 - SYSCALL_PSERIES_2_DIRECT - SYSCALL_PSERIES_3 + SYSCALL_KVMTEST /* loads PACA into r13, and saves r13 to r9 */ + SYSCALL_FASTENDIAN_TEST + SYSCALL_VIRT + SYSCALL_FASTENDIAN EXC_VIRT_END(system_call, 0x4c00, 0x100) -TRAMP_KVM(PACA_EXGEN, 0xc00) +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER + /* + * This is a hcall, so register convention is as above, with these + * differences: + * r13 = PACA + * r12 = orig r13 + * ctr = orig r10 + */ +TRAMP_KVM_BEGIN(do_kvm_0xc00) + /* + * Save the PPR (on systems that support it) before changing to + * HMT_MEDIUM. That allows the KVM code to save that value into the + * guest state (it is the guest's PPR value). + */ + OPT_GET_SPR(r0, SPRN_PPR, CPU_FTR_HAS_PPR) + HMT_MEDIUM + OPT_SAVE_REG_TO_PACA(PACA_EXGEN+EX_PPR, r0, CPU_FTR_HAS_PPR) + mfctr r10 + SET_SCRATCH0(r12) + std r9,PACA_EXGEN+EX_R9(r13) + mfcr r9 + std r10,PACA_EXGEN+EX_R10(r13) + KVM_HANDLER(PACA_EXGEN, EXC_STD, 0xc00) +#endif EXC_REAL(single_step, 0xd00, 0x100) -- cgit v1.2.3 From bc4f65e4cf9d6cc43e0e9ba0b8648cf9201cd55f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:35:05 +1000 Subject: powerpc/64: Avoid restore_math call if possible in syscall exit The syscall exit code that branches to restore_math is quite heavy on Book3S, consisting of 2 mtmsr instructions. Threads that don't use both FP and vector can get caught here if the kernel ever uses FP or vector. Lazy-FP/vec context switching also trips this case. So check for lazy FP and vector before switching RI for restore_math. Move most of this case out of line. For threads that do want to restore math registers, the MSR switches are still suboptimal. Future direction may be to use a soft-RI bit to avoid MSR switches in kernel (similar to soft-EE), but for now at least the no-restore POWER9 context switch rate increases by about 5% due to sched_yield(2) return performance. I haven't constructed a test to measure the syscall cost. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 62 +++++++++++++++++++++++++++++------------- arch/powerpc/kernel/process.c | 4 +++ 2 files changed, 47 insertions(+), 19 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index bfbad08a1207..6f70ea821a07 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -210,27 +210,17 @@ system_call: /* label this so stack traces look sane */ andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) bne- syscall_exit_work - andi. r0,r8,MSR_FP - beq 2f + /* If MSR_FP and MSR_VEC are set in user msr, then no need to restore */ + li r7,MSR_FP #ifdef CONFIG_ALTIVEC - andis. r0,r8,MSR_VEC@h - bne 3f + oris r7,r7,MSR_VEC@h #endif -2: addi r3,r1,STACK_FRAME_OVERHEAD -#ifdef CONFIG_PPC_BOOK3S - li r10,MSR_RI - mtmsrd r10,1 /* Restore RI */ -#endif - bl restore_math -#ifdef CONFIG_PPC_BOOK3S - li r11,0 - mtmsrd r11,1 -#endif - ld r8,_MSR(r1) - ld r3,RESULT(r1) - li r11,-MAX_ERRNO + and r0,r8,r7 + cmpd r0,r7 + bne syscall_restore_math +.Lsyscall_restore_math_cont: -3: cmpld r3,r11 + cmpld r3,r11 ld r5,_CCR(r1) bge- syscall_error .Lsyscall_error_cont: @@ -263,7 +253,41 @@ syscall_error: neg r3,r3 std r5,_CCR(r1) b .Lsyscall_error_cont - + +syscall_restore_math: + /* + * Some initial tests from restore_math to avoid the heavyweight + * C code entry and MSR manipulations. + */ + LOAD_REG_IMMEDIATE(r0, MSR_TS_MASK) + and. r0,r0,r8 + bne 1f + + ld r7,PACACURRENT(r13) + lbz r0,THREAD+THREAD_LOAD_FP(r7) +#ifdef CONFIG_ALTIVEC + lbz r6,THREAD+THREAD_LOAD_VEC(r7) + add r0,r0,r6 +#endif + cmpdi r0,0 + beq .Lsyscall_restore_math_cont + +1: addi r3,r1,STACK_FRAME_OVERHEAD +#ifdef CONFIG_PPC_BOOK3S + li r10,MSR_RI + mtmsrd r10,1 /* Restore RI */ +#endif + bl restore_math +#ifdef CONFIG_PPC_BOOK3S + li r11,0 + mtmsrd r11,1 +#endif + /* Restore volatiles, reload MSR from updated one */ + ld r8,_MSR(r1) + ld r3,RESULT(r1) + li r11,-MAX_ERRNO + b .Lsyscall_restore_math_cont + /* Traced system call support */ syscall_dotrace: bl save_nvgprs diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index baae104b16c7..5cbb8b1faf7e 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -511,6 +511,10 @@ void restore_math(struct pt_regs *regs) { unsigned long msr; + /* + * Syscall exit makes a similar initial check before branching + * to restore_math. Keep them in synch. + */ if (!msr_tm_active(regs->msr) && !current->thread.load_fp && !loadvec(current->thread)) return; -- cgit v1.2.3 From e4c0fc5f72bca11432297168338aef46c12793a4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:36:06 +1000 Subject: powerpc/64s: Leave interrupts hard enabled in context switch for radix Commit 4387e9ff25 ("[POWERPC] Fix PMU + soft interrupt disable bug") hard disabled interrupts over the low level context switch, because the SLB management can't cope with a PMU interrupt accesing the stack in that window. Radix based kernel mapping does not use the SLB so it does not require interrupts hard disabled here. This is worth 1-2% in context switch performance on POWER9. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 8 ++++++++ arch/powerpc/kernel/process.c | 14 ++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 6f70ea821a07..91f9fdc2d027 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -607,6 +607,14 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_1T_SEGMENT) top of the kernel stack. */ addi r7,r7,THREAD_SIZE-SWITCH_FRAME_SIZE + /* + * PMU interrupts in radix may come in here. They will use r1, not + * PACAKSAVE, so this stack switch will not cause a problem. They + * will store to the process stack, which may then be migrated to + * another CPU. However the rq lock release on this CPU paired with + * the rq lock acquire on the new CPU before the stack becomes + * active on the new CPU, will order those stores. + */ mr r1,r8 /* start using new stack pointer */ std r7,PACAKSAVE(r13) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 5cbb8b1faf7e..45faa9a32a01 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1199,12 +1199,14 @@ struct task_struct *__switch_to(struct task_struct *prev, __switch_to_tm(prev, new); - /* - * We can't take a PMU exception inside _switch() since there is a - * window where the kernel stack SLB and the kernel stack are out - * of sync. Hard disable here. - */ - hard_irq_disable(); + if (!radix_enabled()) { + /* + * We can't take a PMU exception inside _switch() since there + * is a window where the kernel stack SLB and the kernel stack + * are out of sync. Hard disable here. + */ + hard_irq_disable(); + } /* * Call restore_sprs() before calling _switch(). If we move it after -- cgit v1.2.3 From 837e72f78a72ef43a0c5e179f3addadb2a225f80 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:36:07 +1000 Subject: powerpc/64: Drop reservation-clearing ldarx in context switch There is no need to explicitly break the reservation in _switch, because we are guaranteed that the context switch path will include a larx/stcx. Comment the guarantee and remove the reservation clear from _switch. This is worth 1-2% in context switch performance. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 91f9fdc2d027..273a35926534 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -521,15 +521,10 @@ _GLOBAL(_switch) #endif /* CONFIG_SMP */ /* - * If we optimise away the clear of the reservation in system - * calls because we know the CPU tracks the address of the - * reservation, then we need to clear it here to cover the - * case that the kernel context switch path has no larx - * instructions. + * The kernel context switch path must contain a spin_lock, + * which contains larx/stcx, which will clear any reservation + * of the task being switched. */ -BEGIN_FTR_SECTION - ldarx r6,0,r1 -END_FTR_SECTION_IFSET(CPU_FTR_STCX_CHECKS_ADDRESS) BEGIN_FTR_SECTION /* -- cgit v1.2.3 From 9145effd626d155484f73db24ab3e142ecda31db Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:36:08 +1000 Subject: powerpc/64: Drop explicit hwsync in context switch The sync (aka. hwsync, aka. heavyweight sync) in the context switch code to prevent MMIO access being reordered from the point of view of a single process if it gets migrated to a different CPU is not required because there is an hwsync performed earlier in the context switch path. Comment this so it's clear enough if anything changes on the scheduler or the powerpc sides. Remove the hwsync from _switch. This improves context switch performance by 2-3% on POWER8. Signed-off-by: Nicholas Piggin Acked-by: Peter Zijlstra (Intel) Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/barrier.h | 5 +++++ arch/powerpc/kernel/entry_64.S | 23 +++++++++++++++++------ 2 files changed, 22 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index c0deafc212b8..25d42bd3f114 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -74,6 +74,11 @@ do { \ ___p1; \ }) +/* + * This must resolve to hwsync on SMP for the context switch path. + * See _switch, and core scheduler context switch memory ordering + * comments. + */ #define smp_mb__before_spinlock() smp_mb() #include diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 273a35926534..fb143859cc68 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -512,13 +512,24 @@ _GLOBAL(_switch) std r23,_CCR(r1) std r1,KSP(r3) /* Set old stack pointer */ -#ifdef CONFIG_SMP - /* We need a sync somewhere here to make sure that if the - * previous task gets rescheduled on another CPU, it sees all - * stores it has performed on this one. + /* + * On SMP kernels, care must be taken because a task may be + * scheduled off CPUx and on to CPUy. Memory ordering must be + * considered. + * + * Cacheable stores on CPUx will be visible when the task is + * scheduled on CPUy by virtue of the core scheduler barriers + * (see "Notes on Program-Order guarantees on SMP systems." in + * kernel/sched/core.c). + * + * Uncacheable stores in the case of involuntary preemption must + * be taken care of. The smp_mb__before_spin_lock() in __schedule() + * is implemented as hwsync on powerpc, which orders MMIO too. So + * long as there is an hwsync in the context switch path, it will + * be executed on the source CPU after the task has performed + * all MMIO ops on that CPU, and on the destination CPU before the + * task performs any MMIO ops there. */ - sync -#endif /* CONFIG_SMP */ /* * The kernel context switch path must contain a spin_lock, -- cgit v1.2.3 From 07d2a628bc0008f90754ac7982289f6cb0f46cf8 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Fri, 9 Jun 2017 01:36:09 +1000 Subject: powerpc/64s: Avoid cpabort in context switch when possible The ISA v3.0B copy-paste facility only requires cpabort when switching to a process that has foreign real addresses mapped (direct access to accelerators), to clear a potential copy buffer filled by a previous thread. There is no accelerator driver implemented yet, so cpabort can be removed. It can be be re-added when a driver is implemented. POWER9 DD1 requires the copy buffer to always be cleared on context switch, but if accelerators are not in use, then an unpaired copy from a dummy region is sufficient to clear data out of the copy buffer. This increases context switch performance by about 5% on POWER9. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/ppc-opcode.h | 8 ++++---- arch/powerpc/kernel/entry_64.S | 9 --------- arch/powerpc/kernel/process.c | 27 ++++++++++++++++++++++++++- 3 files changed, 30 insertions(+), 14 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 3a8d278e7421..3b6bbf5a8683 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -189,8 +189,7 @@ /* sorted alphabetically */ #define PPC_INST_BHRBE 0x7c00025c #define PPC_INST_CLRBHRB 0x7c00035c -#define PPC_INST_COPY 0x7c00060c -#define PPC_INST_COPY_FIRST 0x7c20060c +#define PPC_INST_COPY 0x7c20060c #define PPC_INST_CP_ABORT 0x7c00068c #define PPC_INST_DCBA 0x7c0005ec #define PPC_INST_DCBA_MASK 0xfc0007fe @@ -223,8 +222,7 @@ #define PPC_INST_MSGSNDP 0x7c00011c #define PPC_INST_MTTMR 0x7c0003dc #define PPC_INST_NOP 0x60000000 -#define PPC_INST_PASTE 0x7c00070c -#define PPC_INST_PASTE_LAST 0x7c20070d +#define PPC_INST_PASTE 0x7c20070d #define PPC_INST_POPCNTB 0x7c0000f4 #define PPC_INST_POPCNTB_MASK 0xfc0007fe #define PPC_INST_POPCNTD 0x7c0003f4 @@ -392,6 +390,8 @@ /* Deal with instructions that older assemblers aren't aware of */ #define PPC_CP_ABORT stringify_in_c(.long PPC_INST_CP_ABORT) +#define PPC_COPY(a, b) stringify_in_c(.long PPC_INST_COPY | \ + ___PPC_RA(a) | ___PPC_RB(b)) #define PPC_DCBAL(a, b) stringify_in_c(.long PPC_INST_DCBAL | \ __PPC_RA(a) | __PPC_RB(b)) #define PPC_DCBZL(a, b) stringify_in_c(.long PPC_INST_DCBZL | \ diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index fb143859cc68..da9486e2fd89 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -536,15 +536,6 @@ _GLOBAL(_switch) * which contains larx/stcx, which will clear any reservation * of the task being switched. */ - -BEGIN_FTR_SECTION -/* - * A cp_abort (copy paste abort) here ensures that when context switching, a - * copy from one process can't leak into the paste of another. - */ - PPC_CP_ABORT -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - #ifdef CONFIG_PPC_BOOK3S /* Cancel all explict user streams as they will have no use after context * switch and will stop the HW from creating streams itself diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 45faa9a32a01..6273b5d5baec 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1137,6 +1137,11 @@ static inline void restore_sprs(struct thread_struct *old_thread, #endif } +#ifdef CONFIG_PPC_BOOK3S_64 +#define CP_SIZE 128 +static const u8 dummy_copy_buffer[CP_SIZE] __attribute__((aligned(CP_SIZE))); +#endif + struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *new) { @@ -1226,8 +1231,28 @@ struct task_struct *__switch_to(struct task_struct *prev, batch->active = 1; } - if (current_thread_info()->task->thread.regs) + if (current_thread_info()->task->thread.regs) { restore_math(current_thread_info()->task->thread.regs); + + /* + * The copy-paste buffer can only store into foreign real + * addresses, so unprivileged processes can not see the + * data or use it in any way unless they have foreign real + * mappings. We don't have a VAS driver that allocates those + * yet, so no cpabort is required. + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + /* + * DD1 allows paste into normal system memory, so we + * do an unpaired copy here to clear the buffer and + * prevent a covert channel being set up. + * + * cpabort is not used because it is quite expensive. + */ + asm volatile(PPC_COPY(%0, %1) + : : "r"(dummy_copy_buffer), "r"(0)); + } + } #endif /* CONFIG_PPC_STD_MMU_64 */ return last; -- cgit v1.2.3 From 2201f994a5742c03e660623c385fd6897dd1fa2f Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:45 +1000 Subject: powerpc/64s/idle: Move soft interrupt mask logic into C code This simplifies the asm and fixes irq-off tracing over sleep instructions. Also move powersave_nap check for POWER8 into C code, and move PSSCR register value calculation for POWER9 into C. Reviewed-by: Gautham R. Shenoy Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hw_irq.h | 3 ++ arch/powerpc/include/asm/machdep.h | 1 + arch/powerpc/include/asm/processor.h | 10 ++-- arch/powerpc/kernel/idle_book3s.S | 82 ++++++-------------------------- arch/powerpc/kernel/irq.c | 33 ++++++++++++- arch/powerpc/platforms/powernv/idle.c | 71 ++++++++++++++++++++++++--- arch/powerpc/platforms/powernv/smp.c | 2 - arch/powerpc/platforms/powernv/subcore.c | 3 +- drivers/cpuidle/cpuidle-powernv.c | 12 ++--- 9 files changed, 128 insertions(+), 89 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index eba60416536e..f06112cf8734 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -129,6 +129,9 @@ static inline bool arch_irq_disabled_regs(struct pt_regs *regs) } extern bool prep_irq_for_idle(void); +extern bool prep_irq_for_idle_irqsoff(void); + +#define fini_irq_for_idle_irqsoff() trace_hardirqs_off(); extern void force_external_irq_replay(void); diff --git a/arch/powerpc/include/asm/machdep.h b/arch/powerpc/include/asm/machdep.h index f90b22c722e1..cd2fc1cc1cc7 100644 --- a/arch/powerpc/include/asm/machdep.h +++ b/arch/powerpc/include/asm/machdep.h @@ -226,6 +226,7 @@ struct machdep_calls { extern void e500_idle(void); extern void power4_idle(void); extern void power7_idle(void); +extern void power9_idle(void); extern void ppc6xx_idle(void); extern void book3e_idle(void); diff --git a/arch/powerpc/include/asm/processor.h b/arch/powerpc/include/asm/processor.h index a2123f291ab0..c49165a7439c 100644 --- a/arch/powerpc/include/asm/processor.h +++ b/arch/powerpc/include/asm/processor.h @@ -481,11 +481,11 @@ extern unsigned long cpuidle_disable; enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF}; extern int powersave_nap; /* set if nap mode can be used in idle loop */ -extern unsigned long power7_nap(int check_irq); -extern unsigned long power7_sleep(void); -extern unsigned long power7_winkle(void); -extern unsigned long power9_idle_stop(unsigned long stop_psscr_val, - unsigned long stop_psscr_mask); +extern unsigned long power7_idle_insn(unsigned long type); /* PNV_THREAD_NAP/etc*/ +extern void power7_idle_type(unsigned long type); +extern unsigned long power9_idle_stop(unsigned long psscr_val); +extern void power9_idle_type(unsigned long stop_psscr_val, + unsigned long stop_psscr_mask); extern void flush_instruction_cache(void); extern void hard_reset_now(void); diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 98a6d07ecb5c..35cf5bb7daed 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -109,13 +109,9 @@ core_idle_lock_held: /* * Pass requested state in r3: * r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8 - * - Requested STOP state in POWER9 + * - Requested PSSCR value in POWER9 * - * To check IRQ_HAPPENED in r4 - * 0 - don't check - * 1 - check - * - * Address to 'rfid' to in r5 + * Address of idle handler to 'rfid' to in r4 */ pnv_powersave_common: /* Use r3 to pass state nap/sleep/winkle */ @@ -131,30 +127,7 @@ pnv_powersave_common: std r0,_LINK(r1) std r0,_NIP(r1) - /* Hard disable interrupts */ - mfmsr r9 - rldicl r9,r9,48,1 - rotldi r9,r9,16 - mtmsrd r9,1 /* hard-disable interrupts */ - - /* Check if something happened while soft-disabled */ - lbz r0,PACAIRQHAPPENED(r13) - andi. r0,r0,~PACA_IRQ_HARD_DIS@l - beq 1f - cmpwi cr0,r4,0 - beq 1f - addi r1,r1,INT_FRAME_SIZE - ld r0,16(r1) - li r3,0 /* Return 0 (no nap) */ - mtlr r0 - blr - -1: /* We mark irqs hard disabled as this is the state we'll - * be in when returning and we need to tell arch_local_irq_restore() - * about it - */ - li r0,PACA_IRQ_HARD_DIS - stb r0,PACAIRQHAPPENED(r13) + mfmsr r9 /* We haven't lost state ... yet */ li r0,0 @@ -163,8 +136,8 @@ pnv_powersave_common: /* Continue saving state */ SAVE_GPR(2, r1) SAVE_NVGPRS(r1) - mfcr r4 - std r4,_CCR(r1) + mfcr r5 + std r5,_CCR(r1) std r9,_MSR(r1) std r1,PACAR1(r13) @@ -178,7 +151,7 @@ pnv_powersave_common: li r6, MSR_RI andc r6, r9, r6 mtmsrd r6, 1 /* clear RI before setting SRR0/1 */ - mtspr SPRN_SRR0, r5 + mtspr SPRN_SRR0, r4 mtspr SPRN_SRR1, r7 rfid @@ -322,35 +295,14 @@ lwarx_loop_stop: IDLE_STATE_ENTER_SEQ_NORET(PPC_STOP) -_GLOBAL(power7_idle) +/* + * Entered with MSR[EE]=0 and no soft-masked interrupts pending. + * r3 contains desired idle state (PNV_THREAD_NAP/SLEEP/WINKLE). + */ +_GLOBAL(power7_idle_insn) /* Now check if user or arch enabled NAP mode */ - LOAD_REG_ADDRBASE(r3,powersave_nap) - lwz r4,ADDROFF(powersave_nap)(r3) - cmpwi 0,r4,0 - beqlr - li r3, 1 - /* fall through */ - -_GLOBAL(power7_nap) - mr r4,r3 - li r3,PNV_THREAD_NAP - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) - b pnv_powersave_common - /* No return */ - -_GLOBAL(power7_sleep) - li r3,PNV_THREAD_SLEEP - li r4,1 - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) + LOAD_REG_ADDR(r4, pnv_enter_arch207_idle_mode) b pnv_powersave_common - /* No return */ - -_GLOBAL(power7_winkle) - li r3,PNV_THREAD_WINKLE - li r4,1 - LOAD_REG_ADDR(r5, pnv_enter_arch207_idle_mode) - b pnv_powersave_common - /* No return */ #define CHECK_HMI_INTERRUPT \ mfspr r0,SPRN_SRR1; \ @@ -372,17 +324,13 @@ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ 20: nop; /* - * r3 - The PSSCR value corresponding to the stop state. - * r4 - The PSSCR mask corrresonding to the stop state. + * Entered with MSR[EE]=0 and no soft-masked interrupts pending. + * r3 contains desired PSSCR register value. */ _GLOBAL(power9_idle_stop) - mfspr r5,SPRN_PSSCR - andc r5,r5,r4 - or r3,r3,r5 std r3, PACA_REQ_PSSCR(r13) mtspr SPRN_PSSCR,r3 - LOAD_REG_ADDR(r5,power_enter_stop) - li r4,1 + LOAD_REG_ADDR(r4,power_enter_stop) b pnv_powersave_common /* No return */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 5c291df30fe3..58dcac88bc79 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -322,7 +322,8 @@ bool prep_irq_for_idle(void) * First we need to hard disable to ensure no interrupt * occurs before we effectively enter the low power state */ - hard_irq_disable(); + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; /* * If anything happened while we were soft-disabled, @@ -347,6 +348,36 @@ bool prep_irq_for_idle(void) return true; } +/* + * This is for idle sequences that return with IRQs off, but the + * idle state itself wakes on interrupt. Tell the irq tracer that + * IRQs are enabled for the duration of idle so it does not get long + * off times. Must be paired with fini_irq_for_idle_irqsoff. + */ +bool prep_irq_for_idle_irqsoff(void) +{ + WARN_ON(!irqs_disabled()); + + /* + * First we need to hard disable to ensure no interrupt + * occurs before we effectively enter the low power state + */ + __hard_irq_disable(); + local_paca->irq_happened |= PACA_IRQ_HARD_DIS; + + /* + * If anything happened while we were soft-disabled, + * we return now and do not enter the low power state. + */ + if (lazy_irq_pending()) + return false; + + /* Tell lockdep we are about to re-enable */ + trace_hardirqs_on(); + + return true; +} + /* * Force a replay of the external interrupt handler on this CPU. */ diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index 46946a587004..f875879ff1eb 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -23,6 +23,7 @@ #include #include #include +#include #include "powernv.h" #include "subcore.h" @@ -283,12 +284,68 @@ static DEVICE_ATTR(fastsleep_workaround_applyonce, 0600, show_fastsleep_workaround_applyonce, store_fastsleep_workaround_applyonce); +static unsigned long __power7_idle_type(unsigned long type) +{ + unsigned long srr1; + + if (!prep_irq_for_idle_irqsoff()) + return 0; + + ppc64_runlatch_off(); + srr1 = power7_idle_insn(type); + ppc64_runlatch_on(); + + fini_irq_for_idle_irqsoff(); + + return srr1; +} + +void power7_idle_type(unsigned long type) +{ + __power7_idle_type(type); +} + +void power7_idle(void) +{ + if (!powersave_nap) + return; + + power7_idle_type(PNV_THREAD_NAP); +} + +static unsigned long __power9_idle_type(unsigned long stop_psscr_val, + unsigned long stop_psscr_mask) +{ + unsigned long psscr; + unsigned long srr1; + + if (!prep_irq_for_idle_irqsoff()) + return 0; + + psscr = mfspr(SPRN_PSSCR); + psscr = (psscr & ~stop_psscr_mask) | stop_psscr_val; + + ppc64_runlatch_off(); + srr1 = power9_idle_stop(psscr); + ppc64_runlatch_on(); + + fini_irq_for_idle_irqsoff(); + + return srr1; +} + +void power9_idle_type(unsigned long stop_psscr_val, + unsigned long stop_psscr_mask) +{ + __power9_idle_type(stop_psscr_val, stop_psscr_mask); +} + /* * Used for ppc_md.power_save which needs a function with no parameters */ -static void power9_idle(void) +void power9_idle(void) { - power9_idle_stop(pnv_default_stop_val, pnv_default_stop_mask); + power9_idle_type(pnv_default_stop_val, pnv_default_stop_mask); } #ifdef CONFIG_HOTPLUG_CPU @@ -303,16 +360,17 @@ unsigned long pnv_cpu_offline(unsigned int cpu) u32 idle_states = pnv_get_supported_cpuidle_states(); if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) { - srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, + srr1 = __power9_idle_type(pnv_deepest_stop_psscr_val, pnv_deepest_stop_psscr_mask); } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { - srr1 = power7_winkle(); + srr1 = __power7_idle_type(PNV_THREAD_WINKLE); } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { - srr1 = power7_sleep(); + srr1 = __power7_idle_type(PNV_THREAD_SLEEP); } else if (idle_states & OPAL_PM_NAP_ENABLED) { - srr1 = power7_nap(1); + srr1 = __power7_idle_type(PNV_THREAD_NAP); } else { + ppc64_runlatch_off(); /* This is the fallback method. We emulate snooze */ while (!generic_check_cpu_restart(cpu)) { HMT_low(); @@ -320,6 +378,7 @@ unsigned long pnv_cpu_offline(unsigned int cpu) } srr1 = 0; HMT_medium(); + ppc64_runlatch_on(); } return srr1; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 4aff754b6f2c..f8752795decf 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -182,9 +182,7 @@ static void pnv_smp_cpu_kill_self(void) */ kvmppc_set_host_ipi(cpu, 0); - ppc64_runlatch_off(); srr1 = pnv_cpu_offline(cpu); - ppc64_runlatch_on(); /* * If the SRR1 value indicates that we woke up due to diff --git a/arch/powerpc/platforms/powernv/subcore.c b/arch/powerpc/platforms/powernv/subcore.c index 0babef11136f..d975d78188a9 100644 --- a/arch/powerpc/platforms/powernv/subcore.c +++ b/arch/powerpc/platforms/powernv/subcore.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -182,7 +183,7 @@ static void unsplit_core(void) cpu = smp_processor_id(); if (cpu_thread_in_core(cpu) != 0) { while (mfspr(SPRN_HID0) & mask) - power7_nap(0); + power7_idle_insn(PNV_THREAD_NAP); per_cpu(split_state, cpu).step = SYNC_STEP_UNSPLIT; return; diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index 45eaf06462ae..79152676f62b 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -73,9 +73,8 @@ static int nap_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - ppc64_runlatch_off(); - power7_idle(); - ppc64_runlatch_on(); + power7_idle_type(PNV_THREAD_NAP); + return index; } @@ -98,7 +97,8 @@ static int fastsleep_loop(struct cpuidle_device *dev, new_lpcr &= ~LPCR_PECE1; mtspr(SPRN_LPCR, new_lpcr); - power7_sleep(); + + power7_idle_type(PNV_THREAD_SLEEP); mtspr(SPRN_LPCR, old_lpcr); @@ -110,10 +110,8 @@ static int stop_loop(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index) { - ppc64_runlatch_off(); - power9_idle_stop(stop_psscr_table[index].val, + power9_idle_type(stop_psscr_table[index].val, stop_psscr_table[index].mask); - ppc64_runlatch_on(); return index; } -- cgit v1.2.3 From 771d4304d07f080b6ce751e12f3579cb012a1b22 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:47 +1000 Subject: powerpc/64s/idle: Process interrupts from system reset wakeup When the CPU wakes from low power state, it begins at the system reset interrupt with the exception that caused the wakeup encoded in SRR1. Today, powernv idle wakeup ignores the wakeup reason (except a special case for HMI), and the regular interrupt corresponding to the exception will fire after the idle wakeup exits. Change this to replay the interrupt from the idle wakeup before interrupts are hard-enabled. Test on POWER8 of context_switch selftests benchmark with polling idle disabled (e.g., always nap, giving cross-CPU IPIs) gives the following results: original wakeup direct Different threads, same core: 315k/s 264k/s Different cores: 235k/s 242k/s There is a slowdown for doorbell IPI (same core) case because system reset wakeup does not clear the message and the doorbell interrupt fires again needlessly. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/hw_irq.h | 1 + arch/powerpc/kernel/irq.c | 29 +++++++++++++++++++++++++++++ arch/powerpc/platforms/powernv/idle.c | 10 ++++++++-- 3 files changed, 38 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index f06112cf8734..c1dd1929342d 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -130,6 +130,7 @@ static inline bool arch_irq_disabled_regs(struct pt_regs *regs) extern bool prep_irq_for_idle(void); extern bool prep_irq_for_idle_irqsoff(void); +extern void irq_set_pending_from_srr1(unsigned long srr1); #define fini_irq_for_idle_irqsoff() trace_hardirqs_off(); diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 58dcac88bc79..0bcec745a672 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -348,6 +348,7 @@ bool prep_irq_for_idle(void) return true; } +#ifdef CONFIG_PPC_BOOK3S /* * This is for idle sequences that return with IRQs off, but the * idle state itself wakes on interrupt. Tell the irq tracer that @@ -378,6 +379,34 @@ bool prep_irq_for_idle_irqsoff(void) return true; } +/* + * Take the SRR1 wakeup reason, index into this table to find the + * appropriate irq_happened bit. + */ +static const u8 srr1_to_lazyirq[0x10] = { + 0, 0, 0, + PACA_IRQ_DBELL, + 0, + PACA_IRQ_DBELL, + PACA_IRQ_DEC, + 0, + PACA_IRQ_EE, + PACA_IRQ_EE, + PACA_IRQ_HMI, + 0, 0, 0, 0, 0 }; + +void irq_set_pending_from_srr1(unsigned long srr1) +{ + unsigned int idx = (srr1 & SRR1_WAKEMASK_P8) >> 18; + + /* + * The 0 index (SRR1[42:45]=b0000) must always evaluate to 0, + * so this can be called unconditionally with srr1 wake reason. + */ + local_paca->irq_happened |= srr1_to_lazyirq[idx]; +} +#endif /* CONFIG_PPC_BOOK3S */ + /* * Force a replay of the external interrupt handler on this CPU. */ diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index f188d84d9c59..1028df82cd2f 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -302,7 +302,10 @@ static unsigned long __power7_idle_type(unsigned long type) void power7_idle_type(unsigned long type) { - __power7_idle_type(type); + unsigned long srr1; + + srr1 = __power7_idle_type(type); + irq_set_pending_from_srr1(srr1); } void power7_idle(void) @@ -337,7 +340,10 @@ static unsigned long __power9_idle_type(unsigned long stop_psscr_val, void power9_idle_type(unsigned long stop_psscr_val, unsigned long stop_psscr_mask) { - __power9_idle_type(stop_psscr_val, stop_psscr_mask); + unsigned long srr1; + + srr1 = __power9_idle_type(stop_psscr_val, stop_psscr_mask); + irq_set_pending_from_srr1(srr1); } /* -- cgit v1.2.3 From a9af97aa0a12c30178dd7ad9af8887d5b9c4647b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:48 +1000 Subject: powerpc/64s: msgclr when handling doorbell exceptions from system reset msgsnd doorbell exceptions are cleared when the doorbell interrupt is taken. However if a doorbell exception causes a system reset interrupt wake from power saving state, the message is not cleared. Processing the doorbell from the system reset interrupt requires msgclr to avoid taking the exception again. Testing this plus the previous wakup direct patch gives: original wakeup direct msgclr Different threads, same core: 315k/s 264k/s 345k/s Different cores: 235k/s 242k/s 242k/s Net speedup is +10% for same core, and +3% for different core. Reviewed-by: Gautham R. Shenoy Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/dbell.h | 13 +++++++++++++ arch/powerpc/include/asm/ppc-opcode.h | 3 +++ arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kernel/exceptions-64s.S | 23 +++++++++++++++++++++-- 4 files changed, 38 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/dbell.h b/arch/powerpc/include/asm/dbell.h index f70cbfe0ec04..9f2ae0d25e15 100644 --- a/arch/powerpc/include/asm/dbell.h +++ b/arch/powerpc/include/asm/dbell.h @@ -56,6 +56,19 @@ static inline void ppc_msgsync(void) : : "i" (CPU_FTR_HVMODE|CPU_FTR_ARCH_300)); } +static inline void _ppc_msgclr(u32 msg) +{ + __asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGCLR(%1), PPC_MSGCLRP(%1), %0) + : : "i" (CPU_FTR_HVMODE), "r" (msg)); +} + +static inline void ppc_msgclr(enum ppc_dbell type) +{ + u32 msg = PPC_DBELL_TYPE(type); + + _ppc_msgclr(msg); +} + #else /* CONFIG_PPC_BOOK3S */ #define PPC_DBELL_MSGTYPE PPC_DBELL diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h index 3b6bbf5a8683..4e2cf719c9b2 100644 --- a/arch/powerpc/include/asm/ppc-opcode.h +++ b/arch/powerpc/include/asm/ppc-opcode.h @@ -220,6 +220,7 @@ #define PPC_INST_MSGCLR 0x7c0001dc #define PPC_INST_MSGSYNC 0x7c0006ec #define PPC_INST_MSGSNDP 0x7c00011c +#define PPC_INST_MSGCLRP 0x7c00015c #define PPC_INST_MTTMR 0x7c0003dc #define PPC_INST_NOP 0x60000000 #define PPC_INST_PASTE 0x7c20070d @@ -409,6 +410,8 @@ ___PPC_RB(b)) #define PPC_MSGSNDP(b) stringify_in_c(.long PPC_INST_MSGSNDP | \ ___PPC_RB(b)) +#define PPC_MSGCLRP(b) stringify_in_c(.long PPC_INST_MSGCLRP | \ + ___PPC_RB(b)) #define PPC_POPCNTB(a, s) stringify_in_c(.long PPC_INST_POPCNTB | \ __PPC_RA(a) | __PPC_RS(s)) #define PPC_POPCNTD(a, s) stringify_in_c(.long PPC_INST_POPCNTD | \ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index e15c178ba079..9624851ca276 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -746,6 +746,7 @@ int main(void) #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); + DEFINE(PPC_DBELL_MSGTYPE, PPC_DBELL_MSGTYPE); #ifdef CONFIG_PPC_8xx DEFINE(VIRT_IMMR_BASE, (u64)__fix_to_virt(FIX_IMMR_BASE)); diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 2f700a15bfa3..1752beefee69 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1611,6 +1611,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) bl kernel_bad_stack b 1b +/* + * When doorbell is triggered from system reset wakeup, the message is + * not cleared, so it would fire again when EE is enabled. + * + * When coming from local_irq_enable, there may be the same problem if + * we were hard disabled. + * + * Execute msgclr to clear pending exceptions before handling it. + */ +h_doorbell_common_msgclr: + LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) + PPC_MSGCLR(3) + b h_doorbell_common + +doorbell_super_common_msgclr: + LOAD_REG_IMMEDIATE(r3, PPC_DBELL_MSGTYPE << (63-36)) + PPC_MSGCLRP(3) + b doorbell_super_common + /* * Called from arch_local_irq_enable when an interrupt needs * to be resent. r3 contains 0x500, 0x900, 0xa00 or 0xe80 to indicate @@ -1636,13 +1655,13 @@ _GLOBAL(__replay_interrupt) beq hardware_interrupt_common BEGIN_FTR_SECTION cmpwi r3,0xe80 - beq h_doorbell_common + beq h_doorbell_common_msgclr cmpwi r3,0xea0 beq h_virt_irq_common cmpwi r3,0xe60 beq hmi_exception_common FTR_SECTION_ELSE cmpwi r3,0xa00 - beq doorbell_super_common + beq doorbell_super_common_msgclr ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) blr -- cgit v1.2.3 From b48bbb82e2b83537c500417d60218ad44446e572 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:49 +1000 Subject: powerpc/64s: Don't unbalance the return branch predictor in __replay_interrupt() The __replay_interrupt() code is branched to with bl, but the caller is returned to directly with rfid from the interrupt. Instead, rfid to a stub that returns to the caller with blr, which should keep the return branch predictor balanced. Reviewed-by: Gautham R. Shenoy Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 1752beefee69..cad3b4b82813 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1639,6 +1639,10 @@ doorbell_super_common_msgclr: * Note: While MSR:EE is off, we need to make sure that _MSR * in the generated frame has EE set to 1 or the exception * handler will not properly re-enable them. + * + * Note that we don't specify LR as the NIP (return address) for + * the interrupt because that would unbalance the return branch + * predictor. */ _GLOBAL(__replay_interrupt) /* We are going to jump to the exception common code which @@ -1646,7 +1650,7 @@ _GLOBAL(__replay_interrupt) * we don't give a damn about, so we don't bother storing them. */ mfmsr r12 - mflr r11 + LOAD_REG_ADDR(r11, .L__replay_interrupt_return) mfcr r9 ori r12,r12,MSR_EE cmpwi r3,0x900 @@ -1664,4 +1668,6 @@ FTR_SECTION_ELSE cmpwi r3,0xa00 beq doorbell_super_common_msgclr ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) +.L__replay_interrupt_return: blr + -- cgit v1.2.3 From b51351e264009e890936af83b8d800b32034273d Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:50 +1000 Subject: powerpc/64s/idle: Branch to handler with virtual mode offset Have the system reset idle wakeup handlers branched to in real mode with the 0xc... kernel address applied. This allows simplifications of avoiding rfid when switching to virtual mode in the wakeup handler. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/exception-64s.h | 13 +++++++++++++ arch/powerpc/kernel/exceptions-64s.S | 6 ++++-- 2 files changed, 17 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index 183d73b6ed99..33473cbc0986 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -236,6 +236,19 @@ END_FTR_SECTION_NESTED(ftr,ftr,943) #define kvmppc_interrupt kvmppc_interrupt_pr #endif +/* + * Branch to label using its 0xC000 address. This results in instruction + * address suitable for MSR[IR]=0 or 1, which allows relocation to be turned + * on using mtmsr rather than rfid. + * + * This could set the 0xc bits for !RELOCATABLE as an immediate, rather than + * load KBASE for a slight optimisation. + */ +#define BRANCH_TO_C000(reg, label) \ + __LOAD_HANDLER(reg, label); \ + mtctr reg; \ + bctr + #ifdef CONFIG_RELOCATABLE #define BRANCH_TO_COMMON(reg, label) \ __LOAD_HANDLER(reg, label); \ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index cad3b4b82813..7807719ca855 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -99,7 +99,9 @@ EXC_VIRT_NONE(0x4000, 0x100) #ifdef CONFIG_PPC_P7_NAP /* * If running native on arch 2.06 or later, check if we are waking up - * from nap/sleep/winkle, and branch to idle handler. + * from nap/sleep/winkle, and branch to idle handler. The idle wakeup + * handler initially runs in real mode, but we branch to the 0xc000... + * address so we can turn on relocation with mtmsr. */ #define IDLETEST(n) \ BEGIN_FTR_SECTION ; \ @@ -107,7 +109,7 @@ EXC_VIRT_NONE(0x4000, 0x100) rlwinm. r10,r10,47-31,30,31 ; \ beq- 1f ; \ cmpwi cr3,r10,2 ; \ - BRANCH_TO_COMMON(r10, system_reset_idle_common) ; \ + BRANCH_TO_C000(r10, system_reset_idle_common) ; \ 1: \ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) #else -- cgit v1.2.3 From 9d29250136f60438fc0839871bae0a0e9cbbd47e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:51 +1000 Subject: powerpc/64s/idle: Avoid SRR usage in idle sleep/wake paths Idle code now always runs at the 0xc... effective address whether in real or virtual mode. This means rfid can be ditched, along with a lot of SRR manipulations. In the wakeup path, carry SRR1 around in r12. Use mtmsrd to change MSR states as required. This also balances the return prediction for the idle call, by doing blr rather than rfid to return to the idle caller. On POWER9, 2-process context switch on different cores, with snooze disabled, increases performance by 2%. Signed-off-by: Nicholas Piggin [mpe: Incorporate v2 fixes from Nick] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 1 + arch/powerpc/kernel/idle_book3s.S | 57 ++++++++++++++++----------------- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 12 +++++-- 3 files changed, 38 insertions(+), 32 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7807719ca855..64365907cddc 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -130,6 +130,7 @@ EXC_VIRT_NONE(0x4100, 0x100) #ifdef CONFIG_PPC_P7_NAP EXC_COMMON_BEGIN(system_reset_idle_common) + mfspr r12,SPRN_SRR1 b pnv_powersave_wakeup #endif diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 35cf5bb7daed..ebe80b5d5ce4 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -111,7 +111,7 @@ core_idle_lock_held: * r3 - PNV_THREAD_NAP/SLEEP/WINKLE in POWER8 * - Requested PSSCR value in POWER9 * - * Address of idle handler to 'rfid' to in r4 + * Address of idle handler to branch to in realmode in r4 */ pnv_powersave_common: /* Use r3 to pass state nap/sleep/winkle */ @@ -121,14 +121,14 @@ pnv_powersave_common: * need to save PC, some CR bits and the NV GPRs, * but for now an interrupt frame will do. */ + mtctr r4 + mflr r0 std r0,16(r1) stdu r1,-INT_FRAME_SIZE(r1) std r0,_LINK(r1) std r0,_NIP(r1) - mfmsr r9 - /* We haven't lost state ... yet */ li r0,0 stb r0,PACA_NAPSTATELOST(r13) @@ -138,7 +138,6 @@ pnv_powersave_common: SAVE_NVGPRS(r1) mfcr r5 std r5,_CCR(r1) - std r9,_MSR(r1) std r1,PACAR1(r13) /* @@ -148,12 +147,8 @@ pnv_powersave_common: * the MMU context to the guest. */ LOAD_REG_IMMEDIATE(r7, MSR_IDLE) - li r6, MSR_RI - andc r6, r9, r6 - mtmsrd r6, 1 /* clear RI before setting SRR0/1 */ - mtspr SPRN_SRR0, r4 - mtspr SPRN_SRR1, r7 - rfid + mtmsrd r7,0 + bctr .globl pnv_enter_arch207_idle_mode pnv_enter_arch207_idle_mode: @@ -305,11 +300,10 @@ _GLOBAL(power7_idle_insn) b pnv_powersave_common #define CHECK_HMI_INTERRUPT \ - mfspr r0,SPRN_SRR1; \ BEGIN_FTR_SECTION_NESTED(66); \ - rlwinm r0,r0,45-31,0xf; /* extract wake reason field (P8) */ \ + rlwinm r0,r12,45-31,0xf; /* extract wake reason field (P8) */ \ FTR_SECTION_ELSE_NESTED(66); \ - rlwinm r0,r0,45-31,0xe; /* P7 wake reason field is 3 bits */ \ + rlwinm r0,r12,45-31,0xe; /* P7 wake reason field is 3 bits */ \ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ cmpwi r0,0xa; /* Hypervisor maintenance ? */ \ bne 20f; \ @@ -388,17 +382,17 @@ pnv_powersave_wakeup_mce: /* * Now put the original SRR1 with SRR1_WAKEMCE_RESVD as the wake - * reason into SRR1, which allows reuse of the system reset wakeup + * reason into r12, which allows reuse of the system reset wakeup * code without being mistaken for another type of wakeup. */ - oris r3,r3,SRR1_WAKEMCE_RESVD@h - mtspr SPRN_SRR1,r3 + oris r12,r3,SRR1_WAKEMCE_RESVD@h b pnv_powersave_wakeup /* * Called from reset vector for powersave wakeups. * cr3 - set to gt if waking up with partial/complete hypervisor state loss + * r12 - SRR1 */ .global pnv_powersave_wakeup pnv_powersave_wakeup: @@ -416,6 +410,8 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) li r0,PNV_THREAD_RUNNING stb r0,PACA_THREAD_IDLE_STATE(r13) /* Clear thread state */ + mr r3,r12 + #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE li r0,KVM_HWTHREAD_IN_KERNEL stb r0,HSTATE_HWTHREAD_STATE(r13) @@ -429,7 +425,6 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) #endif /* Return SRR1 from power7_nap() */ - mfspr r3,SPRN_SRR1 blt cr3,pnv_wakeup_noloss b pnv_wakeup_loss @@ -529,9 +524,9 @@ pnv_wakeup_tb_loss: * is required to return back to reset vector after hypervisor state * restore is complete. */ + mr r19,r12 mr r18,r4 mflr r17 - mfspr r16,SPRN_SRR1 BEGIN_FTR_SECTION CHECK_HMI_INTERRUPT END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) @@ -781,7 +776,7 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) hypervisor_state_restored: - mtspr SPRN_SRR1,r16 + mr r12,r19 mtlr r17 blr /* return to pnv_powersave_wakeup */ @@ -794,6 +789,7 @@ fastsleep_workaround_at_exit: /* * R3 here contains the value that will be returned to the caller * of power7_nap. + * R12 contains SRR1 for CHECK_HMI_INTERRUPT. */ .global pnv_wakeup_loss pnv_wakeup_loss: @@ -803,32 +799,33 @@ BEGIN_FTR_SECTION END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) REST_NVGPRS(r1) REST_GPR(2, r1) + ld r4,PACAKMSR(r13) + ld r5,_LINK(r1) ld r6,_CCR(r1) - ld r4,_MSR(r1) - ld r5,_NIP(r1) addi r1,r1,INT_FRAME_SIZE + mtlr r5 mtcr r6 - mtspr SPRN_SRR1,r4 - mtspr SPRN_SRR0,r5 - rfid + mtmsrd r4 + blr /* * R3 here contains the value that will be returned to the caller * of power7_nap. + * R12 contains SRR1 for CHECK_HMI_INTERRUPT. */ pnv_wakeup_noloss: lbz r0,PACA_NAPSTATELOST(r13) cmpwi r0,0 bne pnv_wakeup_loss + ld r1,PACAR1(r13) BEGIN_FTR_SECTION CHECK_HMI_INTERRUPT END_FTR_SECTION_IFSET(CPU_FTR_HVMODE) - ld r1,PACAR1(r13) - ld r6,_CCR(r1) - ld r4,_MSR(r1) + ld r4,PACAKMSR(r13) ld r5,_NIP(r1) + ld r6,_CCR(r1) addi r1,r1,INT_FRAME_SIZE + mtlr r5 mtcr r6 - mtspr SPRN_SRR1,r4 - mtspr SPRN_SRR0,r5 - rfid + mtmsrd r4 + blr diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index bdb3f76ceb6b..ecb69c4ee943 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -329,15 +329,21 @@ kvm_novcpu_exit: * We come in here when wakened from nap mode. * Relocation is off and most register values are lost. * r13 points to the PACA. + * r3 contains the SRR1 wakeup value, SRR1 is trashed. */ .globl kvm_start_guest kvm_start_guest: - /* Set runlatch bit the minute you wake up from nap */ mfspr r0, SPRN_CTRLF ori r0, r0, 1 mtspr SPRN_CTRLT, r0 + /* + * Could avoid this and pass it through in r3. For now, + * code expects it to be in SRR1. + */ + mtspr SPRN_SRR1,r3 + ld r2,PACATOC(r13) li r0,KVM_HWTHREAD_IN_KVM @@ -456,13 +462,15 @@ kvm_no_guest: /* * We jump to pnv_wakeup_loss, which will return to the caller * of power7_nap in the powernv cpu offline loop. The value we - * put in r3 becomes the return value for power7_nap. + * put in r3 becomes the return value for power7_nap. pnv_wakeup_loss + * requires SRR1 in r12. */ li r3, LPCR_PECE0 mfspr r4, SPRN_LPCR rlwimi r4, r3, 0, LPCR_PECE0 | LPCR_PECE1 mtspr SPRN_LPCR, r4 li r3, 0 + mfspr r12,SPRN_SRR1 b pnv_wakeup_loss 53: HMT_LOW -- cgit v1.2.3 From 95acdc07124f329ef3088a9bc68af905804b2e6b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 13 Jun 2017 23:05:52 +1000 Subject: powerpc/64s/idle: Predict HMI wakeup as unlikely In a busy system, idle wakeups can be expected from IPIs and device interrupts. Reviewed-by: Gautham R. Shenoy Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_book3s.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index ebe80b5d5ce4..1ea14b96f126 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -306,7 +306,7 @@ FTR_SECTION_ELSE_NESTED(66); \ rlwinm r0,r12,45-31,0xe; /* P7 wake reason field is 3 bits */ \ ALT_FTR_SECTION_END_NESTED_IFSET(CPU_FTR_ARCH_207S, 66); \ cmpwi r0,0xa; /* Hypervisor maintenance ? */ \ - bne 20f; \ + bne+ 20f; \ /* Invoke opal call to handle hmi */ \ ld r2,PACATOC(r13); \ ld r1,PACAR1(r13); \ -- cgit v1.2.3 From 7c28f04828dc8321cb234b2ad57266b9f902add0 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 21 May 2017 23:15:43 +1000 Subject: powerpc/64s: Avoid saving faulting address into EX_DAR in SLB miss The EX_DAR save area is only used in exceptional cases. With r3 no longer clobbered by slb_allocate_realmode, saving faulting address to EX_DAR can be deferred to those cases. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 64365907cddc..fe3bc52aadf8 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -631,7 +631,6 @@ EXC_COMMON_BEGIN(slb_miss_realmode) stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ - std r3,PACA_EXSLB+EX_DAR(r13) crset 4*cr0+eq #ifdef CONFIG_PPC_STD_MMU_64 @@ -641,11 +640,10 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) #endif ld r10,PACA_EXSLB+EX_LR(r13) - ld r3,PACA_EXSLB+EX_R3(r13) lwz r9,PACA_EXSLB+EX_CCR(r13) /* get saved CR */ mtlr r10 - beq 8f /* if bad address, make full stack frame */ + beq- 8f /* if bad address, make full stack frame */ andi. r10,r12,MSR_RI /* check for unrecoverable exception */ beq- 2f @@ -660,6 +658,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) .machine pop RESTORE_PPR_PACA(PACA_EXSLB, r9) + ld r3,PACA_EXSLB+EX_R3(r13) ld r9,PACA_EXSLB+EX_R9(r13) ld r10,PACA_EXSLB+EX_R10(r13) ld r11,PACA_EXSLB+EX_R11(r13) @@ -668,7 +667,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) rfid b . /* prevent speculative execution */ -2: mfspr r11,SPRN_SRR0 +2: std r3,PACA_EXSLB+EX_DAR(r13) + ld r3,PACA_EXSLB+EX_R3(r13) + mfspr r11,SPRN_SRR0 LOAD_HANDLER(r10,unrecov_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) @@ -676,7 +677,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) rfid b . -8: mfspr r11,SPRN_SRR0 +8: std r3,PACA_EXSLB+EX_DAR(r13) + ld r3,PACA_EXSLB+EX_R3(r13) + mfspr r11,SPRN_SRR0 LOAD_HANDLER(r10,bad_addr_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) -- cgit v1.2.3 From fe5482c04312791bb19202e47f8a7751d476251e Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 21 May 2017 23:15:44 +1000 Subject: powerpc/64s: SLB miss already has CTR saved for relocatable kernel The EXCEPTION_PROLOG_1 used by SLB miss already saves CTR when the kernel is built with CONFIG_RELOCATABLE. So it does not have to be saved and reloaded when branching to slb_miss_realmode. It can be restored from the PACA as usual. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index fe3bc52aadf8..059b3a356250 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -522,7 +522,6 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) * because the distance from here to there depends on where * the kernel ends up being put. */ - mfctr r11 LOAD_HANDLER(r10, slb_miss_realmode) mtctr r10 bctr @@ -545,7 +544,6 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) * because the distance from here to there depends on where * the kernel ends up being put. */ - mfctr r11 LOAD_HANDLER(r10, slb_miss_realmode) mtctr r10 bctr @@ -585,7 +583,6 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) #ifndef CONFIG_RELOCATABLE b slb_miss_realmode #else - mfctr r11 LOAD_HANDLER(r10, slb_miss_realmode) mtctr r10 bctr @@ -603,7 +600,6 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) #ifndef CONFIG_RELOCATABLE b slb_miss_realmode #else - mfctr r11 LOAD_HANDLER(r10, slb_miss_realmode) mtctr r10 bctr @@ -625,10 +621,6 @@ EXC_COMMON_BEGIN(slb_miss_realmode) * procedure. */ mflr r10 -#ifdef CONFIG_RELOCATABLE - mtctr r11 -#endif - stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ @@ -657,6 +649,7 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ .machine pop + RESTORE_CTR(r9, PACA_EXSLB) RESTORE_PPR_PACA(PACA_EXSLB, r9) ld r3,PACA_EXSLB+EX_R3(r13) ld r9,PACA_EXSLB+EX_R9(r13) -- cgit v1.2.3 From 4d7cd3b956713d3dfbc3028ad1251b3f6b416a53 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Sun, 21 May 2017 23:15:45 +1000 Subject: powerpc/64s: Avoid r3 save/restore in SLB miss handler The SLB miss handler uses r3 for the faulting address but r12 is mostly able to be freed up to save r3 in. It just requires SRR1 be reloaded again on error. It would be more conventional to use r12 for SRR1 (and use r11 to save r3), but slb_allocate_realmode clobbers r11 and not r12. Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 41 +++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 15 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 059b3a356250..ed8628c6f0f4 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -510,9 +510,9 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x380) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_DAR - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crset 4*cr6+eq #ifndef CONFIG_RELOCATABLE b slb_miss_realmode @@ -532,9 +532,9 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x380) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_DAR - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crset 4*cr6+eq #ifndef CONFIG_RELOCATABLE b slb_miss_realmode @@ -576,9 +576,9 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, KVMTEST_PR, 0x480) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crclr 4*cr6+eq #ifndef CONFIG_RELOCATABLE b slb_miss_realmode @@ -593,9 +593,9 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) SET_SCRATCH0(r13) EXCEPTION_PROLOG_0(PACA_EXSLB) EXCEPTION_PROLOG_1(PACA_EXSLB, NOTEST, 0x480) - std r3,PACA_EXSLB+EX_R3(r13) + mr r12,r3 /* save r3 */ mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ - mfspr r12,SPRN_SRR1 + mfspr r11,SPRN_SRR1 crclr 4*cr6+eq #ifndef CONFIG_RELOCATABLE b slb_miss_realmode @@ -612,10 +612,10 @@ TRAMP_KVM(PACA_EXSLB, 0x480) EXC_COMMON_BEGIN(slb_miss_realmode) /* * r13 points to the PACA, r9 contains the saved CR, - * r12 contain the saved SRR1, SRR0 is still ready for return + * r12 contains the saved r3, + * r11 contain the saved SRR1, SRR0 is still ready for return * r3 has the faulting address * r9 - r13 are saved in paca->exslb. - * r3 is saved in paca->slb_r3 * cr6.eq is set for a D-SLB miss, clear for a I-SLB miss * We assume we aren't going to take any exceptions during this * procedure. @@ -624,6 +624,15 @@ EXC_COMMON_BEGIN(slb_miss_realmode) stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ + /* + * Test MSR_RI before calling slb_allocate_realmode, because the + * MSR in r11 gets clobbered. However we still want to allocate + * SLB in case MSR_RI=0, to minimise the risk of getting stuck in + * recursive SLB faults. So use cr5 for this, which is preserved. + */ + andi. r11,r11,MSR_RI /* check for unrecoverable exception */ + cmpdi cr5,r11,MSR_RI + crset 4*cr0+eq #ifdef CONFIG_PPC_STD_MMU_64 BEGIN_MMU_FTR_SECTION @@ -637,21 +646,21 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) beq- 8f /* if bad address, make full stack frame */ - andi. r10,r12,MSR_RI /* check for unrecoverable exception */ - beq- 2f + bne- cr5,2f /* if unrecoverable exception, oops */ /* All done -- return from exception. */ .machine push .machine "power4" mtcrf 0x80,r9 + mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ mtcrf 0x02,r9 /* I/D indication is in cr6 */ mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ .machine pop RESTORE_CTR(r9, PACA_EXSLB) RESTORE_PPR_PACA(PACA_EXSLB, r9) - ld r3,PACA_EXSLB+EX_R3(r13) + mr r3,r12 ld r9,PACA_EXSLB+EX_R9(r13) ld r10,PACA_EXSLB+EX_R10(r13) ld r11,PACA_EXSLB+EX_R11(r13) @@ -661,8 +670,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) b . /* prevent speculative execution */ 2: std r3,PACA_EXSLB+EX_DAR(r13) - ld r3,PACA_EXSLB+EX_R3(r13) + mr r3,r12 mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 LOAD_HANDLER(r10,unrecov_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) @@ -671,8 +681,9 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) b . 8: std r3,PACA_EXSLB+EX_DAR(r13) - ld r3,PACA_EXSLB+EX_R3(r13) + mr r3,r12 mfspr r11,SPRN_SRR0 + mfspr r12,SPRN_SRR1 LOAD_HANDLER(r10,bad_addr_slb) mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) -- cgit v1.2.3 From b102063b47d59752e113c5588422279c75eadd4d Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 19 Jun 2017 21:47:11 +1000 Subject: powerpc/64s: Use BRANCH_TO_COMMON() for slb_miss_realmode All the callers of slb_miss_realmode currently open code the #ifndef CONFIG_RELOCATABLE check and the branch via CTR in the RELOCATABLE case. We have a macro to do this, BRANCH_TO_COMMON(), so use it. Signed-off-by: Michael Ellerman Reviewed-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 42 ++++-------------------------------- 1 file changed, 4 insertions(+), 38 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ed8628c6f0f4..7bdfddbe0328 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -514,18 +514,7 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) mfspr r3,SPRN_DAR mfspr r11,SPRN_SRR1 crset 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - /* - * We can't just use a direct branch to slb_miss_realmode - * because the distance from here to there depends on where - * the kernel ends up being put. - */ - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_realmode) EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) @@ -536,18 +525,7 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) mfspr r3,SPRN_DAR mfspr r11,SPRN_SRR1 crset 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - /* - * We can't just use a direct branch to slb_miss_realmode - * because the distance from here to there depends on where - * the kernel ends up being put. - */ - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_realmode) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) @@ -580,13 +558,7 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ mfspr r11,SPRN_SRR1 crclr 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_realmode) EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) @@ -597,13 +569,7 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ mfspr r11,SPRN_SRR1 crclr 4*cr6+eq -#ifndef CONFIG_RELOCATABLE - b slb_miss_realmode -#else - LOAD_HANDLER(r10, slb_miss_realmode) - mtctr r10 - bctr -#endif + BRANCH_TO_COMMON(r10, slb_miss_realmode) EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) TRAMP_KVM(PACA_EXSLB, 0x480) -- cgit v1.2.3 From 442b6e8e03e44dc3a33142ab3b8d0b3395053274 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 19 Jun 2017 21:52:03 +1000 Subject: powerpc/64s: Rename slb_miss_realmode() to slb_miss_common() slb_miss_realmode() doesn't always runs in real mode, which is what the name implies. So rename it to avoid confusing people. Signed-off-by: Michael Ellerman Reviewed-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 7bdfddbe0328..6ad755e0cb29 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -514,7 +514,7 @@ EXC_REAL_BEGIN(data_access_slb, 0x380, 0x80) mfspr r3,SPRN_DAR mfspr r11,SPRN_SRR1 crset 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_realmode) + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_REAL_END(data_access_slb, 0x380, 0x80) EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) @@ -525,7 +525,7 @@ EXC_VIRT_BEGIN(data_access_slb, 0x4380, 0x80) mfspr r3,SPRN_DAR mfspr r11,SPRN_SRR1 crset 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_realmode) + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_VIRT_END(data_access_slb, 0x4380, 0x80) TRAMP_KVM_SKIP(PACA_EXSLB, 0x380) @@ -558,7 +558,7 @@ EXC_REAL_BEGIN(instruction_access_slb, 0x480, 0x80) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ mfspr r11,SPRN_SRR1 crclr 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_realmode) + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_REAL_END(instruction_access_slb, 0x480, 0x80) EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) @@ -569,13 +569,16 @@ EXC_VIRT_BEGIN(instruction_access_slb, 0x4480, 0x80) mfspr r3,SPRN_SRR0 /* SRR0 is faulting address */ mfspr r11,SPRN_SRR1 crclr 4*cr6+eq - BRANCH_TO_COMMON(r10, slb_miss_realmode) + BRANCH_TO_COMMON(r10, slb_miss_common) EXC_VIRT_END(instruction_access_slb, 0x4480, 0x80) TRAMP_KVM(PACA_EXSLB, 0x480) -/* This handler is used by both 0x380 and 0x480 slb miss interrupts */ -EXC_COMMON_BEGIN(slb_miss_realmode) +/* + * This handler is used by the 0x380 and 0x480 SLB miss interrupts, as well as + * the virtual mode 0x4380 and 0x4480 interrupts if AIL is enabled. + */ +EXC_COMMON_BEGIN(slb_miss_common) /* * r13 points to the PACA, r9 contains the saved CR, * r12 contains the saved r3, -- cgit v1.2.3 From fd88b945c18866962645d09dfc401e91e13b1909 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Mon, 19 Jun 2017 21:57:33 +1000 Subject: powerpc/64s: Rename slb_allocate_realmode() to slb_allocate() As for slb_miss_realmode(), rename slb_allocate_realmode() to avoid confusion over whether it runs in real or virtual mode - it runs in both. Signed-off-by: Michael Ellerman Reviewed-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 2 +- arch/powerpc/mm/slb.c | 10 +--------- arch/powerpc/mm/slb_low.S | 6 +++--- 3 files changed, 5 insertions(+), 13 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 6ad755e0cb29..07b79c2c70f8 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -605,7 +605,7 @@ EXC_COMMON_BEGIN(slb_miss_common) crset 4*cr0+eq #ifdef CONFIG_PPC_STD_MMU_64 BEGIN_MMU_FTR_SECTION - bl slb_allocate_realmode + bl slb_allocate END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX) #endif diff --git a/arch/powerpc/mm/slb.c b/arch/powerpc/mm/slb.c index 654a0d7ba0e7..13cfe413b40d 100644 --- a/arch/powerpc/mm/slb.c +++ b/arch/powerpc/mm/slb.c @@ -33,15 +33,7 @@ enum slb_index { KSTACK_INDEX = 2, /* Kernel stack map */ }; -extern void slb_allocate_realmode(unsigned long ea); - -static void slb_allocate(unsigned long ea) -{ - /* Currently, we do real mode for all SLBs including user, but - * that will change if we bring back dynamic VSIDs - */ - slb_allocate_realmode(ea); -} +extern void slb_allocate(unsigned long ea); #define slb_esid_mask(ssize) \ (((ssize) == MMU_SEGSIZE_256M)? ESID_MASK: ESID_MASK_1T) diff --git a/arch/powerpc/mm/slb_low.S b/arch/powerpc/mm/slb_low.S index 9869b44a04dc..bde378559d01 100644 --- a/arch/powerpc/mm/slb_low.S +++ b/arch/powerpc/mm/slb_low.S @@ -65,7 +65,7 @@ MMU_FTR_SECTION_ELSE \ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) -/* void slb_allocate_realmode(unsigned long ea); +/* void slb_allocate(unsigned long ea); * * Create an SLB entry for the given EA (user or kernel). * r3 = faulting address, r13 = PACA @@ -73,7 +73,7 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_68_BIT_VA) * r3 is preserved. * No other registers are examined or changed. */ -_GLOBAL(slb_allocate_realmode) +_GLOBAL(slb_allocate) /* * check for bad kernel/user address * (ea & ~REGION_MASK) >= PGTABLE_RANGE @@ -309,7 +309,7 @@ slb_compare_rr_to_size: b 7b -_ASM_NOKPROBE_SYMBOL(slb_allocate_realmode) +_ASM_NOKPROBE_SYMBOL(slb_allocate) _ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_linear) _ASM_NOKPROBE_SYMBOL(slb_miss_kernel_load_io) _ASM_NOKPROBE_SYMBOL(slb_compare_rr_to_size) -- cgit v1.2.3 From 6b847d795cf4ab3e574f4fcf7193fe245908a195 Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Tue, 20 Jun 2017 13:14:47 +0530 Subject: powerpc/time: Fix tracing in time.c Since trace_clock is in a different file and already marked with notrace, enable tracing in time.c by removing it from the disabled list in Makefile. Also annotate clocksource read functions and sched_clock with notrace. Testing: Timer and ftrace selftests run with different trace clocks. Acked-by: Naveen N. Rao Signed-off-by: Santosh Sivaraj Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/Makefile | 2 -- arch/powerpc/kernel/time.c | 6 +++--- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index e132902e1f14..0845eebc5af3 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -25,8 +25,6 @@ CFLAGS_REMOVE_cputable.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_prom_init.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_btext.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_prom.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) -# timers used by tracing -CFLAGS_REMOVE_time.o = -mno-sched-epilog $(CC_FLAGS_FTRACE) endif obj-y := cputable.o ptrace.o syscalls.o \ diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 60714b8c9a2f..476a527b220d 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -675,7 +675,7 @@ EXPORT_SYMBOL_GPL(tb_to_ns); * the high 64 bits of a * b, i.e. (a * b) >> 64, where a and b * are 64-bit unsigned numbers. */ -unsigned long long sched_clock(void) +notrace unsigned long long sched_clock(void) { if (__USE_RTC()) return get_rtc(); @@ -831,12 +831,12 @@ void read_persistent_clock(struct timespec *ts) } /* clocksource code */ -static u64 rtc_read(struct clocksource *cs) +static notrace u64 rtc_read(struct clocksource *cs) { return (u64)get_rtc(); } -static u64 timebase_read(struct clocksource *cs) +static notrace u64 timebase_read(struct clocksource *cs) { return (u64)get_tb(); } -- cgit v1.2.3 From d4cfb11387ee29ba4626546c676fd25c7abbbbb2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 27 May 2017 18:04:52 +1000 Subject: powerpc: Convert VDSO update function to use new update_vsyscall interface This converts the powerpc VDSO time update function to use the new interface introduced in commit 576094b7f0aa ("time: Introduce new GENERIC_TIME_VSYSCALL", 2012-09-11). Where the old interface gave us the time as of the last update in seconds and whole nanoseconds, with the new interface we get the nanoseconds part effectively in a binary fixed-point format with tk->tkr_mono.shift bits to the right of the binary point. With the old interface, the fractional nanoseconds got truncated, meaning that the value returned by the VDSO clock_gettime function would have about 1ns of jitter in it compared to the value computed by the generic timekeeping code in the kernel. The powerpc VDSO time functions (clock_gettime and gettimeofday) already work in units of 2^-32 seconds, or 0.23283 ns, because that makes it simple to split the result into seconds and fractional seconds, and represent the fractional seconds in either microseconds or nanoseconds. This is good enough accuracy for now, so this patch avoids changing how the VDSO works or the interface in the VDSO data page. This patch converts the powerpc update_vsyscall_old to be called update_vsyscall and use the new interface. We convert the fractional second to units of 2^-32 seconds without truncating to whole nanoseconds. (There is still a conversion to whole nanoseconds for any legacy users of the vdso_data/systemcfg stamp_xtime field.) In addition, this improves the accuracy of the computation of tb_to_xs for those systems with high-frequency timebase clocks (>= 268.5 MHz) by doing the right shift in two parts, one before the multiplication and one after, rather than doing the right shift before the multiplication. (We can't do all of the right shift after the multiplication unless we use 128-bit arithmetic.) Signed-off-by: Paul Mackerras Acked-by: John Stultz Signed-off-by: Michael Ellerman --- arch/powerpc/Kconfig | 2 +- arch/powerpc/kernel/time.c | 68 +++++++++++++++++++++++++++++++++++----------- 2 files changed, 53 insertions(+), 17 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b50d46d214f6..7ee79e0ac88e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -155,7 +155,7 @@ config PPC select GENERIC_SMP_IDLE_THREAD select GENERIC_STRNCPY_FROM_USER select GENERIC_STRNLEN_USER - select GENERIC_TIME_VSYSCALL_OLD + select GENERIC_TIME_VSYSCALL select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL select HAVE_ARCH_KGDB diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 476a527b220d..0cc0dad905d5 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -841,28 +841,66 @@ static notrace u64 timebase_read(struct clocksource *cs) return (u64)get_tb(); } -void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm, - struct clocksource *clock, u32 mult, u64 cycle_last) + +void update_vsyscall(struct timekeeper *tk) { + struct timespec xt; + struct clocksource *clock = tk->tkr_mono.clock; + u32 mult = tk->tkr_mono.mult; + u32 shift = tk->tkr_mono.shift; + u64 cycle_last = tk->tkr_mono.cycle_last; u64 new_tb_to_xs, new_stamp_xsec; - u32 frac_sec; + u64 frac_sec; if (clock != &clocksource_timebase) return; + xt.tv_sec = tk->xtime_sec; + xt.tv_nsec = (long)(tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift); + /* Make userspace gettimeofday spin until we're done. */ ++vdso_data->tb_update_count; smp_mb(); - /* 19342813113834067 ~= 2^(20+64) / 1e9 */ - new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); - new_stamp_xsec = (u64) wall_time->tv_nsec * XSEC_PER_SEC; - do_div(new_stamp_xsec, 1000000000); - new_stamp_xsec += (u64) wall_time->tv_sec * XSEC_PER_SEC; + /* + * This computes ((2^20 / 1e9) * mult) >> shift as a + * 0.64 fixed-point fraction. + * The computation in the else clause below won't overflow + * (as long as the timebase frequency is >= 1.049 MHz) + * but loses precision because we lose the low bits of the constant + * in the shift. Note that 19342813113834067 ~= 2^(20+64) / 1e9. + * For a shift of 24 the error is about 0.5e-9, or about 0.5ns + * over a second. (Shift values are usually 22, 23 or 24.) + * For high frequency clocks such as the 512MHz timebase clock + * on POWER[6789], the mult value is small (e.g. 32768000) + * and so we can shift the constant by 16 initially + * (295147905179 ~= 2^(20+64-16) / 1e9) and then do the + * remaining shifts after the multiplication, which gives a + * more accurate result (e.g. with mult = 32768000, shift = 24, + * the error is only about 1.2e-12, or 0.7ns over 10 minutes). + */ + if (mult <= 62500000 && clock->shift >= 16) + new_tb_to_xs = ((u64) mult * 295147905179ULL) >> (clock->shift - 16); + else + new_tb_to_xs = (u64) mult * (19342813113834067ULL >> clock->shift); + + /* + * Compute the fractional second in units of 2^-32 seconds. + * The fractional second is tk->tkr_mono.xtime_nsec >> tk->tkr_mono.shift + * in nanoseconds, so multiplying that by 2^32 / 1e9 gives + * it in units of 2^-32 seconds. + * We assume shift <= 32 because clocks_calc_mult_shift() + * generates shift values in the range 0 - 32. + */ + frac_sec = tk->tkr_mono.xtime_nsec << (32 - shift); + do_div(frac_sec, NSEC_PER_SEC); - BUG_ON(wall_time->tv_nsec >= NSEC_PER_SEC); - /* this is tv_nsec / 1e9 as a 0.32 fraction */ - frac_sec = ((u64) wall_time->tv_nsec * 18446744073ULL) >> 32; + /* + * Work out new stamp_xsec value for any legacy users of systemcfg. + * stamp_xsec is in units of 2^-20 seconds. + */ + new_stamp_xsec = frac_sec >> 12; + new_stamp_xsec += tk->xtime_sec * XSEC_PER_SEC; /* * tb_update_count is used to allow the userspace gettimeofday code @@ -872,15 +910,13 @@ void update_vsyscall_old(struct timespec *wall_time, struct timespec *wtm, * the two values of tb_update_count match and are even then the * tb_to_xs and stamp_xsec values are consistent. If not, then it * loops back and reads them again until this criteria is met. - * We expect the caller to have done the first increment of - * vdso_data->tb_update_count already. */ vdso_data->tb_orig_stamp = cycle_last; vdso_data->stamp_xsec = new_stamp_xsec; vdso_data->tb_to_xs = new_tb_to_xs; - vdso_data->wtom_clock_sec = wtm->tv_sec; - vdso_data->wtom_clock_nsec = wtm->tv_nsec; - vdso_data->stamp_xtime = *wall_time; + vdso_data->wtom_clock_sec = tk->wall_to_monotonic.tv_sec; + vdso_data->wtom_clock_nsec = tk->wall_to_monotonic.tv_nsec; + vdso_data->stamp_xtime = xt; vdso_data->stamp_sec_fraction = frac_sec; smp_wmb(); ++(vdso_data->tb_update_count); -- cgit v1.2.3 From 64ebb9a208c6e66316329a6d9101815d1ee06fa9 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Thu, 15 Jun 2017 11:53:16 +1000 Subject: powerpc: Fix /proc/cpuinfo revision for POWER9 DD2 The P9 PVR bits 12-15 don't indicate a revision but instead different chip configurations. From BookIV we have: Bits Configuration 0 : Scale out 12 cores 1 : Scale out 24 cores 2 : Scale up 12 cores 3 : Scale up 24 cores DD1 doesn't use this but DD2 does. Linux will mostly use the "Scale out 24 core" configuration (ie. SMT4 not SMT8) which results in a PVR of 0x004e1200. The reported revision in /proc/cpuinfo is hence reported incorrectly as "18.0". This patch fixes this to mask off only the relevant bits for the major revision (ie. bits 8-11) for POWER9. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/setup-common.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index 71dcda91755d..096dee58b9c1 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -335,6 +335,10 @@ static int show_cpuinfo(struct seq_file *m, void *v) maj = ((pvr >> 8) & 0xFF) - 1; min = pvr & 0xFF; break; + case 0x004e: /* POWER9 bits 12-15 give chip type */ + maj = (pvr >> 8) & 0x0F; + min = pvr & 0xFF; + break; default: maj = (pvr >> 8) & 0xFF; min = pvr & 0xFF; -- cgit v1.2.3 From aa9a95163638bd9acb3e1f61f48cd5a000e79f03 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Mon, 8 May 2017 16:23:31 +1000 Subject: powerpc: Fix asm offsets to point to actual FP and VMX regs The asm code assumes the FP regs are at the start of fp_state. While this is true now, it may not always be the case and there is nothing enforcing it. This fixes the asm-offsets to point to the actual FP registers inside the fp_state. Similarly for VMX. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/asm-offsets.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 9624851ca276..a7b5af32b89e 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -100,12 +100,12 @@ int main(void) OFFSET(THREAD_NORMSAVES, thread_struct, normsave[0]); #endif OFFSET(THREAD_FPEXC_MODE, thread_struct, fpexc_mode); - OFFSET(THREAD_FPSTATE, thread_struct, fp_state); + OFFSET(THREAD_FPSTATE, thread_struct, fp_state.fpr); OFFSET(THREAD_FPSAVEAREA, thread_struct, fp_save_area); OFFSET(FPSTATE_FPSCR, thread_fp_state, fpscr); OFFSET(THREAD_LOAD_FP, thread_struct, load_fp); #ifdef CONFIG_ALTIVEC - OFFSET(THREAD_VRSTATE, thread_struct, vr_state); + OFFSET(THREAD_VRSTATE, thread_struct, vr_state.vr); OFFSET(THREAD_VRSAVEAREA, thread_struct, vr_save_area); OFFSET(THREAD_VRSAVE, thread_struct, vrsave); OFFSET(THREAD_USED_VR, thread_struct, used_vr); @@ -145,9 +145,9 @@ int main(void) OFFSET(THREAD_TM_PPR, thread_struct, tm_ppr); OFFSET(THREAD_TM_DSCR, thread_struct, tm_dscr); OFFSET(PT_CKPT_REGS, thread_struct, ckpt_regs); - OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state); + OFFSET(THREAD_CKVRSTATE, thread_struct, ckvr_state.vr); OFFSET(THREAD_CKVRSAVE, thread_struct, ckvrsave); - OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state); + OFFSET(THREAD_CKFPSTATE, thread_struct, ckfp_state.fpr); /* Local pt_regs on stack for Transactional Memory funcs. */ DEFINE(TM_FRAME_SIZE, STACK_FRAME_OVERHEAD + sizeof(struct pt_regs) + 16); -- cgit v1.2.3 From 2bafb7ffa3e0908ad2e69b94c436a0326ef2e7e1 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Mon, 8 May 2017 16:18:31 +1000 Subject: powerpc/tm: Fix comment Update to real function name. Signed-off-by: Michael Neuling Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/tm.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/tm.S b/arch/powerpc/kernel/tm.S index 3a2d04134da9..c4ba37822ba0 100644 --- a/arch/powerpc/kernel/tm.S +++ b/arch/powerpc/kernel/tm.S @@ -313,8 +313,8 @@ dont_backup_fp: blr - /* void tm_recheckpoint(struct thread_struct *thread, - * unsigned long orig_msr) + /* void __tm_recheckpoint(struct thread_struct *thread, + * unsigned long orig_msr) * - Restore the checkpointed register state saved by tm_reclaim * when we switch_to a process. * -- cgit v1.2.3 From ba6d334ac230065243a92bb7cb3fd6a5f6a7f8ac Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Sat, 24 Jun 2017 12:29:01 -0500 Subject: powerpc/64s: Invalidate ERAT on powersave wakeup for POWER9 On POWER9 the ERAT may be incorrect on wakeup from some stop states that lose state. This causes random segvs and illegal instructions when these stop states are enabled. This patch invalidates the ERAT on wakeup on POWER9 to prevent this from causing a problem. Signed-off-by: Michael Neuling Signed-off-by: Benjamin Herrenschmidt Reviewed-by: Nicholas Piggin [mpe: Merge comment change with upstream changes] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 8 +++++--- arch/powerpc/kernel/idle_book3s.S | 7 +++++++ 2 files changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 07b79c2c70f8..02a82777fd5b 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -99,9 +99,11 @@ EXC_VIRT_NONE(0x4000, 0x100) #ifdef CONFIG_PPC_P7_NAP /* * If running native on arch 2.06 or later, check if we are waking up - * from nap/sleep/winkle, and branch to idle handler. The idle wakeup - * handler initially runs in real mode, but we branch to the 0xc000... - * address so we can turn on relocation with mtmsr. + * from nap/sleep/winkle, and branch to idle handler. This tests SRR1 + * bits 46:47. A non-0 value indicates that we are coming from a power + * saving state. The idle wakeup handler initially runs in real mode, + * but we branch to the 0xc000... address so we can turn on relocation + * with mtmsr. */ #define IDLETEST(n) \ BEGIN_FTR_SECTION ; \ diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index 1ea14b96f126..f6518c768d2a 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -435,6 +435,13 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300) * cr3 - set to gt if waking up with partial/complete hypervisor state loss */ pnv_restore_hyp_resource_arch300: + /* + * Workaround for POWER9, if we lost resources, the ERAT + * might have been mixed up and needs flushing. + */ + blt cr3,1f + PPC_INVALIDATE_ERAT +1: /* * POWER ISA 3. Use PSSCR to determine if we * are waking up from deep idle state -- cgit v1.2.3 From a77af552ccc9d4d54459a39f9e5f7ad307aeb4f9 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 1 Jun 2017 22:50:38 +0530 Subject: powerpc/fadump: avoid duplicates in crash memory ranges fadump sets up crash memory ranges to be used for creating PT_LOAD program headers in elfcore header. Memory chunk RMA_START through boot memory area size is added as the first memory range because firmware, at the time of crash, moves this memory chunk to different location specified during fadump registration making it necessary to create a separate program header for it with the correct offset. This memory chunk is skipped while setting up the remaining memory ranges. But currently, there is possibility that some of this memory may have duplicate entries like when it is hot-removed and added again. Ensure that no two memory ranges represent the same memory. When 5 lmbs are hot-removed and then hot-plugged before registering fadump, here is how the program headers in /proc/vmcore exported by fadump look like without this change: Program Headers: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flags Align NOTE 0x0000000000010000 0x0000000000000000 0x0000000000000000 0x0000000000001894 0x0000000000001894 0 LOAD 0x0000000000021020 0xc000000000000000 0x0000000000000000 0x0000000040000000 0x0000000040000000 RWE 0 LOAD 0x0000000040031020 0xc000000000000000 0x0000000000000000 0x0000000010000000 0x0000000010000000 RWE 0 LOAD 0x0000000050040000 0xc000000010000000 0x0000000010000000 0x0000000050000000 0x0000000050000000 RWE 0 LOAD 0x00000000a0040000 0xc000000060000000 0x0000000060000000 0x000000019ffe0000 0x000000019ffe0000 RWE 0 and with this change: Program Headers: Type Offset VirtAddr PhysAddr FileSiz MemSiz Flags Align NOTE 0x0000000000010000 0x0000000000000000 0x0000000000000000 0x0000000000001894 0x0000000000001894 0 LOAD 0x0000000000021020 0xc000000000000000 0x0000000000000000 0x0000000040000000 0x0000000040000000 RWE 0 LOAD 0x0000000040030000 0xc000000040000000 0x0000000040000000 0x0000000020000000 0x0000000020000000 RWE 0 LOAD 0x0000000060030000 0xc000000060000000 0x0000000060000000 0x000000019ffe0000 0x000000019ffe0000 RWE 0 Signed-off-by: Hari Bathini Reviewed-by: Mahesh J Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/fadump.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 12837d52e84a..08e00448a355 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -878,8 +878,19 @@ static void fadump_setup_crash_memory_ranges(void) for_each_memblock(memory, reg) { start = (unsigned long long)reg->base; end = start + (unsigned long long)reg->size; - if (start == RMA_START && end >= fw_dump.boot_memory_size) - start = fw_dump.boot_memory_size; + + /* + * skip the first memory chunk that is already added (RMA_START + * through boot_memory_size). This logic needs a relook if and + * when RMA_START changes to a non-zero value. + */ + BUILD_BUG_ON(RMA_START != 0); + if (start < fw_dump.boot_memory_size) { + if (end > fw_dump.boot_memory_size) + start = fw_dump.boot_memory_size; + else + continue; + } /* add this range excluding the reserved dump area. */ fadump_exclude_reserved_area(start, end); -- cgit v1.2.3 From eae0dfcc44320c79a05637534d59af4643b2ee7b Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 1 Jun 2017 22:51:26 +0530 Subject: powerpc/fadump: avoid holes in boot memory area when fadump is registered To register fadump, boot memory area - the size of low memory chunk that is required for a kernel to boot successfully when booted with restricted memory, is assumed to have no holes. But this memory area is currently not protected from hot-remove operations. So, fadump could fail to re-register after a memory hot-remove operation, if memory is removed from boot memory area. To avoid this, ensure that memory from boot memory area is not hot-removed when fadump is registered. Signed-off-by: Hari Bathini Reviewed-by: Mahesh J Salgaonkar Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/fadump.h | 1 + arch/powerpc/kernel/fadump.c | 12 ++++++++++++ arch/powerpc/platforms/pseries/hotplug-memory.c | 7 +++++++ 3 files changed, 20 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/fadump.h b/arch/powerpc/include/asm/fadump.h index a3de219073af..ce88bbe1d809 100644 --- a/arch/powerpc/include/asm/fadump.h +++ b/arch/powerpc/include/asm/fadump.h @@ -203,6 +203,7 @@ struct fad_crash_memory_ranges { unsigned long long size; }; +extern int is_fadump_boot_memory_area(u64 addr, ulong size); extern int early_init_dt_scan_fw_dump(unsigned long node, const char *uname, int depth, void *data); extern int fadump_reserve_mem(void); diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 08e00448a355..750bff3b4af3 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -113,6 +113,18 @@ int __init early_init_dt_scan_fw_dump(unsigned long node, return 1; } +/* + * If fadump is registered, check if the memory provided + * falls within boot memory area. + */ +int is_fadump_boot_memory_area(u64 addr, ulong size) +{ + if (!fw_dump.dump_registered) + return 0; + + return (addr + size) > RMA_START && addr <= fw_dump.boot_memory_size; +} + int is_fadump_active(void) { return fw_dump.dump_active; diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index e104c71ea44a..a186b8e8019a 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "pseries.h" static bool rtas_hp_event; @@ -406,6 +407,12 @@ static bool lmb_is_removable(struct of_drconf_cell *lmb) scns_per_block = block_sz / MIN_MEMORY_BLOCK_SIZE; phys_addr = lmb->base_addr; +#ifdef CONFIG_FA_DUMP + /* Don't hot-remove memory that falls in fadump boot memory area */ + if (is_fadump_boot_memory_area(phys_addr, block_sz)) + return false; +#endif + for (i = 0; i < scns_per_block; i++) { pfn = PFN_DOWN(phys_addr); if (!pfn_present(pfn)) -- cgit v1.2.3 From a5a05b91c7f36c180c32e27fa41890957c31bad1 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Thu, 1 Jun 2017 22:52:10 +0530 Subject: powerpc/fadump: provide a helpful error message fadump fails to register when there are holes in boot memory area. Provide a helpful error message to the user in such case. Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/fadump.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 750bff3b4af3..a3568136ab28 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -130,6 +130,38 @@ int is_fadump_active(void) return fw_dump.dump_active; } +/* + * Returns 1, if there are no holes in boot memory area, + * 0 otherwise. + */ +static int is_boot_memory_area_contiguous(void) +{ + struct memblock_region *reg; + unsigned long tstart, tend; + unsigned long start_pfn = PHYS_PFN(RMA_START); + unsigned long end_pfn = PHYS_PFN(RMA_START + fw_dump.boot_memory_size); + unsigned int ret = 0; + + for_each_memblock(memory, reg) { + tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); + tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + if (tstart < tend) { + /* Memory hole from start_pfn to tstart */ + if (tstart > start_pfn) + break; + + if (tend == end_pfn) { + ret = 1; + break; + } + + start_pfn = tend + 1; + } + } + + return ret; +} + /* Print firmware assisted dump configurations for debugging purpose. */ static void fadump_show_config(void) { @@ -457,6 +489,10 @@ static int register_fw_dump(struct fadump_mem_struct *fdm) " dump. Hardware Error(%d).\n", rc); break; case -3: + if (!is_boot_memory_area_contiguous()) + pr_err("Can't have holes in boot memory area while " + "registering fadump\n"); + printk(KERN_ERR "Failed to register firmware-assisted kernel" " dump. Parameter Error(%d).\n", rc); err = -EINVAL; -- cgit v1.2.3 From 68fa6478e3b1fab7077d390070ed455aed93905c Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Fri, 2 Jun 2017 01:10:10 +0530 Subject: powerpc/fadump: add reschedule point while releasing memory Around 95% of memory is reserved by fadump/capture kernel. All this memory is freed, one page at a time, on writing '1' to the node /sys/kernel/fadump_release_mem. On systems with large memory, this can take a long time to complete, leading to soft lockup warning messages. To avoid this, add reschedule points at regular intervals. Also, while memblock_reserve() implicitly takes care of holes in the given memory range while reserving memory, those holes need to be taken care of while releasing memory as memory is freed one page at a time. Add support to skip holes while releasing memory. Suggested-by: Michael Ellerman Signed-off-by: Hari Bathini Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/fadump.c | 65 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 11 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index a3568136ab28..3079518f2245 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1152,28 +1152,71 @@ void fadump_cleanup(void) } } +static void fadump_free_reserved_memory(unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long pfn; + unsigned long time_limit = jiffies + HZ; + + pr_info("freeing reserved memory (0x%llx - 0x%llx)\n", + PFN_PHYS(start_pfn), PFN_PHYS(end_pfn)); + + for (pfn = start_pfn; pfn < end_pfn; pfn++) { + free_reserved_page(pfn_to_page(pfn)); + + if (time_after(jiffies, time_limit)) { + cond_resched(); + time_limit = jiffies + HZ; + } + } +} + +/* + * Skip memory holes and free memory that was actually reserved. + */ +static void fadump_release_reserved_area(unsigned long start, unsigned long end) +{ + struct memblock_region *reg; + unsigned long tstart, tend; + unsigned long start_pfn = PHYS_PFN(start); + unsigned long end_pfn = PHYS_PFN(end); + + for_each_memblock(memory, reg) { + tstart = max(start_pfn, memblock_region_memory_base_pfn(reg)); + tend = min(end_pfn, memblock_region_memory_end_pfn(reg)); + if (tstart < tend) { + fadump_free_reserved_memory(tstart, tend); + + if (tend == end_pfn) + break; + + start_pfn = tend + 1; + } + } +} + /* * Release the memory that was reserved in early boot to preserve the memory * contents. The released memory will be available for general use. */ static void fadump_release_memory(unsigned long begin, unsigned long end) { - unsigned long addr; unsigned long ra_start, ra_end; ra_start = fw_dump.reserve_dump_area_start; ra_end = ra_start + fw_dump.reserve_dump_area_size; - for (addr = begin; addr < end; addr += PAGE_SIZE) { - /* - * exclude the dump reserve area. Will reuse it for next - * fadump registration. - */ - if (addr <= ra_end && ((addr + PAGE_SIZE) > ra_start)) - continue; - - free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); - } + /* + * exclude the dump reserve area. Will reuse it for next + * fadump registration. + */ + if (begin < ra_end && end > ra_start) { + if (begin < ra_start) + fadump_release_reserved_area(begin, ra_start); + if (end > ra_end) + fadump_release_reserved_area(ra_end, end); + } else + fadump_release_reserved_area(begin, end); } static void fadump_invalidate_release_mem(void) -- cgit v1.2.3 From f8d0d5dc641cd405ad40cb2498b04df9716baee6 Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Tue, 27 Jun 2017 12:30:05 +0530 Subject: powerpc/smp: Do not BUG_ON if invalid CPU during kick During secondary start, we do not need to BUG_ON if an invalid CPU number is passed. We already print an error if secondary cannot be started, so just return an error instead. Signed-off-by: Santosh Sivaraj Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/smp.c | 3 ++- arch/powerpc/platforms/cell/smp.c | 3 ++- arch/powerpc/platforms/powernv/smp.c | 3 ++- arch/powerpc/platforms/pseries/smp.c | 3 ++- 4 files changed, 8 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index df2a41647d8e..05bf5836107c 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -112,7 +112,8 @@ int smp_generic_cpu_bootable(unsigned int nr) #ifdef CONFIG_PPC64 int smp_generic_kick_cpu(int nr) { - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= NR_CPUS) + return -EINVAL; /* * The processor is currently spinning, waiting for the diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index 895560f4be69..ee8c535cf4d3 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -115,7 +115,8 @@ static void smp_cell_setup_cpu(int cpu) static int smp_cell_kick_cpu(int nr) { - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= NR_CPUS) + return -EINVAL; if (!smp_startup_cpu(nr)) return -ENOENT; diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index c04c87adad94..292825f25ffd 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -63,7 +63,8 @@ static int pnv_smp_kick_cpu(int nr) long rc; uint8_t status; - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= NR_CPUS) + return -EINVAL; /* * If we already started or OPAL is not supported, we just diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index 52ca6b311d44..c82182ac40af 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -151,7 +151,8 @@ static void smp_setup_cpu(int cpu) static int smp_pSeries_kick_cpu(int nr) { - BUG_ON(nr < 0 || nr >= NR_CPUS); + if (nr < 0 || nr >= NR_CPUS) + return -EINVAL; if (!smp_startup_cpu(nr)) return -ENOENT; -- cgit v1.2.3 From c642af9c41f09296997519499d16ff30e700816a Mon Sep 17 00:00:00 2001 From: Santosh Sivaraj Date: Tue, 27 Jun 2017 12:30:06 +0530 Subject: powerpc/smp: Convert NR_CPUS to nr_cpu_ids nr_cpu_ids can be limited by nr_cpus boot parameter, whereas NR_CPUS is a compile time constant, which shouldn't be compared against during cpu kick. Signed-off-by: Santosh Sivaraj Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/smp.c | 2 +- arch/powerpc/platforms/cell/smp.c | 2 +- arch/powerpc/platforms/powernv/smp.c | 2 +- arch/powerpc/platforms/pseries/smp.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 05bf5836107c..418019728efa 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -112,7 +112,7 @@ int smp_generic_cpu_bootable(unsigned int nr) #ifdef CONFIG_PPC64 int smp_generic_kick_cpu(int nr) { - if (nr < 0 || nr >= NR_CPUS) + if (nr < 0 || nr >= nr_cpu_ids) return -EINVAL; /* diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index ee8c535cf4d3..f84d52a2db40 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -115,7 +115,7 @@ static void smp_cell_setup_cpu(int cpu) static int smp_cell_kick_cpu(int nr) { - if (nr < 0 || nr >= NR_CPUS) + if (nr < 0 || nr >= nr_cpu_ids) return -EINVAL; if (!smp_startup_cpu(nr)) diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 292825f25ffd..40dae96f7e20 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -63,7 +63,7 @@ static int pnv_smp_kick_cpu(int nr) long rc; uint8_t status; - if (nr < 0 || nr >= NR_CPUS) + if (nr < 0 || nr >= nr_cpu_ids) return -EINVAL; /* diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index c82182ac40af..24785f63fb40 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -151,7 +151,7 @@ static void smp_setup_cpu(int cpu) static int smp_pSeries_kick_cpu(int nr) { - if (nr < 0 || nr >= NR_CPUS) + if (nr < 0 || nr >= nr_cpu_ids) return -EINVAL; if (!smp_startup_cpu(nr)) -- cgit v1.2.3 From 4d0d7c02df680740da41f5f92a238c35796ca5be Mon Sep 17 00:00:00 2001 From: Akshay Adiga Date: Wed, 28 Jun 2017 06:46:49 +0530 Subject: powerpc/powernv/idle: Clear r12 on wakeup from stop lite pnv_wakeup_noloss() expects r12 to contain SRR1 value to determine if the wakeup reason is an HMI in CHECK_HMI_INTERRUPT. When we wakeup with ESL=0, SRR1 will not contain the wakeup reason, so there is no point setting r12 to SRR1. However, we don't set r12 at all so r12 contains garbage (likely a kernel pointer), and is still used to check HMI assuming that it contained SRR1. This causes the OPAL msglog to be filled with the following print: HMI: Received HMI interrupt: HMER = 0x0040000000000000 This patch clears r12 after waking up from stop with ESL=EC=0, so that we don't accidentally enter the HMI handler in pnv_wakeup_noloss() if the value of r12[42:45] corresponds to HMI as wakeup reason. Prior to commit 9d29250136f6 ("powerpc/64s/idle: Avoid SRR usage in idle sleep/wake paths") this bug existed, in that we would incorrectly look at SRR1 to check for a HMI when SRR1 didn't contain a wakeup reason. However the SRR1 value would just happen to never have bits 42:45 set. Fixes: 9d29250136f6 ("powerpc/64s/idle: Avoid SRR usage in idle sleep/wake paths") Signed-off-by: Akshay Adiga Reviewed-by: Nicholas Piggin [mpe: Change log and comment massaging] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/idle_book3s.S | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/idle_book3s.S b/arch/powerpc/kernel/idle_book3s.S index f6518c768d2a..5adb390e773b 100644 --- a/arch/powerpc/kernel/idle_book3s.S +++ b/arch/powerpc/kernel/idle_book3s.S @@ -256,6 +256,19 @@ power_enter_stop: bne .Lhandle_esl_ec_set IDLE_STATE_ENTER_SEQ(PPC_STOP) li r3,0 /* Since we didn't lose state, return 0 */ + + /* + * pnv_wakeup_noloss() expects r12 to contain the SRR1 value so + * it can determine if the wakeup reason is an HMI in + * CHECK_HMI_INTERRUPT. + * + * However, when we wakeup with ESL=0, SRR1 will not contain the wakeup + * reason, so there is no point setting r12 to SRR1. + * + * Further, we clear r12 here, so that we don't accidentally enter the + * HMI in pnv_wakeup_noloss() if the value of r12[42:45] == WAKE_HMI. + */ + li r12, 0 b pnv_wakeup_noloss .Lhandle_esl_ec_set: -- cgit v1.2.3 From 4e287e655e108cbbd6e3e7dcc49d591c8aa5a8a4 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Tue, 6 Jun 2017 23:08:32 +1000 Subject: powerpc: use spin loop primitives in some functions Use the different spin loop primitives in some simple powerpc spin loops, including those which will spin as a common case. This will help to test the spin loop primitives before more conversions are done. Signed-off-by: Nicholas Piggin [mpe: Add some includes of ] Signed-off-by: Michael Ellerman --- arch/powerpc/include/asm/delay.h | 16 ++++++++++++---- arch/powerpc/kernel/smp.c | 4 ++-- arch/powerpc/kernel/time.c | 8 +++++--- arch/powerpc/mm/hash_native_64.c | 5 ++++- 4 files changed, 23 insertions(+), 10 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/include/asm/delay.h b/arch/powerpc/include/asm/delay.h index 52e4d54da2a9..3df4417dd9c8 100644 --- a/arch/powerpc/include/asm/delay.h +++ b/arch/powerpc/include/asm/delay.h @@ -2,6 +2,7 @@ #define _ASM_POWERPC_DELAY_H #ifdef __KERNEL__ +#include #include /* @@ -58,11 +59,18 @@ extern void udelay(unsigned long usecs); typeof(condition) __ret; \ unsigned long __loops = tb_ticks_per_usec * timeout; \ unsigned long __start = get_tbl(); \ - while (!(__ret = (condition)) && (tb_ticks_since(__start) <= __loops)) \ - if (delay) \ + \ + if (delay) { \ + while (!(__ret = (condition)) && \ + (tb_ticks_since(__start) <= __loops)) \ udelay(delay); \ - else \ - cpu_relax(); \ + } else { \ + spin_begin(); \ + while (!(__ret = (condition)) && \ + (tb_ticks_since(__start) <= __loops)) \ + spin_cpu_relax(); \ + spin_end(); \ + } \ if (!__ret) \ __ret = (condition); \ __ret; \ diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c index 418019728efa..a975ddf77200 100644 --- a/arch/powerpc/kernel/smp.c +++ b/arch/powerpc/kernel/smp.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -767,8 +768,7 @@ int __cpu_up(unsigned int cpu, struct task_struct *tidle) smp_ops->give_timebase(); /* Wait until cpu puts itself in the online & active maps */ - while (!cpu_online(cpu)) - cpu_relax(); + spin_until_cond(cpu_online(cpu)); return 0; } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 0cc0dad905d5..fe6f3a285455 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -59,10 +59,10 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -442,6 +442,7 @@ void __delay(unsigned long loops) unsigned long start; int diff; + spin_begin(); if (__USE_RTC()) { start = get_rtcl(); do { @@ -449,13 +450,14 @@ void __delay(unsigned long loops) diff = get_rtcl() - start; if (diff < 0) diff += 1000000000; + spin_cpu_relax(); } while (diff < loops); } else { start = get_tbl(); while (get_tbl() - start < loops) - HMT_low(); - HMT_medium(); + spin_cpu_relax(); } + spin_end(); } EXPORT_SYMBOL(__delay); diff --git a/arch/powerpc/mm/hash_native_64.c b/arch/powerpc/mm/hash_native_64.c index bdaac28193f7..fbd1acc3b8d9 100644 --- a/arch/powerpc/mm/hash_native_64.c +++ b/arch/powerpc/mm/hash_native_64.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -184,8 +185,10 @@ static inline void native_lock_hpte(struct hash_pte *hptep) while (1) { if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) break; + spin_begin(); while(test_bit(HPTE_LOCK_BIT, word)) - cpu_relax(); + spin_cpu_relax(); + spin_end(); } } -- cgit v1.2.3 From 9d6c452352d6535745e734f296335e6695b6df0b Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 29 Jun 2017 23:19:15 +0530 Subject: powerpc/64s: Convert .L__replay_interrupt_return to a local label Commit b48bbb82e2b835 ("powerpc/64s: Don't unbalance the return branch predictor in __replay_interrupt()") introduced __replay_interrupt_return symbol with '.L' prefix in hopes of keeping it private. However, due to the use of LOAD_REG_ADDR(), the assembler kept this symbol visible. Fix the same by instead using the local label '1'. Fixes: Commit b48bbb82e2b835 ("powerpc/64s: Don't unbalance the return branch predictor in __replay_interrupt()") Suggested-by: Nicholas Piggin Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/exceptions-64s.S | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index fb4ffc773082..3a7a456ab95c 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1634,7 +1634,7 @@ _GLOBAL(__replay_interrupt) * we don't give a damn about, so we don't bother storing them. */ mfmsr r12 - LOAD_REG_ADDR(r11, .L__replay_interrupt_return) + LOAD_REG_ADDR(r11, 1f) mfcr r9 ori r12,r12,MSR_EE cmpwi r3,0x900 @@ -1652,6 +1652,6 @@ FTR_SECTION_ELSE cmpwi r3,0xa00 beq doorbell_super_common_msgclr ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) -.L__replay_interrupt_return: +1: blr -- cgit v1.2.3 From cf7d6fb0672668690ba235280698c9aa6a81010a Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 29 Jun 2017 23:19:16 +0530 Subject: powerpc/64s: Blacklist system_call() and system_call_common() from kprobes Convert some of the symbols into private symbols and blacklist system_call_common() and system_call() from kprobes. We can't take a trap at parts of these functions as either MSR_RI is unset or the kernel stack pointer is not yet setup. Reviewed-by: Masami Hiramatsu Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao [mpe: Don't convert system_call_common to _GLOBAL()] Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index da9486e2fd89..410e19295259 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -57,7 +57,7 @@ system_call_common: #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION extrdi. r10, r12, 1, (63-MSR_TS_T_LG) /* transaction active? */ - bne tabort_syscall + bne .Ltabort_syscall END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif andi. r10,r12,MSR_PR @@ -152,9 +152,9 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) CURRENT_THREAD_INFO(r11, r1) ld r10,TI_FLAGS(r11) andi. r11,r10,_TIF_SYSCALL_DOTRACE - bne syscall_dotrace /* does not return */ + bne .Lsyscall_dotrace /* does not return */ cmpldi 0,r0,NR_syscalls - bge- syscall_enosys + bge- .Lsyscall_enosys system_call: /* label this so stack traces look sane */ /* @@ -208,7 +208,7 @@ system_call: /* label this so stack traces look sane */ ld r9,TI_FLAGS(r12) li r11,-MAX_ERRNO andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) - bne- syscall_exit_work + bne- .Lsyscall_exit_work /* If MSR_FP and MSR_VEC are set in user msr, then no need to restore */ li r7,MSR_FP @@ -217,12 +217,12 @@ system_call: /* label this so stack traces look sane */ #endif and r0,r8,r7 cmpd r0,r7 - bne syscall_restore_math + bne .Lsyscall_restore_math .Lsyscall_restore_math_cont: cmpld r3,r11 ld r5,_CCR(r1) - bge- syscall_error + bge- .Lsyscall_error .Lsyscall_error_cont: ld r7,_NIP(r1) BEGIN_FTR_SECTION @@ -248,13 +248,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) RFI b . /* prevent speculative execution */ -syscall_error: +.Lsyscall_error: oris r5,r5,0x1000 /* Set SO bit in CR */ neg r3,r3 std r5,_CCR(r1) b .Lsyscall_error_cont -syscall_restore_math: +.Lsyscall_restore_math: /* * Some initial tests from restore_math to avoid the heavyweight * C code entry and MSR manipulations. @@ -289,7 +289,7 @@ syscall_restore_math: b .Lsyscall_restore_math_cont /* Traced system call support */ -syscall_dotrace: +.Lsyscall_dotrace: bl save_nvgprs addi r3,r1,STACK_FRAME_OVERHEAD bl do_syscall_trace_enter @@ -322,11 +322,11 @@ syscall_dotrace: b .Lsyscall_exit -syscall_enosys: +.Lsyscall_enosys: li r3,-ENOSYS b .Lsyscall_exit -syscall_exit_work: +.Lsyscall_exit_work: #ifdef CONFIG_PPC_BOOK3S li r10,MSR_RI mtmsrd r10,1 /* Restore RI */ @@ -386,7 +386,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) b ret_from_except #ifdef CONFIG_PPC_TRANSACTIONAL_MEM -tabort_syscall: +.Ltabort_syscall: /* Firstly we need to enable TM in the kernel */ mfmsr r10 li r9, 1 @@ -412,6 +412,8 @@ tabort_syscall: rfid b . /* prevent speculative execution */ #endif +_ASM_NOKPROBE_SYMBOL(system_call_common); +_ASM_NOKPROBE_SYMBOL(system_call); /* Save non-volatile GPRs, if not already saved. */ _GLOBAL(save_nvgprs) -- cgit v1.2.3 From 266de3a842d70b481ef1dac487acfad1240deb7e Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 29 Jun 2017 23:19:17 +0530 Subject: powerpc/64s: Move system_call() symbol to just after setting MSR_EE It is common to get a PMU interrupt right after the mtmsr instruction that enables interrupts. Due to this, the stack trace profile gets needlessly split across system_call_common() and system_call(). Previously, system_call() symbol was at the current place to hide a few earlier symbols which have since been made private or removed entirely. So, let's move system_call() slightly higher up, right after the mtmsr instruction that enables interrupts. Convert existing references to system_call to a local syscall symbol. Suggested-by: Nicholas Piggin Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 410e19295259..e5cf98d755db 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -143,6 +143,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) mtmsrd r11,1 #endif /* CONFIG_PPC_BOOK3E */ +system_call: /* label this so stack traces look sane */ /* We do need to set SOFTE in the stack frame or the return * from interrupt will be painful */ @@ -156,7 +157,7 @@ END_FW_FTR_SECTION_IFSET(FW_FEATURE_SPLPAR) cmpldi 0,r0,NR_syscalls bge- .Lsyscall_enosys -system_call: /* label this so stack traces look sane */ +.Lsyscall: /* * Need to vector to 32 Bit or default sys_call_table here, * based on caller's run-mode / personality. @@ -310,13 +311,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r7,GPR7(r1) ld r8,GPR8(r1) - /* Repopulate r9 and r10 for the system_call path */ + /* Repopulate r9 and r10 for the syscall path */ addi r9,r1,STACK_FRAME_OVERHEAD CURRENT_THREAD_INFO(r10, r1) ld r10,TI_FLAGS(r10) cmpldi r0,NR_syscalls - blt+ system_call + blt+ .Lsyscall /* Return code is already in r3 thanks to do_syscall_trace_enter() */ b .Lsyscall_exit -- cgit v1.2.3 From 3639d6619c249acf3b8ecc4fd4552486f217dec0 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 29 Jun 2017 23:19:18 +0530 Subject: powerpc/64s: Un-blacklist system_call() from kprobes It is actually safe to probe system_call() in entry_64.S, but only till we unset MSR_RI. To allow this, add a new symbol system_call_exit() after the mtmsrd and blacklist that. Suggested-by: Michael Ellerman Reviewed-by: Nicholas Piggin Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index e5cf98d755db..ed45e86673c0 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -188,6 +188,18 @@ system_call: /* label this so stack traces look sane */ andi. r10,r8,MSR_RI beq- unrecov_restore #endif + +/* + * This is a few instructions into the actual syscall exit path (which actually + * starts at .Lsyscall_exit) to cater to kprobe blacklisting and to reduce the + * number of visible symbols for profiling purposes. + * + * We can probe from system_call until this point as MSR_RI is set. But once it + * is cleared below, we won't be able to take a trap. + * + * This is blacklisted from kprobes further below with _ASM_NOKPROBE_SYMBOL(). + */ +system_call_exit: /* * Disable interrupts so current_thread_info()->flags can't change, * and so that we don't get interrupted after loading SRR0/1. @@ -414,7 +426,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) b . /* prevent speculative execution */ #endif _ASM_NOKPROBE_SYMBOL(system_call_common); -_ASM_NOKPROBE_SYMBOL(system_call); +_ASM_NOKPROBE_SYMBOL(system_call_exit); /* Save non-volatile GPRs, if not already saved. */ _GLOBAL(save_nvgprs) -- cgit v1.2.3 From 15770a13bebc7fae2fbc06bc4cc708a3f724c586 Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 29 Jun 2017 23:19:19 +0530 Subject: powerpc/64s: Blacklist functions invoked on a trap Blacklist all functions involved while handling a trap. We: - convert some of the symbols into private symbols, and - blacklist most functions involved while handling a trap. Reviewed-by: Masami Hiramatsu Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 35 ++++++++++++++++++++++------------- arch/powerpc/kernel/exceptions-64s.S | 2 ++ arch/powerpc/kernel/traps.c | 3 +++ 3 files changed, 27 insertions(+), 13 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index ed45e86673c0..d5be463b2fa4 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -186,7 +186,7 @@ system_call: /* label this so stack traces look sane */ #ifdef CONFIG_PPC_BOOK3S /* No MSR:RI on BookE */ andi. r10,r8,MSR_RI - beq- unrecov_restore + beq- .Lunrecov_restore #endif /* @@ -437,6 +437,7 @@ _GLOBAL(save_nvgprs) clrrdi r0,r11,1 std r0,_TRAP(r1) blr +_ASM_NOKPROBE_SYMBOL(save_nvgprs); /* @@ -807,11 +808,11 @@ restore: ld r5,SOFTE(r1) lbz r6,PACASOFTIRQEN(r13) cmpwi cr0,r5,0 - beq restore_irq_off + beq .Lrestore_irq_off /* We are enabling, were we already enabled ? Yes, just return */ cmpwi cr0,r6,1 - beq cr0,do_restore + beq cr0,.Ldo_restore /* * We are about to soft-enable interrupts (we are hard disabled @@ -820,14 +821,14 @@ restore: */ lbz r0,PACAIRQHAPPENED(r13) cmpwi cr0,r0,0 - bne- restore_check_irq_replay + bne- .Lrestore_check_irq_replay /* * Get here when nothing happened while soft-disabled, just * soft-enable and move-on. We will hard-enable as a side * effect of rfi */ -restore_no_replay: +.Lrestore_no_replay: TRACE_ENABLE_INTS li r0,1 stb r0,PACASOFTIRQEN(r13); @@ -835,7 +836,7 @@ restore_no_replay: /* * Final return path. BookE is handled in a different file */ -do_restore: +.Ldo_restore: #ifdef CONFIG_PPC_BOOK3E b exception_return_book3e #else @@ -869,7 +870,7 @@ fast_exception_return: REST_8GPRS(5, r1) andi. r0,r3,MSR_RI - beq- unrecov_restore + beq- .Lunrecov_restore /* Load PPR from thread struct before we clear MSR:RI */ BEGIN_FTR_SECTION @@ -927,7 +928,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * make sure that in this case, we also clear PACA_IRQ_HARD_DIS * or that bit can get out of sync and bad things will happen */ -restore_irq_off: +.Lrestore_irq_off: ld r3,_MSR(r1) lbz r7,PACAIRQHAPPENED(r13) andi. r0,r3,MSR_EE @@ -937,13 +938,13 @@ restore_irq_off: 1: li r0,0 stb r0,PACASOFTIRQEN(r13); TRACE_DISABLE_INTS - b do_restore + b .Ldo_restore /* * Something did happen, check if a re-emit is needed * (this also clears paca->irq_happened) */ -restore_check_irq_replay: +.Lrestore_check_irq_replay: /* XXX: We could implement a fast path here where we check * for irq_happened being just 0x01, in which case we can * clear it and return. That means that we would potentially @@ -953,7 +954,7 @@ restore_check_irq_replay: */ bl __check_irq_replay cmpwi cr0,r3,0 - beq restore_no_replay + beq .Lrestore_no_replay /* * We need to re-emit an interrupt. We do so by re-using our @@ -1002,10 +1003,18 @@ restore_check_irq_replay: #endif /* CONFIG_PPC_DOORBELL */ 1: b ret_from_except /* What else to do here ? */ -unrecov_restore: +.Lunrecov_restore: addi r3,r1,STACK_FRAME_OVERHEAD bl unrecoverable_exception - b unrecov_restore + b .Lunrecov_restore + +_ASM_NOKPROBE_SYMBOL(ret_from_except); +_ASM_NOKPROBE_SYMBOL(ret_from_except_lite); +_ASM_NOKPROBE_SYMBOL(resume_kernel); +_ASM_NOKPROBE_SYMBOL(fast_exc_return_irq); +_ASM_NOKPROBE_SYMBOL(restore); +_ASM_NOKPROBE_SYMBOL(fast_exception_return); + #ifdef CONFIG_PPC_RTAS /* diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 3a7a456ab95c..4c18a5fbb4bb 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1594,6 +1594,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) 1: addi r3,r1,STACK_FRAME_OVERHEAD bl kernel_bad_stack b 1b +_ASM_NOKPROBE_SYMBOL(bad_stack); /* * When doorbell is triggered from system reset wakeup, the message is @@ -1655,3 +1656,4 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE) 1: blr +_ASM_NOKPROBE_SYMBOL(__replay_interrupt) diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index d4e545d27ef9..bfcfd9ef09f2 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -237,6 +237,7 @@ void die(const char *str, struct pt_regs *regs, long err) err = 0; oops_end(flags, regs, err); } +NOKPROBE_SYMBOL(die); void user_single_step_siginfo(struct task_struct *tsk, struct pt_regs *regs, siginfo_t *info) @@ -1968,6 +1969,7 @@ void unrecoverable_exception(struct pt_regs *regs) regs->trap, regs->nip); die("Unrecoverable exception", regs, SIGABRT); } +NOKPROBE_SYMBOL(unrecoverable_exception); #if defined(CONFIG_BOOKE_WDT) || defined(CONFIG_40x) /* @@ -1998,6 +2000,7 @@ void kernel_bad_stack(struct pt_regs *regs) regs->gpr[1], regs->nip); die("Bad kernel stack pointer", regs, SIGABRT); } +NOKPROBE_SYMBOL(kernel_bad_stack); void __init trap_init(void) { -- cgit v1.2.3 From 90653a84052cfbbc1f46427f851dad14083e36df Mon Sep 17 00:00:00 2001 From: "Naveen N. Rao" Date: Thu, 29 Jun 2017 23:19:20 +0530 Subject: powerpc/64s: Blacklist rtas entry/exit from kprobes We can't take traps with relocation off, so blacklist enter_rtas() and rtas_return_loc(). However, instead of blacklisting all of enter_rtas(), introduce a new symbol __enter_rtas from where on we can't take a trap and blacklist that. Signed-off-by: Naveen N. Rao Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/entry_64.S | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index d5be463b2fa4..49d8422767b4 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -1091,6 +1091,8 @@ _GLOBAL(enter_rtas) rldicr r9,r9,MSR_SF_LG,(63-MSR_SF_LG) ori r9,r9,MSR_IR|MSR_DR|MSR_FE0|MSR_FE1|MSR_FP|MSR_RI|MSR_LE andc r6,r0,r9 + +__enter_rtas: sync /* disable interrupts so SRR0/1 */ mtmsrd r0 /* don't get trashed */ @@ -1127,6 +1129,8 @@ rtas_return_loc: mtspr SPRN_SRR1,r4 rfid b . /* prevent speculative execution */ +_ASM_NOKPROBE_SYMBOL(__enter_rtas) +_ASM_NOKPROBE_SYMBOL(rtas_return_loc) .align 3 1: .llong rtas_restore_regs -- cgit v1.2.3 From d07df82c43be82ab6972662180e89e6ba2a828ad Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 6 Jun 2017 14:29:38 +1000 Subject: powerpc/kprobes: Move kprobes over to patch_instruction() arch_arm/disarm_probe() use direct assignment for copying instructions, replace them with patch_instruction(). We don't need to call flush_icache_range() because patch_instruction() does it for us. Signed-off-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/kprobes.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index 01addfb0ed0a..45f1ff721c32 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -164,17 +164,13 @@ NOKPROBE_SYMBOL(arch_prepare_kprobe); void arch_arm_kprobe(struct kprobe *p) { - *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + patch_instruction(p->addr, BREAKPOINT_INSTRUCTION); } NOKPROBE_SYMBOL(arch_arm_kprobe); void arch_disarm_kprobe(struct kprobe *p) { - *p->addr = p->opcode; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + patch_instruction(p->addr, p->opcode); } NOKPROBE_SYMBOL(arch_disarm_kprobe); -- cgit v1.2.3 From f3eca956389316acd1a132fad1ad0b6f2ca78a61 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 6 Jun 2017 14:29:39 +1000 Subject: powerpc/kprobes/optprobes: Use patch_instruction() So that we can implement STRICT_RWX, use patch_instruction() in optprobes. Signed-off-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/optprobes.c | 53 +++++++++++++++++++++++++---------------- 1 file changed, 32 insertions(+), 21 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index ec60ed0d4aad..6f8273f5e988 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -158,12 +158,13 @@ void arch_remove_optimized_kprobe(struct optimized_kprobe *op) void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr) { /* addis r4,0,(insn)@h */ - *addr++ = PPC_INST_ADDIS | ___PPC_RT(4) | - ((val >> 16) & 0xffff); + patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(4) | + ((val >> 16) & 0xffff)); + addr++; /* ori r4,r4,(insn)@l */ - *addr = PPC_INST_ORI | ___PPC_RA(4) | ___PPC_RS(4) | - (val & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(4) | + ___PPC_RS(4) | (val & 0xffff)); } /* @@ -173,24 +174,28 @@ void patch_imm32_load_insns(unsigned int val, kprobe_opcode_t *addr) void patch_imm64_load_insns(unsigned long val, kprobe_opcode_t *addr) { /* lis r3,(op)@highest */ - *addr++ = PPC_INST_ADDIS | ___PPC_RT(3) | - ((val >> 48) & 0xffff); + patch_instruction(addr, PPC_INST_ADDIS | ___PPC_RT(3) | + ((val >> 48) & 0xffff)); + addr++; /* ori r3,r3,(op)@higher */ - *addr++ = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) | - ((val >> 32) & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) | + ___PPC_RS(3) | ((val >> 32) & 0xffff)); + addr++; /* rldicr r3,r3,32,31 */ - *addr++ = PPC_INST_RLDICR | ___PPC_RA(3) | ___PPC_RS(3) | - __PPC_SH64(32) | __PPC_ME64(31); + patch_instruction(addr, PPC_INST_RLDICR | ___PPC_RA(3) | + ___PPC_RS(3) | __PPC_SH64(32) | __PPC_ME64(31)); + addr++; /* oris r3,r3,(op)@h */ - *addr++ = PPC_INST_ORIS | ___PPC_RA(3) | ___PPC_RS(3) | - ((val >> 16) & 0xffff); + patch_instruction(addr, PPC_INST_ORIS | ___PPC_RA(3) | + ___PPC_RS(3) | ((val >> 16) & 0xffff)); + addr++; /* ori r3,r3,(op)@l */ - *addr = PPC_INST_ORI | ___PPC_RA(3) | ___PPC_RS(3) | - (val & 0xffff); + patch_instruction(addr, PPC_INST_ORI | ___PPC_RA(3) | + ___PPC_RS(3) | (val & 0xffff)); } int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) @@ -198,7 +203,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) kprobe_opcode_t *buff, branch_op_callback, branch_emulate_step; kprobe_opcode_t *op_callback_addr, *emulate_step_addr; long b_offset; - unsigned long nip; + unsigned long nip, size; + int rc, i; kprobe_ppc_optinsn_slots.insn_size = MAX_OPTINSN_SIZE; @@ -231,8 +237,14 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) goto error; /* Setup template */ - memcpy(buff, optprobe_template_entry, - TMPL_END_IDX * sizeof(kprobe_opcode_t)); + /* We can optimize this via patch_instruction_window later */ + size = (TMPL_END_IDX * sizeof(kprobe_opcode_t)) / sizeof(int); + pr_devel("Copying template to %p, size %lu\n", buff, size); + for (i = 0; i < size; i++) { + rc = patch_instruction(buff + i, *(optprobe_template_entry + i)); + if (rc < 0) + goto error; + } /* * Fixup the template with instructions to: @@ -261,8 +273,8 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) if (!branch_op_callback || !branch_emulate_step) goto error; - buff[TMPL_CALL_HDLR_IDX] = branch_op_callback; - buff[TMPL_EMULATE_IDX] = branch_emulate_step; + patch_instruction(buff + TMPL_CALL_HDLR_IDX, branch_op_callback); + patch_instruction(buff + TMPL_EMULATE_IDX, branch_emulate_step); /* * 3. load instruction to be emulated into relevant register, and @@ -272,8 +284,7 @@ int arch_prepare_optimized_kprobe(struct optimized_kprobe *op, struct kprobe *p) /* * 4. branch back from trampoline */ - buff[TMPL_RET_IDX] = create_branch((unsigned int *)buff + TMPL_RET_IDX, - (unsigned long)nip, 0); + patch_branch(buff + TMPL_RET_IDX, (unsigned long)nip, 0); flush_icache_range((unsigned long)buff, (unsigned long)(&buff[TMPL_END_IDX])); -- cgit v1.2.3 From d924cc3feda9c2bea8164930899f367ce249cbbf Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Thu, 29 Jun 2017 03:04:06 +1000 Subject: powerpc/vmlinux.lds: Align __init_begin to 16M For CONFIG_STRICT_KERNEL_RWX align __init_begin to 16M. We use 16M since its the larger of 2M on radix and 16M on hash for our linear mapping. The plan is to have .text, .rodata and everything upto __init_begin marked as RX. Note we still have executable read only data. We could further align rodata to another 16M boundary. I've used keeping text plus rodata as read-only-executable as a trade-off to doing read-only-executable for text and read-only for rodata. We don't use multi PT_LOAD in PHDRS because we are not sure if all bootloaders support them. This patch keeps PHDRS in vmlinux.lds.S as the same they are with just one PT_LOAD for all of the kernel marked as RWX (7). mpe: What this means is the added alignment bloats the resulting binary on disk, a powernv kernel goes from 17M to 22M. Signed-off-by: Balbir Singh Signed-off-by: Michael Ellerman --- arch/powerpc/kernel/vmlinux.lds.S | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/powerpc/kernel') diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index ace6b6579961..b1a250560198 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -8,6 +8,12 @@ #include #include +#ifdef CONFIG_STRICT_KERNEL_RWX +#define STRICT_ALIGN_SIZE (1 << 24) +#else +#define STRICT_ALIGN_SIZE PAGE_SIZE +#endif + ENTRY(_stext) PHDRS { @@ -123,7 +129,7 @@ SECTIONS PROVIDE32 (etext = .); /* Read-only data */ - RODATA + RO_DATA(PAGE_SIZE) EXCEPTION_TABLE(0) @@ -140,7 +146,7 @@ SECTIONS /* * Init sections discarded at runtime */ - . = ALIGN(PAGE_SIZE); + . = ALIGN(STRICT_ALIGN_SIZE); __init_begin = .; INIT_TEXT_SECTION(PAGE_SIZE) :kernel -- cgit v1.2.3