From 35b69a420bfb56b7b74cb635ea903db05e357bec Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 4 Nov 2018 03:48:54 +0000 Subject: clockevents/drivers/i8253: Add support for PIT shutdown quirk Add support for platforms where pit_shutdown() doesn't work because of a quirk in the PIT emulation. On these platforms setting the counter register to zero causes the PIT to start running again, negating the shutdown. Provide a global variable that controls whether the counter register is zero'ed, which platform specific code can override. Signed-off-by: Michael Kelley Signed-off-by: Thomas Gleixner Cc: "gregkh@linuxfoundation.org" Cc: "devel@linuxdriverproject.org" Cc: "daniel.lezcano@linaro.org" Cc: "virtualization@lists.linux-foundation.org" Cc: "jgross@suse.com" Cc: "akataria@vmware.com" Cc: "olaf@aepfle.de" Cc: "apw@canonical.com" Cc: vkuznets Cc: "jasowang@redhat.com" Cc: "marcelo.cerri@canonical.com" Cc: KY Srinivasan Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1541303219-11142-2-git-send-email-mikelley@microsoft.com --- drivers/clocksource/i8253.c | 14 ++++++++++++-- include/linux/i8253.h | 1 + 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c index 9c38895542f4..d4350bb10b83 100644 --- a/drivers/clocksource/i8253.c +++ b/drivers/clocksource/i8253.c @@ -20,6 +20,13 @@ DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); +/* + * Handle PIT quirk in pit_shutdown() where zeroing the counter register + * restarts the PIT, negating the shutdown. On platforms with the quirk, + * platform specific code can set this to false. + */ +bool i8253_clear_counter_on_shutdown __ro_after_init = true; + #ifdef CONFIG_CLKSRC_I8253 /* * Since the PIT overflows every tick, its not very useful @@ -109,8 +116,11 @@ static int pit_shutdown(struct clock_event_device *evt) raw_spin_lock(&i8253_lock); outb_p(0x30, PIT_MODE); - outb_p(0, PIT_CH0); - outb_p(0, PIT_CH0); + + if (i8253_clear_counter_on_shutdown) { + outb_p(0, PIT_CH0); + outb_p(0, PIT_CH0); + } raw_spin_unlock(&i8253_lock); return 0; diff --git a/include/linux/i8253.h b/include/linux/i8253.h index e6bb36a97519..8336b2f6f834 100644 --- a/include/linux/i8253.h +++ b/include/linux/i8253.h @@ -21,6 +21,7 @@ #define PIT_LATCH ((PIT_TICK_RATE + HZ/2) / HZ) extern raw_spinlock_t i8253_lock; +extern bool i8253_clear_counter_on_shutdown; extern struct clock_event_device i8253_clockevent; extern void clockevent_i8253_init(bool oneshot); -- cgit v1.2.3 From 1de72c706488b7be664a601cf3843bd01e327e58 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Sun, 4 Nov 2018 03:48:57 +0000 Subject: x86/hyper-v: Enable PIT shutdown quirk Hyper-V emulation of the PIT has a quirk such that the normal PIT shutdown path doesn't work, because clearing the counter register restarts the timer. Disable the counter clearing on PIT shutdown. Signed-off-by: Michael Kelley Signed-off-by: Thomas Gleixner Cc: "gregkh@linuxfoundation.org" Cc: "devel@linuxdriverproject.org" Cc: "daniel.lezcano@linaro.org" Cc: "virtualization@lists.linux-foundation.org" Cc: "jgross@suse.com" Cc: "akataria@vmware.com" Cc: "olaf@aepfle.de" Cc: "apw@canonical.com" Cc: vkuznets Cc: "jasowang@redhat.com" Cc: "marcelo.cerri@canonical.com" Cc: KY Srinivasan Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/1541303219-11142-3-git-send-email-mikelley@microsoft.com --- arch/x86/kernel/cpu/mshyperv.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 1c72f3819eb1..e81a2db42df7 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -295,6 +296,16 @@ static void __init ms_hyperv_init_platform(void) if (efi_enabled(EFI_BOOT)) x86_platform.get_nmi_reason = hv_get_nmi_reason; + /* + * Hyper-V VMs have a PIT emulation quirk such that zeroing the + * counter register during PIT shutdown restarts the PIT. So it + * continues to interrupt @18.2 HZ. Setting i8253_clear_counter + * to false tells pit_shutdown() not to zero the counter so that + * the PIT really is shutdown. Generation 2 VMs don't have a PIT, + * and setting this value has no effect. + */ + i8253_clear_counter_on_shutdown = false; + #if IS_ENABLED(CONFIG_HYPERV) /* * Setup the hook to get control post apic initialization. -- cgit v1.2.3 From b068621a53f92f42400581d69ad0e84c56620b0a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 4 Nov 2018 14:03:56 -0800 Subject: Documentation/x86: Fix typo in zero-page.txt Signed-off-by: Randy Dunlap Cc: Jonathan Corbet Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-doc@vger.kernel.org Link: http://lkml.kernel.org/r/f259b21b-1f2b-f215-00d2-23388bed2530@infradead.org Signed-off-by: Ingo Molnar --- Documentation/x86/zero-page.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/x86/zero-page.txt b/Documentation/x86/zero-page.txt index 97b7adbceda4..68aed077f7b6 100644 --- a/Documentation/x86/zero-page.txt +++ b/Documentation/x86/zero-page.txt @@ -25,7 +25,7 @@ Offset Proto Name Meaning 0C8/004 ALL ext_cmd_line_ptr cmd_line_ptr high 32bits 140/080 ALL edid_info Video mode setup (struct edid_info) 1C0/020 ALL efi_info EFI 32 information (struct efi_info) -1E0/004 ALL alk_mem_k Alternative mem check, in KB +1E0/004 ALL alt_mem_k Alternative mem check, in KB 1E4/004 ALL scratch Scratch field for the kernel setup code 1E8/001 ALL e820_entries Number of entries in e820_table (below) 1E9/001 ALL eddbuf_entries Number of entries in eddbuf (below) -- cgit v1.2.3 From b42967dcac1d4f5b059ec25568136462bcb051fe Mon Sep 17 00:00:00 2001 From: Yi Wang Date: Mon, 29 Oct 2018 15:17:31 +0800 Subject: x86/hyper-v: Fix indentation in hv_do_fast_hypercall16() Remove the surplus TAB in hv_do_fast_hypercall16(). Signed-off-by: Yi Wang Signed-off-by: Thomas Gleixner Cc: kys@microsoft.com Cc: haiyangz@microsoft.com Cc: sthemmin@microsoft.com Cc: bp@alien8.de Cc: hpa@zytor.com Cc: devel@linuxdriverproject.org Cc: zhong.weidong@zte.com.cn Link: https://lkml.kernel.org/r/1540797451-2792-1-git-send-email-wang.yi59@zte.com.cn --- arch/x86/include/asm/mshyperv.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/mshyperv.h b/arch/x86/include/asm/mshyperv.h index 0d6271cce198..1d0a7778e163 100644 --- a/arch/x86/include/asm/mshyperv.h +++ b/arch/x86/include/asm/mshyperv.h @@ -232,7 +232,7 @@ static inline u64 hv_do_fast_hypercall16(u16 code, u64 input1, u64 input2) : "cc"); } #endif - return hv_status; + return hv_status; } /* -- cgit v1.2.3 From 437e88ab8f9e2ad90576ab74c4cf8f527bbf51cd Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 23 Oct 2018 16:11:25 -0700 Subject: x86/build: Remove -pipe from KBUILD_CFLAGS Commit 77b0bf55bc67 ("kbuild/Makefile: Prepare for using macros in inline assembly code to work around asm() related GCC inlining bugs") added -Wa,- to KBUILD_CFLAGS, which breaks compiling with Clang (hangs indefinitely at compiling init/main.o). This happens because while Clang accepts -pipe (and has it documented in its list of supported flags), it silently ignores it after this 2010 commit (thanks to Nick Desaulniers for tracking this down), meaning that gas just infinitely waits for stdin and never receives it. https://github.com/llvm-mirror/clang/commit/c19a12dc3d441bec62eed55e312b76c12d6d9022 Initially, I had suggested just add -Wa,- to KBUILD_CFLAGS when GCC was being used but that was before realizing it is because Clang doesn't do anything with -pipe. H. Peter Anvin suggested checking to see if -pipe gives us any gains out of GCC. Turns out it might actually be hurting: With -pipe: real 3m40.813s real 3m44.449s real 3m39.648s Without -pipe: real 3m38.492s real 3m38.335s real 3m38.975s The issue of -Wa,- being passed along to gas without -pipe being supported should still probably be fixed on the LLVM side (open issue: https://bugs.llvm.org/show_bug.cgi?id=39410) but this is not as much of a workaround anymore since it helps both GCC and Clang. Suggested-by: H. Peter Anvin Signed-off-by: Nathan Chancellor Signed-off-by: Thomas Gleixner Tested-by: Nick Desaulniers Reviewed-by: Nadav Amit Reviewed-by: Nick Desaulniers Cc: Borislav Petkov Cc: Kees Cook Cc: Masahiro Yamada Link: https://github.com/ClangBuiltLinux/linux/issues/213 Link: https://lkml.kernel.org/r/20181023231125.27976-1-natechancellor@gmail.com --- arch/x86/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 5b562e464009..88398fdf8129 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -213,8 +213,6 @@ ifdef CONFIG_X86_64 KBUILD_LDFLAGS += $(call ld-option, -z max-page-size=0x200000) endif -# Speed up the build -KBUILD_CFLAGS += -pipe # Workaround for a gcc prelease that unfortunately was shipped in a suse release KBUILD_CFLAGS += -Wno-sign-compare # @@ -239,7 +237,7 @@ archheaders: archmacros: $(Q)$(MAKE) $(build)=arch/x86/kernel arch/x86/kernel/macros.s -ASM_MACRO_FLAGS = -Wa,arch/x86/kernel/macros.s -Wa,- +ASM_MACRO_FLAGS = -Wa,arch/x86/kernel/macros.s export ASM_MACRO_FLAGS KBUILD_CFLAGS += $(ASM_MACRO_FLAGS) -- cgit v1.2.3 From 5d96c9342c23ee1d084802dcf064caa67ecaa45b Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 25 Oct 2018 18:37:28 -0600 Subject: acpi/nfit, x86/mce: Handle only uncorrectable machine checks The MCE handler for nfit devices is called for memory errors on a Non-Volatile DIMM and adds the error location to a 'badblocks' list. This list is used by the various NVDIMM drivers to avoid consuming known poison locations during IO. The MCE handler gets called for both corrected and uncorrectable errors. Until now, both kinds of errors have been added to the badblocks list. However, corrected memory errors indicate that the problem has already been fixed by hardware, and the resulting interrupt is merely a notification to Linux. As far as future accesses to that location are concerned, it is perfectly fine to use, and thus doesn't need to be included in the above badblocks list. Add a check in the nfit MCE handler to filter out corrected mce events, and only process uncorrectable errors. Fixes: 6839a6d96f4e ("nfit: do an ARS scrub on hitting a latent media error") Reported-by: Omar Avelar Signed-off-by: Vishal Verma Signed-off-by: Borislav Petkov CC: Arnd Bergmann CC: Dan Williams CC: Dave Jiang CC: elliott@hpe.com CC: "H. Peter Anvin" CC: Ingo Molnar CC: Len Brown CC: linux-acpi@vger.kernel.org CC: linux-edac CC: linux-nvdimm@lists.01.org CC: Qiuxu Zhuo CC: "Rafael J. Wysocki" CC: Ross Zwisler CC: stable CC: Thomas Gleixner CC: Tony Luck CC: x86-ml CC: Yazen Ghannam Link: http://lkml.kernel.org/r/20181026003729.8420-1-vishal.l.verma@intel.com --- arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 3 ++- drivers/acpi/nfit/mce.c | 4 ++-- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 4da9b1c58d28..dbd9fe2f6163 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -221,6 +221,7 @@ static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_am int mce_available(struct cpuinfo_x86 *c); bool mce_is_memory_error(struct mce *m); +bool mce_is_correctable(struct mce *m); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8c66d2fc8f81..77527b8ea982 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -534,7 +534,7 @@ bool mce_is_memory_error(struct mce *m) } EXPORT_SYMBOL_GPL(mce_is_memory_error); -static bool mce_is_correctable(struct mce *m) +bool mce_is_correctable(struct mce *m) { if (m->cpuvendor == X86_VENDOR_AMD && m->status & MCI_STATUS_DEFERRED) return false; @@ -547,6 +547,7 @@ static bool mce_is_correctable(struct mce *m) return true; } +EXPORT_SYMBOL_GPL(mce_is_correctable); static bool cec_add_mce(struct mce *m) { diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index e9626bf6ca29..7a51707f87e9 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -25,8 +25,8 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, struct acpi_nfit_desc *acpi_desc; struct nfit_spa *nfit_spa; - /* We only care about memory errors */ - if (!mce_is_memory_error(mce)) + /* We only care about uncorrectable memory errors */ + if (!mce_is_memory_error(mce) || mce_is_correctable(mce)) return NOTIFY_DONE; /* -- cgit v1.2.3 From e8a308e5f47e545e0d41d0686c00f5f5217c5f61 Mon Sep 17 00:00:00 2001 From: Vishal Verma Date: Thu, 25 Oct 2018 18:37:29 -0600 Subject: acpi/nfit, x86/mce: Validate a MCE's address before using it The NFIT machine check handler uses the physical address from the mce structure, and compares it against information in the ACPI NFIT table to determine whether that location lies on an NVDIMM. The mce->addr field however may not always be valid, and this is indicated by the MCI_STATUS_ADDRV bit in the status field. Export mce_usable_address() which already performs validation for the address, and use it in the NFIT handler. Fixes: 6839a6d96f4e ("nfit: do an ARS scrub on hitting a latent media error") Reported-by: Robert Elliott Signed-off-by: Vishal Verma Signed-off-by: Borislav Petkov CC: Arnd Bergmann Cc: Dan Williams CC: Dave Jiang CC: elliott@hpe.com CC: "H. Peter Anvin" CC: Ingo Molnar CC: Len Brown CC: linux-acpi@vger.kernel.org CC: linux-edac CC: linux-nvdimm@lists.01.org CC: Qiuxu Zhuo CC: "Rafael J. Wysocki" CC: Ross Zwisler CC: stable CC: Thomas Gleixner CC: Tony Luck CC: x86-ml CC: Yazen Ghannam Link: http://lkml.kernel.org/r/20181026003729.8420-2-vishal.l.verma@intel.com --- arch/x86/include/asm/mce.h | 1 + arch/x86/kernel/cpu/mcheck/mce.c | 3 ++- drivers/acpi/nfit/mce.c | 4 ++++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index dbd9fe2f6163..c1a812bd5a27 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -222,6 +222,7 @@ static inline void mce_hygon_feature_init(struct cpuinfo_x86 *c) { return mce_am int mce_available(struct cpuinfo_x86 *c); bool mce_is_memory_error(struct mce *m); bool mce_is_correctable(struct mce *m); +int mce_usable_address(struct mce *m); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 77527b8ea982..36d2696c9563 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -485,7 +485,7 @@ static void mce_report_event(struct pt_regs *regs) * be somewhat complicated (e.g. segment offset would require an instruction * parser). So only support physical addresses up to page granuality for now. */ -static int mce_usable_address(struct mce *m) +int mce_usable_address(struct mce *m) { if (!(m->status & MCI_STATUS_ADDRV)) return 0; @@ -505,6 +505,7 @@ static int mce_usable_address(struct mce *m) return 1; } +EXPORT_SYMBOL_GPL(mce_usable_address); bool mce_is_memory_error(struct mce *m) { diff --git a/drivers/acpi/nfit/mce.c b/drivers/acpi/nfit/mce.c index 7a51707f87e9..d6c1b10f6c25 100644 --- a/drivers/acpi/nfit/mce.c +++ b/drivers/acpi/nfit/mce.c @@ -29,6 +29,10 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val, if (!mce_is_memory_error(mce) || mce_is_correctable(mce)) return NOTIFY_DONE; + /* Verify the address reported in the MCE is valid. */ + if (!mce_usable_address(mce)) + return NOTIFY_DONE; + /* * mce->addr contains the physical addr accessed that caused the * machine check. We need to walk through the list of NFITs, and see -- cgit v1.2.3 From d52888aa2753e3063a9d3a0c9f72f94aa9809c15 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 26 Oct 2018 15:28:54 +0300 Subject: x86/mm: Move LDT remap out of KASLR region on 5-level paging On 5-level paging the LDT remap area is placed in the middle of the KASLR randomization region and it can overlap with the direct mapping, the vmalloc or the vmap area. The LDT mapping is per mm, so it cannot be moved into the P4D page table next to the CPU_ENTRY_AREA without complicating PGD table allocation for 5-level paging. The 4 PGD slot gap just before the direct mapping is reserved for hypervisors, so it cannot be used. Move the direct mapping one slot deeper and use the resulting gap for the LDT remap area. The resulting layout is the same for 4 and 5 level paging. [ tglx: Massaged changelog ] Fixes: f55f0501cbf6 ("x86/pti: Put the LDT in its own PGD if PTI is on") Signed-off-by: Kirill A. Shutemov Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Cc: bp@alien8.de Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: peterz@infradead.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: bhe@redhat.com Cc: willy@infradead.org Cc: linux-mm@kvack.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181026122856.66224-2-kirill.shutemov@linux.intel.com --- Documentation/x86/x86_64/mm.txt | 34 +++++++++++++++++---------------- arch/x86/include/asm/page_64_types.h | 12 +++++++----- arch/x86/include/asm/pgtable_64_types.h | 4 +--- arch/x86/xen/mmu_pv.c | 6 +++--- 4 files changed, 29 insertions(+), 27 deletions(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 73aaaa3da436..804f9426ed17 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -34,23 +34,24 @@ __________________|____________|__________________|_________|___________________ ____________________________________________________________|___________________________________________________________ | | | | ffff800000000000 | -128 TB | ffff87ffffffffff | 8 TB | ... guard hole, also reserved for hypervisor - ffff880000000000 | -120 TB | ffffc7ffffffffff | 64 TB | direct mapping of all physical memory (page_offset_base) - ffffc80000000000 | -56 TB | ffffc8ffffffffff | 1 TB | ... unused hole + ffff880000000000 | -120 TB | ffff887fffffffff | 0.5 TB | LDT remap for PTI + ffff888000000000 | -119.5 TB | ffffc87fffffffff | 64 TB | direct mapping of all physical memory (page_offset_base) + ffffc88000000000 | -55.5 TB | ffffc8ffffffffff | 0.5 TB | ... unused hole ffffc90000000000 | -55 TB | ffffe8ffffffffff | 32 TB | vmalloc/ioremap space (vmalloc_base) ffffe90000000000 | -23 TB | ffffe9ffffffffff | 1 TB | ... unused hole ffffea0000000000 | -22 TB | ffffeaffffffffff | 1 TB | virtual memory map (vmemmap_base) ffffeb0000000000 | -21 TB | ffffebffffffffff | 1 TB | ... unused hole ffffec0000000000 | -20 TB | fffffbffffffffff | 16 TB | KASAN shadow memory - fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole - | | | | vaddr_end for KASLR - fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping - fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | LDT remap for PTI - ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks __________________|____________|__________________|_________|____________________________________________________________ | - | Identical layout to the 47-bit one from here on: + | Identical layout to the 56-bit one from here on: ____________________________________________________________|____________________________________________________________ | | | | + fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole + | | | | vaddr_end for KASLR + fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping + fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole + ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole @@ -83,7 +84,7 @@ Notes: __________________|____________|__________________|_________|___________________________________________________________ | | | | 0000800000000000 | +64 PB | ffff7fffffffffff | ~16K PB | ... huge, still almost 64 bits wide hole of non-canonical - | | | | virtual memory addresses up to the -128 TB + | | | | virtual memory addresses up to the -64 PB | | | | starting offset of kernel mappings. __________________|____________|__________________|_________|___________________________________________________________ | @@ -91,23 +92,24 @@ __________________|____________|__________________|_________|___________________ ____________________________________________________________|___________________________________________________________ | | | | ff00000000000000 | -64 PB | ff0fffffffffffff | 4 PB | ... guard hole, also reserved for hypervisor - ff10000000000000 | -60 PB | ff8fffffffffffff | 32 PB | direct mapping of all physical memory (page_offset_base) - ff90000000000000 | -28 PB | ff9fffffffffffff | 4 PB | LDT remap for PTI + ff10000000000000 | -60 PB | ff10ffffffffffff | 0.25 PB | LDT remap for PTI + ff11000000000000 | -59.75 PB | ff90ffffffffffff | 32 PB | direct mapping of all physical memory (page_offset_base) + ff91000000000000 | -27.75 PB | ff9fffffffffffff | 3.75 PB | ... unused hole ffa0000000000000 | -24 PB | ffd1ffffffffffff | 12.5 PB | vmalloc/ioremap space (vmalloc_base) ffd2000000000000 | -11.5 PB | ffd3ffffffffffff | 0.5 PB | ... unused hole ffd4000000000000 | -11 PB | ffd5ffffffffffff | 0.5 PB | virtual memory map (vmemmap_base) ffd6000000000000 | -10.5 PB | ffdeffffffffffff | 2.25 PB | ... unused hole ffdf000000000000 | -8.25 PB | fffffdffffffffff | ~8 PB | KASAN shadow memory - fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole - | | | | vaddr_end for KASLR - fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping - fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole - ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks __________________|____________|__________________|_________|____________________________________________________________ | | Identical layout to the 47-bit one from here on: ____________________________________________________________|____________________________________________________________ | | | | + fffffc0000000000 | -4 TB | fffffdffffffffff | 2 TB | ... unused hole + | | | | vaddr_end for KASLR + fffffe0000000000 | -2 TB | fffffe7fffffffff | 0.5 TB | cpu_entry_area mapping + fffffe8000000000 | -1.5 TB | fffffeffffffffff | 0.5 TB | ... unused hole + ffffff0000000000 | -1 TB | ffffff7fffffffff | 0.5 TB | %esp fixup stacks ffffff8000000000 | -512 GB | ffffffeeffffffff | 444 GB | ... unused hole ffffffef00000000 | -68 GB | fffffffeffffffff | 64 GB | EFI region mapping space ffffffff00000000 | -4 GB | ffffffff7fffffff | 2 GB | ... unused hole diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index cd0cf1c568b4..8f657286d599 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -33,12 +33,14 @@ /* * Set __PAGE_OFFSET to the most negative possible address + - * PGDIR_SIZE*16 (pgd slot 272). The gap is to allow a space for a - * hypervisor to fit. Choosing 16 slots here is arbitrary, but it's - * what Xen requires. + * PGDIR_SIZE*17 (pgd slot 273). + * + * The gap is to allow a space for LDT remap for PTI (1 pgd slot) and space for + * a hypervisor (16 slots). Choosing 16 slots for a hypervisor is arbitrary, + * but it's what Xen requires. */ -#define __PAGE_OFFSET_BASE_L5 _AC(0xff10000000000000, UL) -#define __PAGE_OFFSET_BASE_L4 _AC(0xffff880000000000, UL) +#define __PAGE_OFFSET_BASE_L5 _AC(0xff11000000000000, UL) +#define __PAGE_OFFSET_BASE_L4 _AC(0xffff888000000000, UL) #ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT #define __PAGE_OFFSET page_offset_base diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 04edd2d58211..84bd9bdc1987 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -111,9 +111,7 @@ extern unsigned int ptrs_per_p4d; */ #define MAXMEM (1UL << MAX_PHYSMEM_BITS) -#define LDT_PGD_ENTRY_L4 -3UL -#define LDT_PGD_ENTRY_L5 -112UL -#define LDT_PGD_ENTRY (pgtable_l5_enabled() ? LDT_PGD_ENTRY_L5 : LDT_PGD_ENTRY_L4) +#define LDT_PGD_ENTRY -240UL #define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #define LDT_END_ADDR (LDT_BASE_ADDR + PGDIR_SIZE) diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 0d7b3ae4960b..a5d7ed125337 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1905,7 +1905,7 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) init_top_pgt[0] = __pgd(0); /* Pre-constructed entries are in pfn, so convert to mfn */ - /* L4[272] -> level3_ident_pgt */ + /* L4[273] -> level3_ident_pgt */ /* L4[511] -> level3_kernel_pgt */ convert_pfn_mfn(init_top_pgt); @@ -1925,8 +1925,8 @@ void __init xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn) addr[0] = (unsigned long)pgd; addr[1] = (unsigned long)l3; addr[2] = (unsigned long)l2; - /* Graft it onto L4[272][0]. Note that we creating an aliasing problem: - * Both L4[272][0] and L4[511][510] have entries that point to the same + /* Graft it onto L4[273][0]. Note that we creating an aliasing problem: + * Both L4[273][0] and L4[511][510] have entries that point to the same * L2 (PMD) tables. Meaning that if you modify it in __va space * it will be also modified in the __ka space! (But if you just * modify the PMD table to point to other PTE's or none, then you -- cgit v1.2.3 From a0e6e0831c516860fc7f9be1db6c081fe902ebcf Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 26 Oct 2018 15:28:55 +0300 Subject: x86/ldt: Unmap PTEs for the slot before freeing LDT pages modify_ldt(2) leaves the old LDT mapped after switching over to the new one. The old LDT gets freed and the pages can be re-used. Leaving the mapping in place can have security implications. The mapping is present in the userspace page tables and Meltdown-like attacks can read these freed and possibly reused pages. It's relatively simple to fix: unmap the old LDT and flush TLB before freeing the old LDT memory. This further allows to avoid flushing the TLB in map_ldt_struct() as the slot is unmapped and flushed by unmap_ldt_struct() or has never been mapped at all. [ tglx: Massaged changelog and removed the needless line breaks ] Fixes: f55f0501cbf6 ("x86/pti: Put the LDT in its own PGD if PTI is on") Signed-off-by: Kirill A. Shutemov Signed-off-by: Thomas Gleixner Cc: bp@alien8.de Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: luto@kernel.org Cc: peterz@infradead.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: bhe@redhat.com Cc: willy@infradead.org Cc: linux-mm@kvack.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181026122856.66224-3-kirill.shutemov@linux.intel.com --- arch/x86/kernel/ldt.c | 51 ++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 38 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ab18e0884dc6..18e4525c5933 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -199,14 +199,6 @@ static void sanity_check_ldt_mapping(struct mm_struct *mm) /* * If PTI is enabled, this maps the LDT into the kernelmode and * usermode tables for the given mm. - * - * There is no corresponding unmap function. Even if the LDT is freed, we - * leave the PTEs around until the slot is reused or the mm is destroyed. - * This is harmless: the LDT is always in ordinary memory, and no one will - * access the freed slot. - * - * If we wanted to unmap freed LDTs, we'd also need to do a flush to make - * it useful, and the flush would slow down modify_ldt(). */ static int map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) @@ -214,8 +206,8 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) unsigned long va; bool is_vmalloc; spinlock_t *ptl; + int i, nr_pages; pgd_t *pgd; - int i; if (!static_cpu_has(X86_FEATURE_PTI)) return 0; @@ -238,7 +230,9 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) is_vmalloc = is_vmalloc_addr(ldt->entries); - for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) { + nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); + + for (i = 0; i < nr_pages; i++) { unsigned long offset = i << PAGE_SHIFT; const void *src = (char *)ldt->entries + offset; unsigned long pfn; @@ -272,13 +266,39 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) /* Propagate LDT mapping to the user page-table */ map_ldt_struct_to_user(mm); - va = (unsigned long)ldt_slot_va(slot); - flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, PAGE_SHIFT, false); - ldt->slot = slot; return 0; } +static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) +{ + unsigned long va; + int i, nr_pages; + + if (!ldt) + return; + + /* LDT map/unmap is only required for PTI */ + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); + + for (i = 0; i < nr_pages; i++) { + unsigned long offset = i << PAGE_SHIFT; + spinlock_t *ptl; + pte_t *ptep; + + va = (unsigned long)ldt_slot_va(ldt->slot) + offset; + ptep = get_locked_pte(mm, va, &ptl); + pte_clear(mm, va, ptep); + pte_unmap_unlock(ptep, ptl); + } + + va = (unsigned long)ldt_slot_va(ldt->slot); + flush_tlb_mm_range(mm, va, va + nr_pages * PAGE_SIZE, PAGE_SHIFT, false); +} + #else /* !CONFIG_PAGE_TABLE_ISOLATION */ static int @@ -286,6 +306,10 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) { return 0; } + +static void unmap_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt) +{ +} #endif /* CONFIG_PAGE_TABLE_ISOLATION */ static void free_ldt_pgtables(struct mm_struct *mm) @@ -524,6 +548,7 @@ static int write_ldt(void __user *ptr, unsigned long bytecount, int oldmode) } install_ldt(mm, new_ldt); + unmap_ldt_struct(mm, old_ldt); free_ldt_struct(old_ldt); error = 0; -- cgit v1.2.3 From b082f2dd80612015cd6d9d84e52099734ec9a0e1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Fri, 26 Oct 2018 15:28:56 +0300 Subject: x86/ldt: Remove unused variable in map_ldt_struct() Splitting out the sanity check in map_ldt_struct() moved page table syncing into a separate function, which made the pgd variable unused. Remove it. [ tglx: Massaged changelog ] Fixes: 9bae3197e15d ("x86/ldt: Split out sanity check in map_ldt_struct()") Signed-off-by: Kirill A. Shutemov Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Cc: bp@alien8.de Cc: hpa@zytor.com Cc: dave.hansen@linux.intel.com Cc: peterz@infradead.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: bhe@redhat.com Cc: willy@infradead.org Cc: linux-mm@kvack.org Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20181026122856.66224-4-kirill.shutemov@linux.intel.com --- arch/x86/kernel/ldt.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 18e4525c5933..6135ae8ce036 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -207,7 +207,6 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) bool is_vmalloc; spinlock_t *ptl; int i, nr_pages; - pgd_t *pgd; if (!static_cpu_has(X86_FEATURE_PTI)) return 0; @@ -221,13 +220,6 @@ map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) /* Check if the current mappings are sane */ sanity_check_ldt_mapping(mm); - /* - * Did we already have the top level entry allocated? We can't - * use pgd_none() for this because it doens't do anything on - * 4-level page table kernels. - */ - pgd = pgd_offset(mm, LDT_BASE_ADDR); - is_vmalloc = is_vmalloc_addr(ldt->entries); nr_pages = DIV_ROUND_UP(ldt->nr_entries * LDT_ENTRY_SIZE, PAGE_SIZE); -- cgit v1.2.3 From a48777fdda7d13179979a889e1fb87655a783cc0 Mon Sep 17 00:00:00 2001 From: Eial Czerwacki Date: Mon, 5 Nov 2018 19:31:54 +0200 Subject: x86/vsmp: Remove dependency on pv_irq_ops vSMP dependency on pv_irq_ops has been removed some years ago, but the code still deals with pv_irq_ops. In short, "cap & ctl & (1 << 4)" is always returning 0, so all PARAVIRT/PARAVIRT_XXL code related to that can be removed. However, the rest of the code depends on CONFIG_PCI, so fix it accordingly. Rename set_vsmp_pv_ops to set_vsmp_ctl as the original name does not make sense anymore. Signed-off-by: Eial Czerwacki Signed-off-by: Thomas Gleixner Acked-by: Shai Fultheim Cc: Juergen Gross Link: https://lkml.kernel.org/r/1541439114-28297-1-git-send-email-eial@scalemp.com --- arch/x86/Kconfig | 1 - arch/x86/kernel/vsmp_64.c | 84 ++++------------------------------------------- 2 files changed, 7 insertions(+), 78 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index ba7e3464ee92..9d734f3c8234 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -525,7 +525,6 @@ config X86_VSMP bool "ScaleMP vSMP" select HYPERVISOR_GUEST select PARAVIRT - select PARAVIRT_XXL depends on X86_64 && PCI depends on X86_EXTENDED_PLATFORM depends on SMP diff --git a/arch/x86/kernel/vsmp_64.c b/arch/x86/kernel/vsmp_64.c index 1eae5af491c2..891a75dbc131 100644 --- a/arch/x86/kernel/vsmp_64.c +++ b/arch/x86/kernel/vsmp_64.c @@ -26,65 +26,8 @@ #define TOPOLOGY_REGISTER_OFFSET 0x10 -#if defined CONFIG_PCI && defined CONFIG_PARAVIRT_XXL -/* - * Interrupt control on vSMPowered systems: - * ~AC is a shadow of IF. If IF is 'on' AC should be 'off' - * and vice versa. - */ - -asmlinkage __visible unsigned long vsmp_save_fl(void) -{ - unsigned long flags = native_save_fl(); - - if (!(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC)) - flags &= ~X86_EFLAGS_IF; - return flags; -} -PV_CALLEE_SAVE_REGS_THUNK(vsmp_save_fl); - -__visible void vsmp_restore_fl(unsigned long flags) -{ - if (flags & X86_EFLAGS_IF) - flags &= ~X86_EFLAGS_AC; - else - flags |= X86_EFLAGS_AC; - native_restore_fl(flags); -} -PV_CALLEE_SAVE_REGS_THUNK(vsmp_restore_fl); - -asmlinkage __visible void vsmp_irq_disable(void) -{ - unsigned long flags = native_save_fl(); - - native_restore_fl((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); -} -PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_disable); - -asmlinkage __visible void vsmp_irq_enable(void) -{ - unsigned long flags = native_save_fl(); - - native_restore_fl((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); -} -PV_CALLEE_SAVE_REGS_THUNK(vsmp_irq_enable); - -static unsigned __init vsmp_patch(u8 type, void *ibuf, - unsigned long addr, unsigned len) -{ - switch (type) { - case PARAVIRT_PATCH(irq.irq_enable): - case PARAVIRT_PATCH(irq.irq_disable): - case PARAVIRT_PATCH(irq.save_fl): - case PARAVIRT_PATCH(irq.restore_fl): - return paravirt_patch_default(type, ibuf, addr, len); - default: - return native_patch(type, ibuf, addr, len); - } - -} - -static void __init set_vsmp_pv_ops(void) +#ifdef CONFIG_PCI +static void __init set_vsmp_ctl(void) { void __iomem *address; unsigned int cap, ctl, cfg; @@ -109,28 +52,12 @@ static void __init set_vsmp_pv_ops(void) } #endif - if (cap & ctl & (1 << 4)) { - /* Setup irq ops and turn on vSMP IRQ fastpath handling */ - pv_ops.irq.irq_disable = PV_CALLEE_SAVE(vsmp_irq_disable); - pv_ops.irq.irq_enable = PV_CALLEE_SAVE(vsmp_irq_enable); - pv_ops.irq.save_fl = PV_CALLEE_SAVE(vsmp_save_fl); - pv_ops.irq.restore_fl = PV_CALLEE_SAVE(vsmp_restore_fl); - pv_ops.init.patch = vsmp_patch; - ctl &= ~(1 << 4); - } writel(ctl, address + 4); ctl = readl(address + 4); pr_info("vSMP CTL: control set to:0x%08x\n", ctl); early_iounmap(address, 8); } -#else -static void __init set_vsmp_pv_ops(void) -{ -} -#endif - -#ifdef CONFIG_PCI static int is_vsmp = -1; static void __init detect_vsmp_box(void) @@ -164,11 +91,14 @@ static int is_vsmp_box(void) { return 0; } +static void __init set_vsmp_ctl(void) +{ +} #endif static void __init vsmp_cap_cpus(void) { -#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP) +#if !defined(CONFIG_X86_VSMP) && defined(CONFIG_SMP) && defined(CONFIG_PCI) void __iomem *address; unsigned int cfg, topology, node_shift, maxcpus; @@ -221,6 +151,6 @@ void __init vsmp_init(void) vsmp_cap_cpus(); - set_vsmp_pv_ops(); + set_vsmp_ctl(); return; } -- cgit v1.2.3 From 15035388439f892017d38b05214d3cda6578af64 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 9 Nov 2018 15:22:07 -0500 Subject: x86/cpu/vmware: Do not trace vmware_sched_clock() When running function tracing on a Linux guest running on VMware Workstation, the guest would crash. This is due to tracing of the sched_clock internal call of the VMware vmware_sched_clock(), which causes an infinite recursion within the tracing code (clock calls must not be traced). Make vmware_sched_clock() not traced by ftrace. Fixes: 80e9a4f21fd7c ("x86/vmware: Add paravirt sched clock") Reported-by: GwanYeong Kim Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Borislav Petkov CC: Alok Kataria CC: GwanYeong Kim CC: "H. Peter Anvin" CC: Ingo Molnar Cc: stable@vger.kernel.org CC: Thomas Gleixner CC: virtualization@lists.linux-foundation.org CC: x86-ml Link: http://lkml.kernel.org/r/20181109152207.4d3e7d70@gandalf.local.home --- arch/x86/kernel/cpu/vmware.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index d9ab49bed8af..0eda91f8eeac 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -77,7 +77,7 @@ static __init int setup_vmw_sched_clock(char *s) } early_param("no-vmw-sched-clock", setup_vmw_sched_clock); -static unsigned long long vmware_sched_clock(void) +static unsigned long long notrace vmware_sched_clock(void) { unsigned long long ns; -- cgit v1.2.3