From 81cf09edc793688cbf53c3082802571e2018f3ac Mon Sep 17 00:00:00 2001 From: Alexander Kuleshov Date: Fri, 4 Sep 2015 15:43:35 -0700 Subject: sh: use PFN_DOWN macro Replace ((x) >> PAGE_SHIFT) with the predefined PFN_DOWN macro. Signed-off-by: Alexander Kuleshov Acked-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/mm/init.c | 4 ++-- arch/sh/mm/numa.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch') diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 2790b6a64157..17f486233db0 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -488,7 +488,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) int arch_add_memory(int nid, u64 start, u64 size) { pg_data_t *pgdat; - unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; @@ -517,7 +517,7 @@ EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #ifdef CONFIG_MEMORY_HOTREMOVE int arch_remove_memory(u64 start, u64 size) { - unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; struct zone *zone; int ret; diff --git a/arch/sh/mm/numa.c b/arch/sh/mm/numa.c index bce52ba66206..05713d190247 100644 --- a/arch/sh/mm/numa.c +++ b/arch/sh/mm/numa.c @@ -33,8 +33,8 @@ void __init setup_bootmem_node(int nid, unsigned long start, unsigned long end) /* Don't allow bogus node assignment */ BUG_ON(nid >= MAX_NUMNODES || nid <= 0); - start_pfn = start >> PAGE_SHIFT; - end_pfn = end >> PAGE_SHIFT; + start_pfn = PFN_DOWN(start); + end_pfn = PFN_DOWN(end); pmb_bolt_mapping((unsigned long)__va(start), start, end - start, PAGE_KERNEL); -- cgit v1.2.3 From aacfbe6a9724bb6d66a656a5abcc681d5649ed92 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Fri, 4 Sep 2015 15:45:12 -0700 Subject: kernel/watchdog: move NMI function header declarations from watchdog.h to nmi.h The kernel's NMI watchdog has nothing to do with the watchdog subsystem. Its header declarations should be in linux/nmi.h, not linux/watchdog.h. The code provided two sets of dummy functions if HARDLOCKUP_DETECTOR is not configured, one in the include file and one in kernel/watchdog.c. Remove the dummy functions from kernel/watchdog.c and use those from the include file. Signed-off-by: Guenter Roeck Cc: Stephane Eranian Cc: Peter Zijlstra (Intel) Cc: Ingo Molnar Cc: Don Zickus Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- include/linux/nmi.h | 8 +++++--- include/linux/watchdog.h | 8 -------- kernel/watchdog.c | 2 -- 4 files changed, 6 insertions(+), 14 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 3f124d553c5a..36bd8250934b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/nmi.h b/include/linux/nmi.h index f94da0e65dea..088714537d10 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -26,10 +26,12 @@ static inline void touch_nmi_watchdog(void) #if defined(CONFIG_HARDLOCKUP_DETECTOR) extern void hardlockup_detector_disable(void); +void watchdog_nmi_disable_all(void); +void watchdog_nmi_enable_all(void); #else -static inline void hardlockup_detector_disable(void) -{ -} +static inline void hardlockup_detector_disable(void) {} +static inline void watchdog_nmi_disable_all(void) {} +static inline void watchdog_nmi_enable_all(void) {} #endif /* diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index f47feada5b42..d74a0e907b9e 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -140,12 +140,4 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd, extern int watchdog_register_device(struct watchdog_device *); extern void watchdog_unregister_device(struct watchdog_device *); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -void watchdog_nmi_disable_all(void); -void watchdog_nmi_enable_all(void); -#else -static inline void watchdog_nmi_disable_all(void) {} -static inline void watchdog_nmi_enable_all(void) {} -#endif - #endif /* ifndef _LINUX_WATCHDOG_H */ diff --git a/kernel/watchdog.c b/kernel/watchdog.c index d18330fa4776..e74d48bc3e61 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -651,8 +651,6 @@ unlock: #else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } -void watchdog_nmi_enable_all(void) {} -void watchdog_nmi_disable_all(void) {} #endif /* CONFIG_HARDLOCKUP_DETECTOR */ static struct smp_hotplug_thread watchdog_threads = { -- cgit v1.2.3 From 999bbe49ea0118b70ddf3f5d679f51dc7a97ae55 Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Fri, 4 Sep 2015 15:45:25 -0700 Subject: watchdog: use suspend/resume interface in fixup_ht_bug() Remove watchdog_nmi_disable_all() and watchdog_nmi_enable_all() since these functions are no longer needed. If a subsystem has a need to deactivate the watchdog temporarily, it should utilize the watchdog_suspend() and watchdog_resume() functions. [akpm@linux-foundation.org: fix build with CONFIG_LOCKUP_DETECTOR=m] Signed-off-by: Ulrich Obergfell Reviewed-by: Aaron Tomlin Cc: Guenter Roeck Cc: Don Zickus Cc: Ulrich Obergfell Cc: Jiri Olsa Cc: Michal Hocko Cc: Stephane Eranian Cc: Chris Metcalf Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event_intel.c | 7 +++++-- include/linux/nmi.h | 13 +++++++++---- kernel/watchdog.c | 35 ---------------------------------- 3 files changed, 14 insertions(+), 41 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 36bd8250934b..144ab91951a7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -3627,7 +3627,10 @@ static __init int fixup_ht_bug(void) return 0; } - watchdog_nmi_disable_all(); + if (watchdog_suspend() != 0) { + pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n"); + return 0; + } x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); @@ -3635,7 +3638,7 @@ static __init int fixup_ht_bug(void) x86_pmu.commit_scheduling = NULL; x86_pmu.stop_scheduling = NULL; - watchdog_nmi_enable_all(); + watchdog_resume(); get_online_cpus(); diff --git a/include/linux/nmi.h b/include/linux/nmi.h index e9f213c337bb..e5afe8bae202 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -26,12 +26,8 @@ static inline void touch_nmi_watchdog(void) #if defined(CONFIG_HARDLOCKUP_DETECTOR) extern void hardlockup_detector_disable(void); -void watchdog_nmi_disable_all(void); -void watchdog_nmi_enable_all(void); #else static inline void hardlockup_detector_disable(void) {} -static inline void watchdog_nmi_disable_all(void) {} -static inline void watchdog_nmi_enable_all(void) {} #endif /* @@ -84,6 +80,15 @@ extern int proc_watchdog_cpumask(struct ctl_table *, int, void __user *, size_t *, loff_t *); extern int watchdog_suspend(void); extern void watchdog_resume(void); +#else +static inline int watchdog_suspend(void) +{ + return 0; +} + +static inline void watchdog_resume(void) +{ +} #endif #ifdef CONFIG_HAVE_ACPI_APEI_NMI diff --git a/kernel/watchdog.c b/kernel/watchdog.c index eb8f94b50101..69666f4b8e8f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -615,41 +615,6 @@ static void watchdog_nmi_disable(unsigned int cpu) } } -void watchdog_nmi_enable_all(void) -{ - int cpu; - - mutex_lock(&watchdog_proc_mutex); - - if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED)) - goto unlock; - - get_online_cpus(); - for_each_watchdog_cpu(cpu) - watchdog_nmi_enable(cpu); - put_online_cpus(); - -unlock: - mutex_unlock(&watchdog_proc_mutex); -} - -void watchdog_nmi_disable_all(void) -{ - int cpu; - - mutex_lock(&watchdog_proc_mutex); - - if (!watchdog_running) - goto unlock; - - get_online_cpus(); - for_each_watchdog_cpu(cpu) - watchdog_nmi_disable(cpu); - put_online_cpus(); - -unlock: - mutex_unlock(&watchdog_proc_mutex); -} #else static int watchdog_nmi_enable(unsigned int cpu) { return 0; } static void watchdog_nmi_disable(unsigned int cpu) { return; } -- cgit v1.2.3 From ec6a90661a0d6ce1461d05c7a58a0a151154e14a Mon Sep 17 00:00:00 2001 From: Ulrich Obergfell Date: Fri, 4 Sep 2015 15:45:28 -0700 Subject: watchdog: rename watchdog_suspend() and watchdog_resume() Rename watchdog_suspend() to lockup_detector_suspend() and watchdog_resume() to lockup_detector_resume() to avoid confusion with the watchdog subsystem and to be consistent with the existing name lockup_detector_init(). Also provide comment blocks to explain the watchdog_running and watchdog_suspended variables and their relationship. Signed-off-by: Ulrich Obergfell Reviewed-by: Aaron Tomlin Cc: Guenter Roeck Cc: Don Zickus Cc: Ulrich Obergfell Cc: Jiri Olsa Cc: Michal Hocko Cc: Stephane Eranian Cc: Chris Metcalf Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/perf_event_intel.c | 4 ++-- include/linux/nmi.h | 8 ++++---- kernel/watchdog.c | 26 ++++++++++++++++++++++---- 3 files changed, 28 insertions(+), 10 deletions(-) (limited to 'arch') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 144ab91951a7..cd9b6d0b10bf 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -3627,7 +3627,7 @@ static __init int fixup_ht_bug(void) return 0; } - if (watchdog_suspend() != 0) { + if (lockup_detector_suspend() != 0) { pr_debug("failed to disable PMU erratum BJ122, BV98, HSD29 workaround\n"); return 0; } @@ -3638,7 +3638,7 @@ static __init int fixup_ht_bug(void) x86_pmu.commit_scheduling = NULL; x86_pmu.stop_scheduling = NULL; - watchdog_resume(); + lockup_detector_resume(); get_online_cpus(); diff --git a/include/linux/nmi.h b/include/linux/nmi.h index e5afe8bae202..a91adf6e02f2 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -78,15 +78,15 @@ extern int proc_watchdog_thresh(struct ctl_table *, int , void __user *, size_t *, loff_t *); extern int proc_watchdog_cpumask(struct ctl_table *, int, void __user *, size_t *, loff_t *); -extern int watchdog_suspend(void); -extern void watchdog_resume(void); +extern int lockup_detector_suspend(void); +extern void lockup_detector_resume(void); #else -static inline int watchdog_suspend(void) +static inline int lockup_detector_suspend(void) { return 0; } -static inline void watchdog_resume(void) +static inline void lockup_detector_resume(void) { } #endif diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 69666f4b8e8f..64ed1c37bd1f 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -67,8 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #define for_each_watchdog_cpu(cpu) \ for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask) -static int __read_mostly watchdog_suspended; +/* + * The 'watchdog_running' variable is set to 1 when the watchdog threads + * are registered/started and is set to 0 when the watchdog threads are + * unregistered/stopped, so it is an indicator whether the threads exist. + */ static int __read_mostly watchdog_running; +/* + * If a subsystem has a need to deactivate the watchdog temporarily, it + * can use the suspend/resume interface to achieve this. The content of + * the 'watchdog_suspended' variable reflects this state. Existing threads + * are parked/unparked by the lockup_detector_{suspend|resume} functions + * (see comment blocks pertaining to those functions for further details). + * + * 'watchdog_suspended' also prevents threads from being registered/started + * or unregistered/stopped via parameters in /proc/sys/kernel, so the state + * of 'watchdog_running' cannot change while the watchdog is deactivated + * temporarily (see related code in 'proc' handlers). + */ +static int __read_mostly watchdog_suspended; + static u64 __read_mostly sample_period; static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts); @@ -669,7 +687,7 @@ static void watchdog_unpark_threads(void) /* * Suspend the hard and soft lockup detector by parking the watchdog threads. */ -int watchdog_suspend(void) +int lockup_detector_suspend(void) { int ret = 0; @@ -679,7 +697,7 @@ int watchdog_suspend(void) * the 'watchdog_suspended' variable). If the watchdog threads are * running, the first caller takes care that they will be parked. * The state of 'watchdog_running' cannot change while a suspend - * request is active (see related changes in 'proc' handlers). + * request is active (see related code in 'proc' handlers). */ if (watchdog_running && !watchdog_suspended) ret = watchdog_park_threads(); @@ -695,7 +713,7 @@ int watchdog_suspend(void) /* * Resume the hard and soft lockup detector by unparking the watchdog threads. */ -void watchdog_resume(void) +void lockup_detector_resume(void) { mutex_lock(&watchdog_proc_mutex); -- cgit v1.2.3 From 1380fca084743fef8d17e59b273473393944ce58 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Fri, 4 Sep 2015 15:46:58 -0700 Subject: userfaultfd: activate syscall This activates the userfaultfd syscall. [sfr@canb.auug.org.au: activate syscall fix] [akpm@linux-foundation.org: don't enable userfaultfd on powerpc] Signed-off-by: Andrea Arcangeli Acked-by: Pavel Emelyanov Cc: Sanidhya Kashyap Cc: zhang.zhanghailiang@huawei.com Cc: "Kirill A. Shutemov" Cc: Andres Lagar-Cavilla Cc: Dave Hansen Cc: Paolo Bonzini Cc: Rik van Riel Cc: Mel Gorman Cc: Andy Lutomirski Cc: Hugh Dickins Cc: Peter Feiner Cc: "Dr. David Alan Gilbert" Cc: Johannes Weiner Cc: "Huangpeng (Peter)" Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/entry/syscalls/syscall_32.tbl | 1 + arch/x86/entry/syscalls/syscall_64.tbl | 1 + include/linux/syscalls.h | 1 + kernel/sys_ni.c | 1 + 4 files changed, 4 insertions(+) (limited to 'arch') diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 25e3cf1cd8fd..477bfa6db370 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -380,3 +380,4 @@ 371 i386 recvfrom sys_recvfrom compat_sys_recvfrom 372 i386 recvmsg sys_recvmsg compat_sys_recvmsg 373 i386 shutdown sys_shutdown +374 i386 userfaultfd sys_userfaultfd diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 9ef32d5f1b19..81c490634db9 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -329,6 +329,7 @@ 320 common kexec_file_load sys_kexec_file_load 321 common bpf sys_bpf 322 64 execveat stub_execveat +323 common userfaultfd sys_userfaultfd # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index b45c45b8c829..08001317aee7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -810,6 +810,7 @@ asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr); asmlinkage long sys_eventfd(unsigned int count); asmlinkage long sys_eventfd2(unsigned int count, int flags); asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags); +asmlinkage long sys_userfaultfd(int flags); asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int); asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *, diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index ca7d84f438f1..03c3875d9958 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -219,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); cond_syscall(sys_memfd_create); +cond_syscall(sys_userfaultfd); /* performance counters: */ cond_syscall(sys_perf_event_open); -- cgit v1.2.3 From 5b74283ab251b9db55cbbe31d19ca72482103290 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 4 Sep 2015 15:47:29 -0700 Subject: x86, mm: trace when an IPI is about to be sent When unmapping pages it is necessary to flush the TLB. If that page was accessed by another CPU then an IPI is used to flush the remote CPU. That is a lot of IPIs if kswapd is scanning and unmapping >100K pages per second. There already is a window between when a page is unmapped and when it is TLB flushed. This series increases the window so multiple pages can be flushed using a single IPI. This should be safe or the kernel is hosed already. Patch 1 simply made the rest of the series easier to write as ftrace could identify all the senders of TLB flush IPIS. Patch 2 tracks what CPUs potentially map a PFN and then sends an IPI to flush the entire TLB. Patch 3 tracks when there potentially are writable TLB entries that need to be batched differently Patch 4 increases SWAP_CLUSTER_MAX to further batch flushes The performance impact is documented in the changelogs but in the optimistic case on a 4-socket machine the full series reduces interrupts from 900K interrupts/second to 60K interrupts/second. This patch (of 4): It is easy to trace when an IPI is received to flush a TLB but harder to detect what event sent it. This patch makes it easy to identify the source of IPIs being transmitted for TLB flushes on x86. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Reviewed-by: Dave Hansen Acked-by: Ingo Molnar Cc: Linus Torvalds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/tlb.c | 1 + include/linux/mm_types.h | 1 + include/trace/events/tlb.h | 3 ++- 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'arch') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 90b924acd982..8ddb5d0d66fb 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -140,6 +140,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, info.flush_end = end; count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); + trace_tlb_flush(TLB_REMOTE_SEND_IPI, end - start); if (is_uv_system()) { unsigned int cpu; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 26a30c3566f0..c8d0a73d64c4 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -554,6 +554,7 @@ enum tlb_flush_reason { TLB_REMOTE_SHOOTDOWN, TLB_LOCAL_SHOOTDOWN, TLB_LOCAL_MM_SHOOTDOWN, + TLB_REMOTE_SEND_IPI, NR_TLB_FLUSH_REASONS, }; diff --git a/include/trace/events/tlb.h b/include/trace/events/tlb.h index 4250f364a6ca..bc8815f45f3b 100644 --- a/include/trace/events/tlb.h +++ b/include/trace/events/tlb.h @@ -11,7 +11,8 @@ EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \ EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \ EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \ - EMe( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) + EM( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) \ + EMe( TLB_REMOTE_SEND_IPI, "remote ipi send" ) /* * First define the enums in TLB_FLUSH_REASON to be exported to userspace -- cgit v1.2.3 From 72b252aed506b8f1a03f7abd29caef4cdf6a043b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Fri, 4 Sep 2015 15:47:32 -0700 Subject: mm: send one IPI per CPU to TLB flush all entries after unmapping pages An IPI is sent to flush remote TLBs when a page is unmapped that was potentially accesssed by other CPUs. There are many circumstances where this happens but the obvious one is kswapd reclaiming pages belonging to a running process as kswapd and the task are likely running on separate CPUs. On small machines, this is not a significant problem but as machine gets larger with more cores and more memory, the cost of these IPIs can be high. This patch uses a simple structure that tracks CPUs that potentially have TLB entries for pages being unmapped. When the unmapping is complete, the full TLB is flushed on the assumption that a refill cost is lower than flushing individual entries. Architectures wishing to do this must give the following guarantee. If a clean page is unmapped and not immediately flushed, the architecture must guarantee that a write to that linear address from a CPU with a cached TLB entry will trap a page fault. This is essentially what the kernel already depends on but the window is much larger with this patch applied and is worth highlighting. The architecture should consider whether the cost of the full TLB flush is higher than sending an IPI to flush each individual entry. An additional architecture helper called flush_tlb_local is required. It's a trivial wrapper with some accounting in the x86 case. The impact of this patch depends on the workload as measuring any benefit requires both mapped pages co-located on the LRU and memory pressure. The case with the biggest impact is multiple processes reading mapped pages taken from the vm-scalability test suite. The test case uses NR_CPU readers of mapped files that consume 10*RAM. Linear mapped reader on a 4-node machine with 64G RAM and 48 CPUs 4.2.0-rc1 4.2.0-rc1 vanilla flushfull-v7 Ops lru-file-mmap-read-elapsed 159.62 ( 0.00%) 120.68 ( 24.40%) Ops lru-file-mmap-read-time_range 30.59 ( 0.00%) 2.80 ( 90.85%) Ops lru-file-mmap-read-time_stddv 6.70 ( 0.00%) 0.64 ( 90.38%) 4.2.0-rc1 4.2.0-rc1 vanilla flushfull-v7 User 581.00 611.43 System 5804.93 4111.76 Elapsed 161.03 122.12 This is showing that the readers completed 24.40% faster with 29% less system CPU time. From vmstats, it is known that the vanilla kernel was interrupted roughly 900K times per second during the steady phase of the test and the patched kernel was interrupts 180K times per second. The impact is lower on a single socket machine. 4.2.0-rc1 4.2.0-rc1 vanilla flushfull-v7 Ops lru-file-mmap-read-elapsed 25.33 ( 0.00%) 20.38 ( 19.54%) Ops lru-file-mmap-read-time_range 0.91 ( 0.00%) 1.44 (-58.24%) Ops lru-file-mmap-read-time_stddv 0.28 ( 0.00%) 0.47 (-65.34%) 4.2.0-rc1 4.2.0-rc1 vanilla flushfull-v7 User 58.09 57.64 System 111.82 76.56 Elapsed 27.29 22.55 It's still a noticeable improvement with vmstat showing interrupts went from roughly 500K per second to 45K per second. The patch will have no impact on workloads with no memory pressure or have relatively few mapped pages. It will have an unpredictable impact on the workload running on the CPU being flushed as it'll depend on how many TLB entries need to be refilled and how long that takes. Worst case, the TLB will be completely cleared of active entries when the target PFNs were not resident at all. [sasha.levin@oracle.com: trace tlb flush after disabling preemption in try_to_unmap_flush] Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Dave Hansen Acked-by: Ingo Molnar Cc: Linus Torvalds Signed-off-by: Sasha Levin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/Kconfig | 1 + arch/x86/include/asm/tlbflush.h | 6 +++ include/linux/rmap.h | 3 ++ include/linux/sched.h | 16 +++++++ init/Kconfig | 10 ++++ mm/internal.h | 11 +++++ mm/rmap.c | 104 +++++++++++++++++++++++++++++++++++++++- mm/vmscan.c | 23 ++++++++- 8 files changed, 172 insertions(+), 2 deletions(-) (limited to 'arch') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 48f7433dac6f..117e2f373e50 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -41,6 +41,7 @@ config X86 select ARCH_USE_CMPXCHG_LOCKREF if X86_64 select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANT_FRAME_POINTERS select ARCH_WANT_IPC_PARSE_VERSION if X86_32 diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index cd791948b286..6df2029405a3 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -261,6 +261,12 @@ static inline void reset_lazy_tlbstate(void) #endif /* SMP */ +/* Not inlined due to inc_irq_stat not being defined yet */ +#define flush_tlb_local() { \ + inc_irq_stat(irq_tlb_count); \ + local_flush_tlb(); \ +} + #ifndef CONFIG_PARAVIRT #define flush_tlb_others(mask, mm, start, end) \ native_flush_tlb_others(mask, mm, start, end) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index c89c53a113a8..29446aeef36e 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -89,6 +89,9 @@ enum ttu_flags { TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ TTU_IGNORE_HWPOISON = (1 << 10),/* corrupted page is recoverable */ + TTU_BATCH_FLUSH = (1 << 11), /* Batch TLB flushes where possible + * and caller guarantees they will + * do a final flush if necessary */ }; #ifdef CONFIG_MMU diff --git a/include/linux/sched.h b/include/linux/sched.h index 119823decc46..3c602c20c717 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1344,6 +1344,18 @@ enum perf_event_task_context { perf_nr_task_contexts, }; +/* Track pages that require TLB flushes */ +struct tlbflush_unmap_batch { + /* + * Each bit set is a CPU that potentially has a TLB entry for one of + * the PFNs being flushed. See set_tlb_ubc_flush_pending(). + */ + struct cpumask cpumask; + + /* True if any bit in cpumask is set */ + bool flush_required; +}; + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; @@ -1700,6 +1712,10 @@ struct task_struct { unsigned long numa_pages_migrated; #endif /* CONFIG_NUMA_BALANCING */ +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + struct tlbflush_unmap_batch tlb_ubc; +#endif + struct rcu_head rcu; /* diff --git a/init/Kconfig b/init/Kconfig index 161acd8bc56f..cf7e4824c8d0 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -882,6 +882,16 @@ config GENERIC_SCHED_CLOCK config ARCH_SUPPORTS_NUMA_BALANCING bool +# +# For architectures that prefer to flush all TLBs after a number of pages +# are unmapped instead of sending one IPI per page to flush. The architecture +# must provide guarantees on what happens if a clean TLB cache entry is +# written after the unmap. Details are in mm/rmap.c near the check for +# should_defer_flush. The architecture should also consider if the full flush +# and the refill costs are offset by the savings of sending fewer IPIs. +config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH + bool + # # For architectures that know their GCC __int128 support is sound # diff --git a/mm/internal.h b/mm/internal.h index 36b23f1e2ca6..bd6372ac5f7f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -426,4 +426,15 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, #define ALLOC_CMA 0x80 /* allow allocations from CMA areas */ #define ALLOC_FAIR 0x100 /* fair zone allocation */ +enum ttu_flags; +struct tlbflush_unmap_batch; + +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +void try_to_unmap_flush(void); +#else +static inline void try_to_unmap_flush(void) +{ +} + +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ #endif /* __MM_INTERNAL_H */ diff --git a/mm/rmap.c b/mm/rmap.c index 171b68768df1..326d5d89e45c 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -62,6 +62,8 @@ #include +#include + #include "internal.h" static struct kmem_cache *anon_vma_cachep; @@ -583,6 +585,89 @@ vma_address(struct page *page, struct vm_area_struct *vma) return address; } +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +static void percpu_flush_tlb_batch_pages(void *data) +{ + /* + * All TLB entries are flushed on the assumption that it is + * cheaper to flush all TLBs and let them be refilled than + * flushing individual PFNs. Note that we do not track mm's + * to flush as that might simply be multiple full TLB flushes + * for no gain. + */ + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + flush_tlb_local(); +} + +/* + * Flush TLB entries for recently unmapped pages from remote CPUs. It is + * important if a PTE was dirty when it was unmapped that it's flushed + * before any IO is initiated on the page to prevent lost writes. Similarly, + * it must be flushed before freeing to prevent data leakage. + */ +void try_to_unmap_flush(void) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + int cpu; + + if (!tlb_ubc->flush_required) + return; + + cpu = get_cpu(); + + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, -1UL); + + if (cpumask_test_cpu(cpu, &tlb_ubc->cpumask)) + percpu_flush_tlb_batch_pages(&tlb_ubc->cpumask); + + if (cpumask_any_but(&tlb_ubc->cpumask, cpu) < nr_cpu_ids) { + smp_call_function_many(&tlb_ubc->cpumask, + percpu_flush_tlb_batch_pages, (void *)tlb_ubc, true); + } + cpumask_clear(&tlb_ubc->cpumask); + tlb_ubc->flush_required = false; + put_cpu(); +} + +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, + struct page *page) +{ + struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + + cpumask_or(&tlb_ubc->cpumask, &tlb_ubc->cpumask, mm_cpumask(mm)); + tlb_ubc->flush_required = true; +} + +/* + * Returns true if the TLB flush should be deferred to the end of a batch of + * unmap operations to reduce IPIs. + */ +static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) +{ + bool should_defer = false; + + if (!(flags & TTU_BATCH_FLUSH)) + return false; + + /* If remote CPUs need to be flushed then defer batch the flush */ + if (cpumask_any_but(mm_cpumask(mm), get_cpu()) < nr_cpu_ids) + should_defer = true; + put_cpu(); + + return should_defer; +} +#else +static void set_tlb_ubc_flush_pending(struct mm_struct *mm, + struct page *page) +{ +} + +static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) +{ + return false; +} +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + /* * At what user virtual address is page expected in vma? * Caller should check the page is actually part of the vma. @@ -1220,7 +1305,24 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); - pteval = ptep_clear_flush(vma, address, pte); + if (should_defer_flush(mm, flags)) { + /* + * We clear the PTE but do not flush so potentially a remote + * CPU could still be writing to the page. If the entry was + * previously clean then the architecture must guarantee that + * a clear->dirty transition on a cached TLB entry is written + * through and traps if the PTE is unmapped. + */ + pteval = ptep_get_and_clear(mm, address, pte); + + /* Potentially writable TLBs must be flushed before IO */ + if (pte_dirty(pteval)) + flush_tlb_page(vma, address); + else + set_tlb_ubc_flush_pending(mm, page); + } else { + pteval = ptep_clear_flush(vma, address, pte); + } /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8286938c70de..99ec00d6a5dd 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1057,7 +1057,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, ttu_flags)) { + switch (try_to_unmap(page, + ttu_flags|TTU_BATCH_FLUSH)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -1208,6 +1209,7 @@ keep: } mem_cgroup_uncharge_list(&free_pages); + try_to_unmap_flush(); free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); @@ -2151,6 +2153,23 @@ out: } } +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH +static void init_tlb_ubc(void) +{ + /* + * This deliberately does not clear the cpumask as it's expensive + * and unnecessary. If there happens to be data in there then the + * first SWAP_CLUSTER_MAX pages will send an unnecessary IPI and + * then will be cleared. + */ + current->tlb_ubc.flush_required = false; +} +#else +static inline void init_tlb_ubc(void) +{ +} +#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -2185,6 +2204,8 @@ static void shrink_lruvec(struct lruvec *lruvec, int swappiness, scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && sc->priority == DEF_PRIORITY); + init_tlb_ubc(); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { -- cgit v1.2.3 From 73858173593c31cb94bce63fe1c24eb803bb04e6 Mon Sep 17 00:00:00 2001 From: Vladimir Zapolskiy Date: Fri, 4 Sep 2015 15:47:43 -0700 Subject: genalloc: add name arg to gen_pool_get() and devm_gen_pool_create() This change modifies gen_pool_get() and devm_gen_pool_create() client interfaces adding one more argument "name" of a gen_pool object. Due to implementation gen_pool_get() is capable to retrieve only one gen_pool associated with a device even if multiple gen_pools are created, fortunately right at the moment it is sufficient for the clients, hence provide NULL as a valid argument on both producer devm_gen_pool_create() and consumer gen_pool_get() sides. Because only one created gen_pool per device is addressable, explicitly add a restriction to devm_gen_pool_create() to create only one gen_pool per device, this implies two possible error codes returned by the function, account it on client side (only misc/sram). This completes client side changes related to genalloc updates. [akpm@linux-foundation.org: gen_pool_get() cleanup] Signed-off-by: Vladimir Zapolskiy Cc: Philipp Zabel Cc: Greg Kroah-Hartman Cc: Russell King Cc: Nicolas Ferre Cc: Alexandre Belloni Cc: Jean-Christophe Plagniol-Villard Cc: Shawn Guo Cc: Sascha Hauer Cc: Mauro Carvalho Chehab Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mach-at91/pm.c | 2 +- arch/arm/mach-imx/pm-imx5.c | 2 +- arch/arm/mach-imx/pm-imx6.c | 2 +- arch/arm/mach-socfpga/pm.c | 2 +- drivers/media/platform/coda/coda-common.c | 2 +- drivers/misc/sram.c | 8 ++--- include/linux/genalloc.h | 4 +-- lib/genalloc.c | 49 ++++++++++++++++++------------- 8 files changed, 39 insertions(+), 32 deletions(-) (limited to 'arch') diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index 265ffeb2037e..80e277cfcc8b 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -369,7 +369,7 @@ static void __init at91_pm_sram_init(void) return; } - sram_pool = gen_pool_get(&pdev->dev); + sram_pool = gen_pool_get(&pdev->dev, NULL); if (!sram_pool) { pr_warn("%s: sram pool unavailable!\n", __func__); return; diff --git a/arch/arm/mach-imx/pm-imx5.c b/arch/arm/mach-imx/pm-imx5.c index 1885676c23c0..532d4b08276d 100644 --- a/arch/arm/mach-imx/pm-imx5.c +++ b/arch/arm/mach-imx/pm-imx5.c @@ -297,7 +297,7 @@ static int __init imx_suspend_alloc_ocram( goto put_node; } - ocram_pool = gen_pool_get(&pdev->dev); + ocram_pool = gen_pool_get(&pdev->dev, NULL); if (!ocram_pool) { pr_warn("%s: ocram pool unavailable!\n", __func__); ret = -ENODEV; diff --git a/arch/arm/mach-imx/pm-imx6.c b/arch/arm/mach-imx/pm-imx6.c index 93ecf559d06d..8ff8fc0b261c 100644 --- a/arch/arm/mach-imx/pm-imx6.c +++ b/arch/arm/mach-imx/pm-imx6.c @@ -451,7 +451,7 @@ static int __init imx6q_suspend_init(const struct imx6_pm_socdata *socdata) goto put_node; } - ocram_pool = gen_pool_get(&pdev->dev); + ocram_pool = gen_pool_get(&pdev->dev, NULL); if (!ocram_pool) { pr_warn("%s: ocram pool unavailable!\n", __func__); ret = -ENODEV; diff --git a/arch/arm/mach-socfpga/pm.c b/arch/arm/mach-socfpga/pm.c index 6a4199f2bffb..c378ab0c2431 100644 --- a/arch/arm/mach-socfpga/pm.c +++ b/arch/arm/mach-socfpga/pm.c @@ -56,7 +56,7 @@ static int socfpga_setup_ocram_self_refresh(void) goto put_node; } - ocram_pool = gen_pool_get(&pdev->dev); + ocram_pool = gen_pool_get(&pdev->dev, NULL); if (!ocram_pool) { pr_warn("%s: ocram pool unavailable!\n", __func__); ret = -ENODEV; diff --git a/drivers/media/platform/coda/coda-common.c b/drivers/media/platform/coda/coda-common.c index 58f65486de33..284ac4c934ba 100644 --- a/drivers/media/platform/coda/coda-common.c +++ b/drivers/media/platform/coda/coda-common.c @@ -2157,7 +2157,7 @@ static int coda_probe(struct platform_device *pdev) /* Get IRAM pool from device tree or platform data */ pool = of_gen_pool_get(np, "iram", 0); if (!pool && pdata) - pool = gen_pool_get(pdata->iram_dev); + pool = gen_pool_get(pdata->iram_dev, NULL); if (!pool) { dev_err(&pdev->dev, "iram pool not available\n"); return -ENOMEM; diff --git a/drivers/misc/sram.c b/drivers/misc/sram.c index 15c33cc34a80..431e1dd528bc 100644 --- a/drivers/misc/sram.c +++ b/drivers/misc/sram.c @@ -186,10 +186,10 @@ static int sram_probe(struct platform_device *pdev) if (IS_ERR(sram->virt_base)) return PTR_ERR(sram->virt_base); - sram->pool = devm_gen_pool_create(sram->dev, - ilog2(SRAM_GRANULARITY), -1); - if (!sram->pool) - return -ENOMEM; + sram->pool = devm_gen_pool_create(sram->dev, ilog2(SRAM_GRANULARITY), + NUMA_NO_NODE, NULL); + if (IS_ERR(sram->pool)) + return PTR_ERR(sram->pool); ret = sram_reserve_regions(sram, res); if (ret) diff --git a/include/linux/genalloc.h b/include/linux/genalloc.h index 5383bb1394a1..6afa65e6cdb7 100644 --- a/include/linux/genalloc.h +++ b/include/linux/genalloc.h @@ -118,8 +118,8 @@ extern unsigned long gen_pool_best_fit(unsigned long *map, unsigned long size, unsigned long start, unsigned int nr, void *data); extern struct gen_pool *devm_gen_pool_create(struct device *dev, - int min_alloc_order, int nid); -extern struct gen_pool *gen_pool_get(struct device *dev); + int min_alloc_order, int nid, const char *name); +extern struct gen_pool *gen_pool_get(struct device *dev, const char *name); bool addr_in_gen_pool(struct gen_pool *pool, unsigned long start, size_t size); diff --git a/lib/genalloc.c b/lib/genalloc.c index daf0afb6d979..b13cfd1a366e 100644 --- a/lib/genalloc.c +++ b/lib/genalloc.c @@ -570,24 +570,47 @@ static void devm_gen_pool_release(struct device *dev, void *res) gen_pool_destroy(*(struct gen_pool **)res); } +/** + * gen_pool_get - Obtain the gen_pool (if any) for a device + * @dev: device to retrieve the gen_pool from + * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device + * + * Returns the gen_pool for the device if one is present, or NULL. + */ +struct gen_pool *gen_pool_get(struct device *dev, const char *name) +{ + struct gen_pool **p; + + p = devres_find(dev, devm_gen_pool_release, NULL, NULL); + if (!p) + return NULL; + return *p; +} +EXPORT_SYMBOL_GPL(gen_pool_get); + /** * devm_gen_pool_create - managed gen_pool_create * @dev: device that provides the gen_pool * @min_alloc_order: log base 2 of number of bytes each bitmap bit represents - * @nid: node id of the node the pool structure should be allocated on, or -1 + * @nid: node selector for allocated gen_pool, %NUMA_NO_NODE for all nodes + * @name: name of a gen_pool or NULL, identifies a particular gen_pool on device * * Create a new special memory pool that can be used to manage special purpose * memory not managed by the regular kmalloc/kfree interface. The pool will be * automatically destroyed by the device management code. */ struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, - int nid) + int nid, const char *name) { struct gen_pool **ptr, *pool; + /* Check that genpool to be created is uniquely addressed on device */ + if (gen_pool_get(dev, name)) + return ERR_PTR(-EINVAL); + ptr = devres_alloc(devm_gen_pool_release, sizeof(*ptr), GFP_KERNEL); if (!ptr) - return NULL; + return ERR_PTR(-ENOMEM); pool = gen_pool_create(min_alloc_order, nid); if (pool) { @@ -595,29 +618,13 @@ struct gen_pool *devm_gen_pool_create(struct device *dev, int min_alloc_order, devres_add(dev, ptr); } else { devres_free(ptr); + return ERR_PTR(-ENOMEM); } return pool; } EXPORT_SYMBOL(devm_gen_pool_create); -/** - * gen_pool_get - Obtain the gen_pool (if any) for a device - * @dev: device to retrieve the gen_pool from - * - * Returns the gen_pool for the device if one is present, or NULL. - */ -struct gen_pool *gen_pool_get(struct device *dev) -{ - struct gen_pool **p = devres_find(dev, devm_gen_pool_release, NULL, - NULL); - - if (!p) - return NULL; - return *p; -} -EXPORT_SYMBOL_GPL(gen_pool_get); - #ifdef CONFIG_OF /** * of_gen_pool_get - find a pool by phandle property @@ -642,7 +649,7 @@ struct gen_pool *of_gen_pool_get(struct device_node *np, of_node_put(np_pool); if (!pdev) return NULL; - return gen_pool_get(&pdev->dev); + return gen_pool_get(&pdev->dev, NULL); } EXPORT_SYMBOL_GPL(of_gen_pool_get); #endif /* CONFIG_OF */ -- cgit v1.2.3