From 097963959594c5eccaba42510f7033f703211bda Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 10 Jan 2017 16:57:21 -0800 Subject: mm: add follow_pte_pmd() Patch series "Write protect DAX PMDs in *sync path". Currently dax_mapping_entry_mkclean() fails to clean and write protect the pmd_t of a DAX PMD entry during an *sync operation. This can result in data loss, as detailed in patch 2. This series is based on Dan's "libnvdimm-pending" branch, which is the current home for Jan's "dax: Page invalidation fixes" series. You can find a working tree here: https://git.kernel.org/cgit/linux/kernel/git/zwisler/linux.git/log/?h=dax_pmd_clean This patch (of 2): Similar to follow_pte(), follow_pte_pmd() allows either a PTE leaf or a huge page PMD leaf to be found and returned. Link: http://lkml.kernel.org/r/1482272586-21177-2-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Suggested-by: Dave Hansen Cc: Alexander Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Jan Kara Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index fe6b4036664a..02793ac64ac6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1212,6 +1212,8 @@ void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp); +int follow_pte_pmd(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, -- cgit v1.2.3 From f729c8c9b24f0540a6e6b86e68f3888ba90ef7e7 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Tue, 10 Jan 2017 16:57:24 -0800 Subject: dax: wrprotect pmd_t in dax_mapping_entry_mkclean Currently dax_mapping_entry_mkclean() fails to clean and write protect the pmd_t of a DAX PMD entry during an *sync operation. This can result in data loss in the following sequence: 1) mmap write to DAX PMD, dirtying PMD radix tree entry and making the pmd_t dirty and writeable 2) fsync, flushing out PMD data and cleaning the radix tree entry. We currently fail to mark the pmd_t as clean and write protected. 3) more mmap writes to the PMD. These don't cause any page faults since the pmd_t is dirty and writeable. The radix tree entry remains clean. 4) fsync, which fails to flush the dirty PMD data because the radix tree entry was clean. 5) crash - dirty data that should have been fsync'd as part of 4) could still have been in the processor cache, and is lost. Fix this by marking the pmd_t clean and write protected in dax_mapping_entry_mkclean(), which is called as part of the fsync operation 2). This will cause the writes in step 3) above to generate page faults where we'll re-dirty the PMD radix tree entry, resulting in flushes in the fsync that happens in step 4). Fixes: 4b4bb46d00b3 ("dax: clear dirty entry tags on cache flush") Link: http://lkml.kernel.org/r/1482272586-21177-3-git-send-email-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Jan Kara Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 51 ++++++++++++++++++++++++++++++++++++--------------- include/linux/mm.h | 2 -- mm/memory.c | 4 ++-- 3 files changed, 38 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/fs/dax.c b/fs/dax.c index 5c74f60d0a50..ddcddfeaa03b 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -691,8 +691,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pgoff_t index, unsigned long pfn) { struct vm_area_struct *vma; - pte_t *ptep; - pte_t pte; + pte_t pte, *ptep = NULL; + pmd_t *pmdp = NULL; spinlock_t *ptl; bool changed; @@ -707,21 +707,42 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, address = pgoff_address(index, vma); changed = false; - if (follow_pte(vma->vm_mm, address, &ptep, &ptl)) + if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl)) continue; - if (pfn != pte_pfn(*ptep)) - goto unlock; - if (!pte_dirty(*ptep) && !pte_write(*ptep)) - goto unlock; - flush_cache_page(vma, address, pfn); - pte = ptep_clear_flush(vma, address, ptep); - pte = pte_wrprotect(pte); - pte = pte_mkclean(pte); - set_pte_at(vma->vm_mm, address, ptep, pte); - changed = true; -unlock: - pte_unmap_unlock(ptep, ptl); + if (pmdp) { +#ifdef CONFIG_FS_DAX_PMD + pmd_t pmd; + + if (pfn != pmd_pfn(*pmdp)) + goto unlock_pmd; + if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) + goto unlock_pmd; + + flush_cache_page(vma, address, pfn); + pmd = pmdp_huge_clear_flush(vma, address, pmdp); + pmd = pmd_wrprotect(pmd); + pmd = pmd_mkclean(pmd); + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + changed = true; +unlock_pmd: + spin_unlock(ptl); +#endif + } else { + if (pfn != pte_pfn(*ptep)) + goto unlock_pte; + if (!pte_dirty(*ptep) && !pte_write(*ptep)) + goto unlock_pte; + + flush_cache_page(vma, address, pfn); + pte = ptep_clear_flush(vma, address, ptep); + pte = pte_wrprotect(pte); + pte = pte_mkclean(pte); + set_pte_at(vma->vm_mm, address, ptep, pte); + changed = true; +unlock_pte: + pte_unmap_unlock(ptep, ptl); + } if (changed) mmu_notifier_invalidate_page(vma->vm_mm, address); diff --git a/include/linux/mm.h b/include/linux/mm.h index 02793ac64ac6..b84615b0f64c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1210,8 +1210,6 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); -int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, - spinlock_t **ptlp); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index b62f3bc63481..6bf2b471e30c 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3819,8 +3819,8 @@ out: return -EINVAL; } -int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, - spinlock_t **ptlp) +static inline int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) { int res; -- cgit v1.2.3 From bb1107f7c6052c863692a41f78c000db792334bf Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:57:27 -0800 Subject: mm, slab: make sure that KMALLOC_MAX_SIZE will fit into MAX_ORDER Andrey Konovalov has reported the following warning triggered by the syzkaller fuzzer. WARNING: CPU: 1 PID: 9935 at mm/page_alloc.c:3511 __alloc_pages_nodemask+0x159c/0x1e20 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 9935 Comm: syz-executor0 Not tainted 4.9.0-rc7+ #34 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: __alloc_pages_slowpath mm/page_alloc.c:3511 __alloc_pages_nodemask+0x159c/0x1e20 mm/page_alloc.c:3781 alloc_pages_current+0x1c7/0x6b0 mm/mempolicy.c:2072 alloc_pages include/linux/gfp.h:469 kmalloc_order+0x1f/0x70 mm/slab_common.c:1015 kmalloc_order_trace+0x1f/0x160 mm/slab_common.c:1026 kmalloc_large include/linux/slab.h:422 __kmalloc+0x210/0x2d0 mm/slub.c:3723 kmalloc include/linux/slab.h:495 ep_write_iter+0x167/0xb50 drivers/usb/gadget/legacy/inode.c:664 new_sync_write fs/read_write.c:499 __vfs_write+0x483/0x760 fs/read_write.c:512 vfs_write+0x170/0x4e0 fs/read_write.c:560 SYSC_write fs/read_write.c:607 SyS_write+0xfb/0x230 fs/read_write.c:599 entry_SYSCALL_64_fastpath+0x1f/0xc2 The issue is caused by a lack of size check for the request size in ep_write_iter which should be fixed. It, however, points to another problem, that SLUB defines KMALLOC_MAX_SIZE too large because the its KMALLOC_SHIFT_MAX is (MAX_ORDER + PAGE_SHIFT) which means that the resulting page allocator request might be MAX_ORDER which is too large (see __alloc_pages_slowpath). The same applies to the SLOB allocator which allows even larger sizes. Make sure that they are capped properly and never request more than MAX_ORDER order. Link: http://lkml.kernel.org/r/20161220130659.16461-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Andrey Konovalov Acked-by: Christoph Lameter Cc: Alexei Starovoitov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/slab.h b/include/linux/slab.h index 084b12bad198..4c5363566815 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -226,7 +226,7 @@ static inline const char *__check_heap_object(const void *ptr, * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) -#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT) +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif @@ -239,7 +239,7 @@ static inline const char *__check_heap_object(const void *ptr, * be allocated from the same page. */ #define KMALLOC_SHIFT_HIGH PAGE_SHIFT -#define KMALLOC_SHIFT_MAX 30 +#define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif -- cgit v1.2.3 From 41b6167e8f746b475668f1da78599fc4284f18db Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:57:42 -0800 Subject: mm: get rid of __GFP_OTHER_NODE The flag was introduced by commit 78afd5612deb ("mm: add __GFP_OTHER_NODE flag") to allow proper accounting of remote node allocations done by kernel daemons on behalf of a process - e.g. khugepaged. After "mm: fix remote numa hits statistics" we do not need and actually use the flag so we can safely remove it because all allocations which are satisfied from their "home" node are accounted properly. [mhocko@suse.com: fix build] Link: http://lkml.kernel.org/r/20170106122225.GK5556@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20170102153057.9451-3-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Michal Hocko Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Taku Izumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 13 +++---------- include/trace/events/mmflags.h | 3 +-- mm/huge_memory.c | 3 +-- mm/khugepaged.c | 5 ++--- mm/page_alloc.c | 5 ++--- tools/perf/builtin-kmem.c | 1 - 6 files changed, 9 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 4175dca4ac39..7806a8f80abc 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -38,9 +38,8 @@ struct vm_area_struct; #define ___GFP_ACCOUNT 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_DIRECT_RECLAIM 0x400000u -#define ___GFP_OTHER_NODE 0x800000u -#define ___GFP_WRITE 0x1000000u -#define ___GFP_KSWAPD_RECLAIM 0x2000000u +#define ___GFP_WRITE 0x800000u +#define ___GFP_KSWAPD_RECLAIM 0x1000000u /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* @@ -172,11 +171,6 @@ struct vm_area_struct; * __GFP_NOTRACK_FALSE_POSITIVE is an alias of __GFP_NOTRACK. It's a means of * distinguishing in the source between false positives and allocations that * cannot be supported (e.g. page tables). - * - * __GFP_OTHER_NODE is for allocations that are on a remote node but that - * should not be accounted for as a remote allocation in vmstat. A - * typical user would be khugepaged collapsing a huge page on a remote - * node. */ #define __GFP_COLD ((__force gfp_t)___GFP_COLD) #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) @@ -184,10 +178,9 @@ struct vm_area_struct; #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT 26 +#define __GFP_BITS_SHIFT 25 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h index 9e687ca9a307..15bf875d0e4a 100644 --- a/include/trace/events/mmflags.h +++ b/include/trace/events/mmflags.h @@ -47,8 +47,7 @@ {(unsigned long)__GFP_WRITE, "__GFP_WRITE"}, \ {(unsigned long)__GFP_RECLAIM, "__GFP_RECLAIM"}, \ {(unsigned long)__GFP_DIRECT_RECLAIM, "__GFP_DIRECT_RECLAIM"},\ - {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"},\ - {(unsigned long)__GFP_OTHER_NODE, "__GFP_OTHER_NODE"} \ + {(unsigned long)__GFP_KSWAPD_RECLAIM, "__GFP_KSWAPD_RECLAIM"}\ #define show_gfp_flags(flags) \ (flags) ? __print_flags(flags, "|", \ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 10eedbf14421..72339a646fb1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -919,8 +919,7 @@ static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, } for (i = 0; i < HPAGE_PMD_NR; i++) { - pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | - __GFP_OTHER_NODE, vma, + pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma, vmf->address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_try_charge(pages[i], vma->vm_mm, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b0924a68cc36..77ae3239c3de 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -943,7 +943,7 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_OTHER_NODE | __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; /* * Before allocating the hugepage, release the mmap_sem read lock. @@ -1309,8 +1309,7 @@ static void collapse_shmem(struct mm_struct *mm, VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | - __GFP_OTHER_NODE | __GFP_THISNODE; + gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; new_page = khugepaged_alloc_page(hpage, gfp, node); if (!new_page) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cba2a64792e6..872caae544ef 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2584,8 +2584,7 @@ int __isolate_free_page(struct page *page, unsigned int order) * * Must be called with interrupts disabled. */ -static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, - gfp_t flags) +static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) { #ifdef CONFIG_NUMA enum zone_stat_item local_stat = NUMA_LOCAL; @@ -2667,7 +2666,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, } __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); - zone_statistics(preferred_zone, zone, gfp_flags); + zone_statistics(preferred_zone, zone); local_irq_restore(flags); VM_BUG_ON_PAGE(bad_range(zone, page), page); diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 35a02f8e5a4a..915869e00d86 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -655,7 +655,6 @@ static const struct { { "__GFP_RECLAIM", "R" }, { "__GFP_DIRECT_RECLAIM", "DR" }, { "__GFP_KSWAPD_RECLAIM", "KR" }, - { "__GFP_OTHER_NODE", "ON" }, }; static size_t max_gfp_len; -- cgit v1.2.3 From 2d39b3cd34e6d323720d4c61bd714f5ae202c022 Mon Sep 17 00:00:00 2001 From: Jamie Iles Date: Tue, 10 Jan 2017 16:57:54 -0800 Subject: signal: protect SIGNAL_UNKILLABLE from unintentional clearing. Since commit 00cd5c37afd5 ("ptrace: permit ptracing of /sbin/init") we can now trace init processes. init is initially protected with SIGNAL_UNKILLABLE which will prevent fatal signals such as SIGSTOP, but there are a number of paths during tracing where SIGNAL_UNKILLABLE can be implicitly cleared. This can result in init becoming stoppable/killable after tracing. For example, running: while true; do kill -STOP 1; done & strace -p 1 and then stopping strace and the kill loop will result in init being left in state TASK_STOPPED. Sending SIGCONT to init will resume it, but init will now respond to future SIGSTOP signals rather than ignoring them. Make sure that when setting SIGNAL_STOP_CONTINUED/SIGNAL_STOP_STOPPED that we don't clear SIGNAL_UNKILLABLE. Link: http://lkml.kernel.org/r/20170104122017.25047-1-jamie.iles@oracle.com Signed-off-by: Jamie Iles Acked-by: Oleg Nesterov Cc: Alexander Viro Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 10 ++++++++++ kernel/signal.c | 4 ++-- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 4d1905245c7a..ad3ec9ec61f7 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -854,6 +854,16 @@ struct signal_struct { #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ +#define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ + SIGNAL_STOP_CONTINUED) + +static inline void signal_set_stop_flags(struct signal_struct *sig, + unsigned int flags) +{ + WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP)); + sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; +} + /* If true, all threads except ->group_exit_task have pending SIGKILL */ static inline int signal_group_exit(const struct signal_struct *sig) { diff --git a/kernel/signal.c b/kernel/signal.c index ff046b73ff2d..3603d93a1968 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -346,7 +346,7 @@ static bool task_participate_group_stop(struct task_struct *task) * fresh group stop. Read comment in do_signal_stop() for details. */ if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) { - sig->flags = SIGNAL_STOP_STOPPED; + signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED); return true; } return false; @@ -843,7 +843,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) * will take ->siglock, notice SIGNAL_CLD_MASK, and * notify its parent. See get_signal_to_deliver(). */ - signal->flags = why | SIGNAL_STOP_CONTINUED; + signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED); signal->group_stop_count = 0; signal->group_exit_code = 0; } -- cgit v1.2.3 From b4536f0c829c8586544c94735c343f9b5070bd01 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 10 Jan 2017 16:58:04 -0800 Subject: mm, memcg: fix the active list aging for lowmem requests when memcg is enabled Nils Holland and Klaus Ethgen have reported unexpected OOM killer invocations with 32b kernel starting with 4.8 kernels kworker/u4:5 invoked oom-killer: gfp_mask=0x2400840(GFP_NOFS|__GFP_NOFAIL), nodemask=0, order=0, oom_score_adj=0 kworker/u4:5 cpuset=/ mems_allowed=0 CPU: 1 PID: 2603 Comm: kworker/u4:5 Not tainted 4.9.0-gentoo #2 [...] Mem-Info: active_anon:58685 inactive_anon:90 isolated_anon:0 active_file:274324 inactive_file:281962 isolated_file:0 unevictable:0 dirty:649 writeback:0 unstable:0 slab_reclaimable:40662 slab_unreclaimable:17754 mapped:7382 shmem:202 pagetables:351 bounce:0 free:206736 free_pcp:332 free_cma:0 Node 0 active_anon:234740kB inactive_anon:360kB active_file:1097296kB inactive_file:1127848kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:29528kB dirty:2596kB writeback:0kB shmem:0kB shmem_thp: 0kB shmem_pmdmapped: 184320kB anon_thp: 808kB writeback_tmp:0kB unstable:0kB pages_scanned:0 all_unreclaimable? no DMA free:3952kB min:788kB low:984kB high:1180kB active_anon:0kB inactive_anon:0kB active_file:7316kB inactive_file:0kB unevictable:0kB writepending:96kB present:15992kB managed:15916kB mlocked:0kB slab_reclaimable:3200kB slab_unreclaimable:1408kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 813 3474 3474 Normal free:41332kB min:41368kB low:51708kB high:62048kB active_anon:0kB inactive_anon:0kB active_file:532748kB inactive_file:44kB unevictable:0kB writepending:24kB present:897016kB managed:836248kB mlocked:0kB slab_reclaimable:159448kB slab_unreclaimable:69608kB kernel_stack:1112kB pagetables:1404kB bounce:0kB free_pcp:528kB local_pcp:340kB free_cma:0kB lowmem_reserve[]: 0 0 21292 21292 HighMem free:781660kB min:512kB low:34356kB high:68200kB active_anon:234740kB inactive_anon:360kB active_file:557232kB inactive_file:1127804kB unevictable:0kB writepending:2592kB present:2725384kB managed:2725384kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:800kB local_pcp:608kB free_cma:0kB the oom killer is clearly pre-mature because there there is still a lot of page cache in the zone Normal which should satisfy this lowmem request. Further debugging has shown that the reclaim cannot make any forward progress because the page cache is hidden in the active list which doesn't get rotated because inactive_list_is_low is not memcg aware. The code simply subtracts per-zone highmem counters from the respective memcg's lru sizes which doesn't make any sense. We can simply end up always seeing the resulting active and inactive counts 0 and return false. This issue is not limited to 32b kernels but in practice the effect on systems without CONFIG_HIGHMEM would be much harder to notice because we do not invoke the OOM killer for allocations requests targeting < ZONE_NORMAL. Fix the issue by tracking per zone lru page counts in mem_cgroup_per_node and subtract per-memcg highmem counts when memcg is enabled. Introduce helper lruvec_zone_lru_size which redirects to either zone counters or mem_cgroup_get_zone_lru_size when appropriate. We are losing empty LRU but non-zero lru size detection introduced by ca707239e8a7 ("mm: update_lru_size warn and reset bad lru_size") because of the inherent zone vs. node discrepancy. Fixes: f8d1a31163fc ("mm: consider whether to decivate based on eligible zones inactive ratio") Link: http://lkml.kernel.org/r/20170104100825.3729-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Nils Holland Tested-by: Nils Holland Reported-by: Klaus Ethgen Acked-by: Minchan Kim Acked-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Vladimir Davydov Cc: [4.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 26 +++++++++++++++++++++++--- include/linux/mm_inline.h | 2 +- mm/memcontrol.c | 18 ++++++++---------- mm/vmscan.c | 27 +++++++++++++++++---------- 4 files changed, 49 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 61d20c17f3b7..254698856b8f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -120,7 +120,7 @@ struct mem_cgroup_reclaim_iter { */ struct mem_cgroup_per_node { struct lruvec lruvec; - unsigned long lru_size[NR_LRU_LISTS]; + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; @@ -432,7 +432,7 @@ static inline bool mem_cgroup_online(struct mem_cgroup *memcg) int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int nr_pages); + int zid, int nr_pages); unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask); @@ -441,9 +441,23 @@ static inline unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { struct mem_cgroup_per_node *mz; + unsigned long nr_pages = 0; + int zid; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - return mz->lru_size[lru]; + for (zid = 0; zid < MAX_NR_ZONES; zid++) + nr_pages += mz->lru_zone_size[zid][lru]; + return nr_pages; +} + +static inline +unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, + enum lru_list lru, int zone_idx) +{ + struct mem_cgroup_per_node *mz; + + mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + return mz->lru_zone_size[zone_idx][lru]; } void mem_cgroup_handle_over_high(void); @@ -671,6 +685,12 @@ mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) { return 0; } +static inline +unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, + enum lru_list lru, int zone_idx) +{ + return 0; +} static inline unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 71613e8a720f..41d376e7116d 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -39,7 +39,7 @@ static __always_inline void update_lru_size(struct lruvec *lruvec, { __update_lru_size(lruvec, lru, zid, nr_pages); #ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, nr_pages); + mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages); #endif } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4048897e7b01..a63a8f832664 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -625,8 +625,8 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { + struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg); unsigned long nr = 0; - struct mem_cgroup_per_node *mz; enum lru_list lru; VM_BUG_ON((unsigned)nid >= nr_node_ids); @@ -634,8 +634,7 @@ unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, for_each_lru(lru) { if (!(BIT(lru) & lru_mask)) continue; - mz = mem_cgroup_nodeinfo(memcg, nid); - nr += mz->lru_size[lru]; + nr += mem_cgroup_get_lru_size(lruvec, lru); } return nr; } @@ -1002,6 +1001,7 @@ out: * mem_cgroup_update_lru_size - account for adding or removing an lru page * @lruvec: mem_cgroup per zone lru vector * @lru: index of lru list the page is sitting on + * @zid: zone id of the accounted pages * @nr_pages: positive when adding or negative when removing * * This function must be called under lru_lock, just before a page is added @@ -1009,27 +1009,25 @@ out: * so as to allow it to check that lru_size 0 is consistent with list_empty). */ void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, - int nr_pages) + int zid, int nr_pages) { struct mem_cgroup_per_node *mz; unsigned long *lru_size; long size; - bool empty; if (mem_cgroup_disabled()) return; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - lru_size = mz->lru_size + lru; - empty = list_empty(lruvec->lists + lru); + lru_size = &mz->lru_zone_size[zid][lru]; if (nr_pages < 0) *lru_size += nr_pages; size = *lru_size; - if (WARN_ONCE(size < 0 || empty != !size, - "%s(%p, %d, %d): lru_size %ld but %sempty\n", - __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) { + if (WARN_ONCE(size < 0, + "%s(%p, %d, %d): lru_size %ld\n", + __func__, lruvec, lru, nr_pages, size)) { VM_BUG_ON(1); *lru_size = 0; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 6aa5b01d3e75..532a2a750952 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -242,6 +242,16 @@ unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru) return node_page_state(lruvec_pgdat(lruvec), NR_LRU_BASE + lru); } +unsigned long lruvec_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, + int zone_idx) +{ + if (!mem_cgroup_disabled()) + return mem_cgroup_get_zone_lru_size(lruvec, lru, zone_idx); + + return zone_page_state(&lruvec_pgdat(lruvec)->node_zones[zone_idx], + NR_ZONE_LRU_BASE + lru); +} + /* * Add a shrinker callback to be called from the vm. */ @@ -1382,8 +1392,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) * be complete before mem_cgroup_update_lru_size due to a santity check. */ static __always_inline void update_lru_sizes(struct lruvec *lruvec, - enum lru_list lru, unsigned long *nr_zone_taken, - unsigned long nr_taken) + enum lru_list lru, unsigned long *nr_zone_taken) { int zid; @@ -1392,11 +1401,11 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, continue; __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); - } - #ifdef CONFIG_MEMCG - mem_cgroup_update_lru_size(lruvec, lru, -nr_taken); + mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]); #endif + } + } /* @@ -1501,7 +1510,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, *nr_scanned = scan; trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, nr_to_scan, scan, nr_taken, mode, is_file_lru(lru)); - update_lru_sizes(lruvec, lru, nr_zone_taken, nr_taken); + update_lru_sizes(lruvec, lru, nr_zone_taken); return nr_taken; } @@ -2047,10 +2056,8 @@ static bool inactive_list_is_low(struct lruvec *lruvec, bool file, if (!managed_zone(zone)) continue; - inactive_zone = zone_page_state(zone, - NR_ZONE_LRU_BASE + (file * LRU_FILE)); - active_zone = zone_page_state(zone, - NR_ZONE_LRU_BASE + (file * LRU_FILE) + LRU_ACTIVE); + inactive_zone = lruvec_zone_lru_size(lruvec, file * LRU_FILE, zid); + active_zone = lruvec_zone_lru_size(lruvec, (file * LRU_FILE) + LRU_ACTIVE, zid); inactive -= min(inactive, inactive_zone); active -= min(active, active_zone); -- cgit v1.2.3 From 8c2dd3e4a4bae78093c4a5cee6494877651be3c9 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 10 Jan 2017 16:58:06 -0800 Subject: mm: rename __alloc_page_frag to page_frag_alloc and __free_page_frag to page_frag_free Patch series "Page fragment updates", v4. This patch series takes care of a few cleanups for the page fragments API. First we do some renames so that things are much more consistent. First we move the page_frag_ portion of the name to the front of the functions names. Secondly we split out the cache specific functions from the other page fragment functions by adding the word "cache" to the name. Finally I added a bit of documentation that will hopefully help to explain some of this. I plan to revisit this later as we get things more ironed out in the near future with the changes planned for the DMA setup to support eXpress Data Path. This patch (of 3): This patch renames the page frag functions to be more consistent with other APIs. Specifically we place the name page_frag first in the name and then have either an alloc or free call name that we append as the suffix. This makes it a bit clearer in terms of naming. In addition we drop the leading double underscores since we are technically no longer a backing interface and instead the front end that is called from the networking APIs. Link: http://lkml.kernel.org/r/20170104023854.13451.67390.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 6 +++--- include/linux/skbuff.h | 2 +- mm/page_alloc.c | 10 +++++----- net/core/skbuff.c | 8 ++++---- 4 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7806a8f80abc..ed77a86fbbb0 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -501,9 +501,9 @@ extern void free_hot_cold_page_list(struct list_head *list, bool cold); struct page_frag_cache; extern void __page_frag_drain(struct page *page, unsigned int order, unsigned int count); -extern void *__alloc_page_frag(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask); -extern void __free_page_frag(void *addr); +extern void *page_frag_alloc(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask); +extern void page_frag_free(void *addr); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b53c0cfd417e..a410715bbef8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -2480,7 +2480,7 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, static inline void skb_free_frag(void *addr) { - __free_page_frag(addr); + page_frag_free(addr); } void *napi_alloc_frag(unsigned int fragsz); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 74afdb4177cb..097893ffe194 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3931,8 +3931,8 @@ void __page_frag_drain(struct page *page, unsigned int order, } EXPORT_SYMBOL(__page_frag_drain); -void *__alloc_page_frag(struct page_frag_cache *nc, - unsigned int fragsz, gfp_t gfp_mask) +void *page_frag_alloc(struct page_frag_cache *nc, + unsigned int fragsz, gfp_t gfp_mask) { unsigned int size = PAGE_SIZE; struct page *page; @@ -3983,19 +3983,19 @@ refill: return nc->va + offset; } -EXPORT_SYMBOL(__alloc_page_frag); +EXPORT_SYMBOL(page_frag_alloc); /* * Frees a page fragment allocated out of either a compound or order 0 page. */ -void __free_page_frag(void *addr) +void page_frag_free(void *addr) { struct page *page = virt_to_head_page(addr); if (unlikely(put_page_testzero(page))) __free_pages_ok(page, compound_order(page)); } -EXPORT_SYMBOL(__free_page_frag); +EXPORT_SYMBOL(page_frag_free); static void *make_alloc_exact(unsigned long addr, unsigned int order, size_t size) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 5a03730fbc1a..734c71468b01 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -369,7 +369,7 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) local_irq_save(flags); nc = this_cpu_ptr(&netdev_alloc_cache); - data = __alloc_page_frag(nc, fragsz, gfp_mask); + data = page_frag_alloc(nc, fragsz, gfp_mask); local_irq_restore(flags); return data; } @@ -391,7 +391,7 @@ static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); - return __alloc_page_frag(&nc->page, fragsz, gfp_mask); + return page_frag_alloc(&nc->page, fragsz, gfp_mask); } void *napi_alloc_frag(unsigned int fragsz) @@ -441,7 +441,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len, local_irq_save(flags); nc = this_cpu_ptr(&netdev_alloc_cache); - data = __alloc_page_frag(nc, len, gfp_mask); + data = page_frag_alloc(nc, len, gfp_mask); pfmemalloc = nc->pfmemalloc; local_irq_restore(flags); @@ -505,7 +505,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = __alloc_page_frag(&nc->page, len, gfp_mask); + data = page_frag_alloc(&nc->page, len, gfp_mask); if (unlikely(!data)) return NULL; -- cgit v1.2.3 From 2976db8018532b624c4123ae662fbc0814877abf Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Tue, 10 Jan 2017 16:58:09 -0800 Subject: mm: rename __page_frag functions to __page_frag_cache, drop order from drain This patch does two things. First it goes through and renames the __page_frag prefixed functions to __page_frag_cache so that we can be clear that we are draining or refilling the cache, not the frags themselves. Second we drop the order parameter from __page_frag_cache_drain since we don't actually need to pass it since all fragments are either order 0 or must be a compound page. Link: http://lkml.kernel.org/r/20170104023954.13451.5678.stgit@localhost.localdomain Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/net/ethernet/intel/igb/igb_main.c | 6 +++--- include/linux/gfp.h | 3 +-- mm/page_alloc.c | 13 +++++++------ 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index a761001308dc..1515abaa5ac9 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -3962,8 +3962,8 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring) PAGE_SIZE, DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); - __page_frag_drain(buffer_info->page, 0, - buffer_info->pagecnt_bias); + __page_frag_cache_drain(buffer_info->page, + buffer_info->pagecnt_bias); buffer_info->page = NULL; } @@ -6991,7 +6991,7 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring, dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma, PAGE_SIZE, DMA_FROM_DEVICE, DMA_ATTR_SKIP_CPU_SYNC); - __page_frag_drain(page, 0, rx_buffer->pagecnt_bias); + __page_frag_cache_drain(page, rx_buffer->pagecnt_bias); } /* clear contents of rx_buffer */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ed77a86fbbb0..0fe0b6295ab5 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -499,8 +499,7 @@ extern void free_hot_cold_page(struct page *page, bool cold); extern void free_hot_cold_page_list(struct list_head *list, bool cold); struct page_frag_cache; -extern void __page_frag_drain(struct page *page, unsigned int order, - unsigned int count); +extern void __page_frag_cache_drain(struct page *page, unsigned int count); extern void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask); extern void page_frag_free(void *addr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 097893ffe194..d604d2596b7b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3896,8 +3896,8 @@ EXPORT_SYMBOL(free_pages); * drivers to provide a backing region of memory for use as either an * sk_buff->head, or to be used in the "frags" portion of skb_shared_info. */ -static struct page *__page_frag_refill(struct page_frag_cache *nc, - gfp_t gfp_mask) +static struct page *__page_frag_cache_refill(struct page_frag_cache *nc, + gfp_t gfp_mask) { struct page *page = NULL; gfp_t gfp = gfp_mask; @@ -3917,19 +3917,20 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc, return page; } -void __page_frag_drain(struct page *page, unsigned int order, - unsigned int count) +void __page_frag_cache_drain(struct page *page, unsigned int count) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); if (page_ref_sub_and_test(page, count)) { + unsigned int order = compound_order(page); + if (order == 0) free_hot_cold_page(page, false); else __free_pages_ok(page, order); } } -EXPORT_SYMBOL(__page_frag_drain); +EXPORT_SYMBOL(__page_frag_cache_drain); void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) @@ -3940,7 +3941,7 @@ void *page_frag_alloc(struct page_frag_cache *nc, if (unlikely(!nc->va)) { refill: - page = __page_frag_refill(nc, gfp_mask); + page = __page_frag_cache_refill(nc, gfp_mask); if (!page) return NULL; -- cgit v1.2.3 From f05714293a591038304ddae7cb0dd747bb3786cc Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 10 Jan 2017 16:58:15 -0800 Subject: mm: support anonymous stable page During developemnt for zram-swap asynchronous writeback, I found strange corruption of compressed page, resulting in: Modules linked in: zram(E) CPU: 3 PID: 1520 Comm: zramd-1 Tainted: G E 4.8.0-mm1-00320-ge0d4894c9c38-dirty #3274 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 task: ffff88007620b840 task.stack: ffff880078090000 RIP: set_freeobj.part.43+0x1c/0x1f RSP: 0018:ffff880078093ca8 EFLAGS: 00010246 RAX: 0000000000000018 RBX: ffff880076798d88 RCX: ffffffff81c408c8 RDX: 0000000000000018 RSI: 0000000000000000 RDI: 0000000000000246 RBP: ffff880078093cb0 R08: 0000000000000000 R09: 0000000000000000 R10: ffff88005bc43030 R11: 0000000000001df3 R12: ffff880076798d88 R13: 000000000005bc43 R14: ffff88007819d1b8 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff88007e380000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fc934048f20 CR3: 0000000077b01000 CR4: 00000000000406e0 Call Trace: obj_malloc+0x22b/0x260 zs_malloc+0x1e4/0x580 zram_bvec_rw+0x4cd/0x830 [zram] page_requests_rw+0x9c/0x130 [zram] zram_thread+0xe6/0x173 [zram] kthread+0xca/0xe0 ret_from_fork+0x25/0x30 With investigation, it reveals currently stable page doesn't support anonymous page. IOW, reuse_swap_page can reuse the page without waiting writeback completion so it can overwrite page zram is compressing. Unfortunately, zram has used per-cpu stream feature from v4.7. It aims for increasing cache hit ratio of scratch buffer for compressing. Downside of that approach is that zram should ask memory space for compressed page in per-cpu context which requires stricted gfp flag which could be failed. If so, it retries to allocate memory space out of per-cpu context so it could get memory this time and compress the data again, copies it to the memory space. In this scenario, zram assumes the data should never be changed but it is not true unless stable page supports. So, If the data is changed under us, zram can make buffer overrun because second compression size could be bigger than one we got in previous trial and blindly, copy bigger size object to smaller buffer which is buffer overrun. The overrun breaks zsmalloc free object chaining so system goes crash like above. I think below is same problem. https://bugzilla.suse.com/show_bug.cgi?id=997574 Unfortunately, reuse_swap_page should be atomic so that we cannot wait on writeback in there so the approach in this patch is simply return false if we found it needs stable page. Although it increases memory footprint temporarily, it happens rarely and it should be reclaimed easily althoug it happened. Also, It would be better than waiting of IO completion, which is critial path for application latency. Fixes: da9556a2367c ("zram: user per-cpu compression streams") Link: http://lkml.kernel.org/r/20161120233015.GA14113@bbox Link: http://lkml.kernel.org/r/1482366980-3782-2-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Hugh Dickins Cc: Sergey Senozhatsky Cc: Darrick J. Wong Cc: Takashi Iwai Cc: Hyeoncheol Lee Cc: Cc: Sangseok Lee Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 3 ++- mm/swapfile.c | 20 +++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index 09f4be179ff3..7f47b7098b1b 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -150,8 +150,9 @@ enum { SWP_FILE = (1 << 7), /* set after swap_activate success */ SWP_AREA_DISCARD = (1 << 8), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 9), /* freed swap page-cluster discards */ + SWP_STABLE_WRITES = (1 << 10), /* no overwrite PG_writeback pages */ /* add others here before... */ - SWP_SCANNING = (1 << 10), /* refcount in scan_swap_map */ + SWP_SCANNING = (1 << 11), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL diff --git a/mm/swapfile.c b/mm/swapfile.c index 1c6e0321205d..4761701d1721 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -943,11 +943,25 @@ bool reuse_swap_page(struct page *page, int *total_mapcount) count = page_trans_huge_mapcount(page, total_mapcount); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); - if (count == 1 && !PageWriteback(page)) { + if (count != 1) + goto out; + if (!PageWriteback(page)) { delete_from_swap_cache(page); SetPageDirty(page); + } else { + swp_entry_t entry; + struct swap_info_struct *p; + + entry.val = page_private(page); + p = swap_info_get(entry); + if (p->flags & SWP_STABLE_WRITES) { + spin_unlock(&p->lock); + return false; + } + spin_unlock(&p->lock); } } +out: return count <= 1; } @@ -2448,6 +2462,10 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } + + if (bdi_cap_stable_pages_required(inode_to_bdi(inode))) + p->flags |= SWP_STABLE_WRITES; + if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { int cpu; -- cgit v1.2.3 From 575b1967e10a1f3038266244d2c7a3ca6b99fed8 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Tue, 10 Jan 2017 16:58:30 -0800 Subject: timerfd: export defines to userspace Since userspace is expected to call timerfd syscalls directly with these flags/ioctls, make sure we export them so they don't have to duplicate the values themselves. Link: http://lkml.kernel.org/r/20161219064052.7196-1-vapier@gentoo.org Signed-off-by: Mike Frysinger Acked-by: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/timerfd.h | 20 +------------------- include/uapi/linux/Kbuild | 1 + include/uapi/linux/timerfd.h | 36 ++++++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 19 deletions(-) create mode 100644 include/uapi/linux/timerfd.h (limited to 'include') diff --git a/include/linux/timerfd.h b/include/linux/timerfd.h index bd36ce431e32..bab0b1ad0613 100644 --- a/include/linux/timerfd.h +++ b/include/linux/timerfd.h @@ -8,23 +8,7 @@ #ifndef _LINUX_TIMERFD_H #define _LINUX_TIMERFD_H -/* For O_CLOEXEC and O_NONBLOCK */ -#include - -/* For _IO helpers */ -#include - -/* - * CAREFUL: Check include/asm-generic/fcntl.h when defining - * new flags, since they might collide with O_* ones. We want - * to re-use O_* flags that couldn't possibly have a meaning - * from eventfd, in order to leave a free define-space for - * shared O_* flags. - */ -#define TFD_TIMER_ABSTIME (1 << 0) -#define TFD_TIMER_CANCEL_ON_SET (1 << 1) -#define TFD_CLOEXEC O_CLOEXEC -#define TFD_NONBLOCK O_NONBLOCK +#include #define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK) /* Flags for timerfd_create. */ @@ -32,6 +16,4 @@ /* Flags for timerfd_settime. */ #define TFD_SETTIME_FLAGS (TFD_TIMER_ABSTIME | TFD_TIMER_CANCEL_ON_SET) -#define TFD_IOC_SET_TICKS _IOW('T', 0, u64) - #endif /* _LINUX_TIMERFD_H */ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index a8b93e685239..f330ba4547cf 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -414,6 +414,7 @@ header-y += telephony.h header-y += termios.h header-y += thermal.h header-y += time.h +header-y += timerfd.h header-y += times.h header-y += timex.h header-y += tiocl.h diff --git a/include/uapi/linux/timerfd.h b/include/uapi/linux/timerfd.h new file mode 100644 index 000000000000..6fcfaa8da173 --- /dev/null +++ b/include/uapi/linux/timerfd.h @@ -0,0 +1,36 @@ +/* + * include/linux/timerfd.h + * + * Copyright (C) 2007 Davide Libenzi + * + */ + +#ifndef _UAPI_LINUX_TIMERFD_H +#define _UAPI_LINUX_TIMERFD_H + +#include + +/* For O_CLOEXEC and O_NONBLOCK */ +#include + +/* For _IO helpers */ +#include + +/* + * CAREFUL: Check include/asm-generic/fcntl.h when defining + * new flags, since they might collide with O_* ones. We want + * to re-use O_* flags that couldn't possibly have a meaning + * from eventfd, in order to leave a free define-space for + * shared O_* flags. + * + * Also make sure to update the masks in include/linux/timerfd.h + * when adding new flags. + */ +#define TFD_TIMER_ABSTIME (1 << 0) +#define TFD_TIMER_CANCEL_ON_SET (1 << 1) +#define TFD_CLOEXEC O_CLOEXEC +#define TFD_NONBLOCK O_NONBLOCK + +#define TFD_IOC_SET_TICKS _IOW('T', 0, __u64) + +#endif /* _UAPI_LINUX_TIMERFD_H */ -- cgit v1.2.3