From 15d9f3d116c02a485441d758d9ca0a2e4f3b30be Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Thu, 15 Feb 2018 10:08:14 -0600 Subject: percpu: match chunk allocator declarations with definitions At some point the function declaration parameters got out of sync with the function definitions in percpu-vm.c and percpu-km.c. This patch makes them match again. Signed-off-by: Dennis Zhou Acked-by: Christoph Lameter Signed-off-by: Tejun Heo --- mm/percpu.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 50e7fdf84055..e1ea41002173 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1277,8 +1277,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, * pcpu_addr_to_page - translate address to physical address * pcpu_verify_alloc_info - check alloc_info is acceptable during init */ -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size); -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size); +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end); +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, + int page_start, int page_end); static struct pcpu_chunk *pcpu_create_chunk(void); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static struct page *pcpu_addr_to_page(void *addr); -- cgit v1.2.3 From 47504ee04b9241548ae2c28be7d0b01cff3b7aa6 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Fri, 16 Feb 2018 12:07:19 -0600 Subject: percpu: add __GFP_NORETRY semantics to the percpu balancing path Percpu memory using the vmalloc area based chunk allocator lazily populates chunks by first requesting the full virtual address space required for the chunk and subsequently adding pages as allocations come through. To ensure atomic allocations can succeed, a workqueue item is used to maintain a minimum number of empty pages. In certain scenarios, such as reported in [1], it is possible that physical memory becomes quite scarce which can result in either a rather long time spent trying to find free pages or worse, a kernel panic. This patch adds support for __GFP_NORETRY and __GFP_NOWARN passing them through to the underlying allocators. This should prevent any unnecessary panics potentially caused by the workqueue item. The passing of gfp around is as additional flags rather than a full set of flags. The next patch will change these to caller passed semantics. V2: Added const modifier to gfp flags in the balance path. Removed an extra whitespace. [1] https://lkml.org/lkml/2018/2/12/551 Signed-off-by: Dennis Zhou Suggested-by: Daniel Borkmann Reported-by: syzbot+adb03f3f0bb57ce3acda@syzkaller.appspotmail.com Acked-by: Christoph Lameter Signed-off-by: Tejun Heo --- mm/percpu-km.c | 8 ++++---- mm/percpu-vm.c | 18 +++++++++++------- mm/percpu.c | 44 +++++++++++++++++++++++++++----------------- 3 files changed, 42 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/percpu-km.c b/mm/percpu-km.c index d2a76642c4ae..0d88d7bd5706 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -34,7 +34,7 @@ #include static int pcpu_populate_chunk(struct pcpu_chunk *chunk, - int page_start, int page_end) + int page_start, int page_end, gfp_t gfp) { return 0; } @@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, /* nada */ } -static struct pcpu_chunk *pcpu_create_chunk(void) +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) { const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; struct pcpu_chunk *chunk; struct page *pages; int i; - chunk = pcpu_alloc_chunk(); + chunk = pcpu_alloc_chunk(gfp); if (!chunk) return NULL; - pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages)); + pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages)); if (!pages) { pcpu_free_chunk(chunk); return NULL; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 9158e5a81391..0af71eb2fff0 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void) lockdep_assert_held(&pcpu_alloc_mutex); if (!pages) - pages = pcpu_mem_zalloc(pages_size); + pages = pcpu_mem_zalloc(pages_size, 0); return pages; } @@ -73,18 +73,21 @@ static void pcpu_free_pages(struct pcpu_chunk *chunk, * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() * @page_start: page index of the first page to be allocated * @page_end: page index of the last page to be allocated + 1 + * @gfp: allocation flags passed to the underlying allocator * * Allocate pages [@page_start,@page_end) into @pages for all units. * The allocation is for @chunk. Percpu core doesn't care about the * content of @pages and will pass it verbatim to pcpu_map_pages(). */ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, - struct page **pages, int page_start, int page_end) + struct page **pages, int page_start, int page_end, + gfp_t gfp) { - const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM; unsigned int cpu, tcpu; int i; + gfp |= GFP_KERNEL | __GFP_HIGHMEM; + for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; @@ -262,6 +265,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, * @chunk: chunk of interest * @page_start: the start page * @page_end: the end page + * @gfp: allocation flags passed to the underlying memory allocator * * For each cpu, populate and map pages [@page_start,@page_end) into * @chunk. @@ -270,7 +274,7 @@ static void pcpu_post_map_flush(struct pcpu_chunk *chunk, * pcpu_alloc_mutex, does GFP_KERNEL allocation. */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, - int page_start, int page_end) + int page_start, int page_end, gfp_t gfp) { struct page **pages; @@ -278,7 +282,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, if (!pages) return -ENOMEM; - if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) + if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp)) return -ENOMEM; if (pcpu_map_pages(chunk, pages, page_start, page_end)) { @@ -325,12 +329,12 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, pcpu_free_pages(chunk, pages, page_start, page_end); } -static struct pcpu_chunk *pcpu_create_chunk(void) +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) { struct pcpu_chunk *chunk; struct vm_struct **vms; - chunk = pcpu_alloc_chunk(); + chunk = pcpu_alloc_chunk(gfp); if (!chunk) return NULL; diff --git a/mm/percpu.c b/mm/percpu.c index e1ea41002173..f97443d488a8 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -447,10 +447,12 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, /** * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate + * @gfp: allocation flags * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, - * kzalloc() is used; otherwise, vzalloc() is used. The returned - * memory is always zeroed. + * kzalloc() is used; otherwise, the equivalent of vzalloc() is used. + * This is to facilitate passing through whitelisted flags. The + * returned memory is always zeroed. * * CONTEXT: * Does GFP_KERNEL allocation. @@ -458,15 +460,16 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ -static void *pcpu_mem_zalloc(size_t size) +static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) { if (WARN_ON_ONCE(!slab_is_available())) return NULL; if (size <= PAGE_SIZE) - return kzalloc(size, GFP_KERNEL); + return kzalloc(size, gfp | GFP_KERNEL); else - return vzalloc(size); + return __vmalloc(size, gfp | GFP_KERNEL | __GFP_ZERO, + PAGE_KERNEL); } /** @@ -1154,12 +1157,12 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, return chunk; } -static struct pcpu_chunk *pcpu_alloc_chunk(void) +static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) { struct pcpu_chunk *chunk; int region_bits; - chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); + chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp); if (!chunk) return NULL; @@ -1168,17 +1171,17 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) region_bits = pcpu_chunk_map_bits(chunk); chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) * - sizeof(chunk->alloc_map[0])); + sizeof(chunk->alloc_map[0]), gfp); if (!chunk->alloc_map) goto alloc_map_fail; chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) * - sizeof(chunk->bound_map[0])); + sizeof(chunk->bound_map[0]), gfp); if (!chunk->bound_map) goto bound_map_fail; chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) * - sizeof(chunk->md_blocks[0])); + sizeof(chunk->md_blocks[0]), gfp); if (!chunk->md_blocks) goto md_blocks_fail; @@ -1278,10 +1281,10 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, * pcpu_verify_alloc_info - check alloc_info is acceptable during init */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, - int page_start, int page_end); + int page_start, int page_end, gfp_t gfp); static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end); -static struct pcpu_chunk *pcpu_create_chunk(void); +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static struct page *pcpu_addr_to_page(void *addr); static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); @@ -1423,7 +1426,7 @@ restart: } if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { - chunk = pcpu_create_chunk(); + chunk = pcpu_create_chunk(0); if (!chunk) { err = "failed to allocate new chunk"; goto fail; @@ -1452,7 +1455,7 @@ area_found: page_start, page_end) { WARN_ON(chunk->immutable); - ret = pcpu_populate_chunk(chunk, rs, re); + ret = pcpu_populate_chunk(chunk, rs, re, 0); spin_lock_irqsave(&pcpu_lock, flags); if (ret) { @@ -1563,10 +1566,17 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * - * Reclaim all fully free chunks except for the first one. + * Reclaim all fully free chunks except for the first one. This is also + * responsible for maintaining the pool of empty populated pages. However, + * it is possible that this is called when physical memory is scarce causing + * OOM killer to be triggered. We should avoid doing so until an actual + * allocation causes the failure as it is possible that requests can be + * serviced from already backed regions. */ static void pcpu_balance_workfn(struct work_struct *work) { + /* gfp flags passed to underlying allocators */ + const gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; LIST_HEAD(to_free); struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; @@ -1647,7 +1657,7 @@ retry_pop: chunk->nr_pages) { int nr = min(re - rs, nr_to_pop); - ret = pcpu_populate_chunk(chunk, rs, rs + nr); + ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp); if (!ret) { nr_to_pop -= nr; spin_lock_irq(&pcpu_lock); @@ -1664,7 +1674,7 @@ retry_pop: if (nr_to_pop) { /* ran out of chunks to populate, create a new one and retry */ - chunk = pcpu_create_chunk(); + chunk = pcpu_create_chunk(gfp); if (chunk) { spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); -- cgit v1.2.3 From 554fef1c39ee148623a496e04569dabb11463406 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Fri, 16 Feb 2018 12:09:58 -0600 Subject: percpu: allow select gfp to be passed to underlying allocators The prior patch added support for passing gfp flags through to the underlying allocators. This patch allows users to pass along gfp flags (currently only __GFP_NORETRY and __GFP_NOWARN) to the underlying allocators. This should allow users to decide if they are ok with failing allocations recovering in a more graceful way. Additionally, gfp passing was done as additional flags in the previous patch. Instead, change this to caller passed semantics. GFP_KERNEL is also removed as the default flag. It continues to be used for internally caused underlying percpu allocations. V2: Removed gfp_percpu_mask in favor of doing it inline. Removed GFP_KERNEL as a default flag for __alloc_percpu_gfp. Signed-off-by: Dennis Zhou Suggested-by: Daniel Borkmann Acked-by: Christoph Lameter Signed-off-by: Tejun Heo --- mm/percpu-km.c | 2 +- mm/percpu-vm.c | 4 ++-- mm/percpu.c | 16 +++++++--------- 3 files changed, 10 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/percpu-km.c b/mm/percpu-km.c index 0d88d7bd5706..38de70ab1a0d 100644 --- a/mm/percpu-km.c +++ b/mm/percpu-km.c @@ -56,7 +56,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) if (!chunk) return NULL; - pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages)); + pages = alloc_pages(gfp, order_base_2(nr_pages)); if (!pages) { pcpu_free_chunk(chunk); return NULL; diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 0af71eb2fff0..d8078de912de 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void) lockdep_assert_held(&pcpu_alloc_mutex); if (!pages) - pages = pcpu_mem_zalloc(pages_size, 0); + pages = pcpu_mem_zalloc(pages_size, GFP_KERNEL); return pages; } @@ -86,7 +86,7 @@ static int pcpu_alloc_pages(struct pcpu_chunk *chunk, unsigned int cpu, tcpu; int i; - gfp |= GFP_KERNEL | __GFP_HIGHMEM; + gfp |= __GFP_HIGHMEM; for_each_possible_cpu(cpu) { for (i = page_start; i < page_end; i++) { diff --git a/mm/percpu.c b/mm/percpu.c index f97443d488a8..fa3f854634a1 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -454,9 +454,6 @@ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, * This is to facilitate passing through whitelisted flags. The * returned memory is always zeroed. * - * CONTEXT: - * Does GFP_KERNEL allocation. - * * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ @@ -466,10 +463,9 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) return NULL; if (size <= PAGE_SIZE) - return kzalloc(size, gfp | GFP_KERNEL); + return kzalloc(size, gfp); else - return __vmalloc(size, gfp | GFP_KERNEL | __GFP_ZERO, - PAGE_KERNEL); + return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL); } /** @@ -1344,6 +1340,8 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, gfp_t gfp) { + /* whitelisted flags that can be passed to the backing allocators */ + gfp_t pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; bool do_warn = !(gfp & __GFP_NOWARN); static int warn_limit = 10; @@ -1426,7 +1424,7 @@ restart: } if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { - chunk = pcpu_create_chunk(0); + chunk = pcpu_create_chunk(pcpu_gfp); if (!chunk) { err = "failed to allocate new chunk"; goto fail; @@ -1455,7 +1453,7 @@ area_found: page_start, page_end) { WARN_ON(chunk->immutable); - ret = pcpu_populate_chunk(chunk, rs, re, 0); + ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); spin_lock_irqsave(&pcpu_lock, flags); if (ret) { @@ -1576,7 +1574,7 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) static void pcpu_balance_workfn(struct work_struct *work) { /* gfp flags passed to underlying allocators */ - const gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; + const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; LIST_HEAD(to_free); struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; -- cgit v1.2.3 From accd4f36a7d11c2d54544007eb65e10604dcf2f5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 23 Feb 2018 08:12:42 -0800 Subject: percpu: add a schedule point in pcpu_balance_workfn() When a large BPF percpu map is destroyed, I have seen pcpu_balance_workfn() holding cpu for hundreds of milliseconds. On KASAN config and 112 hyperthreads, average time to destroy a chunk is ~4 ms. [ 2489.841376] destroy chunk 1 in 4148689 ns ... [ 2490.093428] destroy chunk 32 in 4072718 ns Signed-off-by: Eric Dumazet Signed-off-by: Tejun Heo --- mm/percpu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index fa3f854634a1..36e7b65ba6cf 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1610,6 +1610,7 @@ static void pcpu_balance_workfn(struct work_struct *work) spin_unlock_irq(&pcpu_lock); } pcpu_destroy_chunk(chunk); + cond_resched(); } /* -- cgit v1.2.3 From 3e04040df6d4613a8af5a80882d5f7f298f49810 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 14 Mar 2018 19:29:37 +0000 Subject: Revert "mm/page_alloc: fix memmap_init_zone pageblock alignment" This reverts commit 864b75f9d6b0100bb24fdd9a20d156e7cda9b5ae. Commit 864b75f9d6b0 ("mm/page_alloc: fix memmap_init_zone pageblock alignment") modified the logic in memmap_init_zone() to initialize struct pages associated with invalid PFNs, to appease a VM_BUG_ON() in move_freepages(), which is redundant by its own admission, and dereferences struct page fields to obtain the zone without checking whether the struct pages in question are valid to begin with. Commit 864b75f9d6b0 only makes it worse, since the rounding it does may cause pfn assume the same value it had in a prior iteration of the loop, resulting in an infinite loop and a hang very early in the boot. Also, since it doesn't perform the same rounding on start_pfn itself but only on intermediate values following an invalid PFN, we may still hit the same VM_BUG_ON() as before. So instead, let's fix this at the core, and ensure that the BUG check doesn't dereference struct page fields of invalid pages. Fixes: 864b75f9d6b0 ("mm/page_alloc: fix memmap_init_zone pageblock alignment") Tested-by: Jan Glauber Tested-by: Shanker Donthineni Cc: Daniel Vacek Cc: Mel Gorman Cc: Michal Hocko Cc: Paul Burton Cc: Pavel Tatashin Cc: Vlastimil Babka Cc: Andrew Morton Signed-off-by: Ard Biesheuvel Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3d974cb2a1a1..635d7dd29d7f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1910,7 +1910,9 @@ static int move_freepages(struct zone *zone, * Remove at a later date when no bug reports exist related to * grouping pages by mobility */ - VM_BUG_ON(page_zone(start_page) != page_zone(end_page)); + VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) && + pfn_valid(page_to_pfn(end_page)) && + page_zone(start_page) != page_zone(end_page)); #endif if (num_movable) @@ -5359,14 +5361,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, /* * Skip to the pfn preceding the next valid one (or * end_pfn), such that we hit a valid pfn (or end_pfn) - * on our next iteration of the loop. Note that it needs - * to be pageblock aligned even when the region itself - * is not. move_freepages_block() can shift ahead of - * the valid region but still depends on correct page - * metadata. + * on our next iteration of the loop. */ - pfn = (memblock_next_valid_pfn(pfn, end_pfn) & - ~(pageblock_nr_pages-1)) - 1; + pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; #endif continue; } -- cgit v1.2.3 From 71546d100422bcc2c543dadeb9328728997cd23a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 14 Mar 2018 08:27:26 -0700 Subject: percpu: include linux/sched.h for cond_resched() microblaze build broke due to missing declaration of the cond_resched() invocation added recently. Let's include linux/sched.h explicitly. Signed-off-by: Tejun Heo Reported-by: kbuild test robot --- mm/percpu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 36e7b65ba6cf..15a398c00791 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -80,6 +80,7 @@ #include #include #include +#include #include #include -- cgit v1.2.3 From f52ba1fef7b92e74d58efef8eae7b6f48c6d218d Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Mon, 19 Mar 2018 18:32:10 +0300 Subject: mm: Allow to kill tasks doing pcpu_alloc() and waiting for pcpu_balance_workfn() In case of memory deficit and low percpu memory pages, pcpu_balance_workfn() takes pcpu_alloc_mutex for a long time (as it makes memory allocations itself and waits for memory reclaim). If tasks doing pcpu_alloc() are choosen by OOM killer, they can't exit, because they are waiting for the mutex. The patch makes pcpu_alloc() to care about killing signal and use mutex_lock_killable(), when it's allowed by GFP flags. This guarantees, a task does not miss SIGKILL from OOM killer. Signed-off-by: Kirill Tkhai Signed-off-by: Tejun Heo --- mm/percpu.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 15a398c00791..9297098519a6 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1373,8 +1373,17 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, return NULL; } - if (!is_atomic) - mutex_lock(&pcpu_alloc_mutex); + if (!is_atomic) { + /* + * pcpu_balance_workfn() allocates memory under this mutex, + * and it may wait for memory reclaim. Allow current task + * to become OOM victim, in case of memory pressure. + */ + if (gfp & __GFP_NOFAIL) + mutex_lock(&pcpu_alloc_mutex); + else if (mutex_lock_killable(&pcpu_alloc_mutex)) + return NULL; + } spin_lock_irqsave(&pcpu_lock, flags); -- cgit v1.2.3 From 8970a63e965b43288c4f5f40efbc2bbf80de7f16 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Thu, 22 Mar 2018 16:17:02 -0700 Subject: mm/mempolicy.c: avoid use uninitialized preferred_node Alexander reported a use of uninitialized memory in __mpol_equal(), which is caused by incorrect use of preferred_node. When mempolicy in mode MPOL_PREFERRED with flags MPOL_F_LOCAL, it uses numa_node_id() instead of preferred_node, however, __mpol_equal() uses preferred_node without checking whether it is MPOL_F_LOCAL or not. [akpm@linux-foundation.org: slight comment tweak] Link: http://lkml.kernel.org/r/4ebee1c2-57f6-bcb8-0e2d-1833d1ee0bb7@huawei.com Fixes: fc36b8d3d819 ("mempolicy: use MPOL_F_LOCAL to Indicate Preferred Local Policy") Signed-off-by: Yisheng Xie Reported-by: Alexander Potapenko Tested-by: Alexander Potapenko Reviewed-by: Andrew Morton Cc: Dmitriy Vyukov Cc: Vlastimil Babka Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d879f1d8a44a..32cba0332787 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2124,6 +2124,9 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) case MPOL_INTERLEAVE: return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: + /* a's ->flags is the same as b's */ + if (a->flags & MPOL_F_LOCAL) + return true; return a->v.preferred_node == b->v.preferred_node; default: BUG(); -- cgit v1.2.3 From 2e517d681632326ed98399cb4dd99519efe3e32c Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 22 Mar 2018 16:17:10 -0700 Subject: lockdep: fix fs_reclaim warning Dave Jones reported fs_reclaim lockdep warnings. ============================================ WARNING: possible recursive locking detected 4.15.0-rc9-backup-debug+ #1 Not tainted -------------------------------------------- sshd/24800 is trying to acquire lock: (fs_reclaim){+.+.}, at: [<0000000084f438c2>] fs_reclaim_acquire.part.102+0x5/0x30 but task is already holding lock: (fs_reclaim){+.+.}, at: [<0000000084f438c2>] fs_reclaim_acquire.part.102+0x5/0x30 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(fs_reclaim); lock(fs_reclaim); *** DEADLOCK *** May be due to missing lock nesting notation 2 locks held by sshd/24800: #0: (sk_lock-AF_INET6){+.+.}, at: [<000000001a069652>] tcp_sendmsg+0x19/0x40 #1: (fs_reclaim){+.+.}, at: [<0000000084f438c2>] fs_reclaim_acquire.part.102+0x5/0x30 stack backtrace: CPU: 3 PID: 24800 Comm: sshd Not tainted 4.15.0-rc9-backup-debug+ #1 Call Trace: dump_stack+0xbc/0x13f __lock_acquire+0xa09/0x2040 lock_acquire+0x12e/0x350 fs_reclaim_acquire.part.102+0x29/0x30 kmem_cache_alloc+0x3d/0x2c0 alloc_extent_state+0xa7/0x410 __clear_extent_bit+0x3ea/0x570 try_release_extent_mapping+0x21a/0x260 __btrfs_releasepage+0xb0/0x1c0 btrfs_releasepage+0x161/0x170 try_to_release_page+0x162/0x1c0 shrink_page_list+0x1d5a/0x2fb0 shrink_inactive_list+0x451/0x940 shrink_node_memcg.constprop.88+0x4c9/0x5e0 shrink_node+0x12d/0x260 try_to_free_pages+0x418/0xaf0 __alloc_pages_slowpath+0x976/0x1790 __alloc_pages_nodemask+0x52c/0x5c0 new_slab+0x374/0x3f0 ___slab_alloc.constprop.81+0x47e/0x5a0 __slab_alloc.constprop.80+0x32/0x60 __kmalloc_track_caller+0x267/0x310 __kmalloc_reserve.isra.40+0x29/0x80 __alloc_skb+0xee/0x390 sk_stream_alloc_skb+0xb8/0x340 tcp_sendmsg_locked+0x8e6/0x1d30 tcp_sendmsg+0x27/0x40 inet_sendmsg+0xd0/0x310 sock_write_iter+0x17a/0x240 __vfs_write+0x2ab/0x380 vfs_write+0xfb/0x260 SyS_write+0xb6/0x140 do_syscall_64+0x1e5/0xc05 entry_SYSCALL64_slow_path+0x25/0x25 This warning is caused by commit d92a8cfcb37e ("locking/lockdep: Rework FS_RECLAIM annotation") which replaced the use of lockdep_{set,clear}_current_reclaim_state() in __perform_reclaim() and lockdep_trace_alloc() in slab_pre_alloc_hook() with fs_reclaim_acquire()/ fs_reclaim_release(). Since __kmalloc_reserve() from __alloc_skb() adds __GFP_NOMEMALLOC | __GFP_NOWARN to gfp_mask, and all reclaim path simply propagates __GFP_NOMEMALLOC, fs_reclaim_acquire() in slab_pre_alloc_hook() is trying to grab the 'fake' lock again when __perform_reclaim() already grabbed the 'fake' lock. The /* this guy won't enter reclaim */ if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) return false; test which causes slab_pre_alloc_hook() to try to grab the 'fake' lock was added by commit cf40bd16fdad ("lockdep: annotate reclaim context (__GFP_NOFS)"). But that test is outdated because PF_MEMALLOC thread won't enter reclaim regardless of __GFP_NOMEMALLOC after commit 341ce06f69ab ("page allocator: calculate the alloc_flags for allocation only once") added the PF_MEMALLOC safeguard ( /* Avoid recursion of direct reclaim */ if (p->flags & PF_MEMALLOC) goto nopage; in __alloc_pages_slowpath()). Thus, let's fix outdated test by removing __GFP_NOMEMALLOC test and allow __need_fs_reclaim() to return false. Link: http://lkml.kernel.org/r/201802280650.FJC73911.FOSOMLJVFFQtHO@I-love.SAKURA.ne.jp Fixes: d92a8cfcb37ecd13 ("locking/lockdep: Rework FS_RECLAIM annotation") Signed-off-by: Tetsuo Handa Reported-by: Dave Jones Tested-by: Dave Jones Cc: Peter Zijlstra Cc: Nick Piggin Cc: Ingo Molnar Cc: Nikolay Borisov Cc: Michal Hocko Cc: [4.14+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 635d7dd29d7f..010dee0f089e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3596,7 +3596,7 @@ static bool __need_fs_reclaim(gfp_t gfp_mask) return false; /* this guy won't enter reclaim */ - if ((current->flags & PF_MEMALLOC) && !(gfp_mask & __GFP_NOMEMALLOC)) + if (current->flags & PF_MEMALLOC) return false; /* We're only interested __GFP_FS allocations for now */ -- cgit v1.2.3 From 63489f8e821144000e0bdca7e65a8d1cc23a7ee7 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 22 Mar 2018 16:17:13 -0700 Subject: hugetlbfs: check for pgoff value overflow A vma with vm_pgoff large enough to overflow a loff_t type when converted to a byte offset can be passed via the remap_file_pages system call. The hugetlbfs mmap routine uses the byte offset to calculate reservations and file size. A sequence such as: mmap(0x20a00000, 0x600000, 0, 0x66033, -1, 0); remap_file_pages(0x20a00000, 0x600000, 0, 0x20000000000000, 0); will result in the following when task exits/file closed, kernel BUG at mm/hugetlb.c:749! Call Trace: hugetlbfs_evict_inode+0x2f/0x40 evict+0xcb/0x190 __dentry_kill+0xcb/0x150 __fput+0x164/0x1e0 task_work_run+0x84/0xa0 exit_to_usermode_loop+0x7d/0x80 do_syscall_64+0x18b/0x190 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 The overflowed pgoff value causes hugetlbfs to try to set up a mapping with a negative range (end < start) that leaves invalid state which causes the BUG. The previous overflow fix to this code was incomplete and did not take the remap_file_pages system call into account. [mike.kravetz@oracle.com: v3] Link: http://lkml.kernel.org/r/20180309002726.7248-1-mike.kravetz@oracle.com [akpm@linux-foundation.org: include mmdebug.h] [akpm@linux-foundation.org: fix -ve left shift count on sh] Link: http://lkml.kernel.org/r/20180308210502.15952-1-mike.kravetz@oracle.com Fixes: 045c7a3f53d9 ("hugetlbfs: fix offset overflow in hugetlbfs mmap") Signed-off-by: Mike Kravetz Reported-by: Nic Losby Acked-by: Michal Hocko Cc: "Kirill A . Shutemov" Cc: Yisheng Xie Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 17 ++++++++++++++--- mm/hugetlb.c | 7 +++++++ 2 files changed, 21 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8fe1b0aa2896..b9a254dcc0e7 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -108,6 +108,16 @@ static void huge_pagevec_release(struct pagevec *pvec) pagevec_reinit(pvec); } +/* + * Mask used when checking the page offset value passed in via system + * calls. This value will be converted to a loff_t which is signed. + * Therefore, we want to check the upper PAGE_SHIFT + 1 bits of the + * value. The extra bit (- 1 in the shift value) is to take the sign + * bit into account. + */ +#define PGOFF_LOFFT_MAX \ + (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) + static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); @@ -127,12 +137,13 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &hugetlb_vm_ops; /* - * Offset passed to mmap (before page shift) could have been - * negative when represented as a (l)off_t. + * page based offset in vm_pgoff could be sufficiently large to + * overflow a (l)off_t when converted to byte offset. */ - if (((loff_t)vma->vm_pgoff << PAGE_SHIFT) < 0) + if (vma->vm_pgoff & PGOFF_LOFFT_MAX) return -EINVAL; + /* must be huge page aligned */ if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a963f2034dfc..976bbc5646fe 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -4374,6 +4375,12 @@ int hugetlb_reserve_pages(struct inode *inode, struct resv_map *resv_map; long gbl_reserve; + /* This should never happen */ + if (from > to) { + VM_WARN(1, "%s called with a negative range\n", __func__); + return -EINVAL; + } + /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page -- cgit v1.2.3 From fece2029a9e65b9a990831afe2a2b83290cbbe26 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 22 Mar 2018 16:17:28 -0700 Subject: mm/khugepaged.c: convert VM_BUG_ON() to collapse fail khugepaged is not yet able to convert PTE-mapped huge pages back to PMD mapped. We do not collapse such pages. See check khugepaged_scan_pmd(). But if between khugepaged_scan_pmd() and __collapse_huge_page_isolate() somebody managed to instantiate THP in the range and then split the PMD back to PTEs we would have a problem -- VM_BUG_ON_PAGE(PageCompound(page)) will get triggered. It's possible since we drop mmap_sem during collapse to re-take for write. Replace the VM_BUG_ON() with graceful collapse fail. Link: http://lkml.kernel.org/r/20180315152353.27989-1-kirill.shutemov@linux.intel.com Fixes: b1caa957ae6d ("khugepaged: ignore pmd tables with THP mapped with ptes") Signed-off-by: Kirill A. Shutemov Cc: Laura Abbott Cc: Jerome Marchand Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b7e2268dfc9a..c15da1ea7e63 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -530,7 +530,12 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } - VM_BUG_ON_PAGE(PageCompound(page), page); + /* TODO: teach khugepaged to collapse THP mapped with pte */ + if (PageCompound(page)) { + result = SCAN_PAGE_COMPOUND; + goto out; + } + VM_BUG_ON_PAGE(!PageAnon(page), page); /* -- cgit v1.2.3 From fa41b900c30b45fab03783724932dc30cd46a6be Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 22 Mar 2018 16:17:31 -0700 Subject: mm/thp: do not wait for lock_page() in deferred_split_scan() deferred_split_scan() gets called from reclaim path. Waiting for page lock may lead to deadlock there. Replace lock_page() with trylock_page() and skip the page if we failed to lock it. We will get to the page on the next scan. Link: http://lkml.kernel.org/r/20180315150747.31945-1-kirill.shutemov@linux.intel.com Fixes: 9a982250f773 ("thp: introduce deferred_split_huge_page()") Signed-off-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 87ab9b8f56b5..529cf36b7edb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2783,11 +2783,13 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, list_for_each_safe(pos, next, &list) { page = list_entry((void *)pos, struct page, mapping); - lock_page(page); + if (!trylock_page(page)) + goto next; /* split_huge_page() removes page from list on success */ if (!split_huge_page(page)) split++; unlock_page(page); +next: put_page(page); } -- cgit v1.2.3 From b3cd54b257ad95d344d121dc563d943ca39b0921 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 22 Mar 2018 16:17:35 -0700 Subject: mm/shmem: do not wait for lock_page() in shmem_unused_huge_shrink() shmem_unused_huge_shrink() gets called from reclaim path. Waiting for page lock may lead to deadlock there. There was a bug report that may be attributed to this: http://lkml.kernel.org/r/alpine.LRH.2.11.1801242349220.30642@mail.ewheeler.net Replace lock_page() with trylock_page() and skip the page if we failed to lock it. We will get to the page on the next scan. We can test for the PageTransHuge() outside the page lock as we only need protection against splitting the page under us. Holding pin oni the page is enough for this. Link: http://lkml.kernel.org/r/20180316210830.43738-1-kirill.shutemov@linux.intel.com Fixes: 779750d20b93 ("shmem: split huge pages beyond i_size under memory pressure") Signed-off-by: Kirill A. Shutemov Reported-by: Eric Wheeler Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Tetsuo Handa Cc: Hugh Dickins Cc: [4.8+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 1907688b75ee..b85919243399 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -493,36 +493,45 @@ next: info = list_entry(pos, struct shmem_inode_info, shrinklist); inode = &info->vfs_inode; - if (nr_to_split && split >= nr_to_split) { - iput(inode); - continue; - } + if (nr_to_split && split >= nr_to_split) + goto leave; - page = find_lock_page(inode->i_mapping, + page = find_get_page(inode->i_mapping, (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT); if (!page) goto drop; + /* No huge page at the end of the file: nothing to split */ if (!PageTransHuge(page)) { - unlock_page(page); put_page(page); goto drop; } + /* + * Leave the inode on the list if we failed to lock + * the page at this time. + * + * Waiting for the lock may lead to deadlock in the + * reclaim path. + */ + if (!trylock_page(page)) { + put_page(page); + goto leave; + } + ret = split_huge_page(page); unlock_page(page); put_page(page); - if (ret) { - /* split failed: leave it on the list */ - iput(inode); - continue; - } + /* If split failed leave the inode on the list */ + if (ret) + goto leave; split++; drop: list_del_init(&info->shrinklist); removed++; +leave: iput(inode); } -- cgit v1.2.3 From f59f1caf72ba00d519c793c3deb32cd3be32edc2 Mon Sep 17 00:00:00 2001 From: Daniel Vacek Date: Thu, 22 Mar 2018 16:17:38 -0700 Subject: Revert "mm: page_alloc: skip over regions of invalid pfns where possible" This reverts commit b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible"). The commit is meant to be a boot init speed up skipping the loop in memmap_init_zone() for invalid pfns. But given some specific memory mapping on x86_64 (or more generally theoretically anywhere but on arm with CONFIG_HAVE_ARCH_PFN_VALID) the implementation also skips valid pfns which is plain wrong and causes 'kernel BUG at mm/page_alloc.c:1389!' crash> log | grep -e BUG -e RIP -e Call.Trace -e move_freepages_block -e rmqueue -e freelist -A1 kernel BUG at mm/page_alloc.c:1389! invalid opcode: 0000 [#1] SMP -- RIP: 0010: move_freepages+0x15e/0x160 -- Call Trace: move_freepages_block+0x73/0x80 __rmqueue+0x263/0x460 get_page_from_freelist+0x7e1/0x9e0 __alloc_pages_nodemask+0x176/0x420 -- crash> page_init_bug -v | grep RAM 1000 - 9bfff System RAM (620.00 KiB) 100000 - 430bffff System RAM ( 1.05 GiB = 1071.75 MiB = 1097472.00 KiB) 4b0c8000 - 4bf9cfff System RAM ( 14.83 MiB = 15188.00 KiB) 4bfac000 - 646b1fff System RAM (391.02 MiB = 400408.00 KiB) 7b788000 - 7b7fffff System RAM (480.00 KiB) 100000000 - 67fffffff System RAM ( 22.00 GiB) crash> page_init_bug | head -6 7b788000 - 7b7fffff System RAM (480.00 KiB) 1fffff00000000 0 1 DMA32 4096 1048575 505736 505344 505855 0 0 0 DMA 1 4095 1fffff00000400 0 1 DMA32 4096 1048575 BUG, zones differ! crash> kmem -p 77fff000 78000000 7b5ff000 7b600000 7b787000 7b788000 PAGE PHYSICAL MAPPING INDEX CNT FLAGS ffffea0001e00000 78000000 0 0 0 0 ffffea0001ed7fc0 7b5ff000 0 0 0 0 ffffea0001ed8000 7b600000 0 0 0 0 <<<< ffffea0001ede1c0 7b787000 0 0 0 0 ffffea0001ede200 7b788000 0 0 1 1fffff00000000 Link: http://lkml.kernel.org/r/20180316143855.29838-1-neelx@redhat.com Fixes: b92df1de5d28 ("mm: page_alloc: skip over regions of invalid pfns where possible") Signed-off-by: Daniel Vacek Acked-by: Ard Biesheuvel Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Vlastimil Babka Cc: Mel Gorman Cc: Pavel Tatashin Cc: Paul Burton Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 1 - mm/memblock.c | 28 ---------------------------- mm/page_alloc.c | 11 +---------- 3 files changed, 1 insertion(+), 39 deletions(-) (limited to 'mm') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 8be5077efb5f..f92ea7783652 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -187,7 +187,6 @@ int memblock_search_pfn_nid(unsigned long pfn, unsigned long *start_pfn, unsigned long *end_pfn); void __next_mem_pfn_range(int *idx, int nid, unsigned long *out_start_pfn, unsigned long *out_end_pfn, int *out_nid); -unsigned long memblock_next_valid_pfn(unsigned long pfn, unsigned long max_pfn); /** * for_each_mem_pfn_range - early memory pfn range iterator diff --git a/mm/memblock.c b/mm/memblock.c index b6ba6b7adadc..48376bd33274 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1101,34 +1101,6 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid, *out_nid = r->nid; } -unsigned long __init_memblock memblock_next_valid_pfn(unsigned long pfn, - unsigned long max_pfn) -{ - struct memblock_type *type = &memblock.memory; - unsigned int right = type->cnt; - unsigned int mid, left = 0; - phys_addr_t addr = PFN_PHYS(++pfn); - - do { - mid = (right + left) / 2; - - if (addr < type->regions[mid].base) - right = mid; - else if (addr >= (type->regions[mid].base + - type->regions[mid].size)) - left = mid + 1; - else { - /* addr is within the region, so pfn is valid */ - return pfn; - } - } while (left < right); - - if (right == type->cnt) - return -1UL; - else - return PHYS_PFN(type->regions[right].base); -} - /** * memblock_set_node - set node ID on memblock regions * @base: base of area to set node ID for diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 010dee0f089e..1741dd23e7c1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5356,17 +5356,8 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, if (context != MEMMAP_EARLY) goto not_early; - if (!early_pfn_valid(pfn)) { -#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP - /* - * Skip to the pfn preceding the next valid one (or - * end_pfn), such that we hit a valid pfn (or end_pfn) - * on our next iteration of the loop. - */ - pfn = memblock_next_valid_pfn(pfn, end_pfn) - 1; -#endif + if (!early_pfn_valid(pfn)) continue; - } if (!early_pfn_in_nid(pfn, nid)) continue; if (!update_defer_init(pgdat, pfn, end_pfn, &nr_initialised)) -- cgit v1.2.3 From 1c610d5f93c709df56787f50b3576704ac271826 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 22 Mar 2018 16:17:42 -0700 Subject: mm/vmscan: wake up flushers for legacy cgroups too Commit 726d061fbd36 ("mm: vmscan: kick flushers when we encounter dirty pages on the LRU") added flusher invocation to shrink_inactive_list() when many dirty pages on the LRU are encountered. However, shrink_inactive_list() doesn't wake up flushers for legacy cgroup reclaim, so the next commit bbef938429f5 ("mm: vmscan: remove old flusher wakeup from direct reclaim path") removed the only source of flusher's wake up in legacy mem cgroup reclaim path. This leads to premature OOM if there is too many dirty pages in cgroup: # mkdir /sys/fs/cgroup/memory/test # echo $$ > /sys/fs/cgroup/memory/test/tasks # echo 50M > /sys/fs/cgroup/memory/test/memory.limit_in_bytes # dd if=/dev/zero of=tmp_file bs=1M count=100 Killed dd invoked oom-killer: gfp_mask=0x14000c0(GFP_KERNEL), nodemask=(null), order=0, oom_score_adj=0 Call Trace: dump_stack+0x46/0x65 dump_header+0x6b/0x2ac oom_kill_process+0x21c/0x4a0 out_of_memory+0x2a5/0x4b0 mem_cgroup_out_of_memory+0x3b/0x60 mem_cgroup_oom_synchronize+0x2ed/0x330 pagefault_out_of_memory+0x24/0x54 __do_page_fault+0x521/0x540 page_fault+0x45/0x50 Task in /test killed as a result of limit of /test memory: usage 51200kB, limit 51200kB, failcnt 73 memory+swap: usage 51200kB, limit 9007199254740988kB, failcnt 0 kmem: usage 296kB, limit 9007199254740988kB, failcnt 0 Memory cgroup stats for /test: cache:49632KB rss:1056KB rss_huge:0KB shmem:0KB mapped_file:0KB dirty:49500KB writeback:0KB swap:0KB inactive_anon:0KB active_anon:1168KB inactive_file:24760KB active_file:24960KB unevictable:0KB Memory cgroup out of memory: Kill process 3861 (bash) score 88 or sacrifice child Killed process 3876 (dd) total-vm:8484kB, anon-rss:1052kB, file-rss:1720kB, shmem-rss:0kB oom_reaper: reaped process 3876 (dd), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB Wake up flushers in legacy cgroup reclaim too. Link: http://lkml.kernel.org/r/20180315164553.17856-1-aryabinin@virtuozzo.com Fixes: bbef938429f5 ("mm: vmscan: remove old flusher wakeup from direct reclaim path") Signed-off-by: Andrey Ryabinin Tested-by: Shakeel Butt Acked-by: Michal Hocko Cc: Mel Gorman Cc: Tejun Heo Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index bee53495a829..cd5dc3faaa57 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1779,6 +1779,20 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (stat.nr_writeback && stat.nr_writeback == nr_taken) set_bit(PGDAT_WRITEBACK, &pgdat->flags); + /* + * If dirty pages are scanned that are not queued for IO, it + * implies that flushers are not doing their job. This can + * happen when memory pressure pushes dirty pages to the end of + * the LRU before the dirty limits are breached and the dirty + * data has expired. It can also happen when the proportion of + * dirty pages grows not through writes but through memory + * pressure reclaiming all the clean cache. And in some cases, + * the flushers simply cannot keep up with the allocation + * rate. Nudge the flusher threads in case they are asleep. + */ + if (stat.nr_unqueued_dirty == nr_taken) + wakeup_flusher_threads(WB_REASON_VMSCAN); + /* * Legacy memcg will stall in page writeback so avoid forcibly * stalling here. @@ -1791,22 +1805,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (stat.nr_dirty && stat.nr_dirty == stat.nr_congested) set_bit(PGDAT_CONGESTED, &pgdat->flags); - /* - * If dirty pages are scanned that are not queued for IO, it - * implies that flushers are not doing their job. This can - * happen when memory pressure pushes dirty pages to the end of - * the LRU before the dirty limits are breached and the dirty - * data has expired. It can also happen when the proportion of - * dirty pages grows not through writes but through memory - * pressure reclaiming all the clean cache. And in some cases, - * the flushers simply cannot keep up with the allocation - * rate. Nudge the flusher threads in case they are asleep, but - * also allow kswapd to start writing pages during reclaim. - */ - if (stat.nr_unqueued_dirty == nr_taken) { - wakeup_flusher_threads(WB_REASON_VMSCAN); + /* Allow kswapd to start writing pages during reclaim. */ + if (stat.nr_unqueued_dirty == nr_taken) set_bit(PGDAT_DIRTY, &pgdat->flags); - } /* * If kswapd scans pages marked marked for immediate -- cgit v1.2.3 From 9d3c3354bb85bab4d865fe95039443f09a4c8394 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 22 Mar 2018 16:17:45 -0700 Subject: mm, thp: do not cause memcg oom for thp Commit 2516035499b9 ("mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations") changed the page allocator to no longer detect thp allocations based on __GFP_NORETRY. It did not, however, modify the mem cgroup try_charge() path to avoid oom kill for either khugepaged collapsing or thp faulting. It is never expected to oom kill a process to allocate a hugepage for thp; reclaim is governed by the thp defrag mode and MADV_HUGEPAGE, but allocations (and charging) should fallback instead of oom killing processes. Link: http://lkml.kernel.org/r/alpine.DEB.2.20.1803191409420.124411@chino.kir.corp.google.com Fixes: 2516035499b9 ("mm, thp: remove __GFP_NORETRY from khugepaged and madvised allocations") Signed-off-by: David Rientjes Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Vlastimil Babka Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 5 +++-- mm/khugepaged.c | 8 ++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 529cf36b7edb..5a68730eebd6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -555,7 +555,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (mem_cgroup_try_charge(page, vma->vm_mm, gfp, &memcg, true)) { + if (mem_cgroup_try_charge(page, vma->vm_mm, gfp | __GFP_NORETRY, &memcg, + true)) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1316,7 +1317,7 @@ alloc: } if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, - huge_gfp, &memcg, true))) { + huge_gfp | __GFP_NORETRY, &memcg, true))) { put_page(new_page); split_huge_pmd(vma, vmf->pmd, vmf->address); if (page) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index c15da1ea7e63..e42568284e06 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -965,7 +965,9 @@ static void collapse_huge_page(struct mm_struct *mm, goto out_nolock; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + /* Do not oom kill for khugepaged charges */ + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, + &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out_nolock; } @@ -1324,7 +1326,9 @@ static void collapse_shmem(struct mm_struct *mm, goto out; } - if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) { + /* Do not oom kill for khugepaged charges */ + if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp | __GFP_NORETRY, + &memcg, true))) { result = SCAN_CGROUP_CHARGE_FAIL; goto out; } -- cgit v1.2.3