From 13583c3d3224508582ec03d881d0b68dd3ee8e10 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 12 Dec 2016 16:41:29 -0800 Subject: mm: memcontrol: use special workqueue for creating per-memcg caches Creating a lot of cgroups at the same time might stall all worker threads with kmem cache creation works, because kmem cache creation is done with the slab_mutex held. The problem was amplified by commits 801faf0db894 ("mm/slab: lockless decision to grow cache") in case of SLAB and 81ae6d03952c ("mm/slub.c: replace kick_all_cpus_sync() with synchronize_sched() in kmem_cache_shrink()") in case of SLUB, which increased the maximal time the slab_mutex can be held. To prevent that from happening, let's use a special ordered single threaded workqueue for kmem cache creation. This shouldn't introduce any functional changes regarding how kmem caches are created, as the work function holds the global slab_mutex during its whole runtime anyway, making it impossible to run more than one work at a time. By using a single threaded workqueue, we just avoid creating a thread per each work. Ordering is required to avoid a situation when a cgroup's work is put off indefinitely because there are other cgroups to serve, in other words to guarantee fairness. Link: https://bugzilla.kernel.org/show_bug.cgi?id=172981 Link: http://lkml.kernel.org/r/20161004131417.GC1862@esperanza Signed-off-by: Vladimir Davydov Reported-by: Doug Smythies Acked-by: Michal Hocko Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Joonsoo Kim Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f870ba43942..91dfc7c5ce8f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2154,6 +2154,8 @@ struct memcg_kmem_cache_create_work { struct work_struct work; }; +static struct workqueue_struct *memcg_kmem_cache_create_wq; + static void memcg_kmem_cache_create_func(struct work_struct *w) { struct memcg_kmem_cache_create_work *cw = @@ -2185,7 +2187,7 @@ static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, cw->cachep = cachep; INIT_WORK(&cw->work, memcg_kmem_cache_create_func); - schedule_work(&cw->work); + queue_work(memcg_kmem_cache_create_wq, &cw->work); } static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg, @@ -5783,6 +5785,17 @@ static int __init mem_cgroup_init(void) { int cpu, node; +#ifndef CONFIG_SLOB + /* + * Kmem cache creation is mostly done with the slab_mutex held, + * so use a special workqueue to avoid stalling all worker + * threads in case lots of cgroups are created simultaneously. + */ + memcg_kmem_cache_create_wq = + alloc_ordered_workqueue("memcg_kmem_cache_create", 0); + BUG_ON(!memcg_kmem_cache_create_wq); +#endif + hotcpu_notifier(memcg_cpu_hotplug_callback, 0); for_each_possible_cpu(cpu) -- cgit v1.2.3 From 89e364db71fb5e7fc8d93228152abfa67daf35fa Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Mon, 12 Dec 2016 16:41:32 -0800 Subject: slub: move synchronize_sched out of slab_mutex on shrink synchronize_sched() is a heavy operation and calling it per each cache owned by a memory cgroup being destroyed may take quite some time. What is worse, it's currently called under the slab_mutex, stalling all works doing cache creation/destruction. Actually, there isn't much point in calling synchronize_sched() for each cache - it's enough to call it just once - after setting cpu_partial for all caches and before shrinking them. This way, we can also move it out of the slab_mutex, which we have to hold for iterating over the slab cache list. Link: https://bugzilla.kernel.org/show_bug.cgi?id=172991 Link: http://lkml.kernel.org/r/0a10d71ecae3db00fb4421bcd3f82bcc911f4be4.1475329751.git.vdavydov.dev@gmail.com Signed-off-by: Vladimir Davydov Reported-by: Doug Smythies Acked-by: Joonsoo Kim Cc: Christoph Lameter Cc: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 ++-- mm/slab.h | 2 +- mm/slab_common.c | 27 +++++++++++++++++++++++++-- mm/slob.c | 2 +- mm/slub.c | 19 ++----------------- 5 files changed, 31 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 0b0550ca85b4..7ea765cd7e93 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2332,7 +2332,7 @@ out: return nr_freed; } -int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *cachep) { int ret = 0; int node; @@ -2352,7 +2352,7 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate) int __kmem_cache_shutdown(struct kmem_cache *cachep) { - return __kmem_cache_shrink(cachep, false); + return __kmem_cache_shrink(cachep); } void __kmem_cache_release(struct kmem_cache *cachep) diff --git a/mm/slab.h b/mm/slab.h index bc05fdc3edce..ceb7d70cdb76 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -146,7 +146,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); -int __kmem_cache_shrink(struct kmem_cache *, bool); +int __kmem_cache_shrink(struct kmem_cache *); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; diff --git a/mm/slab_common.c b/mm/slab_common.c index 329b03843863..5d2f24fbafc5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -573,6 +573,29 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) get_online_cpus(); get_online_mems(); +#ifdef CONFIG_SLUB + /* + * In case of SLUB, we need to disable empty slab caching to + * avoid pinning the offline memory cgroup by freeable kmem + * pages charged to it. SLAB doesn't need this, as it + * periodically purges unused slabs. + */ + mutex_lock(&slab_mutex); + list_for_each_entry(s, &slab_caches, list) { + c = is_root_cache(s) ? cache_from_memcg_idx(s, idx) : NULL; + if (c) { + c->cpu_partial = 0; + c->min_partial = 0; + } + } + mutex_unlock(&slab_mutex); + /* + * kmem_cache->cpu_partial is checked locklessly (see + * put_cpu_partial()). Make sure the change is visible. + */ + synchronize_sched(); +#endif + mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) { if (!is_root_cache(s)) @@ -584,7 +607,7 @@ void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg) if (!c) continue; - __kmem_cache_shrink(c, true); + __kmem_cache_shrink(c); arr->entries[idx] = NULL; } mutex_unlock(&slab_mutex); @@ -755,7 +778,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep) get_online_cpus(); get_online_mems(); kasan_cache_shrink(cachep); - ret = __kmem_cache_shrink(cachep, false); + ret = __kmem_cache_shrink(cachep); put_online_mems(); put_online_cpus(); return ret; diff --git a/mm/slob.c b/mm/slob.c index 5ec158054ffe..eac04d4357ec 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -634,7 +634,7 @@ void __kmem_cache_release(struct kmem_cache *c) { } -int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *d) { return 0; } diff --git a/mm/slub.c b/mm/slub.c index 2b3e740609e9..4a861f265cd7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3883,7 +3883,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) +int __kmem_cache_shrink(struct kmem_cache *s) { int node; int i; @@ -3895,21 +3895,6 @@ int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate) unsigned long flags; int ret = 0; - if (deactivate) { - /* - * Disable empty slabs caching. Used to avoid pinning offline - * memory cgroups by kmem pages that can be freed. - */ - s->cpu_partial = 0; - s->min_partial = 0; - - /* - * s->cpu_partial is checked locklessly (see put_cpu_partial), - * so we have to make sure the change is visible. - */ - synchronize_sched(); - } - flush_all(s); for_each_kmem_cache_node(s, node, n) { INIT_LIST_HEAD(&discard); @@ -3966,7 +3951,7 @@ static int slab_mem_going_offline_callback(void *arg) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - __kmem_cache_shrink(s, false); + __kmem_cache_shrink(s); mutex_unlock(&slab_mutex); return 0; -- cgit v1.2.3 From 84582c8ab9479ffa4532afa95ab8d8f96b5478dc Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 12 Dec 2016 16:41:35 -0800 Subject: slub: avoid false-postive warning The slub allocator gives us some incorrect warnings when CONFIG_PROFILE_ANNOTATED_BRANCHES is set, as the unlikely() macro prevents it from seeing that the return code matches what it was before: mm/slub.c: In function `kmem_cache_free_bulk': mm/slub.c:262:23: error: `df.s' may be used uninitialized in this function [-Werror=maybe-uninitialized] mm/slub.c:2943:3: error: `df.cnt' may be used uninitialized in this function [-Werror=maybe-uninitialized] mm/slub.c:2933:4470: error: `df.freelist' may be used uninitialized in this function [-Werror=maybe-uninitialized] mm/slub.c:2943:3: error: `df.tail' may be used uninitialized in this function [-Werror=maybe-uninitialized] I have not been able to come up with a perfect way for dealing with this, the three options I see are: - add a bogus initialization, which would increase the runtime overhead - replace unlikely() with unlikely_notrace() - remove the unlikely() annotation completely I checked the object code for a typical x86 configuration and the last two cases produce the same result, so I went for the last one, which is the simplest. Link: http://lkml.kernel.org/r/20161024155704.3114445-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Jesper Dangaard Brouer Cc: Arnd Bergmann Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Cc: Johannes Weiner Cc: Laura Abbott Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 4a861f265cd7..067598a00849 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3076,7 +3076,7 @@ void kmem_cache_free_bulk(struct kmem_cache *s, size_t size, void **p) struct detached_freelist df; size = build_detached_freelist(s, size, p, &df); - if (unlikely(!df.page)) + if (!df.page) continue; slab_free(df.s, df.page, df.freelist, df.tail, df.cnt,_RET_IP_); -- cgit v1.2.3 From e70954fd6d4b469517fd906ef1c33310e90ef9f0 Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Mon, 12 Dec 2016 16:41:38 -0800 Subject: mm/slab_common.c: check kmem_create_cache flags are common Verify that kmem_create_cache flags are not allocator specific. It is done before removing flags that are not available with the current configuration. The current kmem_cache_create removes incorrect flags but do not validate the callers are using them right. This change will ensure that callers are not trying to create caches with flags that won't be used because allocator specific. Link: http://lkml.kernel.org/r/1478553075-120242-2-git-send-email-thgarnie@google.com Signed-off-by: Thomas Garnier Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 15 +++++++++++++++ mm/slab_common.c | 6 ++++++ 2 files changed, 21 insertions(+) (limited to 'mm') diff --git a/mm/slab.h b/mm/slab.h index ceb7d70cdb76..699b072dc46e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -142,8 +142,23 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size, #define SLAB_CACHE_FLAGS (0) #endif +/* Common flags available with current configuration */ #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) +/* Common flags permitted for kmem_cache_create */ +#define SLAB_FLAGS_PERMITTED (SLAB_CORE_FLAGS | \ + SLAB_RED_ZONE | \ + SLAB_POISON | \ + SLAB_STORE_USER | \ + SLAB_TRACE | \ + SLAB_CONSISTENCY_CHECKS | \ + SLAB_MEM_SPREAD | \ + SLAB_NOLEAKTRACE | \ + SLAB_RECLAIM_ACCOUNT | \ + SLAB_TEMPORARY | \ + SLAB_NOTRACK | \ + SLAB_ACCOUNT) + int __kmem_cache_shutdown(struct kmem_cache *); void __kmem_cache_release(struct kmem_cache *); int __kmem_cache_shrink(struct kmem_cache *); diff --git a/mm/slab_common.c b/mm/slab_common.c index 5d2f24fbafc5..ae323841adb1 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -404,6 +404,12 @@ kmem_cache_create(const char *name, size_t size, size_t align, goto out_unlock; } + /* Refuse requests with allocator specific flags */ + if (flags & ~SLAB_FLAGS_PERMITTED) { + err = -EINVAL; + goto out_unlock; + } + /* * Some allocators will constraint the set of valid flags to a subset * of all flags. We expect them to define CACHE_CREATE_MASK in this -- cgit v1.2.3 From f728b0a5d72ae99c446f933912914a61254c03b6 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Mon, 12 Dec 2016 16:41:41 -0800 Subject: mm, slab: faster active and free stats Reading /proc/slabinfo or monitoring slabtop(1) can become very expensive if there are many slab caches and if there are very lengthy per-node partial and/or free lists. Commit 07a63c41fa1f ("mm/slab: improve performance of gathering slabinfo stats") addressed the per-node full lists which showed a significant improvement when no objects were freed. This patch has the same motivation and optimizes the remainder of the usecases where there are very lengthy partial and free lists. This patch maintains per-node active_slabs (full and partial) and free_slabs rather than iterating the lists at runtime when reading /proc/slabinfo. When allocating 100GB of slab from a test cache where every slab page is on the partial list, reading /proc/slabinfo (includes all other slab caches on the system) takes ~247ms on average with 48 samples. As a result of this patch, the same read takes ~0.856ms on average. [rientjes@google.com: changelog] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1611081505240.13403@chino.kir.corp.google.com Signed-off-by: Greg Thelen Signed-off-by: David Rientjes Cc: Christoph Lameter Cc: Pekka Enberg Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 117 +++++++++++++++++++++++++------------------------------------- mm/slab.h | 3 +- 2 files changed, 49 insertions(+), 71 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 7ea765cd7e93..e06da6ceaf73 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -227,13 +227,14 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) INIT_LIST_HEAD(&parent->slabs_full); INIT_LIST_HEAD(&parent->slabs_partial); INIT_LIST_HEAD(&parent->slabs_free); + parent->active_slabs = 0; + parent->free_slabs = 0; parent->shared = NULL; parent->alien = NULL; parent->colour_next = 0; spin_lock_init(&parent->list_lock); parent->free_objects = 0; parent->free_touched = 0; - parent->num_slabs = 0; } #define MAKE_LIST(cachep, listp, slab, nodeid) \ @@ -1366,7 +1367,6 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { #if DEBUG struct kmem_cache_node *n; - struct page *page; unsigned long flags; int node; static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, @@ -1381,32 +1381,20 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) cachep->name, cachep->size, cachep->gfporder); for_each_kmem_cache_node(cachep, node, n) { - unsigned long active_objs = 0, num_objs = 0, free_objects = 0; - unsigned long active_slabs = 0, num_slabs = 0; - unsigned long num_slabs_partial = 0, num_slabs_free = 0; - unsigned long num_slabs_full; + unsigned long active_objs = 0, free_objs = 0; + unsigned long active_slabs, num_slabs; spin_lock_irqsave(&n->list_lock, flags); - num_slabs = n->num_slabs; - list_for_each_entry(page, &n->slabs_partial, lru) { - active_objs += page->active; - num_slabs_partial++; - } - list_for_each_entry(page, &n->slabs_free, lru) - num_slabs_free++; + active_slabs = n->active_slabs; + num_slabs = active_slabs + n->free_slabs; - free_objects += n->free_objects; + active_objs += (num_slabs * cachep->num) - n->free_objects; + free_objs += n->free_objects; spin_unlock_irqrestore(&n->list_lock, flags); - num_objs = num_slabs * cachep->num; - active_slabs = num_slabs - num_slabs_free; - num_slabs_full = num_slabs - - (num_slabs_partial + num_slabs_free); - active_objs += (num_slabs_full * cachep->num); - pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", - node, active_slabs, num_slabs, active_objs, num_objs, - free_objects); + node, active_slabs, num_slabs, active_objs, + num_slabs * cachep->num, free_objs); } #endif } @@ -2318,7 +2306,7 @@ static int drain_freelist(struct kmem_cache *cache, page = list_entry(p, struct page, lru); list_del(&page->lru); - n->num_slabs--; + n->free_slabs--; /* * Safe to drop the lock. The slab is no longer linked * to the cache. @@ -2753,12 +2741,14 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) n = get_node(cachep, page_to_nid(page)); spin_lock(&n->list_lock); - if (!page->active) + if (!page->active) { list_add_tail(&page->lru, &(n->slabs_free)); - else + n->free_slabs++; + } else { fixup_slab_list(cachep, n, page, &list); + n->active_slabs++; + } - n->num_slabs++; STATS_INC_GROWN(cachep); n->free_objects += cachep->num - page->active; spin_unlock(&n->list_lock); @@ -2884,7 +2874,7 @@ static inline void fixup_slab_list(struct kmem_cache *cachep, /* Try to find non-pfmemalloc slab if needed */ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, - struct page *page, bool pfmemalloc) + struct page *page, bool *page_is_free, bool pfmemalloc) { if (!page) return NULL; @@ -2903,9 +2893,11 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, /* Move pfmemalloc slab to the end of list to speed up next search */ list_del(&page->lru); - if (!page->active) + if (*page_is_free) { + WARN_ON(page->active); list_add_tail(&page->lru, &n->slabs_free); - else + *page_is_free = false; + } else list_add_tail(&page->lru, &n->slabs_partial); list_for_each_entry(page, &n->slabs_partial, lru) { @@ -2913,9 +2905,12 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, return page; } + n->free_touched = 1; list_for_each_entry(page, &n->slabs_free, lru) { - if (!PageSlabPfmemalloc(page)) + if (!PageSlabPfmemalloc(page)) { + *page_is_free = true; return page; + } } return NULL; @@ -2924,17 +2919,26 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct page *page; + bool page_is_free = false; + assert_spin_locked(&n->list_lock); page = list_first_entry_or_null(&n->slabs_partial, struct page, lru); if (!page) { n->free_touched = 1; page = list_first_entry_or_null(&n->slabs_free, struct page, lru); + if (page) + page_is_free = true; } if (sk_memalloc_socks()) - return get_valid_first_slab(n, page, pfmemalloc); + page = get_valid_first_slab(n, page, &page_is_free, pfmemalloc); + + if (page && page_is_free) { + n->active_slabs++; + n->free_slabs--; + } return page; } @@ -3434,9 +3438,11 @@ static void free_block(struct kmem_cache *cachep, void **objpp, STATS_DEC_ACTIVE(cachep); /* fixup slab chains */ - if (page->active == 0) + if (page->active == 0) { list_add(&page->lru, &n->slabs_free); - else { + n->free_slabs++; + n->active_slabs--; + } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. @@ -3450,7 +3456,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, page = list_last_entry(&n->slabs_free, struct page, lru); list_move(&page->lru, list); - n->num_slabs--; + n->free_slabs--; } } @@ -4102,43 +4108,21 @@ out: #ifdef CONFIG_SLABINFO void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { - struct page *page; - unsigned long active_objs; - unsigned long num_objs; - unsigned long active_slabs = 0; - unsigned long num_slabs, free_objects = 0, shared_avail = 0; - unsigned long num_slabs_partial = 0, num_slabs_free = 0; - unsigned long num_slabs_full = 0; - const char *name; - char *error = NULL; + unsigned long active_objs, num_objs, active_slabs; + unsigned long num_slabs = 0, free_objs = 0, shared_avail = 0; + unsigned long num_slabs_free = 0; int node; struct kmem_cache_node *n; - active_objs = 0; - num_slabs = 0; for_each_kmem_cache_node(cachep, node, n) { - check_irq_on(); spin_lock_irq(&n->list_lock); - num_slabs += n->num_slabs; + num_slabs += n->active_slabs + n->free_slabs; + num_slabs_free += n->free_slabs; - list_for_each_entry(page, &n->slabs_partial, lru) { - if (page->active == cachep->num && !error) - error = "slabs_partial accounting error"; - if (!page->active && !error) - error = "slabs_partial accounting error"; - active_objs += page->active; - num_slabs_partial++; - } + free_objs += n->free_objects; - list_for_each_entry(page, &n->slabs_free, lru) { - if (page->active && !error) - error = "slabs_free accounting error"; - num_slabs_free++; - } - - free_objects += n->free_objects; if (n->shared) shared_avail += n->shared->avail; @@ -4146,15 +4130,8 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) } num_objs = num_slabs * cachep->num; active_slabs = num_slabs - num_slabs_free; - num_slabs_full = num_slabs - (num_slabs_partial + num_slabs_free); - active_objs += (num_slabs_full * cachep->num); - if (num_objs - active_objs != free_objects && !error) - error = "free_objects accounting error"; - - name = cachep->name; - if (error) - pr_err("slab: cache %s error: %s\n", name, error); + active_objs = num_objs - free_objs; sinfo->active_objs = active_objs; sinfo->num_objs = num_objs; diff --git a/mm/slab.h b/mm/slab.h index 699b072dc46e..26123c512fee 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -447,7 +447,8 @@ struct kmem_cache_node { struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; - unsigned long num_slabs; + unsigned long active_slabs; /* length of slabs_partial+slabs_full */ + unsigned long free_slabs; /* length of slabs_free */ unsigned long free_objects; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ -- cgit v1.2.3 From bf00bd3458041c4643a13d80fb349d29cb66eb63 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 12 Dec 2016 16:41:44 -0800 Subject: mm, slab: maintain total slab count instead of active count Rather than tracking the number of active slabs for each node, track the total number of slabs. This is a minor improvement that avoids active slab tracking when a slab goes from free to partial or partial to free. For slab debugging, this also removes an explicit free count since it can easily be inferred by the difference in number of total objects and number of active objects. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1612042020110.115755@chino.kir.corp.google.com Signed-off-by: David Rientjes Suggested-by: Joonsoo Kim Cc: Greg Thelen Cc: Aruna Ramakrishna Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 70 ++++++++++++++++++++++++++------------------------------------- mm/slab.h | 4 ++-- 2 files changed, 31 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index e06da6ceaf73..87b29e76cafd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -227,7 +227,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) INIT_LIST_HEAD(&parent->slabs_full); INIT_LIST_HEAD(&parent->slabs_partial); INIT_LIST_HEAD(&parent->slabs_free); - parent->active_slabs = 0; + parent->total_slabs = 0; parent->free_slabs = 0; parent->shared = NULL; parent->alien = NULL; @@ -1381,20 +1381,18 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) cachep->name, cachep->size, cachep->gfporder); for_each_kmem_cache_node(cachep, node, n) { - unsigned long active_objs = 0, free_objs = 0; - unsigned long active_slabs, num_slabs; + unsigned long total_slabs, free_slabs, free_objs; spin_lock_irqsave(&n->list_lock, flags); - active_slabs = n->active_slabs; - num_slabs = active_slabs + n->free_slabs; - - active_objs += (num_slabs * cachep->num) - n->free_objects; - free_objs += n->free_objects; + total_slabs = n->total_slabs; + free_slabs = n->free_slabs; + free_objs = n->free_objects; spin_unlock_irqrestore(&n->list_lock, flags); - pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", - node, active_slabs, num_slabs, active_objs, - num_slabs * cachep->num, free_objs); + pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", + node, total_slabs - free_slabs, total_slabs, + (total_slabs * cachep->num) - free_objs, + total_slabs * cachep->num); } #endif } @@ -2307,6 +2305,7 @@ static int drain_freelist(struct kmem_cache *cache, page = list_entry(p, struct page, lru); list_del(&page->lru); n->free_slabs--; + n->total_slabs--; /* * Safe to drop the lock. The slab is no longer linked * to the cache. @@ -2741,13 +2740,12 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) n = get_node(cachep, page_to_nid(page)); spin_lock(&n->list_lock); + n->total_slabs++; if (!page->active) { list_add_tail(&page->lru, &(n->slabs_free)); n->free_slabs++; - } else { + } else fixup_slab_list(cachep, n, page, &list); - n->active_slabs++; - } STATS_INC_GROWN(cachep); n->free_objects += cachep->num - page->active; @@ -2874,7 +2872,7 @@ static inline void fixup_slab_list(struct kmem_cache *cachep, /* Try to find non-pfmemalloc slab if needed */ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, - struct page *page, bool *page_is_free, bool pfmemalloc) + struct page *page, bool pfmemalloc) { if (!page) return NULL; @@ -2893,10 +2891,9 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, /* Move pfmemalloc slab to the end of list to speed up next search */ list_del(&page->lru); - if (*page_is_free) { - WARN_ON(page->active); + if (!page->active) { list_add_tail(&page->lru, &n->slabs_free); - *page_is_free = false; + n->free_slabs++; } else list_add_tail(&page->lru, &n->slabs_partial); @@ -2908,7 +2905,7 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, n->free_touched = 1; list_for_each_entry(page, &n->slabs_free, lru) { if (!PageSlabPfmemalloc(page)) { - *page_is_free = true; + n->free_slabs--; return page; } } @@ -2919,26 +2916,19 @@ static noinline struct page *get_valid_first_slab(struct kmem_cache_node *n, static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct page *page; - bool page_is_free = false; assert_spin_locked(&n->list_lock); - page = list_first_entry_or_null(&n->slabs_partial, - struct page, lru); + page = list_first_entry_or_null(&n->slabs_partial, struct page, lru); if (!page) { n->free_touched = 1; - page = list_first_entry_or_null(&n->slabs_free, - struct page, lru); + page = list_first_entry_or_null(&n->slabs_free, struct page, + lru); if (page) - page_is_free = true; + n->free_slabs--; } if (sk_memalloc_socks()) - page = get_valid_first_slab(n, page, &page_is_free, pfmemalloc); - - if (page && page_is_free) { - n->active_slabs++; - n->free_slabs--; - } + page = get_valid_first_slab(n, page, pfmemalloc); return page; } @@ -3441,7 +3431,6 @@ static void free_block(struct kmem_cache *cachep, void **objpp, if (page->active == 0) { list_add(&page->lru, &n->slabs_free); n->free_slabs++; - n->active_slabs--; } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the @@ -3457,6 +3446,7 @@ static void free_block(struct kmem_cache *cachep, void **objpp, page = list_last_entry(&n->slabs_free, struct page, lru); list_move(&page->lru, list); n->free_slabs--; + n->total_slabs--; } } @@ -4109,8 +4099,8 @@ out: void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { unsigned long active_objs, num_objs, active_slabs; - unsigned long num_slabs = 0, free_objs = 0, shared_avail = 0; - unsigned long num_slabs_free = 0; + unsigned long total_slabs = 0, free_objs = 0, shared_avail = 0; + unsigned long free_slabs = 0; int node; struct kmem_cache_node *n; @@ -4118,9 +4108,8 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) check_irq_on(); spin_lock_irq(&n->list_lock); - num_slabs += n->active_slabs + n->free_slabs; - num_slabs_free += n->free_slabs; - + total_slabs += n->total_slabs; + free_slabs += n->free_slabs; free_objs += n->free_objects; if (n->shared) @@ -4128,15 +4117,14 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) spin_unlock_irq(&n->list_lock); } - num_objs = num_slabs * cachep->num; - active_slabs = num_slabs - num_slabs_free; - + num_objs = total_slabs * cachep->num; + active_slabs = total_slabs - free_slabs; active_objs = num_objs - free_objs; sinfo->active_objs = active_objs; sinfo->num_objs = num_objs; sinfo->active_slabs = active_slabs; - sinfo->num_slabs = num_slabs; + sinfo->num_slabs = total_slabs; sinfo->shared_avail = shared_avail; sinfo->limit = cachep->limit; sinfo->batchcount = cachep->batchcount; diff --git a/mm/slab.h b/mm/slab.h index 26123c512fee..de6579dc362c 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -447,8 +447,8 @@ struct kmem_cache_node { struct list_head slabs_partial; /* partial list first, better asm code */ struct list_head slabs_full; struct list_head slabs_free; - unsigned long active_slabs; /* length of slabs_partial+slabs_full */ - unsigned long free_slabs; /* length of slabs_free */ + unsigned long total_slabs; /* length of all slab lists */ + unsigned long free_slabs; /* length of free slab list only */ unsigned long free_objects; unsigned int free_limit; unsigned int colour_next; /* Per-node cache coloring */ -- cgit v1.2.3 From 3e32158767b04db60b83f760e1722fd15a715d7a Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 12 Dec 2016 16:41:47 -0800 Subject: mm/mprotect.c: don't touch single threaded PTEs which are on the right node We had some problems with pages getting unmapped in single threaded affinitized processes. It was tracked down to NUMA scanning. In this case it doesn't make any sense to unmap pages if the process is single threaded and the page is already on the node the process is running on. Add a check for this case into the numa protection code, and skip unmapping if true. In theory the process could be migrated later, but we will eventually rescan and unmap and migrate then. In theory this could be made more fancy: remembering this state per process or even whole mm. However that would need extra tracking and be more complicated, and the simple check seems to work fine so far. [ak@linux.intel.com: v3: Minor updates from Mel. Change code layout] Link: http://lkml.kernel.org/r/1476382117-5440-1-git-send-email-andi@firstfloor.org Link: http://lkml.kernel.org/r/1476288949-20970-1-git-send-email-andi@firstfloor.org Signed-off-by: Andi Kleen Acked-by: Mel Gorman Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 11936526b08b..05a02b72c98d 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -69,11 +69,17 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; + int target_node = NUMA_NO_NODE; pte = lock_pte_protection(vma, pmd, addr, prot_numa, &ptl); if (!pte) return 0; + /* Get target node for single threaded private VMAs */ + if (prot_numa && !(vma->vm_flags & VM_SHARED) && + atomic_read(&vma->vm_mm->mm_users) == 1) + target_node = numa_node_id(); + arch_enter_lazy_mmu_mode(); do { oldpte = *pte; @@ -95,6 +101,13 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; + + /* + * Don't mess with PTEs if page is already on the node + * a single-threaded process is running on. + */ + if (target_node == page_to_nid(page)) + continue; } ptent = ptep_modify_prot_start(mm, addr, pte); -- cgit v1.2.3 From 5f33a0803bbd781de916f5c7448cbbbbc763d911 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Mon, 12 Dec 2016 16:41:50 -0800 Subject: mm/vmscan.c: set correct defer count for shrinker Our system uses significantly more slab memory with memcg enabled with the latest kernel. With 3.10 kernel, slab uses 2G memory, while with 4.6 kernel, 6G memory is used. The shrinker has problem. Let's see we have two memcg for one shrinker. In do_shrink_slab: 1. Check cg1. nr_deferred = 0, assume total_scan = 700. batch size is 1024, then no memory is freed. nr_deferred = 700 2. Check cg2. nr_deferred = 700. Assume freeable = 20, then total_scan = 10 or 40. Let's assume it's 10. No memory is freed. nr_deferred = 10. The deferred share of cg1 is lost in this case. kswapd will free no memory even run above steps again and again. The fix makes sure one memcg's deferred share isn't lost. Link: http://lkml.kernel.org/r/2414be961b5d25892060315fbb56bb19d81d0c07.1476227351.git.shli@fb.com Signed-off-by: Shaohua Li Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: [4.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index d75cdf360730..c4abf08861d2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -291,6 +291,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, int nid = shrinkctl->nid; long batch_size = shrinker->batch ? shrinker->batch : SHRINK_BATCH; + long scanned = 0, next_deferred; freeable = shrinker->count_objects(shrinker, shrinkctl); if (freeable == 0) @@ -312,7 +313,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", shrinker->scan_objects, total_scan); total_scan = freeable; - } + next_deferred = nr; + } else + next_deferred = total_scan; /* * We need to avoid excessive windup on filesystem shrinkers @@ -369,17 +372,22 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, count_vm_events(SLABS_SCANNED, nr_to_scan); total_scan -= nr_to_scan; + scanned += nr_to_scan; cond_resched(); } + if (next_deferred >= scanned) + next_deferred -= scanned; + else + next_deferred = 0; /* * move the unused scan count back into the shrinker in a * manner that handles concurrent updates. If we exhausted the * scan, there is no need to do an update. */ - if (total_scan > 0) - new_nr = atomic_long_add_return(total_scan, + if (next_deferred > 0) + new_nr = atomic_long_add_return(next_deferred, &shrinker->nr_deferred[nid]); else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); -- cgit v1.2.3 From 771ab4302c592d1de9e6b73f58979e9e5c424f4c Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Mon, 12 Dec 2016 16:41:53 -0800 Subject: mm/gup.c: make unnecessarily global vma_permits_fault() static Make vma_permits_fault() static as it is only used in mm/gup.c This fixes a sparse warning. Link: http://lkml.kernel.org/r/20161017122353.31598-1-tklauser@distanz.ch Signed-off-by: Tobias Klauser Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index ec4f82704b6f..fc04f1c3cf08 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -632,7 +632,8 @@ next_page: return i; } -bool vma_permits_fault(struct vm_area_struct *vma, unsigned int fault_flags) +static bool vma_permits_fault(struct vm_area_struct *vma, + unsigned int fault_flags) { bool write = !!(fault_flags & FAULT_FLAG_WRITE); bool foreign = !!(fault_flags & FAULT_FLAG_REMOTE); -- cgit v1.2.3 From 3999f52e3198e76607446ab1a4610c1ddc406c56 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:41:56 -0800 Subject: mm/hugetlb.c: use the right pte val for compare in hugetlb_cow We cannot use the pte value used in set_pte_at for pte_same comparison, because archs like ppc64, filter/add new pte flag in set_pte_at. Instead fetch the pte value inside hugetlb_cow. We are comparing pte value to make sure the pte didn't change since we dropped the page table lock. hugetlb_cow get called with page table lock held, and we can take a copy of the pte value before we drop the page table lock. With hugetlbfs, we optimize the MAP_PRIVATE write fault path with no previous mapping (huge_pte_none entries), by forcing a cow in the fault path. This avoid take an addition fault to covert a read-only mapping to read/write. Here we were comparing a recently instantiated pte (via set_pte_at) to the pte values from linux page table. As explained above on ppc64 such pte_same check returned wrong result, resulting in us taking an additional fault on ppc64. Fixes: 6a119eae942c ("powerpc/mm: Add a _PAGE_PTE bit") Link: http://lkml.kernel.org/r/20161018154245.18023-1-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Reported-by: Jan Stancek Acked-by: Hillf Danton Cc: Mike Kravetz Cc: Scott Wood Cc: Michael Ellerman Cc: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 418bf01a50ed..23aec01836aa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3450,15 +3450,17 @@ static void unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * Keep the pte_same checks anyway to make transition from the mutex easier. */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, pte_t pte, - struct page *pagecache_page, spinlock_t *ptl) + unsigned long address, pte_t *ptep, + struct page *pagecache_page, spinlock_t *ptl) { + pte_t pte; struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; int ret = 0, outside_reserve = 0; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ + pte = huge_ptep_get(ptep); old_page = pte_page(pte); retry_avoidcopy: @@ -3733,7 +3735,7 @@ retry: hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); + ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); } spin_unlock(ptl); @@ -3888,8 +3890,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, address, ptep, entry, - pagecache_page, ptl); + ret = hugetlb_cow(mm, vma, address, ptep, + pagecache_page, ptl); goto out_put_page; } entry = huge_pte_mkdirty(entry); -- cgit v1.2.3 From 8bea805207500068b70778b707299a9b5920ca72 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:41:59 -0800 Subject: mm/hugetlb.c: use huge_pte_lock instead of opencoding the lock No functional change by this patch. Link: http://lkml.kernel.org/r/20161018090234.22574-1-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 23aec01836aa..c12296f62e8d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3713,8 +3713,7 @@ retry: vma_end_reservation(h, vma, address); } - ptl = huge_pte_lockptr(h, mm, ptep); - spin_lock(ptl); + ptl = huge_pte_lock(h, mm, ptep); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -4332,8 +4331,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!spte) goto out; - ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); - spin_lock(ptl); + ptl = huge_pte_lock(hstate_vma(vma), mm, spte); if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); -- cgit v1.2.3 From 22901c6c9f93058c3803d343db02c14e870e3545 Mon Sep 17 00:00:00 2001 From: Andreas Platschek Date: Mon, 12 Dec 2016 16:42:01 -0800 Subject: kmemleak: fix reference to Documentation Documentation/kmemleak.txt was moved to Documentation/dev-tools/kmemleak.rst, this fixes the reference to the new location. Link: http://lkml.kernel.org/r/1476544946-18804-1-git-send-email-andreas.platschek@opentech.at Signed-off-by: Andreas Platschek Acked-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index d1380ed93fdf..da3436953022 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -19,7 +19,7 @@ * * * For more information on the algorithm and kmemleak usage, please see - * Documentation/kmemleak.txt. + * Documentation/dev-tools/kmemleak.rst. * * Notes on locking * ---------------- -- cgit v1.2.3 From 88ed365ea227aa28841a8d6e196c9a261c76fffd Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 12 Dec 2016 16:42:05 -0800 Subject: mm: don't steal highatomic pageblock Patch series "use up highorder free pages before OOM", v3. I got OOM report from production team with v4.4 kernel. It had enough free memory but failed to allocate GFP_KERNEL order-0 page and finally encountered OOM kill. It occured during QA process which launches several apps, switching and so on. It happned rarely. IOW, In normal situation, it was not a problem but if we are unluck so that several apps uses peak memory at the same time, it can happen. If we manage to pass the phase, the system can go working well. I could reproduce it with my test(memory spike easily. Look at below. The reason is free pages(19M) of DMA32 zone are reserved for HIGHORDERATOMIC and doesn't unreserved before the OOM. balloon invoked oom-killer: gfp_mask=0x24280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), order=0, oom_score_adj=0 balloon cpuset=/ mems_allowed=0 CPU: 1 PID: 8473 Comm: balloon Tainted: G W OE 4.8.0-rc7-00219-g3f74c9559583-dirty #3161 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0x63/0x90 dump_header+0x5c/0x1ce oom_kill_process+0x22e/0x400 out_of_memory+0x1ac/0x210 __alloc_pages_nodemask+0x101e/0x1040 handle_mm_fault+0xa0a/0xbf0 __do_page_fault+0x1dd/0x4d0 trace_do_page_fault+0x43/0x130 do_async_page_fault+0x1a/0xa0 async_page_fault+0x28/0x30 Mem-Info: active_anon:383949 inactive_anon:106724 isolated_anon:0 active_file:15 inactive_file:44 isolated_file:0 unevictable:0 dirty:0 writeback:24 unstable:0 slab_reclaimable:2483 slab_unreclaimable:3326 mapped:0 shmem:0 pagetables:1906 bounce:0 free:6898 free_pcp:291 free_cma:0 Node 0 active_anon:1535796kB inactive_anon:426896kB active_file:60kB inactive_file:176kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:0kB dirty:0kB writeback:96kB shmem:0kB writeback_tmp:0kB unstable:0kB pages_scanned:1418 all_unreclaimable? no DMA free:8188kB min:44kB low:56kB high:68kB active_anon:7648kB inactive_anon:0kB active_file:0kB inactive_file:4kB unevictable:0kB writepending:0kB present:15992kB managed:15908kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:20kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 1952 1952 1952 DMA32 free:19404kB min:5628kB low:7624kB high:9620kB active_anon:1528148kB inactive_anon:426896kB active_file:60kB inactive_file:420kB unevictable:0kB writepending:96kB present:2080640kB managed:2030092kB mlocked:0kB slab_reclaimable:9932kB slab_unreclaimable:13284kB kernel_stack:2496kB pagetables:7624kB bounce:0kB free_pcp:900kB local_pcp:112kB free_cma:0kB lowmem_reserve[]: 0 0 0 0 DMA: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 2*4096kB (H) = 8192kB DMA32: 7*4kB (H) 8*8kB (H) 30*16kB (H) 31*32kB (H) 14*64kB (H) 9*128kB (H) 2*256kB (H) 2*512kB (H) 4*1024kB (H) 5*2048kB (H) 0*4096kB = 19484kB 51131 total pagecache pages 50795 pages in swap cache Swap cache stats: add 3532405601, delete 3532354806, find 124289150/1822712228 Free swap = 8kB Total swap = 255996kB 524158 pages RAM 0 pages HighMem/MovableOnly 12658 pages reserved 0 pages cma reserved 0 pages hwpoisoned Another example exceeded the limit by the race is in:imklog: page allocation failure: order:0, mode:0x2280020(GFP_ATOMIC|__GFP_NOTRACK) CPU: 0 PID: 476 Comm: in:imklog Tainted: G E 4.8.0-rc7-00217-g266ef83c51e5-dirty #3135 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0x63/0x90 warn_alloc_failed+0xdb/0x130 __alloc_pages_nodemask+0x4d6/0xdb0 new_slab+0x339/0x490 ___slab_alloc.constprop.74+0x367/0x480 __slab_alloc.constprop.73+0x20/0x40 __kmalloc+0x1a4/0x1e0 alloc_indirect.isra.14+0x1d/0x50 virtqueue_add_sgs+0x1c4/0x470 __virtblk_add_req+0xae/0x1f0 virtio_queue_rq+0x12d/0x290 __blk_mq_run_hw_queue+0x239/0x370 blk_mq_run_hw_queue+0x8f/0xb0 blk_mq_insert_requests+0x18c/0x1a0 blk_mq_flush_plug_list+0x125/0x140 blk_flush_plug_list+0xc7/0x220 blk_finish_plug+0x2c/0x40 __do_page_cache_readahead+0x196/0x230 filemap_fault+0x448/0x4f0 ext4_filemap_fault+0x36/0x50 __do_fault+0x75/0x140 handle_mm_fault+0x84d/0xbe0 __do_page_fault+0x1dd/0x4d0 trace_do_page_fault+0x43/0x130 do_async_page_fault+0x1a/0xa0 async_page_fault+0x28/0x30 Mem-Info: active_anon:363826 inactive_anon:121283 isolated_anon:32 active_file:65 inactive_file:152 isolated_file:0 unevictable:0 dirty:0 writeback:46 unstable:0 slab_reclaimable:2778 slab_unreclaimable:3070 mapped:112 shmem:0 pagetables:1822 bounce:0 free:9469 free_pcp:231 free_cma:0 Node 0 active_anon:1455304kB inactive_anon:485132kB active_file:260kB inactive_file:608kB unevictable:0kB isolated(anon):128kB isolated(file):0kB mapped:448kB dirty:0kB writeback:184kB shmem:0kB writeback_tmp:0kB unstable:0kB pages_scanned:13641 all_unreclaimable? no DMA free:7748kB min:44kB low:56kB high:68kB active_anon:7944kB inactive_anon:104kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:15992kB managed:15908kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:108kB kernel_stack:0kB pagetables:4kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 1952 1952 1952 DMA32 free:30128kB min:5628kB low:7624kB high:9620kB active_anon:1447360kB inactive_anon:485028kB active_file:260kB inactive_file:608kB unevictable:0kB writepending:184kB present:2080640kB managed:2030132kB mlocked:0kB slab_reclaimable:11112kB slab_unreclaimable:12172kB kernel_stack:2400kB pagetables:7284kB bounce:0kB free_pcp:924kB local_pcp:72kB free_cma:0kB lowmem_reserve[]: 0 0 0 0 DMA: 7*4kB (UE) 3*8kB (UH) 1*16kB (M) 0*32kB 2*64kB (U) 1*128kB (M) 1*256kB (U) 0*512kB 1*1024kB (U) 1*2048kB (U) 1*4096kB (H) = 7748kB DMA32: 10*4kB (H) 3*8kB (H) 47*16kB (H) 38*32kB (H) 5*64kB (H) 1*128kB (H) 2*256kB (H) 3*512kB (H) 3*1024kB (H) 3*2048kB (H) 4*4096kB (H) = 30128kB 2775 total pagecache pages 2536 pages in swap cache Swap cache stats: add 206786828, delete 206784292, find 7323106/106686077 Free swap = 108744kB Total swap = 255996kB 524158 pages RAM 0 pages HighMem/MovableOnly 12648 pages reserved 0 pages cma reserved 0 pages hwpoisoned During the investigation, I found some problems with highatomic so this patch aims to solve the problems and the final goal is to unreserve every highatomic free pages before the OOM kill. This patch (of 4): In page freeing path, migratetype is racy so that a highorderatomic page could free into non-highorderatomic free list. If that page is allocated, VM can change the pageblock from higorderatomic to something. In that case, highatomic pageblock accounting is broken so it doesn't work(e.g., VM cannot reserve highorderatomic pageblocks any more although it doesn't reach 1% limit). So, this patch prohibits the changing from highatomic to other type. It's no problem because MIGRATE_HIGHATOMIC is not listed in fallback array so stealing will only happen due to unexpected races which is really rare. Also, such prohibiting keeps highatomic pageblock more longer so it would be better for highorderatomic page allocation. Link: http://lkml.kernel.org/r/1476259429-18279-2-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Sangseok Lee Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6de9440e3ae2..97170131f2ab 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2133,7 +2133,8 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) page = list_first_entry(&area->free_list[fallback_mt], struct page, lru); - if (can_steal) + if (can_steal && + get_pageblock_migratetype(page) != MIGRATE_HIGHATOMIC) steal_suitable_fallback(zone, page, start_migratetype); /* Remove the page from the freelists */ @@ -2534,7 +2535,8 @@ int __isolate_free_page(struct page *page, unsigned int order) struct page *endpage = page + (1 << order) - 1; for (; page < endpage; page += pageblock_nr_pages) { int mt = get_pageblock_migratetype(page); - if (!is_migrate_isolate(mt) && !is_migrate_cma(mt)) + if (!is_migrate_isolate(mt) && !is_migrate_cma(mt) + && mt != MIGRATE_HIGHATOMIC) set_pageblock_migratetype(page, MIGRATE_MOVABLE); } -- cgit v1.2.3 From 4855e4a7f29d6d10b0b9c84e189c770c9a94e91e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 12 Dec 2016 16:42:08 -0800 Subject: mm: prevent double decrease of nr_reserved_highatomic There is race between page freeing and unreserved highatomic. CPU 0 CPU 1 free_hot_cold_page mt = get_pfnblock_migratetype set_pcppage_migratetype(page, mt) unreserve_highatomic_pageblock spin_lock_irqsave(&zone->lock) move_freepages_block set_pageblock_migratetype(page) spin_unlock_irqrestore(&zone->lock) free_pcppages_bulk __free_one_page(mt) <- mt is stale By above race, a page on CPU 0 could go non-highorderatomic free list since the pageblock's type is changed. By that, unreserve logic of highorderatomic can decrease reserved count on a same pageblock severak times and then it will make mismatch between nr_reserved_highatomic and the number of reserved pageblock. So, this patch verifies whether the pageblock is highatomic or not and decrease the count only if the pageblock is highatomic. Link: http://lkml.kernel.org/r/1476259429-18279-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Sangseok Lee Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 97170131f2ab..8cbc38f923aa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2085,13 +2085,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) continue; /* - * It should never happen but changes to locking could - * inadvertently allow a per-cpu drain to add pages - * to MIGRATE_HIGHATOMIC while unreserving so be safe - * and watch for underflows. + * In page freeing path, migratetype change is racy so + * we can counter several free pages in a pageblock + * in this loop althoug we changed the pageblock type + * from highatomic to ac->migratetype. So we should + * adjust the count once. */ - zone->nr_reserved_highatomic -= min(pageblock_nr_pages, - zone->nr_reserved_highatomic); + if (get_pageblock_migratetype(page) == + MIGRATE_HIGHATOMIC) { + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. + */ + zone->nr_reserved_highatomic -= min( + pageblock_nr_pages, + zone->nr_reserved_highatomic); + } /* * Convert to ac->migratetype and avoid the normal -- cgit v1.2.3 From 04c8716f7b0075def05dc05646e2408f318167d2 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 12 Dec 2016 16:42:11 -0800 Subject: mm: try to exhaust highatomic reserve before the OOM I got OOM report from production team with v4.4 kernel. It had enough free memory but failed to allocate GFP_KERNEL order-0 page and finally encountered OOM kill. It occured during QA process which launches several apps, switching and so on. It happned rarely. IOW, In normal situation, it was not a problem but if we are unluck so that several apps uses peak memory at the same time, it can happen. If we manage to pass the phase, the system can go working well. I could reproduce it with my test(memory spike easily. Look at below. The reason is free pages(19M) of DMA32 zone are reserved for HIGHORDERATOMIC and doesn't unreserved before the OOM. balloon invoked oom-killer: gfp_mask=0x24280ca(GFP_HIGHUSER_MOVABLE|__GFP_ZERO), order=0, oom_score_adj=0 balloon cpuset=/ mems_allowed=0 CPU: 1 PID: 8473 Comm: balloon Tainted: G W OE 4.8.0-rc7-00219-g3f74c9559583-dirty #3161 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0x63/0x90 dump_header+0x5c/0x1ce oom_kill_process+0x22e/0x400 out_of_memory+0x1ac/0x210 __alloc_pages_nodemask+0x101e/0x1040 handle_mm_fault+0xa0a/0xbf0 __do_page_fault+0x1dd/0x4d0 trace_do_page_fault+0x43/0x130 do_async_page_fault+0x1a/0xa0 async_page_fault+0x28/0x30 Mem-Info: active_anon:383949 inactive_anon:106724 isolated_anon:0 active_file:15 inactive_file:44 isolated_file:0 unevictable:0 dirty:0 writeback:24 unstable:0 slab_reclaimable:2483 slab_unreclaimable:3326 mapped:0 shmem:0 pagetables:1906 bounce:0 free:6898 free_pcp:291 free_cma:0 Node 0 active_anon:1535796kB inactive_anon:426896kB active_file:60kB inactive_file:176kB unevictable:0kB isolated(anon):0kB isolated(file):0kB mapped:0kB dirty:0kB writeback:96kB shmem:0kB writeback_tmp:0kB unstable:0kB pages_scanned:1418 all_unreclaimable? no DMA free:8188kB min:44kB low:56kB high:68kB active_anon:7648kB inactive_anon:0kB active_file:0kB inactive_file:4kB unevictable:0kB writepending:0kB present:15992kB managed:15908kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:20kB kernel_stack:0kB pagetables:0kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 1952 1952 1952 DMA32 free:19404kB min:5628kB low:7624kB high:9620kB active_anon:1528148kB inactive_anon:426896kB active_file:60kB inactive_file:420kB unevictable:0kB writepending:96kB present:2080640kB managed:2030092kB mlocked:0kB slab_reclaimable:9932kB slab_unreclaimable:13284kB kernel_stack:2496kB pagetables:7624kB bounce:0kB free_pcp:900kB local_pcp:112kB free_cma:0kB lowmem_reserve[]: 0 0 0 0 DMA: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 0*256kB 0*512kB 0*1024kB 0*2048kB 2*4096kB (H) = 8192kB DMA32: 7*4kB (H) 8*8kB (H) 30*16kB (H) 31*32kB (H) 14*64kB (H) 9*128kB (H) 2*256kB (H) 2*512kB (H) 4*1024kB (H) 5*2048kB (H) 0*4096kB = 19484kB 51131 total pagecache pages 50795 pages in swap cache Swap cache stats: add 3532405601, delete 3532354806, find 124289150/1822712228 Free swap = 8kB Total swap = 255996kB 524158 pages RAM 0 pages HighMem/MovableOnly 12658 pages reserved 0 pages cma reserved 0 pages hwpoisoned Another example exceeded the limit by the race is in:imklog: page allocation failure: order:0, mode:0x2280020(GFP_ATOMIC|__GFP_NOTRACK) CPU: 0 PID: 476 Comm: in:imklog Tainted: G E 4.8.0-rc7-00217-g266ef83c51e5-dirty #3135 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014 Call Trace: dump_stack+0x63/0x90 warn_alloc_failed+0xdb/0x130 __alloc_pages_nodemask+0x4d6/0xdb0 new_slab+0x339/0x490 ___slab_alloc.constprop.74+0x367/0x480 __slab_alloc.constprop.73+0x20/0x40 __kmalloc+0x1a4/0x1e0 alloc_indirect.isra.14+0x1d/0x50 virtqueue_add_sgs+0x1c4/0x470 __virtblk_add_req+0xae/0x1f0 virtio_queue_rq+0x12d/0x290 __blk_mq_run_hw_queue+0x239/0x370 blk_mq_run_hw_queue+0x8f/0xb0 blk_mq_insert_requests+0x18c/0x1a0 blk_mq_flush_plug_list+0x125/0x140 blk_flush_plug_list+0xc7/0x220 blk_finish_plug+0x2c/0x40 __do_page_cache_readahead+0x196/0x230 filemap_fault+0x448/0x4f0 ext4_filemap_fault+0x36/0x50 __do_fault+0x75/0x140 handle_mm_fault+0x84d/0xbe0 __do_page_fault+0x1dd/0x4d0 trace_do_page_fault+0x43/0x130 do_async_page_fault+0x1a/0xa0 async_page_fault+0x28/0x30 Mem-Info: active_anon:363826 inactive_anon:121283 isolated_anon:32 active_file:65 inactive_file:152 isolated_file:0 unevictable:0 dirty:0 writeback:46 unstable:0 slab_reclaimable:2778 slab_unreclaimable:3070 mapped:112 shmem:0 pagetables:1822 bounce:0 free:9469 free_pcp:231 free_cma:0 Node 0 active_anon:1455304kB inactive_anon:485132kB active_file:260kB inactive_file:608kB unevictable:0kB isolated(anon):128kB isolated(file):0kB mapped:448kB dirty:0kB writeback:184kB shmem:0kB writeback_tmp:0kB unstable:0kB pages_scanned:13641 all_unreclaimable? no DMA free:7748kB min:44kB low:56kB high:68kB active_anon:7944kB inactive_anon:104kB active_file:0kB inactive_file:0kB unevictable:0kB writepending:0kB present:15992kB managed:15908kB mlocked:0kB slab_reclaimable:0kB slab_unreclaimable:108kB kernel_stack:0kB pagetables:4kB bounce:0kB free_pcp:0kB local_pcp:0kB free_cma:0kB lowmem_reserve[]: 0 1952 1952 1952 DMA32 free:30128kB min:5628kB low:7624kB high:9620kB active_anon:1447360kB inactive_anon:485028kB active_file:260kB inactive_file:608kB unevictable:0kB writepending:184kB present:2080640kB managed:2030132kB mlocked:0kB slab_reclaimable:11112kB slab_unreclaimable:12172kB kernel_stack:2400kB pagetables:7284kB bounce:0kB free_pcp:924kB local_pcp:72kB free_cma:0kB lowmem_reserve[]: 0 0 0 0 DMA: 7*4kB (UE) 3*8kB (UH) 1*16kB (M) 0*32kB 2*64kB (U) 1*128kB (M) 1*256kB (U) 0*512kB 1*1024kB (U) 1*2048kB (U) 1*4096kB (H) = 7748kB DMA32: 10*4kB (H) 3*8kB (H) 47*16kB (H) 38*32kB (H) 5*64kB (H) 1*128kB (H) 2*256kB (H) 3*512kB (H) 3*1024kB (H) 3*2048kB (H) 4*4096kB (H) = 30128kB 2775 total pagecache pages 2536 pages in swap cache Swap cache stats: add 206786828, delete 206784292, find 7323106/106686077 Free swap = 108744kB Total swap = 255996kB 524158 pages RAM 0 pages HighMem/MovableOnly 12648 pages reserved 0 pages cma reserved 0 pages hwpoisoned It's weird to show that zone has enough free memory above min watermark but OOMed with 4K GFP_KERNEL allocation due to reserved highatomic pages. As last resort, try to unreserve highatomic pages again and if it has moved pages to non-highatmoc free list, retry reclaim once more. Link: http://lkml.kernel.org/r/1476259429-18279-4-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Joonsoo Kim Cc: Sangseok Lee Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8cbc38f923aa..085de0442dd4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2059,7 +2059,7 @@ out_unlock: * intense memory pressure but failed atomic allocations should be easier * to recover from than an OOM. */ -static void unreserve_highatomic_pageblock(const struct alloc_context *ac) +static bool unreserve_highatomic_pageblock(const struct alloc_context *ac) { struct zonelist *zonelist = ac->zonelist; unsigned long flags; @@ -2067,6 +2067,7 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) struct zone *zone; struct page *page; int order; + bool ret; for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, ac->nodemask) { @@ -2115,12 +2116,14 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) * may increase. */ set_pageblock_migratetype(page, ac->migratetype); - move_freepages_block(zone, page, ac->migratetype); + ret = move_freepages_block(zone, page, ac->migratetype); spin_unlock_irqrestore(&zone->lock, flags); - return; + return ret; } spin_unlock_irqrestore(&zone->lock, flags); } + + return false; } /* Remove an element from the buddy allocator from the fallback list */ @@ -3436,8 +3439,10 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, * Make sure we converge to OOM if we cannot make any progress * several times in the row. */ - if (*no_progress_loops > MAX_RECLAIM_RETRIES) - return false; + if (*no_progress_loops > MAX_RECLAIM_RETRIES) { + /* Before OOM, exhaust highatomic_reserve */ + return unreserve_highatomic_pageblock(ac); + } /* * Keep reclaiming pages while there is a chance this will lead -- cgit v1.2.3 From 29fac03bef729ef6f9fba5be56f8554093813c39 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 12 Dec 2016 16:42:14 -0800 Subject: mm: make unreserve highatomic functions reliable Currently, unreserve_highatomic_pageblock bails out if it found highatomic pageblock regardless of really moving free pages from the one so that it could mitigate unreserve logic's goal which saves OOM of a process. This patch makes unreserve functions bail out only if it moves some pages out of !highatomic free list to avoid such false positive. Another potential problem is that by race between page freeing and reserve highatomic function, pages could be in highatomic free list even though the pageblock is !high atomic migratetype. In that case, unreserve_highatomic_pageblock can be void if count of highatomic reserve is less than pageblock_nr_pages. We could solve it simply via draining all of reserved pages before the OOM. It would have a safeguard role to exhuast reserved pages before converging to OOM. Link: http://lkml.kernel.org/r/1476259429-18279-5-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Joonsoo Kim Cc: Sangseok Lee Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 085de0442dd4..2b69e28706b1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2058,8 +2058,12 @@ out_unlock: * potentially hurts the reliability of high-order allocations when under * intense memory pressure but failed atomic allocations should be easier * to recover from than an OOM. + * + * If @force is true, try to unreserve a pageblock even though highatomic + * pageblock is exhausted. */ -static bool unreserve_highatomic_pageblock(const struct alloc_context *ac) +static bool unreserve_highatomic_pageblock(const struct alloc_context *ac, + bool force) { struct zonelist *zonelist = ac->zonelist; unsigned long flags; @@ -2071,8 +2075,12 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac) for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx, ac->nodemask) { - /* Preserve at least one pageblock */ - if (zone->nr_reserved_highatomic <= pageblock_nr_pages) + /* + * Preserve at least one pageblock unless memory pressure + * is really high. + */ + if (!force && zone->nr_reserved_highatomic <= + pageblock_nr_pages) continue; spin_lock_irqsave(&zone->lock, flags); @@ -2117,8 +2125,10 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac) */ set_pageblock_migratetype(page, ac->migratetype); ret = move_freepages_block(zone, page, ac->migratetype); - spin_unlock_irqrestore(&zone->lock, flags); - return ret; + if (ret) { + spin_unlock_irqrestore(&zone->lock, flags); + return ret; + } } spin_unlock_irqrestore(&zone->lock, flags); } @@ -3322,7 +3332,7 @@ retry: * Shrink them them and try again */ if (!page && !drained) { - unreserve_highatomic_pageblock(ac); + unreserve_highatomic_pageblock(ac, false); drain_all_pages(NULL); drained = true; goto retry; @@ -3441,7 +3451,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order, */ if (*no_progress_loops > MAX_RECLAIM_RETRIES) { /* Before OOM, exhaust highatomic_reserve */ - return unreserve_highatomic_pageblock(ac); + return unreserve_highatomic_pageblock(ac, true); } /* -- cgit v1.2.3 From 3f5000693f80e014fa577b67b93a0de945a4338d Mon Sep 17 00:00:00 2001 From: zijun_hu Date: Mon, 12 Dec 2016 16:42:17 -0800 Subject: mm/vmalloc.c: simplify /proc/vmallocinfo implementation Many seq_file helpers exist for simplifying implementation of virtual files especially, for /proc nodes. however, the helpers for iteration over list_head are available but aren't adopted to implement /proc/vmallocinfo currently. Simplify /proc/vmallocinfo implementation by using existing seq_file helpers. Link: http://lkml.kernel.org/r/57FDF2E5.1000201@zoho.com Signed-off-by: zijun_hu Acked-by: Michal Hocko Cc: David Rientjes Cc: Tejun Heo Cc: Ingo Molnar Cc: Joonsoo Kim Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 27 +++++---------------------- 1 file changed, 5 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f2481cb4e6b2..e73948afac70 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2574,32 +2574,13 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) static void *s_start(struct seq_file *m, loff_t *pos) __acquires(&vmap_area_lock) { - loff_t n = *pos; - struct vmap_area *va; - spin_lock(&vmap_area_lock); - va = list_first_entry(&vmap_area_list, typeof(*va), list); - while (n > 0 && &va->list != &vmap_area_list) { - n--; - va = list_next_entry(va, list); - } - if (!n && &va->list != &vmap_area_list) - return va; - - return NULL; - + return seq_list_start(&vmap_area_list, *pos); } static void *s_next(struct seq_file *m, void *p, loff_t *pos) { - struct vmap_area *va = p, *next; - - ++*pos; - next = list_next_entry(va, list); - if (&next->list != &vmap_area_list) - return next; - - return NULL; + return seq_list_next(p, &vmap_area_list, pos); } static void s_stop(struct seq_file *m, void *p) @@ -2634,9 +2615,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) static int s_show(struct seq_file *m, void *p) { - struct vmap_area *va = p; + struct vmap_area *va; struct vm_struct *v; + va = list_entry(p, struct vmap_area, list); + /* * s_show can encounter race with remove_vm_area, !VM_VM_AREA on * behalf of vmap area is being tear down or vm_map_ram allocation. -- cgit v1.2.3 From fd60775aea802beef444881ddfa111a4b73b1bbc Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 12 Dec 2016 16:42:20 -0800 Subject: mm, thp: avoid unlikely branches for split_huge_pmd While doing MADV_DONTNEED on a large area of thp memory, I noticed we encountered many unlikely() branches in profiles for each backing hugepage. This is because zap_pmd_range() would call split_huge_pmd(), which rechecked the conditions that were already validated, but as part of an unlikely() branch. Avoid the unlikely() branch when in a context where pmd is known to be good for __split_huge_pmd() directly. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1610181600300.84525@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 2 ++ mm/memory.c | 4 ++-- mm/mempolicy.c | 2 +- mm/mprotect.c | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index e35e6de633b9..1f782aa1d8e6 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -189,6 +189,8 @@ static inline void deferred_split_huge_page(struct page *page) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) +static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long address, bool freeze, struct page *page) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page) {} diff --git a/mm/memory.c b/mm/memory.c index 33f45edf8272..d86b7b4afd7d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1240,7 +1240,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, if (next - addr != HPAGE_PMD_SIZE) { VM_BUG_ON_VMA(vma_is_anonymous(vma) && !rwsem_is_locked(&tlb->mm->mmap_sem), vma); - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); } else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ @@ -3454,7 +3454,7 @@ static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) /* COW handled on pte level: split pmd */ VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); - split_huge_pmd(fe->vma, fe->pmd, fe->address); + __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); return VM_FAULT_FALLBACK; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 0b859af06b87..a6a27e5d6b14 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -496,7 +496,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, page = pmd_page(*pmd); if (is_huge_zero_page(page)) { spin_unlock(ptl); - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); } else { get_page(page); spin_unlock(ptl); diff --git a/mm/mprotect.c b/mm/mprotect.c index 05a02b72c98d..c5ba2aae0f54 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -176,7 +176,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) { - split_huge_pmd(vma, pmd, addr); + __split_huge_pmd(vma, pmd, addr, false, NULL); if (pmd_trans_unstable(pmd)) continue; } else { -- cgit v1.2.3 From 6d8409580bee356ce418dcb94260b24dda639934 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Mon, 12 Dec 2016 16:42:23 -0800 Subject: mm, mempolicy: clean up __GFP_THISNODE confusion in policy_zonelist __GFP_THISNODE is documented to enforce the allocation to be satisified from the requested node with no fallbacks or placement policy enforcements. policy_zonelist seemingly breaks this semantic if the current policy is MPOL_MBIND and instead of taking the node it will fallback to the first node in the mask if the requested one is not in the mask. This is confusing to say the least because it fact we shouldn't ever go that path. First tasks shouldn't be scheduled on CPUs with nodes outside of their mempolicy binding. And secondly policy_zonelist is called only from 3 places: - huge_zonelist - never should do __GFP_THISNODE when going this path - alloc_pages_vma - which shouldn't depend on __GFP_THISNODE either - alloc_pages_current - which uses default_policy id __GFP_THISNODE is used So we shouldn't even need to care about this possibility and can drop the confusing code. Let's keep a WARN_ON_ONCE in place to catch potential users and fix them up properly (aka use a different allocation function which ignores mempolicy). [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20161013125958.32155-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 24 ++++++++---------------- 1 file changed, 8 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a6a27e5d6b14..4d58021dba34 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1679,25 +1679,17 @@ static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy, int nd) { - switch (policy->mode) { - case MPOL_PREFERRED: - if (!(policy->flags & MPOL_F_LOCAL)) - nd = policy->v.preferred_node; - break; - case MPOL_BIND: + if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) + nd = policy->v.preferred_node; + else { /* - * Normally, MPOL_BIND allocations are node-local within the - * allowed nodemask. However, if __GFP_THISNODE is set and the - * current node isn't part of the mask, we use the zonelist for - * the first node in the mask instead. + * __GFP_THISNODE shouldn't even be used with the bind policy + * because we might easily break the expectation to stay on the + * requested node and not break the policy. */ - if (unlikely(gfp & __GFP_THISNODE) && - unlikely(!node_isset(nd, policy->v.nodes))) - nd = first_node(policy->v.nodes); - break; - default: - BUG(); + WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } + return node_zonelist(nd, gfp); } -- cgit v1.2.3 From 6afcf8ef0ca0a69d014f8edb613d94821f0ae700 Mon Sep 17 00:00:00 2001 From: Ming Ling Date: Mon, 12 Dec 2016 16:42:26 -0800 Subject: mm, compaction: fix NR_ISOLATED_* stats for pfn based migration Since commit bda807d44454 ("mm: migrate: support non-lru movable page migration") isolate_migratepages_block) can isolate !PageLRU pages which would acct_isolated account as NR_ISOLATED_*. Accounting these non-lru pages NR_ISOLATED_{ANON,FILE} doesn't make any sense and it can misguide heuristics based on those counters such as pgdat_reclaimable_pages resp. too_many_isolated which would lead to unexpected stalls during the direct reclaim without any good reason. Note that __alloc_contig_migrate_range can isolate a lot of pages at once. On mobile devices such as 512M ram android Phone, it may use a big zram swap. In some cases zram(zsmalloc) uses too many non-lru but migratedable pages, such as: MemTotal: 468148 kB Normal free:5620kB Free swap:4736kB Total swap:409596kB ZRAM: 164616kB(zsmalloc non-lru pages) active_anon:60700kB inactive_anon:60744kB active_file:34420kB inactive_file:37532kB Fix this by only accounting lru pages to NR_ISOLATED_* in isolate_migratepages_block right after they were isolated and we still know they were on LRU. Drop acct_isolated because it is called after the fact and we've lost that information. Batching per-cpu counter doesn't make much improvement anyway. Also make sure that we uncharge only LRU pages when putting them back on the LRU in putback_movable_pages resp. when unmap_and_move migrates the page. [mhocko@suse.com: replace acct_isolated() with direct counting] Fixes: bda807d44454 ("mm: migrate: support non-lru movable page migration") Link: http://lkml.kernel.org/r/20161019080240.9682-1-mhocko@kernel.org Signed-off-by: Ming Ling Signed-off-by: Michal Hocko Acked-by: Minchan Kim Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 25 +++---------------------- mm/migrate.c | 15 +++++++++++---- 2 files changed, 14 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 0409a4ad6ea1..70e6bec46dc2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -634,22 +634,6 @@ isolate_freepages_range(struct compact_control *cc, return pfn; } -/* Update the number of anon and file isolated pages in the zone */ -static void acct_isolated(struct zone *zone, struct compact_control *cc) -{ - struct page *page; - unsigned int count[2] = { 0, }; - - if (list_empty(&cc->migratepages)) - return; - - list_for_each_entry(page, &cc->migratepages, lru) - count[!!page_is_file_cache(page)]++; - - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON, count[0]); - mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, count[1]); -} - /* Similar to reclaim, but different enough that they don't share logic */ static bool too_many_isolated(struct zone *zone) { @@ -866,6 +850,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* Successfully isolated */ del_page_from_lru_list(page, lruvec, page_lru(page)); + inc_node_page_state(page, + NR_ISOLATED_ANON + page_is_file_cache(page)); isolate_success: list_add(&page->lru, &cc->migratepages); @@ -902,7 +888,6 @@ isolate_fail: spin_unlock_irqrestore(zone_lru_lock(zone), flags); locked = false; } - acct_isolated(zone, cc); putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; cc->last_migrated_pfn = 0; @@ -988,7 +973,6 @@ isolate_migratepages_range(struct compact_control *cc, unsigned long start_pfn, if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) break; } - acct_isolated(cc->zone, cc); return pfn; } @@ -1258,10 +1242,8 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, low_pfn = isolate_migratepages_block(cc, low_pfn, block_end_pfn, isolate_mode); - if (!low_pfn || cc->contended) { - acct_isolated(zone, cc); + if (!low_pfn || cc->contended) return ISOLATE_ABORT; - } /* * Either we isolated something and proceed with migration. Or @@ -1271,7 +1253,6 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, break; } - acct_isolated(zone, cc); /* Record where migration scanner will be restarted. */ cc->migrate_pfn = low_pfn; diff --git a/mm/migrate.c b/mm/migrate.c index 99250aee1ac1..66ce6b490b13 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -168,8 +168,6 @@ void putback_movable_pages(struct list_head *l) continue; } list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); /* * We isolated non-lru movable page so here we can use * __PageMovable because LRU page's mapping cannot have @@ -186,6 +184,8 @@ void putback_movable_pages(struct list_head *l) put_page(page); } else { putback_lru_page(page); + dec_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } } } @@ -1121,8 +1121,15 @@ out: * restored. */ list_del(&page->lru); - dec_node_page_state(page, NR_ISOLATED_ANON + - page_is_file_cache(page)); + + /* + * Compaction can migrate also non-LRU pages which are + * not accounted to NR_ISOLATED_*. They can be recognized + * as __PageMovable + */ + if (likely(!__PageMovable(page))) + dec_node_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } /* -- cgit v1.2.3 From 23f919d4ad0eb325595f10f55be4301b2965d6d6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 12 Dec 2016 16:42:28 -0800 Subject: shmem: avoid maybe-uninitialized warning After enabling -Wmaybe-uninitialized warnings, we get a false-postive warning for shmem: mm/shmem.c: In function `shmem_getpage_gfp': include/linux/spinlock.h:332:21: error: `info' may be used uninitialized in this function [-Werror=maybe-uninitialized] This can be easily avoided, since the correct 'info' pointer is known at the time we first enter the function, so we can simply move the initialization up. Moving it before the first label avoids the warning and lets us remove two later initializations. Note that the function is so hard to read that it not only confuses the compiler, but also most readers and without this patch it could\ easily break if one of the 'goto's changed. Link: https://www.spinics.net/lists/kernel/msg2368133.html Link: http://lkml.kernel.org/r/20161024205725.786455-1-arnd@arndb.de Signed-off-by: Arnd Bergmann Acked-by: Michal Hocko Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Hugh Dickins Cc: Andreas Gruenbacher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index 9d32e1cb9f38..ba0d7644ee20 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1539,7 +1539,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, struct mm_struct *fault_mm, int *fault_type) { struct address_space *mapping = inode->i_mapping; - struct shmem_inode_info *info; + struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; struct mm_struct *charge_mm; struct mem_cgroup *memcg; @@ -1589,7 +1589,6 @@ repeat: * Fast cache lookup did not find it: * bring it back from swap or allocate. */ - info = SHMEM_I(inode); sbinfo = SHMEM_SB(inode->i_sb); charge_mm = fault_mm ? : current->mm; @@ -1837,7 +1836,6 @@ unlock: put_page(page); } if (error == -ENOSPC && !once++) { - info = SHMEM_I(inode); spin_lock_irq(&info->lock); shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); -- cgit v1.2.3 From c0f2e176f87bd989835bd098a52779df41a9243c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:42:31 -0800 Subject: mm: use the correct page size when removing the page We are removing a pmd hugepage here. Use the correct page size. Link: http://lkml.kernel.org/r/20161026084839.27299-2-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: "Kirill A. Shutemov" Cc: Dan Williams Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f8e35cc66d32..0103728ffa94 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1399,12 +1399,12 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, if (vma_is_dax(vma)) { spin_unlock(ptl); if (is_huge_zero_pmd(orig_pmd)) - tlb_remove_page(tlb, pmd_page(orig_pmd)); + tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else if (is_huge_zero_pmd(orig_pmd)) { pte_free(tlb->mm, pgtable_trans_huge_withdraw(tlb->mm, pmd)); atomic_long_dec(&tlb->mm->nr_ptes); spin_unlock(ptl); - tlb_remove_page(tlb, pmd_page(orig_pmd)); + tlb_remove_page_size(tlb, pmd_page(orig_pmd), HPAGE_PMD_SIZE); } else { struct page *page = pmd_page(orig_pmd); page_remove_rmap(page, true); -- cgit v1.2.3 From b528e4b6405b9fd656a6a308a7e2aa6afa50e77d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:42:37 -0800 Subject: mm/hugetlb: add tlb_remove_hugetlb_entry for handling hugetlb pages This add tlb_remove_hugetlb_entry similar to tlb_remove_pmd_tlb_entry. Link: http://lkml.kernel.org/r/20161026084839.27299-4-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: "Kirill A. Shutemov" Cc: Dan Williams Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/tlb.h | 2 ++ arch/ia64/include/asm/tlb.h | 3 +++ arch/s390/include/asm/tlb.h | 2 ++ arch/sh/include/asm/tlb.h | 3 +++ arch/um/include/asm/tlb.h | 3 +++ include/asm-generic/tlb.h | 6 ++++++ mm/hugetlb.c | 2 +- 7 files changed, 20 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 1e25cd80589e..82841ba1f51f 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -186,6 +186,8 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long addr) tlb_add_flush(tlb, addr); } +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index 77e541cf0e5d..b3f369ab844d 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -283,6 +283,9 @@ do { \ __tlb_remove_tlb_entry(tlb, ptep, addr); \ } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) + #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb->need_flush = 1; \ diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 15711de10403..094440b59f9e 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -162,5 +162,7 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, #define tlb_remove_tlb_entry(tlb, ptep, addr) do { } while (0) #define tlb_remove_pmd_tlb_entry(tlb, pmdp, addr) do { } while (0) #define tlb_migrate_finish(mm) do { } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) #endif /* _S390_TLB_H */ diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 025cdb1032f6..e7d15e8c75c1 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -65,6 +65,9 @@ tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, unsigned long address) tlb->end = address + PAGE_SIZE; } +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) + /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 821ff0acfe17..a4427029c3c8 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -141,6 +141,9 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + tlb_remove_tlb_entry(tlb, ptep, address) + #define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) #define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index dba727becd5f..38c2b708df6e 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -220,6 +220,12 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa __tlb_remove_tlb_entry(tlb, ptep, address); \ } while (0) +#define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ + do { \ + __tlb_adjust_range(tlb, address, huge_page_size(h)); \ + __tlb_remove_tlb_entry(tlb, ptep, address); \ + } while (0) + /** * tlb_remove_pmd_tlb_entry - remember a pmd mapping for later tlb invalidation * This is a nop so far, because only x86 needs it. diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c12296f62e8d..8e519da7242d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3336,7 +3336,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, } pte = huge_ptep_get_and_clear(mm, address, ptep); - tlb_remove_tlb_entry(tlb, ptep, address); + tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); -- cgit v1.2.3 From 07e326610e5634e5038fce32fff370949eb42101 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:42:40 -0800 Subject: mm: add tlb_remove_check_page_size_change to track page size change With commit e77b0852b551 ("mm/mmu_gather: track page size with mmu gather and force flush if page size change") we added the ability to force a tlb flush when the page size change in a mmu_gather loop. We did that by checking for a page size change every time we added a page to mmu_gather for lazy flush/remove. We can improve that by moving the page size change check early and not doing it every time we add a page. This also helps us to do tlb flush when invalidating a range covering dax mapping. Wrt dax mapping we don't have a backing struct page and hence we don't call tlb_remove_page, which earlier forced the tlb flush on page size change. Moving the page size change check earlier means we will do the same even for dax mapping. We also avoid doing this check on architecture other than powerpc. In a later patch we will remove page size check from tlb_remove_page(). Link: http://lkml.kernel.org/r/20161026084839.27299-5-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: "Kirill A. Shutemov" Cc: Dan Williams Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/tlb.h | 6 ++++++ arch/ia64/include/asm/tlb.h | 6 ++++++ arch/powerpc/include/asm/tlb.h | 16 ++++++++++++++++ arch/s390/include/asm/tlb.h | 6 ++++++ arch/sh/include/asm/tlb.h | 6 ++++++ arch/um/include/asm/tlb.h | 6 ++++++ include/asm-generic/tlb.h | 16 ++++++++++++++++ mm/huge_memory.c | 4 ++++ mm/hugetlb.c | 5 +++++ mm/madvise.c | 1 + mm/memory.c | 7 ++++++- 11 files changed, 78 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index 82841ba1f51f..a9d6de4746ea 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -286,5 +286,11 @@ tlb_remove_pmd_tlb_entry(struct mmu_gather *tlb, pmd_t *pmdp, unsigned long addr #define tlb_migrate_finish(mm) do { } while (0) +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #endif /* CONFIG_MMU */ #endif diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index b3f369ab844d..bfe6295aa746 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -286,6 +286,12 @@ do { \ #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ tlb_remove_tlb_entry(tlb, ptep, address) +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #define pte_free_tlb(tlb, ptep, address) \ do { \ tlb->need_flush = 1; \ diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h index 99e1397b71da..609557569f65 100644 --- a/arch/powerpc/include/asm/tlb.h +++ b/arch/powerpc/include/asm/tlb.h @@ -28,6 +28,7 @@ #define tlb_start_vma(tlb, vma) do { } while (0) #define tlb_end_vma(tlb, vma) do { } while (0) #define __tlb_remove_tlb_entry __tlb_remove_tlb_entry +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change extern void tlb_flush(struct mmu_gather *tlb); @@ -46,6 +47,21 @@ static inline void __tlb_remove_tlb_entry(struct mmu_gather *tlb, pte_t *ptep, #endif } +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ + if (!tlb->page_size) + tlb->page_size = page_size; + else if (tlb->page_size != page_size) { + tlb_flush_mmu(tlb); + /* + * update the page size after flush for the new + * mmu_gather. + */ + tlb->page_size = page_size; + } +} + #ifdef CONFIG_SMP static inline int mm_is_core_local(struct mm_struct *mm) { diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 094440b59f9e..28b159c87c38 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -165,4 +165,10 @@ static inline void pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ tlb_remove_tlb_entry(tlb, ptep, address) +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #endif /* _S390_TLB_H */ diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index e7d15e8c75c1..0f988b3e484b 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -130,6 +130,12 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, return tlb_remove_page(tlb, page); } +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #define pte_free_tlb(tlb, ptep, addr) pte_free((tlb)->mm, ptep) #define pmd_free_tlb(tlb, pmdp, addr) pmd_free((tlb)->mm, pmdp) #define pud_free_tlb(tlb, pudp, addr) pud_free((tlb)->mm, pudp) diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index a4427029c3c8..8258dd4bb13c 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -144,6 +144,12 @@ static inline void tlb_remove_page_size(struct mmu_gather *tlb, #define tlb_remove_huge_tlb_entry(h, tlb, ptep, address) \ tlb_remove_tlb_entry(tlb, ptep, address) +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ +} + #define pte_free_tlb(tlb, ptep, addr) __pte_free_tlb(tlb, ptep, addr) #define pud_free_tlb(tlb, pudp, addr) __pud_free_tlb(tlb, pudp, addr) diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 38c2b708df6e..256c9de71fdb 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -182,6 +182,22 @@ static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *pa return __tlb_remove_page(tlb, page); } +#ifndef tlb_remove_check_page_size_change +#define tlb_remove_check_page_size_change tlb_remove_check_page_size_change +static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, + unsigned int page_size) +{ + /* + * We don't care about page size change, just update + * mmu_gather page size here so that debug checks + * doesn't throw false warning. + */ +#ifdef CONFIG_DEBUG_VM + tlb->page_size = page_size; +#endif +} +#endif + /* * In the case of tlb vma handling, we can optimise these away in the * case where we're doing a full MM flush. When we're doing a munmap, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0103728ffa94..26fd1161ca85 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1323,6 +1323,8 @@ bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = tlb->mm; bool ret = false; + tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + ptl = pmd_trans_huge_lock(pmd, vma); if (!ptl) goto out_unlocked; @@ -1384,6 +1386,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t orig_pmd; spinlock_t *ptl; + tlb_remove_check_page_size_change(tlb, HPAGE_PMD_SIZE); + ptl = __pmd_trans_huge_lock(pmd, vma); if (!ptl) return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8e519da7242d..3edb759c5c7d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3286,6 +3286,11 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); + /* + * This is a hugetlb vma, all the pte entries should point + * to huge page. + */ + tlb_remove_check_page_size_change(tlb, sz); tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); address = start; diff --git a/mm/madvise.c b/mm/madvise.c index 93fb63e88b5e..0e3828eae9f8 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -281,6 +281,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, if (pmd_trans_unstable(pmd)) return 0; + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); for (; addr != end; pte++, addr += PAGE_SIZE) { diff --git a/mm/memory.c b/mm/memory.c index d86b7b4afd7d..eae20eb66bfc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -528,7 +528,11 @@ void free_pgd_range(struct mmu_gather *tlb, end -= PMD_SIZE; if (addr > end - 1) return; - + /* + * We add page table cache pages with PAGE_SIZE, + * (see pte_free_tlb()), flush the tlb if we need + */ + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); @@ -1120,6 +1124,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, swp_entry_t entry; struct page *pending_page = NULL; + tlb_remove_check_page_size_change(tlb, PAGE_SIZE); again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); -- cgit v1.2.3 From 692a68c1544d6be4ba7c6e929e9c7b2ba0447b91 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:42:43 -0800 Subject: mm: remove the page size change check in tlb_remove_page Now that we check for page size change early in the loop, we can partially revert e9d55e157034a ("mm: change the interface for __tlb_remove_page"). This simplies the code much, by removing the need to track the last address with which we adjusted the range. We also go back to the older way of filling the mmu_gather array, ie, we add an entry and then check whether the gather batch is full. Link: http://lkml.kernel.org/r/20161026084839.27299-6-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: "Kirill A. Shutemov" Cc: Dan Williams Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/tlb.h | 13 +++---------- arch/ia64/include/asm/tlb.h | 16 ++++------------ arch/s390/include/asm/tlb.h | 6 ------ arch/sh/include/asm/tlb.h | 6 ------ arch/um/include/asm/tlb.h | 6 ------ include/asm-generic/tlb.h | 28 ++-------------------------- mm/memory.c | 21 ++++++--------------- 7 files changed, 15 insertions(+), 81 deletions(-) (limited to 'mm') diff --git a/arch/arm/include/asm/tlb.h b/arch/arm/include/asm/tlb.h index a9d6de4746ea..3f2eb76243e3 100644 --- a/arch/arm/include/asm/tlb.h +++ b/arch/arm/include/asm/tlb.h @@ -213,18 +213,17 @@ tlb_end_vma(struct mmu_gather *tlb, struct vm_area_struct *vma) static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { + tlb->pages[tlb->nr++] = page; + VM_WARN_ON(tlb->nr > tlb->max); if (tlb->nr == tlb->max) return true; - tlb->pages[tlb->nr++] = page; return false; } static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - if (__tlb_remove_page(tlb, page)) { + if (__tlb_remove_page(tlb, page)) tlb_flush_mmu(tlb); - __tlb_remove_page(tlb, page); - } } static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, @@ -233,12 +232,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { diff --git a/arch/ia64/include/asm/tlb.h b/arch/ia64/include/asm/tlb.h index bfe6295aa746..fced197b9626 100644 --- a/arch/ia64/include/asm/tlb.h +++ b/arch/ia64/include/asm/tlb.h @@ -207,15 +207,15 @@ tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long end) */ static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - if (tlb->nr == tlb->max) - return true; - tlb->need_flush = 1; if (!tlb->nr && tlb->pages == tlb->local) __tlb_alloc_page(tlb); tlb->pages[tlb->nr++] = page; + VM_WARN_ON(tlb->nr > tlb->max); + if (tlb->nr == tlb->max) + return true; return false; } @@ -236,10 +236,8 @@ static inline void tlb_flush_mmu(struct mmu_gather *tlb) static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) { - if (__tlb_remove_page(tlb, page)) { + if (__tlb_remove_page(tlb, page)) tlb_flush_mmu(tlb); - __tlb_remove_page(tlb, page); - } } static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, @@ -248,12 +246,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { diff --git a/arch/s390/include/asm/tlb.h b/arch/s390/include/asm/tlb.h index 28b159c87c38..853b2a3d8dee 100644 --- a/arch/s390/include/asm/tlb.h +++ b/arch/s390/include/asm/tlb.h @@ -104,12 +104,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { diff --git a/arch/sh/include/asm/tlb.h b/arch/sh/include/asm/tlb.h index 0f988b3e484b..46e0d635e36f 100644 --- a/arch/sh/include/asm/tlb.h +++ b/arch/sh/include/asm/tlb.h @@ -118,12 +118,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 8258dd4bb13c..600a2e9bfee2 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -116,12 +116,6 @@ static inline bool __tlb_remove_page_size(struct mmu_gather *tlb, return __tlb_remove_page(tlb, page); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, - struct page *page) -{ - return __tlb_remove_page(tlb, page); -} - static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { diff --git a/include/asm-generic/tlb.h b/include/asm-generic/tlb.h index 256c9de71fdb..7eed8cf3130a 100644 --- a/include/asm-generic/tlb.h +++ b/include/asm-generic/tlb.h @@ -107,11 +107,6 @@ struct mmu_gather { struct mmu_gather_batch local; struct page *__pages[MMU_GATHER_BUNDLE]; unsigned int batch_count; - /* - * __tlb_adjust_range will track the new addr here, - * that that we can adjust the range after the flush - */ - unsigned long addr; int page_size; }; @@ -130,12 +125,6 @@ static inline void __tlb_adjust_range(struct mmu_gather *tlb, { tlb->start = min(tlb->start, address); tlb->end = max(tlb->end, address + range_size); - /* - * Track the last address with which we adjusted the range. This - * will be used later to adjust again after a mmu_flush due to - * failed __tlb_remove_page - */ - tlb->addr = address; } static inline void __tlb_reset_range(struct mmu_gather *tlb) @@ -151,15 +140,11 @@ static inline void __tlb_reset_range(struct mmu_gather *tlb) static inline void tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size) { - if (__tlb_remove_page_size(tlb, page, page_size)) { + if (__tlb_remove_page_size(tlb, page, page_size)) tlb_flush_mmu(tlb); - tlb->page_size = page_size; - __tlb_adjust_range(tlb, tlb->addr, page_size); - __tlb_remove_page_size(tlb, page, page_size); - } } -static bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) +static inline bool __tlb_remove_page(struct mmu_gather *tlb, struct page *page) { return __tlb_remove_page_size(tlb, page, PAGE_SIZE); } @@ -173,15 +158,6 @@ static inline void tlb_remove_page(struct mmu_gather *tlb, struct page *page) return tlb_remove_page_size(tlb, page, PAGE_SIZE); } -static inline bool __tlb_remove_pte_page(struct mmu_gather *tlb, struct page *page) -{ - /* active->nr should be zero when we call this */ - VM_BUG_ON_PAGE(tlb->active->nr, page); - tlb->page_size = PAGE_SIZE; - __tlb_adjust_range(tlb, tlb->addr, PAGE_SIZE); - return __tlb_remove_page(tlb, page); -} - #ifndef tlb_remove_check_page_size_change #define tlb_remove_check_page_size_change tlb_remove_check_page_size_change static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, diff --git a/mm/memory.c b/mm/memory.c index eae20eb66bfc..0a72f821ccdc 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -300,15 +300,14 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ struct mmu_gather_batch *batch; VM_BUG_ON(!tlb->end); - - if (!tlb->page_size) - tlb->page_size = page_size; - else { - if (page_size != tlb->page_size) - return true; - } + VM_WARN_ON(tlb->page_size != page_size); batch = tlb->active; + /* + * Add the page and check if we are full. If so + * force a flush. + */ + batch->pages[batch->nr++] = page; if (batch->nr == batch->max) { if (!tlb_next_batch(tlb)) return true; @@ -316,7 +315,6 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ } VM_BUG_ON_PAGE(batch->nr > batch->max, page); - batch->pages[batch->nr++] = page; return false; } @@ -1122,7 +1120,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, pte_t *start_pte; pte_t *pte; swp_entry_t entry; - struct page *pending_page = NULL; tlb_remove_check_page_size_change(tlb, PAGE_SIZE); again: @@ -1177,7 +1174,6 @@ again: print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { force_flush = 1; - pending_page = page; addr += PAGE_SIZE; break; } @@ -1218,11 +1214,6 @@ again: if (force_flush) { force_flush = 0; tlb_flush_mmu_free(tlb); - if (pending_page) { - /* remove the page with new size */ - __tlb_remove_pte_page(tlb, pending_page); - pending_page = NULL; - } if (addr != end) goto again; } -- cgit v1.2.3 From 80a7951627712180ed2575a20e1e442b851fc27c Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 12 Dec 2016 16:42:46 -0800 Subject: mm: fix up get_user_pages* comments In the previous round of get_user_pages* changes comments attached to __get_user_pages_unlocked() and get_user_pages_unlocked() were rendered incorrect, this patch corrects them. In addition the get_user_pages_unlocked() comment seems to have already been outdated as it referred to tsk, mm parameters which were removed in c12d2da5 ("mm/gup: Remove the macro overload API migration helpers from the get_user*() APIs"), this patch fixes this also. Link: http://lkml.kernel.org/r/20161025233435.5338-1-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/gup.c b/mm/gup.c index fc04f1c3cf08..e50178c58b97 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -858,14 +858,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, EXPORT_SYMBOL(get_user_pages_locked); /* - * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to - * pass additional gup_flags as last parameter (like FOLL_HWPOISON). + * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows for + * tsk, mm to be specified. * * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the - * caller if required (just like with __get_user_pages). "FOLL_GET", - * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed - * according to the parameters "pages", "write", "force" - * respectively. + * caller if required (just like with __get_user_pages). "FOLL_GET" + * is set implicitly if "pages" is non-NULL. */ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -895,10 +893,8 @@ EXPORT_SYMBOL(__get_user_pages_unlocked); * get_user_pages_unlocked(tsk, mm, ..., pages); * * It is functionally equivalent to get_user_pages_fast so - * get_user_pages_fast should be used instead, if the two parameters - * "tsk" and "mm" are respectively equal to current and current->mm, - * or if "force" shall be set to 1 (get_user_pages_fast misses the - * "force" parameter). + * get_user_pages_fast should be used instead if specific gup_flags + * (e.g. FOLL_FORCE) are not required. */ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) -- cgit v1.2.3 From 8d303e44e99c2ae5cad31f3dded10a572b0fd4d7 Mon Sep 17 00:00:00 2001 From: Piotr Kwapulinski Date: Mon, 12 Dec 2016 16:42:49 -0800 Subject: mm/mempolicy.c: forbid static or relative flags for local NUMA mode The MPOL_F_STATIC_NODES and MPOL_F_RELATIVE_NODES flags are irrelevant when setting them for MPOL_LOCAL NUMA memory policy via set_mempolicy or mbind. Return the "invalid argument" from set_mempolicy and mbind whenever any of these flags is passed along with MPOL_LOCAL. It is consistent with MPOL_PREFERRED passed with empty nodemask. It slightly shortens the execution time in paths where these flags are used e.g. when trying to rebind the NUMA nodes for changes in cgroups cpuset mems (mpol_rebind_preferred()) or when just printing the mempolicy structure (/proc/PID/numa_maps). Isolated tests done. Link: http://lkml.kernel.org/r/20161027163037.4089-1-kwapulinski.piotr@gmail.com Signed-off-by: Piotr Kwapulinski Acked-by: David Rientjes Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Michal Hocko Cc: Liang Chen Cc: Mel Gorman Cc: Dave Hansen Cc: Nathan Zimmer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4d58021dba34..6d3639e1f254 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -276,7 +276,9 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, return ERR_PTR(-EINVAL); } } else if (mode == MPOL_LOCAL) { - if (!nodes_empty(*nodes)) + if (!nodes_empty(*nodes) || + (flags & MPOL_F_STATIC_NODES) || + (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) -- cgit v1.2.3 From 39fa104d5b87655c1c19d4b1990ea63d190c4817 Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Mon, 12 Dec 2016 16:42:55 -0800 Subject: mm: remove x86-only restriction of movable_node In commit c5320926e370 ("mem-hotplug: introduce movable_node boot option"), the memblock allocation direction is changed to bottom-up and then back to top-down like this: 1. memblock_set_bottom_up(true), called by cmdline_parse_movable_node(). 2. memblock_set_bottom_up(false), called by x86's numa_init(). Even though (1) occurs in generic mm code, it is wrapped by #ifdef CONFIG_MOVABLE_NODE, which depends on X86_64. This means that when we extend CONFIG_MOVABLE_NODE to non-x86 arches, things will be unbalanced. (1) will happen for them, but (2) will not. This toggle was added in the first place because x86 has a delay between adding memblocks and marking them as hotpluggable. Since other arches do this marking either immediately or not at all, they do not require the bottom-up toggle. So, resolve things by moving (1) from cmdline_parse_movable_node() to x86's setup_arch(), immediately after the movable_node parameter has been parsed. Link: http://lkml.kernel.org/r/1479160961-25840-3-git-send-email-arbab@linux.vnet.ibm.com Signed-off-by: Reza Arbab Acked-by: Balbir Singh Cc: "Aneesh Kumar K.V" Cc: "H. Peter Anvin" Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Bharata B Rao Cc: Frank Rowand Cc: Ingo Molnar Cc: Michael Ellerman Cc: Nathan Fontenot Cc: Paul Mackerras Cc: Rob Herring Cc: Stewart Smith Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 2 +- arch/x86/kernel/setup.c | 24 ++++++++++++++++++++++++ mm/memory_hotplug.c | 20 -------------------- 3 files changed, 25 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 86a31dfc036e..26f0f92206d7 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -2406,7 +2406,7 @@ bytes respectively. Such letter suffixes can also be entirely omitted. that the amount of memory usable for all allocations is not too small. - movable_node [KNL,X86] Boot-time switch to enable the effects + movable_node [KNL] Boot-time switch to enable the effects of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details. MTD_Partition= [MTD] diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9c337b0e8ba7..4cfba947d774 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -985,6 +985,30 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); +#ifdef CONFIG_MEMORY_HOTPLUG + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + if (movable_node_is_enabled()) + memblock_set_bottom_up(true); +#endif + x86_report_nx(); /* after early param, so could get panic from serial */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cad4b9125695..e43142c15631 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1727,26 +1727,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) static int __init cmdline_parse_movable_node(char *p) { #ifdef CONFIG_MOVABLE_NODE - /* - * Memory used by the kernel cannot be hot-removed because Linux - * cannot migrate the kernel pages. When memory hotplug is - * enabled, we should prevent memblock from allocating memory - * for the kernel. - * - * ACPI SRAT records all hotpluggable memory ranges. But before - * SRAT is parsed, we don't know about it. - * - * The kernel image is loaded into memory at very early time. We - * cannot prevent this anyway. So on NUMA system, we set any - * node the kernel resides in as un-hotpluggable. - * - * Since on modern servers, one node could have double-digit - * gigabytes memory, we can assume the memory around the kernel - * image is also un-hotpluggable. So before SRAT is parsed, just - * allocate memory near the kernel image to try the best to keep - * the kernel away from hotpluggable memory. - */ - memblock_set_bottom_up(true); movable_node_enabled = true; #else pr_warn("movable_node option not supported\n"); -- cgit v1.2.3 From 114cf3cc55ec00465a59bb89e06b4e4fdcd6412e Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Mon, 12 Dec 2016 16:42:59 -0800 Subject: mm: enable CONFIG_MOVABLE_NODE on non-x86 arches To support movable memory nodes (CONFIG_MOVABLE_NODE), at least one of the following must be true: 1. This config has the capability to identify movable nodes at boot. Right now, only x86 can do this. 2. Our config supports memory hotplug, which means that a movable node can be created by hotplugging all of its memory into ZONE_MOVABLE. Fix the Kconfig definition of CONFIG_MOVABLE_NODE, which currently recognizes (1), but not (2). Link: http://lkml.kernel.org/r/1479160961-25840-4-git-send-email-arbab@linux.vnet.ibm.com Signed-off-by: Reza Arbab Reviewed-by: Aneesh Kumar K.V Acked-by: Balbir Singh Cc: "Aneesh Kumar K.V" Cc: "H. Peter Anvin" Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Bharata B Rao Cc: Frank Rowand Cc: Ingo Molnar Cc: Michael Ellerman Cc: Nathan Fontenot Cc: Paul Mackerras Cc: Rob Herring Cc: Stewart Smith Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 86e3e0e74d20..061b46b18029 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -153,7 +153,7 @@ config MOVABLE_NODE bool "Enable to assign a node which has only movable memory" depends on HAVE_MEMBLOCK depends on NO_BOOTMEM - depends on X86_64 + depends on X86_64 || MEMORY_HOTPLUG depends on NUMA default n help -- cgit v1.2.3 From 41a9ada3e6b4253f1a3ce42699c6aaeb8584306c Mon Sep 17 00:00:00 2001 From: Reza Arbab Date: Mon, 12 Dec 2016 16:43:02 -0800 Subject: of/fdt: mark hotpluggable memory When movable nodes are enabled, any node containing only hotpluggable memory is made movable at boot time. On x86, hotpluggable memory is discovered by parsing the ACPI SRAT, making corresponding calls to memblock_mark_hotplug(). If we introduce a dt property to describe memory as hotpluggable, configs supporting early fdt may then also do this marking and use movable nodes. Link: http://lkml.kernel.org/r/1479160961-25840-5-git-send-email-arbab@linux.vnet.ibm.com Signed-off-by: Reza Arbab Tested-by: Balbir Singh Acked-by: Balbir Singh Cc: "Aneesh Kumar K.V" Cc: "H. Peter Anvin" Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Benjamin Herrenschmidt Cc: Bharata B Rao Cc: Frank Rowand Cc: Ingo Molnar Cc: Michael Ellerman Cc: Nathan Fontenot Cc: Paul Mackerras Cc: Rob Herring Cc: Stewart Smith Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/of/fdt.c | 19 +++++++++++++++++++ include/linux/of_fdt.h | 1 + mm/Kconfig | 2 +- 3 files changed, 21 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c index c89d5d231a0e..c9b5cac03b36 100644 --- a/drivers/of/fdt.c +++ b/drivers/of/fdt.c @@ -1015,6 +1015,7 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname, const char *type = of_get_flat_dt_prop(node, "device_type", NULL); const __be32 *reg, *endp; int l; + bool hotpluggable; /* We are scanning "memory" nodes only */ if (type == NULL) { @@ -1034,6 +1035,7 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname, return 0; endp = reg + (l / sizeof(__be32)); + hotpluggable = of_get_flat_dt_prop(node, "hotpluggable", NULL); pr_debug("memory scan node %s, reg size %d,\n", uname, l); @@ -1049,6 +1051,13 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname, (unsigned long long)size); early_init_dt_add_memory_arch(base, size); + + if (!hotpluggable) + continue; + + if (early_init_dt_mark_hotplug_memory_arch(base, size)) + pr_warn("failed to mark hotplug range 0x%llx - 0x%llx\n", + base, base + size); } return 0; @@ -1146,6 +1155,11 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) memblock_add(base, size); } +int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size) +{ + return memblock_mark_hotplug(base, size); +} + int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, bool nomap) { @@ -1168,6 +1182,11 @@ void __init __weak early_init_dt_add_memory_arch(u64 base, u64 size) WARN_ON(1); } +int __init __weak early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size) +{ + return -ENOSYS; +} + int __init __weak early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, bool nomap) { diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h index 4341f32516d8..271b3fdf0070 100644 --- a/include/linux/of_fdt.h +++ b/include/linux/of_fdt.h @@ -71,6 +71,7 @@ extern int early_init_dt_scan_chosen_stdout(void); extern void early_init_fdt_scan_reserved_mem(void); extern void early_init_fdt_reserve_self(void); extern void early_init_dt_add_memory_arch(u64 base, u64 size); +extern int early_init_dt_mark_hotplug_memory_arch(u64 base, u64 size); extern int early_init_dt_reserve_memory_arch(phys_addr_t base, phys_addr_t size, bool no_map); extern void * early_init_dt_alloc_memory_arch(u64 size, u64 align); diff --git a/mm/Kconfig b/mm/Kconfig index 061b46b18029..33a9b06ec618 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -153,7 +153,7 @@ config MOVABLE_NODE bool "Enable to assign a node which has only movable memory" depends on HAVE_MEMBLOCK depends on NO_BOOTMEM - depends on X86_64 || MEMORY_HOTPLUG + depends on X86_64 || OF_EARLY_FLATTREE || MEMORY_HOTPLUG depends on NUMA default n help -- cgit v1.2.3 From c7142aead87aa5026e4b57671c7dbb1706b02606 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Mon, 12 Dec 2016 16:43:09 -0800 Subject: mm/pkeys: generate pkey system call code only if ARCH_HAS_PKEYS is selected Having code for the pkey_mprotect, pkey_alloc and pkey_free system calls makes only sense if ARCH_HAS_PKEYS is selected. If not selected these system calls will always return -ENOSPC or -EINVAL. To simplify things and have less code generate the pkey system call code only if ARCH_HAS_PKEYS is selected. For architectures which have already wired up the system calls, but do not select ARCH_HAS_PKEYS this will result in less generated code and a different return code: the three system calls will now always return -ENOSYS, using the cond_syscall mechanism. For architectures which have not wired up the system calls less unreachable code will be generated. Link: http://lkml.kernel.org/r/20161114111251.70084-1-heiko.carstens@de.ibm.com Signed-off-by: Heiko Carstens Acked-by: Dave Hansen Cc: Mark Rutland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index c5ba2aae0f54..cc2459c57f60 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -497,6 +497,8 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, return do_mprotect_pkey(start, len, prot, -1); } +#ifdef CONFIG_ARCH_HAS_PKEYS + SYSCALL_DEFINE4(pkey_mprotect, unsigned long, start, size_t, len, unsigned long, prot, int, pkey) { @@ -547,3 +549,5 @@ SYSCALL_DEFINE1(pkey_free, int, pkey) */ return ret; } + +#endif /* CONFIG_ARCH_HAS_PKEYS */ -- cgit v1.2.3 From c70b647d381cba1899c953b0016b7dc185892f90 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Mon, 12 Dec 2016 16:43:17 -0800 Subject: mm/filemap.c: add comment for confusing logic in page_cache_tree_insert() Unlike THP, hugetlb pages are represented by one entry in the radix-tree. [akpm@linux-foundation.org: tweak comment] Link: http://lkml.kernel.org/r/20161110163640.126124-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Acked-by: Johannes Weiner Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index 50b52fe51937..caa779f8797f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -169,7 +169,10 @@ static int page_cache_tree_insert(struct address_space *mapping, static void page_cache_tree_delete(struct address_space *mapping, struct page *page, void *shadow) { - int i, nr = PageHuge(page) ? 1 : hpage_nr_pages(page); + int i, nr; + + /* hugetlb pages are represented by one entry in the radix tree */ + nr = PageHuge(page) ? 1 : hpage_nr_pages(page); VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); -- cgit v1.2.3 From f1f5929cd9715c1cdfe07a890f12ac7d2c5304ec Mon Sep 17 00:00:00 2001 From: Jérémy Lefaure Date: Mon, 12 Dec 2016 16:43:23 -0800 Subject: shmem: fix compilation warnings on unused functions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compiling shmem.c with SHMEM and TRANSAPRENT_HUGE_PAGECACHE enabled raises warnings on two unused functions when CONFIG_TMPFS and CONFIG_SYSFS are both disabled: mm/shmem.c:390:20: warning: `shmem_format_huge' defined but not used [-Wunused-function] static const char *shmem_format_huge(int huge) ^~~~~~~~~~~~~~~~~ mm/shmem.c:373:12: warning: `shmem_parse_huge' defined but not used [-Wunused-function] static int shmem_parse_huge(const char *str) ^~~~~~~~~~~~~~~~ A conditional compilation on tmpfs or sysfs removes the warnings. Link: http://lkml.kernel.org/r/20161118055749.11313-1-jeremy.lefaure@lse.epita.fr Signed-off-by: Jérémy Lefaure Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index ba0d7644ee20..ec7aa562343e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -370,6 +370,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, int shmem_huge __read_mostly; +#if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static int shmem_parse_huge(const char *str) { if (!strcmp(str, "never")) @@ -407,6 +408,7 @@ static const char *shmem_format_huge(int huge) return "bad_val"; } } +#endif static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, struct shrink_control *sc, unsigned long nr_to_split) -- cgit v1.2.3 From 9491ae4aade6814afcfa67f4eb3e3342c2b39750 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 12 Dec 2016 16:43:26 -0800 Subject: mm: don't cap request size based on read-ahead setting We ran into a funky issue, where someone doing 256K buffered reads saw 128K requests at the device level. Turns out it is read-ahead capping the request size, since we use 128K as the default setting. This doesn't make a lot of sense - if someone is issuing 256K reads, they should see 256K reads, regardless of the read-ahead setting, if the underlying device can support a 256K read in a single command. This patch introduces a bdi hint, io_pages. This is the soft max IO size for the lower level, I've hooked it up to the bdev settings here. Read-ahead is modified to issue the maximum of the user request size, and the read-ahead max size, but capped to the max request size on the device side. The latter is done to avoid reading ahead too much, if the application asks for a huge read. With this patch, the kernel behaves like the application expects. Link: http://lkml.kernel.org/r/1479498073-8657-1-git-send-email-axboe@fb.com Signed-off-by: Jens Axboe Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- block/blk-settings.c | 1 + block/blk-sysfs.c | 1 + include/linux/backing-dev-defs.h | 1 + mm/readahead.c | 39 ++++++++++++++++++++++++++++----------- 4 files changed, 31 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/block/blk-settings.c b/block/blk-settings.c index f679ae122843..65f16cf4f850 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -249,6 +249,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); limits->max_sectors = max_sectors; + q->backing_dev_info.io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 9cc8d7c5439a..ea374e820775 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -212,6 +212,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) spin_lock_irq(q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; + q->backing_dev_info.io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); spin_unlock_irq(q->queue_lock); return ret; diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index c357f27d5483..b8144b2d59ce 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -136,6 +136,7 @@ struct bdi_writeback { struct backing_dev_info { struct list_head bdi_list; unsigned long ra_pages; /* max readahead in PAGE_SIZE units */ + unsigned long io_pages; /* max allowed IO size */ unsigned int capabilities; /* Device capabilities */ congested_fn *congested_fn; /* Function pointer if device is md/dm */ void *congested_data; /* Pointer to aux data for congested func */ diff --git a/mm/readahead.c b/mm/readahead.c index c8a955b1297e..c4ca70239233 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -207,12 +207,21 @@ out: * memory at once. */ int force_page_cache_readahead(struct address_space *mapping, struct file *filp, - pgoff_t offset, unsigned long nr_to_read) + pgoff_t offset, unsigned long nr_to_read) { + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + struct file_ra_state *ra = &filp->f_ra; + unsigned long max_pages; + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) return -EINVAL; - nr_to_read = min(nr_to_read, inode_to_bdi(mapping->host)->ra_pages); + /* + * If the request exceeds the readahead window, allow the read to + * be up to the optimal hardware IO size + */ + max_pages = max_t(unsigned long, bdi->io_pages, ra->ra_pages); + nr_to_read = min(nr_to_read, max_pages); while (nr_to_read) { int err; @@ -369,9 +378,17 @@ ondemand_readahead(struct address_space *mapping, bool hit_readahead_marker, pgoff_t offset, unsigned long req_size) { - unsigned long max = ra->ra_pages; + struct backing_dev_info *bdi = inode_to_bdi(mapping->host); + unsigned long max_pages = ra->ra_pages; pgoff_t prev_offset; + /* + * If the request exceeds the readahead window, allow the read to + * be up to the optimal hardware IO size + */ + if (req_size > max_pages && bdi->io_pages > max_pages) + max_pages = min(req_size, bdi->io_pages); + /* * start of file */ @@ -385,7 +402,7 @@ ondemand_readahead(struct address_space *mapping, if ((offset == (ra->start + ra->size - ra->async_size) || offset == (ra->start + ra->size))) { ra->start += ra->size; - ra->size = get_next_ra_size(ra, max); + ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } @@ -400,16 +417,16 @@ ondemand_readahead(struct address_space *mapping, pgoff_t start; rcu_read_lock(); - start = page_cache_next_hole(mapping, offset + 1, max); + start = page_cache_next_hole(mapping, offset + 1, max_pages); rcu_read_unlock(); - if (!start || start - offset > max) + if (!start || start - offset > max_pages) return 0; ra->start = start; ra->size = start - offset; /* old async_size */ ra->size += req_size; - ra->size = get_next_ra_size(ra, max); + ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; goto readit; } @@ -417,7 +434,7 @@ ondemand_readahead(struct address_space *mapping, /* * oversize read */ - if (req_size > max) + if (req_size > max_pages) goto initial_readahead; /* @@ -433,7 +450,7 @@ ondemand_readahead(struct address_space *mapping, * Query the page cache and look for the traces(cached history pages) * that a sequential stream would leave behind. */ - if (try_context_readahead(mapping, ra, offset, req_size, max)) + if (try_context_readahead(mapping, ra, offset, req_size, max_pages)) goto readit; /* @@ -444,7 +461,7 @@ ondemand_readahead(struct address_space *mapping, initial_readahead: ra->start = offset; - ra->size = get_init_ra_size(req_size, max); + ra->size = get_init_ra_size(req_size, max_pages); ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size; readit: @@ -454,7 +471,7 @@ readit: * the resulted next readahead window into the current one. */ if (offset == ra->start && ra->size == ra->async_size) { - ra->async_size = get_next_ra_size(ra, max); + ra->async_size = get_next_ra_size(ra, max_pages); ra->size += ra->async_size; } -- cgit v1.2.3 From 91a45f71078a6569ec3ca5bef74e1ab58121d80e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:32 -0800 Subject: mm: khugepaged: close use-after-free race during shmem collapsing Patch series "mm: workingset: radix tree subtleties & single-page file refaults", v3. This is another revision of the radix tree / workingset patches based on feedback from Jan and Kirill. This is a follow-up to d3798ae8c6f3 ("mm: filemap: don't plant shadow entries without radix tree node"). That patch fixed an issue that was caused mainly by the page cache sneaking special shadow page entries into the radix tree and relying on subtleties in the radix tree code to make that work. The fix also had to stop tracking refaults for single-page files because shadow pages stored as direct pointers in radix_tree_root->rnode weren't properly handled during tree extension. These patches make the radix tree code explicitely support and track such special entries, to eliminate the subtleties and to restore the thrash detection for single-page files. This patch (of 9): When a radix tree iteration drops the tree lock, another thread might swoop in and free the node holding the current slot. The iteration needs to do another tree lookup from the current index to continue. [kirill.shutemov@linux.intel.com: re-lookup for replacement] Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages") Link: http://lkml.kernel.org/r/20161117191138.22769-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Kirill A. Shutemov Reviewed-by: Jan Kara Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 87e1a7ca3846..2779c63bdea0 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1403,6 +1403,9 @@ static void collapse_shmem(struct mm_struct *mm, spin_lock_irq(&mapping->tree_lock); + slot = radix_tree_lookup_slot(&mapping->page_tree, index); + VM_BUG_ON_PAGE(page != radix_tree_deref_slot_protected(slot, + &mapping->tree_lock), page); VM_BUG_ON_PAGE(page_mapped(page), page); /* @@ -1426,6 +1429,7 @@ static void collapse_shmem(struct mm_struct *mm, radix_tree_replace_slot(slot, new_page + (index % HPAGE_PMD_NR)); + slot = radix_tree_iter_next(&iter); index++; continue; out_lru: @@ -1537,6 +1541,7 @@ tree_unlocked: putback_lru_page(page); unlock_page(page); spin_lock_irq(&mapping->tree_lock); + slot = radix_tree_iter_next(&iter); } VM_BUG_ON(nr_none); spin_unlock_irq(&mapping->tree_lock); -- cgit v1.2.3 From 59749e6ce53735d8b696763742225f126e94603f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:35 -0800 Subject: mm: khugepaged: fix radix tree node leak in shmem collapse error path The radix tree counts valid entries in each tree node. Entries stored in the tree cannot be removed by simpling storing NULL in the slot or the internal counters will be off and the node never gets freed again. When collapsing a shmem page fails, restore the holes that were filled with radix_tree_insert() with a proper radix tree deletion. Fixes: f3f0e1d2150b ("khugepaged: add support of collapse for tmpfs/shmem pages") Link: http://lkml.kernel.org/r/20161117191138.22769-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reported-by: Jan Kara Acked-by: Kirill A. Shutemov Reviewed-by: Jan Kara Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2779c63bdea0..5d7c006373d3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1525,9 +1525,11 @@ tree_unlocked: if (!page || iter.index < page->index) { if (!nr_none) break; - /* Put holes back where they were */ - radix_tree_replace_slot(slot, NULL); nr_none--; + /* Put holes back where they were */ + radix_tree_delete(&mapping->page_tree, + iter.index); + slot = radix_tree_iter_next(&iter); continue; } -- cgit v1.2.3 From b936887e8739d3fa83f87d899f68d136735d9816 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:38 -0800 Subject: mm: workingset: turn shadow node shrinker bugs into warnings When the shadow page shrinker tries to reclaim a radix tree node but finds it in an unexpected state - it should contain no pages, and non-zero shadow entries - there is no need to kill the executing task or even the entire system. Warn about the invalid state, then leave that tree node be. Simply don't put it back on the shadow LRU for future reclaim and move on. Link: http://lkml.kernel.org/r/20161117191138.22769-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/workingset.c b/mm/workingset.c index fb1f9183d89a..98f830897b1b 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -418,23 +418,27 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - BUG_ON(!workingset_node_shadows(node)); - BUG_ON(workingset_node_pages(node)); - + if (WARN_ON_ONCE(!workingset_node_shadows(node))) + goto out_invalid; + if (WARN_ON_ONCE(workingset_node_pages(node))) + goto out_invalid; for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { - BUG_ON(!radix_tree_exceptional_entry(node->slots[i])); + if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) + goto out_invalid; + if (WARN_ON_ONCE(!mapping->nrexceptional)) + goto out_invalid; node->slots[i] = NULL; workingset_node_shadows_dec(node); - BUG_ON(!mapping->nrexceptional); mapping->nrexceptional--; } } - BUG_ON(workingset_node_shadows(node)); + if (WARN_ON_ONCE(workingset_node_shadows(node))) + goto out_invalid; inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); - if (!__radix_tree_delete_node(&mapping->page_tree, node)) - BUG(); + __radix_tree_delete_node(&mapping->page_tree, node); +out_invalid: spin_unlock(&mapping->tree_lock); ret = LRU_REMOVED_RETRY; out: -- cgit v1.2.3 From f7942430e40f14c6d2ca48a1875add509938c07d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:41 -0800 Subject: lib: radix-tree: native accounting of exceptional entries The way the page cache is sneaking shadow entries of evicted pages into the radix tree past the node entry accounting and tracking them manually in the upper bits of node->count is fraught with problems. These shadow entries are marked in the tree as exceptional entries, which are a native concept to the radix tree. Maintain an explicit counter of exceptional entries in the radix tree node. Subsequent patches will switch shadow entry tracking over to that counter. DAX and shmem are the other users of exceptional entries. Since slot replacements that change the entry type from regular to exceptional must now be accounted, introduce a __radix_tree_replace() function that does replacement and accounting, and switch DAX and shmem over. The increase in radix tree node size is temporary. A followup patch switches the shadow tracking to this new scheme and we'll no longer need the upper bits in node->count and shrink that back to one byte. Link: http://lkml.kernel.org/r/20161117192945.GA23430@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 5 +++-- include/linux/radix-tree.h | 10 +++++++--- lib/radix-tree.c | 46 +++++++++++++++++++++++++++++++++++++++++++--- mm/shmem.c | 8 ++++---- 4 files changed, 57 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/fs/dax.c b/fs/dax.c index 014defd2e744..db78bae0dc0f 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -643,12 +643,13 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, } mapping->nrexceptional++; } else { + struct radix_tree_node *node; void **slot; void *ret; - ret = __radix_tree_lookup(page_tree, index, NULL, &slot); + ret = __radix_tree_lookup(page_tree, index, &node, &slot); WARN_ON_ONCE(ret != entry); - radix_tree_replace_slot(slot, new_entry); + __radix_tree_replace(page_tree, node, slot, new_entry); } if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index af3581b8a451..7ced8a70cc8b 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -85,9 +85,10 @@ static inline bool radix_tree_is_internal_node(void *ptr) #define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1) struct radix_tree_node { - unsigned char shift; /* Bits remaining in each slot */ - unsigned char offset; /* Slot offset in parent */ - unsigned int count; + unsigned char shift; /* Bits remaining in each slot */ + unsigned char offset; /* Slot offset in parent */ + unsigned int count; /* Total entry count */ + unsigned char exceptional; /* Exceptional entry count */ union { struct { /* Used when ascending tree */ @@ -276,6 +277,9 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +void __radix_tree_replace(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item); bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 8e6d552c40dd..7885796d35ae 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -220,10 +220,10 @@ static void dump_node(struct radix_tree_node *node, unsigned long index) { unsigned long i; - pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d parent %p\n", + pr_debug("radix node: %p offset %d tags %lx %lx %lx shift %d count %d exceptional %d parent %p\n", node, node->offset, node->tags[0][0], node->tags[1][0], node->tags[2][0], - node->shift, node->count, node->parent); + node->shift, node->count, node->exceptional, node->parent); for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { unsigned long first = index | (i << node->shift); @@ -522,8 +522,13 @@ static int radix_tree_extend(struct radix_tree_root *root, node->offset = 0; node->count = 1; node->parent = NULL; - if (radix_tree_is_internal_node(slot)) + if (radix_tree_is_internal_node(slot)) { entry_to_node(slot)->parent = node; + } else { + /* Moving an exceptional root->rnode to a node */ + if (radix_tree_exceptional_entry(slot)) + node->exceptional = 1; + } node->slots[0] = slot; slot = node_to_entry(node); rcu_assign_pointer(root->rnode, slot); @@ -649,6 +654,8 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, if (node) { unsigned offset = get_slot_offset(node, slot); node->count++; + if (radix_tree_exceptional_entry(item)) + node->exceptional++; BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); @@ -746,6 +753,37 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); +/** + * __radix_tree_replace - replace item in a slot + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * + * For use with __radix_tree_lookup(). Caller must hold tree write locked + * across slot lookup and replacement. + */ +void __radix_tree_replace(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item) +{ + void *old = rcu_dereference_raw(*slot); + int exceptional; + + WARN_ON_ONCE(radix_tree_is_internal_node(item)); + WARN_ON_ONCE(!!item - !!old); + + exceptional = !!radix_tree_exceptional_entry(item) - + !!radix_tree_exceptional_entry(old); + + WARN_ON_ONCE(exceptional && !node && slot != (void **)&root->rnode); + + if (node) + node->exceptional += exceptional; + + rcu_assign_pointer(*slot, item); +} + /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root @@ -1561,6 +1599,8 @@ void *radix_tree_delete_item(struct radix_tree_root *root, delete_sibling_entries(node, node_to_entry(slot), offset); node->slots[offset] = NULL; node->count--; + if (radix_tree_exceptional_entry(entry)) + node->exceptional--; __radix_tree_delete_node(root, node); diff --git a/mm/shmem.c b/mm/shmem.c index ec7aa562343e..3149ddee8f55 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -300,18 +300,18 @@ void shmem_uncharge(struct inode *inode, long pages) static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { + struct radix_tree_node *node; void **pslot; void *item; VM_BUG_ON(!expected); VM_BUG_ON(!replacement); - pslot = radix_tree_lookup_slot(&mapping->page_tree, index); - if (!pslot) + item = __radix_tree_lookup(&mapping->page_tree, index, &node, &pslot); + if (!item) return -ENOENT; - item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock); if (item != expected) return -ENOENT; - radix_tree_replace_slot(pslot, replacement); + __radix_tree_replace(&mapping->page_tree, node, pslot, replacement); return 0; } -- cgit v1.2.3 From 6d75f366b9242f9b17ed7d0b0604d7460f818f21 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:43 -0800 Subject: lib: radix-tree: check accounting of existing slot replacement users The bug in khugepaged fixed earlier in this series shows that radix tree slot replacement is fragile; and it will become more so when not only NULL<->!NULL transitions need to be caught but transitions from and to exceptional entries as well. We need checks. Re-implement radix_tree_replace_slot() on top of the sanity-checked __radix_tree_replace(). This requires existing callers to also pass the radix tree root, but it'll warn us when somebody replaces slots with contents that need proper accounting (transitions between NULL entries, real entries, exceptional entries) and where a replacement through the slot pointer would corrupt the radix tree node counts. Link: http://lkml.kernel.org/r/20161117193021.GB23430@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/gmap.c | 2 +- drivers/sh/intc/virq.c | 2 +- fs/dax.c | 4 +-- include/linux/radix-tree.h | 16 ++------- lib/radix-tree.c | 63 +++++++++++++++++++++++++++-------- mm/filemap.c | 4 +-- mm/khugepaged.c | 5 +-- mm/migrate.c | 4 +-- mm/truncate.c | 2 +- tools/testing/radix-tree/multiorder.c | 2 +- 10 files changed, 64 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 3ba622702ce4..ec1f0dedb948 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -1015,7 +1015,7 @@ static inline void gmap_insert_rmap(struct gmap *sg, unsigned long vmaddr, if (slot) { rmap->next = radix_tree_deref_slot_protected(slot, &sg->guest_table_lock); - radix_tree_replace_slot(slot, rmap); + radix_tree_replace_slot(&sg->host_to_rmap, slot, rmap); } else { rmap->next = NULL; radix_tree_insert(&sg->host_to_rmap, vmaddr >> PAGE_SHIFT, diff --git a/drivers/sh/intc/virq.c b/drivers/sh/intc/virq.c index e7899624aa0b..35bbe288ddb4 100644 --- a/drivers/sh/intc/virq.c +++ b/drivers/sh/intc/virq.c @@ -254,7 +254,7 @@ restart: radix_tree_tag_clear(&d->tree, entry->enum_id, INTC_TAG_VIRQ_NEEDS_ALLOC); - radix_tree_replace_slot((void **)entries[i], + radix_tree_replace_slot(&d->tree, (void **)entries[i], &intc_irq_xlate[irq]); } diff --git a/fs/dax.c b/fs/dax.c index db78bae0dc0f..85930c2a2749 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -342,7 +342,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry |= RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(slot, (void *)entry); + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); return (void *)entry; } @@ -356,7 +356,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot) radix_tree_deref_slot_protected(slot, &mapping->tree_lock); entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK; - radix_tree_replace_slot(slot, (void *)entry); + radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry); return (void *)entry; } diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 7ced8a70cc8b..2d1b9b8be983 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -249,20 +249,6 @@ static inline int radix_tree_exception(void *arg) return unlikely((unsigned long)arg & RADIX_TREE_ENTRY_MASK); } -/** - * radix_tree_replace_slot - replace item in a slot - * @pslot: pointer to slot, returned by radix_tree_lookup_slot - * @item: new item to store in the slot. - * - * For use with radix_tree_lookup_slot(). Caller must hold tree write locked - * across slot lookup and replacement. - */ -static inline void radix_tree_replace_slot(void **pslot, void *item) -{ - BUG_ON(radix_tree_is_internal_node(item)); - rcu_assign_pointer(*pslot, item); -} - int __radix_tree_create(struct radix_tree_root *root, unsigned long index, unsigned order, struct radix_tree_node **nodep, void ***slotp); @@ -280,6 +266,8 @@ void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, void **slot, void *item); +void radix_tree_replace_slot(struct radix_tree_root *root, + void **slot, void *item); bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 7885796d35ae..f91d5b0af654 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -753,19 +753,10 @@ void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index) } EXPORT_SYMBOL(radix_tree_lookup); -/** - * __radix_tree_replace - replace item in a slot - * @root: radix tree root - * @node: pointer to tree node - * @slot: pointer to slot in @node - * @item: new item to store in the slot. - * - * For use with __radix_tree_lookup(). Caller must hold tree write locked - * across slot lookup and replacement. - */ -void __radix_tree_replace(struct radix_tree_root *root, - struct radix_tree_node *node, - void **slot, void *item) +static void replace_slot(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item, + bool warn_typeswitch) { void *old = rcu_dereference_raw(*slot); int exceptional; @@ -776,7 +767,7 @@ void __radix_tree_replace(struct radix_tree_root *root, exceptional = !!radix_tree_exceptional_entry(item) - !!radix_tree_exceptional_entry(old); - WARN_ON_ONCE(exceptional && !node && slot != (void **)&root->rnode); + WARN_ON_ONCE(warn_typeswitch && exceptional); if (node) node->exceptional += exceptional; @@ -784,6 +775,50 @@ void __radix_tree_replace(struct radix_tree_root *root, rcu_assign_pointer(*slot, item); } +/** + * __radix_tree_replace - replace item in a slot + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * + * For use with __radix_tree_lookup(). Caller must hold tree write locked + * across slot lookup and replacement. + */ +void __radix_tree_replace(struct radix_tree_root *root, + struct radix_tree_node *node, + void **slot, void *item) +{ + /* + * This function supports replacing exceptional entries, but + * that needs accounting against the node unless the slot is + * root->rnode. + */ + replace_slot(root, node, slot, item, + !node && slot != (void **)&root->rnode); +} + +/** + * radix_tree_replace_slot - replace item in a slot + * @root: radix tree root + * @slot: pointer to slot + * @item: new item to store in the slot. + * + * For use with radix_tree_lookup_slot(), radix_tree_gang_lookup_slot(), + * radix_tree_gang_lookup_tag_slot(). Caller must hold tree write locked + * across slot lookup and replacement. + * + * NOTE: This cannot be used to switch between non-entries (empty slots), + * regular entries, and exceptional entries, as that requires accounting + * inside the radix tree node. When switching from one type of entry to + * another, use __radix_tree_lookup() and __radix_tree_replace(). + */ +void radix_tree_replace_slot(struct radix_tree_root *root, + void **slot, void *item) +{ + replace_slot(root, NULL, slot, item, true); +} + /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root diff --git a/mm/filemap.c b/mm/filemap.c index caa779f8797f..1ba726aef708 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -147,7 +147,7 @@ static int page_cache_tree_insert(struct address_space *mapping, false); } } - radix_tree_replace_slot(slot, page); + radix_tree_replace_slot(&mapping->page_tree, slot, page); mapping->nrpages++; if (node) { workingset_node_pages_inc(node); @@ -196,7 +196,7 @@ static void page_cache_tree_delete(struct address_space *mapping, shadow = NULL; } - radix_tree_replace_slot(slot, shadow); + radix_tree_replace_slot(&mapping->page_tree, slot, shadow); if (!node) break; diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 5d7c006373d3..7a50c726c5ae 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1426,7 +1426,7 @@ static void collapse_shmem(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - radix_tree_replace_slot(slot, + radix_tree_replace_slot(&mapping->page_tree, slot, new_page + (index % HPAGE_PMD_NR)); slot = radix_tree_iter_next(&iter); @@ -1538,7 +1538,8 @@ tree_unlocked: /* Unfreeze the page. */ list_del(&page->lru); page_ref_unfreeze(page, 2); - radix_tree_replace_slot(slot, page); + radix_tree_replace_slot(&mapping->page_tree, + slot, page); spin_unlock_irq(&mapping->tree_lock); putback_lru_page(page); unlock_page(page); diff --git a/mm/migrate.c b/mm/migrate.c index 66ce6b490b13..0ed24b1fa77b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -482,7 +482,7 @@ int migrate_page_move_mapping(struct address_space *mapping, SetPageDirty(newpage); } - radix_tree_replace_slot(pslot, newpage); + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); /* * Drop cache reference from old page by unfreezing @@ -556,7 +556,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, get_page(newpage); - radix_tree_replace_slot(pslot, newpage); + radix_tree_replace_slot(&mapping->page_tree, pslot, newpage); page_ref_unfreeze(page, expected_count - 1); diff --git a/mm/truncate.c b/mm/truncate.c index 8d8c62d89e6d..3c631c357873 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -49,7 +49,7 @@ static void clear_exceptional_entry(struct address_space *mapping, goto unlock; if (*slot != entry) goto unlock; - radix_tree_replace_slot(slot, NULL); + radix_tree_replace_slot(&mapping->page_tree, slot, NULL); mapping->nrexceptional--; if (!node) goto unlock; diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 05d7bc488971..d1be94667a30 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -146,7 +146,7 @@ static void multiorder_check(unsigned long index, int order) slot = radix_tree_lookup_slot(&tree, index); free(*slot); - radix_tree_replace_slot(slot, item2); + radix_tree_replace_slot(&tree, slot, item2); for (i = min; i < max; i++) { struct item *item = item_lookup(&tree, i); assert(item != 0); -- cgit v1.2.3 From 4d693d08607ab319095ec8942909df4b4aebdf66 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:49 -0800 Subject: lib: radix-tree: update callback for changing leaf nodes Support handing __radix_tree_replace() a callback that gets invoked for all leaf nodes that change or get freed as a result of the slot replacement, to assist users tracking nodes with node->private_list. This prepares for putting page cache shadow entries into the radix tree root again and drastically simplifying the shadow tracking. Link: http://lkml.kernel.org/r/20161117193134.GD23430@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Jan Kara Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 3 ++- include/linux/radix-tree.h | 4 +++- lib/radix-tree.c | 42 +++++++++++++++++++++++++++++------------- mm/shmem.c | 3 ++- 4 files changed, 36 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/fs/dax.c b/fs/dax.c index 85930c2a2749..6916ed37d463 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -649,7 +649,8 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, ret = __radix_tree_lookup(page_tree, index, &node, &slot); WARN_ON_ONCE(ret != entry); - __radix_tree_replace(page_tree, node, slot, new_entry); + __radix_tree_replace(page_tree, node, slot, + new_entry, NULL, NULL); } if (vmf->flags & FAULT_FLAG_WRITE) radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 2d1b9b8be983..15c972ea9510 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -263,9 +263,11 @@ void *__radix_tree_lookup(struct radix_tree_root *root, unsigned long index, struct radix_tree_node **nodep, void ***slotp); void *radix_tree_lookup(struct radix_tree_root *, unsigned long); void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long); +typedef void (*radix_tree_update_node_t)(struct radix_tree_node *, void *); void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, - void **slot, void *item); + void **slot, void *item, + radix_tree_update_node_t update_node, void *private); void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item); bool __radix_tree_delete_node(struct radix_tree_root *root, diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 5d8930f3b3d8..df4ff18dd63c 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -325,7 +325,6 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) tag_clear(node, i, 0); node->slots[0] = NULL; - node->count = 0; kmem_cache_free(radix_tree_node_cachep, node); } @@ -542,7 +541,9 @@ out: * radix_tree_shrink - shrink radix tree to minimum height * @root radix tree root */ -static inline bool radix_tree_shrink(struct radix_tree_root *root) +static inline bool radix_tree_shrink(struct radix_tree_root *root, + radix_tree_update_node_t update_node, + void *private) { bool shrunk = false; @@ -597,8 +598,12 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) * also results in a stale slot). So tag the slot as indirect * to force callers to retry. */ - if (!radix_tree_is_internal_node(child)) + node->count = 0; + if (!radix_tree_is_internal_node(child)) { node->slots[0] = RADIX_TREE_RETRY; + if (update_node) + update_node(node, private); + } radix_tree_node_free(node); shrunk = true; @@ -608,7 +613,8 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root) } static bool delete_node(struct radix_tree_root *root, - struct radix_tree_node *node) + struct radix_tree_node *node, + radix_tree_update_node_t update_node, void *private) { bool deleted = false; @@ -617,7 +623,8 @@ static bool delete_node(struct radix_tree_root *root, if (node->count) { if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root); + deleted |= radix_tree_shrink(root, update_node, + private); return deleted; } @@ -880,17 +887,20 @@ static void replace_slot(struct radix_tree_root *root, /** * __radix_tree_replace - replace item in a slot - * @root: radix tree root - * @node: pointer to tree node - * @slot: pointer to slot in @node - * @item: new item to store in the slot. + * @root: radix tree root + * @node: pointer to tree node + * @slot: pointer to slot in @node + * @item: new item to store in the slot. + * @update_node: callback for changing leaf nodes + * @private: private data to pass to @update_node * * For use with __radix_tree_lookup(). Caller must hold tree write locked * across slot lookup and replacement. */ void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, - void **slot, void *item) + void **slot, void *item, + radix_tree_update_node_t update_node, void *private) { /* * This function supports replacing exceptional entries and @@ -900,7 +910,13 @@ void __radix_tree_replace(struct radix_tree_root *root, replace_slot(root, node, slot, item, !node && slot != (void **)&root->rnode); - delete_node(root, node); + if (!node) + return; + + if (update_node) + update_node(node, private); + + delete_node(root, node, update_node, private); } /** @@ -1585,7 +1601,7 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) bool __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { - return delete_node(root, node); + return delete_node(root, node, NULL, NULL); } static inline void delete_sibling_entries(struct radix_tree_node *node, @@ -1642,7 +1658,7 @@ void *radix_tree_delete_item(struct radix_tree_root *root, node_tag_clear(root, node, tag, offset); delete_sibling_entries(node, node_to_entry(slot), offset); - __radix_tree_replace(root, node, slot, NULL); + __radix_tree_replace(root, node, slot, NULL, NULL, NULL); return entry; } diff --git a/mm/shmem.c b/mm/shmem.c index 3149ddee8f55..abd7403aba41 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -311,7 +311,8 @@ static int shmem_radix_tree_replace(struct address_space *mapping, return -ENOENT; if (item != expected) return -ENOENT; - __radix_tree_replace(&mapping->page_tree, node, pslot, replacement); + __radix_tree_replace(&mapping->page_tree, node, pslot, + replacement, NULL, NULL); return 0; } -- cgit v1.2.3 From 14b468791fa955d442f962fdf5207dfd39a131c8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:52 -0800 Subject: mm: workingset: move shadow entry tracking to radix tree exceptional tracking Currently, we track the shadow entries in the page cache in the upper bits of the radix_tree_node->count, behind the back of the radix tree implementation. Because the radix tree code has no awareness of them, we rely on random subtleties throughout the implementation (such as the node->count != 1 check in the shrinking code, which is meant to exclude multi-entry nodes but also happens to skip nodes with only one shadow entry, as that's accounted in the upper bits). This is error prone and has, in fact, caused the bug fixed in d3798ae8c6f3 ("mm: filemap: don't plant shadow entries without radix tree node"). To remove these subtleties, this patch moves shadow entry tracking from the upper bits of node->count to the existing counter for exceptional entries. node->count goes back to being a simple counter of valid entries in the tree node and can be shrunk to a single byte. This vastly simplifies the page cache code. All accounting happens natively inside the radix tree implementation, and maintaining the LRU linkage of shadow nodes is consolidated into a single function in the workingset code that is called for leaf nodes affected by a change in the page cache tree. This also removes the last user of the __radix_delete_node() return value. Eliminate it. Link: http://lkml.kernel.org/r/20161117193211.GE23430@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 8 ++----- include/linux/swap.h | 34 +--------------------------- lib/radix-tree.c | 25 +++++---------------- mm/filemap.c | 54 +++++--------------------------------------- mm/truncate.c | 21 +++-------------- mm/workingset.c | 56 +++++++++++++++++++++++++++++++++++----------- 6 files changed, 60 insertions(+), 138 deletions(-) (limited to 'mm') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 15c972ea9510..744486057e9e 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -80,14 +80,10 @@ static inline bool radix_tree_is_internal_node(void *ptr) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) -/* Internally used bits of node->count */ -#define RADIX_TREE_COUNT_SHIFT (RADIX_TREE_MAP_SHIFT + 1) -#define RADIX_TREE_COUNT_MASK ((1UL << RADIX_TREE_COUNT_SHIFT) - 1) - struct radix_tree_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ - unsigned int count; /* Total entry count */ + unsigned char count; /* Total entry count */ unsigned char exceptional; /* Exceptional entry count */ union { struct { @@ -270,7 +266,7 @@ void __radix_tree_replace(struct radix_tree_root *root, radix_tree_update_node_t update_node, void *private); void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item); -bool __radix_tree_delete_node(struct radix_tree_root *root, +void __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node); void *radix_tree_delete_item(struct radix_tree_root *, unsigned long, void *); void *radix_tree_delete(struct radix_tree_root *, unsigned long); diff --git a/include/linux/swap.h b/include/linux/swap.h index a56523cefb9b..09b212d37f1d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -246,39 +246,7 @@ struct swap_info_struct { void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); void workingset_activation(struct page *page); -extern struct list_lru workingset_shadow_nodes; - -static inline unsigned int workingset_node_pages(struct radix_tree_node *node) -{ - return node->count & RADIX_TREE_COUNT_MASK; -} - -static inline void workingset_node_pages_inc(struct radix_tree_node *node) -{ - node->count++; -} - -static inline void workingset_node_pages_dec(struct radix_tree_node *node) -{ - VM_WARN_ON_ONCE(!workingset_node_pages(node)); - node->count--; -} - -static inline unsigned int workingset_node_shadows(struct radix_tree_node *node) -{ - return node->count >> RADIX_TREE_COUNT_SHIFT; -} - -static inline void workingset_node_shadows_inc(struct radix_tree_node *node) -{ - node->count += 1U << RADIX_TREE_COUNT_SHIFT; -} - -static inline void workingset_node_shadows_dec(struct radix_tree_node *node) -{ - VM_WARN_ON_ONCE(!workingset_node_shadows(node)); - node->count -= 1U << RADIX_TREE_COUNT_SHIFT; -} +void workingset_update_node(struct radix_tree_node *node, void *private); /* linux/mm/page_alloc.c */ extern unsigned long totalram_pages; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index df4ff18dd63c..9dbfaac05e6c 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -541,12 +541,10 @@ out: * radix_tree_shrink - shrink radix tree to minimum height * @root radix tree root */ -static inline bool radix_tree_shrink(struct radix_tree_root *root, +static inline void radix_tree_shrink(struct radix_tree_root *root, radix_tree_update_node_t update_node, void *private) { - bool shrunk = false; - for (;;) { struct radix_tree_node *node = root->rnode; struct radix_tree_node *child; @@ -606,26 +604,20 @@ static inline bool radix_tree_shrink(struct radix_tree_root *root, } radix_tree_node_free(node); - shrunk = true; } - - return shrunk; } -static bool delete_node(struct radix_tree_root *root, +static void delete_node(struct radix_tree_root *root, struct radix_tree_node *node, radix_tree_update_node_t update_node, void *private) { - bool deleted = false; - do { struct radix_tree_node *parent; if (node->count) { if (node == entry_to_node(root->rnode)) - deleted |= radix_tree_shrink(root, update_node, - private); - return deleted; + radix_tree_shrink(root, update_node, private); + return; } parent = node->parent; @@ -638,12 +630,9 @@ static bool delete_node(struct radix_tree_root *root, } radix_tree_node_free(node); - deleted = true; node = parent; } while (node); - - return deleted; } /** @@ -1595,13 +1584,11 @@ unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) * After clearing the slot at @index in @node from radix tree * rooted at @root, call this function to attempt freeing the * node and shrinking the tree. - * - * Returns %true if @node was freed, %false otherwise. */ -bool __radix_tree_delete_node(struct radix_tree_root *root, +void __radix_tree_delete_node(struct radix_tree_root *root, struct radix_tree_node *node) { - return delete_node(root, node, NULL, NULL); + delete_node(root, node, NULL, NULL); } static inline void delete_sibling_entries(struct radix_tree_node *node, diff --git a/mm/filemap.c b/mm/filemap.c index 1ba726aef708..dc3e5fce0b7b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -132,37 +132,19 @@ static int page_cache_tree_insert(struct address_space *mapping, if (!dax_mapping(mapping)) { if (shadowp) *shadowp = p; - if (node) - workingset_node_shadows_dec(node); } else { /* DAX can replace empty locked entry with a hole */ WARN_ON_ONCE(p != (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | RADIX_DAX_ENTRY_LOCK)); - /* DAX accounts exceptional entries as normal pages */ - if (node) - workingset_node_pages_dec(node); /* Wakeup waiters for exceptional entry lock */ dax_wake_mapping_entry_waiter(mapping, page->index, false); } } - radix_tree_replace_slot(&mapping->page_tree, slot, page); + __radix_tree_replace(&mapping->page_tree, node, slot, page, + workingset_update_node, mapping); mapping->nrpages++; - if (node) { - workingset_node_pages_inc(node); - /* - * Don't track node that contains actual pages. - * - * Avoid acquiring the list_lru lock if already - * untracked. The list_empty() test is safe as - * node->private_list is protected by - * mapping->tree_lock. - */ - if (!list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - } return 0; } @@ -185,8 +167,6 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index + i, &node, &slot); - radix_tree_clear_tags(&mapping->page_tree, node, slot); - if (!node) { VM_BUG_ON_PAGE(nr != 1, page); /* @@ -196,33 +176,9 @@ static void page_cache_tree_delete(struct address_space *mapping, shadow = NULL; } - radix_tree_replace_slot(&mapping->page_tree, slot, shadow); - - if (!node) - break; - - workingset_node_pages_dec(node); - if (shadow) - workingset_node_shadows_inc(node); - else - if (__radix_tree_delete_node(&mapping->page_tree, node)) - continue; - - /* - * Track node that only contains shadow entries. DAX mappings - * contain no shadow entries and may contain other exceptional - * entries so skip those. - * - * Avoid acquiring the list_lru lock if already tracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!dax_mapping(mapping) && !workingset_node_pages(node) && - list_empty(&node->private_list)) { - node->private_data = mapping; - list_lru_add(&workingset_shadow_nodes, - &node->private_list); - } + radix_tree_clear_tags(&mapping->page_tree, node, slot); + __radix_tree_replace(&mapping->page_tree, node, slot, shadow, + workingset_update_node, mapping); } if (shadow) { diff --git a/mm/truncate.c b/mm/truncate.c index 3c631c357873..fd97f1dbce29 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -44,28 +44,13 @@ static void clear_exceptional_entry(struct address_space *mapping, * without the tree itself locked. These unlocked entries * need verification under the tree lock. */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, - &slot)) + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) goto unlock; if (*slot != entry) goto unlock; - radix_tree_replace_slot(&mapping->page_tree, slot, NULL); + __radix_tree_replace(&mapping->page_tree, node, slot, NULL, + workingset_update_node, mapping); mapping->nrexceptional--; - if (!node) - goto unlock; - workingset_node_shadows_dec(node); - /* - * Don't track node without shadow entries. - * - * Avoid acquiring the list_lru lock if already untracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!workingset_node_shadows(node) && - !list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, - &node->private_list); - __radix_tree_delete_node(&mapping->page_tree, node); unlock: spin_unlock_irq(&mapping->tree_lock); } diff --git a/mm/workingset.c b/mm/workingset.c index 98f830897b1b..ef556bf1323d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -334,18 +335,45 @@ out: * point where they would still be useful. */ -struct list_lru workingset_shadow_nodes; +static struct list_lru shadow_nodes; + +void workingset_update_node(struct radix_tree_node *node, void *private) +{ + struct address_space *mapping = private; + + /* Only regular page cache has shadow entries */ + if (dax_mapping(mapping) || shmem_mapping(mapping)) + return; + + /* + * Track non-empty nodes that contain only shadow entries; + * unlink those that contain pages or are being freed. + * + * Avoid acquiring the list_lru lock when the nodes are + * already where they should be. The list_empty() test is safe + * as node->private_list is protected by &mapping->tree_lock. + */ + if (node->count && node->count == node->exceptional) { + if (list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&shadow_nodes, &node->private_list); + } + } else { + if (!list_empty(&node->private_list)) + list_lru_del(&shadow_nodes, &node->private_list); + } +} static unsigned long count_shadow_nodes(struct shrinker *shrinker, struct shrink_control *sc) { - unsigned long shadow_nodes; unsigned long max_nodes; + unsigned long nodes; unsigned long pages; /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc); + nodes = list_lru_shrink_count(&shadow_nodes, sc); local_irq_enable(); if (sc->memcg) { @@ -372,10 +400,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, */ max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); - if (shadow_nodes <= max_nodes) + if (nodes <= max_nodes) return 0; - return shadow_nodes - max_nodes; + return nodes - max_nodes; } static enum lru_status shadow_lru_isolate(struct list_head *item, @@ -418,22 +446,25 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, * no pages, so we expect to be able to remove them all and * delete and free the empty node afterwards. */ - if (WARN_ON_ONCE(!workingset_node_shadows(node))) + if (WARN_ON_ONCE(!node->exceptional)) goto out_invalid; - if (WARN_ON_ONCE(workingset_node_pages(node))) + if (WARN_ON_ONCE(node->count != node->exceptional)) goto out_invalid; for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) { if (node->slots[i]) { if (WARN_ON_ONCE(!radix_tree_exceptional_entry(node->slots[i]))) goto out_invalid; + if (WARN_ON_ONCE(!node->exceptional)) + goto out_invalid; if (WARN_ON_ONCE(!mapping->nrexceptional)) goto out_invalid; node->slots[i] = NULL; - workingset_node_shadows_dec(node); + node->exceptional--; + node->count--; mapping->nrexceptional--; } } - if (WARN_ON_ONCE(workingset_node_shadows(node))) + if (WARN_ON_ONCE(node->exceptional)) goto out_invalid; inc_node_state(page_pgdat(virt_to_page(node)), WORKINGSET_NODERECLAIM); __radix_tree_delete_node(&mapping->page_tree, node); @@ -456,8 +487,7 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); - ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc, - shadow_lru_isolate, NULL); + ret = list_lru_shrink_walk(&shadow_nodes, sc, shadow_lru_isolate, NULL); local_irq_enable(); return ret; } @@ -496,7 +526,7 @@ static int __init workingset_init(void) pr_info("workingset: timestamp_bits=%d max_order=%d bucket_order=%u\n", timestamp_bits, max_order, bucket_order); - ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key); + ret = list_lru_init_key(&shadow_nodes, &shadow_nodes_key); if (ret) goto err; ret = register_shrinker(&workingset_shadow_shrinker); @@ -504,7 +534,7 @@ static int __init workingset_init(void) goto err_list_lru; return 0; err_list_lru: - list_lru_destroy(&workingset_shadow_nodes); + list_lru_destroy(&shadow_nodes); err: return ret; } -- cgit v1.2.3 From dbc446b88e7041cb1d076e51726ccee497cb6ee3 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:55 -0800 Subject: mm: workingset: restore refault tracking for single-page files Shadow entries in the page cache used to be accounted behind the radix tree implementation's back in the upper bits of node->count, and the radix tree code extending a single-entry tree with a shadow entry in root->rnode would corrupt that counter. As a result, we could not put shadow entries at index 0 if the tree didn't have any other entries, and that means no refault detection for any single-page file. Now that the shadow entries are tracked natively in the radix tree's exceptional counter, this is no longer necessary. Extending and shrinking the tree from and to single entries in root->rnode now does the right thing when the entry is exceptional, remove that limitation. Link: http://lkml.kernel.org/r/20161117193244.GF23430@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Jan Kara Cc: Kirill A. Shutemov Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index dc3e5fce0b7b..5b4dd03130da 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -167,14 +167,7 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index + i, &node, &slot); - if (!node) { - VM_BUG_ON_PAGE(nr != 1, page); - /* - * We need a node to properly account shadow - * entries. Don't plant any without. XXX - */ - shadow = NULL; - } + VM_BUG_ON_PAGE(!node && nr != 1, page); radix_tree_clear_tags(&mapping->page_tree, node, slot); __radix_tree_replace(&mapping->page_tree, node, slot, shadow, -- cgit v1.2.3 From b53889987862b69179560e50992f7ea8a5eb61ed Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 12 Dec 2016 16:43:58 -0800 Subject: mm: workingset: update shadow limit to reflect bigger active list Since commit 59dc76b0d4df ("mm: vmscan: reduce size of inactive file list") the size of the active file list is no longer limited to half of memory. Increase the shadow node limit accordingly to avoid throwing out shadow entries that might still result in eligible refaults. The exact size of the active list now depends on the overall size of the page cache, but converges toward taking up most of the space: In mm/vmscan.c::inactive_list_is_low(), * total target max * memory ratio inactive * ------------------------------------- * 10MB 1 5MB * 100MB 1 50MB * 1GB 3 250MB * 10GB 10 0.9GB * 100GB 31 3GB * 1TB 101 10GB * 10TB 320 32GB It would be possible to apply the same precise ratios when determining the limit for radix tree nodes containing shadow entries, but since it is merely an approximation of the oldest refault distances in the wild and the code also makes assumptions about the node population density, keep it simple and always target the full cache size. While at it, clarify the comment and the formula for memory footprint. Link: http://lkml.kernel.org/r/20161117214701.29000-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/workingset.c | 44 +++++++++++++++++++++++++------------------- 1 file changed, 25 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/workingset.c b/mm/workingset.c index ef556bf1323d..241fa5d6b3b2 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -369,40 +369,46 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, { unsigned long max_nodes; unsigned long nodes; - unsigned long pages; + unsigned long cache; /* list_lru lock nests inside IRQ-safe mapping->tree_lock */ local_irq_disable(); nodes = list_lru_shrink_count(&shadow_nodes, sc); local_irq_enable(); - if (sc->memcg) { - pages = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, - LRU_ALL_FILE); - } else { - pages = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + - node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); - } - /* - * Active cache pages are limited to 50% of memory, and shadow - * entries that represent a refault distance bigger than that - * do not have any effect. Limit the number of shadow nodes - * such that shadow entries do not exceed the number of active - * cache pages, assuming a worst-case node population density - * of 1/8th on average. + * Approximate a reasonable limit for the radix tree nodes + * containing shadow entries. We don't need to keep more + * shadow entries than possible pages on the active list, + * since refault distances bigger than that are dismissed. + * + * The size of the active list converges toward 100% of + * overall page cache as memory grows, with only a tiny + * inactive list. Assume the total cache size for that. + * + * Nodes might be sparsely populated, with only one shadow + * entry in the extreme case. Obviously, we cannot keep one + * node for every eligible shadow entry, so compromise on a + * worst-case density of 1/8th. Below that, not all eligible + * refaults can be detected anymore. * * On 64-bit with 7 radix_tree_nodes per page and 64 slots * each, this will reclaim shadow entries when they consume - * ~2% of available memory: + * ~1.8% of available memory: * - * PAGE_SIZE / radix_tree_nodes / node_entries / PAGE_SIZE + * PAGE_SIZE / radix_tree_nodes / node_entries * 8 / PAGE_SIZE */ - max_nodes = pages >> (1 + RADIX_TREE_MAP_SHIFT - 3); + if (sc->memcg) { + cache = mem_cgroup_node_nr_lru_pages(sc->memcg, sc->nid, + LRU_ALL_FILE); + } else { + cache = node_page_state(NODE_DATA(sc->nid), NR_ACTIVE_FILE) + + node_page_state(NODE_DATA(sc->nid), NR_INACTIVE_FILE); + } + max_nodes = cache >> (RADIX_TREE_MAP_SHIFT - 3); if (nodes <= max_nodes) return 0; - return nodes - max_nodes; } -- cgit v1.2.3 From c8eef01e2f98e09a6733f2acdc675b4cf87a22a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2016 16:44:01 -0800 Subject: mm: remove free_unmap_vmap_area_noflush() Patch series "reduce latency in __purge_vmap_area_lazy", v2. This patch (of 10): Sort out the long lock hold times in __purge_vmap_area_lazy. It is based on a patch from Joel. Inline free_unmap_vmap_area_noflush() it into the only caller. Link: http://lkml.kernel.org/r/1479474236-4139-2-git-send-email-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: Jisheng Zhang Cc: Andrey Ryabinin Cc: Joel Fernandes Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e73948afac70..c3261143a0af 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -710,23 +710,14 @@ static void free_vmap_area_noflush(struct vmap_area *va) try_purge_vmap_area_lazy(); } -/* - * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been - * called for the correct range previously. - */ -static void free_unmap_vmap_area_noflush(struct vmap_area *va) -{ - unmap_vmap_area(va); - free_vmap_area_noflush(va); -} - /* * Free and unmap a vmap area */ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); - free_unmap_vmap_area_noflush(va); + unmap_vmap_area(va); + free_vmap_area_noflush(va); } static struct vmap_area *find_vmap_area(unsigned long addr) -- cgit v1.2.3 From 9c3acf6043ac437ae0a45de4657ee700c3dc8850 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2016 16:44:04 -0800 Subject: mm: remove free_unmap_vmap_area_addr() Just inline it into the only caller. Link: http://lkml.kernel.org/r/1479474236-4139-3-git-send-email-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: Jisheng Zhang Cc: Andrey Ryabinin Cc: Joel Fernandes Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c3261143a0af..842ea986adcd 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -731,16 +731,6 @@ static struct vmap_area *find_vmap_area(unsigned long addr) return va; } -static void free_unmap_vmap_area_addr(unsigned long addr) -{ - struct vmap_area *va; - - va = find_vmap_area(addr); - BUG_ON(!va); - free_unmap_vmap_area(va); -} - - /*** Per cpu kva allocator ***/ /* @@ -1098,6 +1088,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) { unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr = (unsigned long)mem; + struct vmap_area *va; BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); @@ -1107,10 +1098,14 @@ void vm_unmap_ram(const void *mem, unsigned int count) debug_check_no_locks_freed(mem, size); vmap_debug_free_range(addr, addr+size); - if (likely(count <= VMAP_MAX_ALLOC)) + if (likely(count <= VMAP_MAX_ALLOC)) { vb_free(mem, size); - else - free_unmap_vmap_area_addr(addr); + return; + } + + va = find_vmap_area(addr); + BUG_ON(!va); + free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); -- cgit v1.2.3 From 0574ecd141df28d573d4364adec59766ddf5f38d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2016 16:44:07 -0800 Subject: mm: refactor __purge_vmap_area_lazy() Move the purge_lock synchronization to the callers, move the call to purge_fragmented_blocks_allcpus at the beginning of the function to the callers that need it, move the force_flush behavior to the caller that needs it, and pass start and end by value instead of by reference. No change in behavior. Link: http://lkml.kernel.org/r/1479474236-4139-4-git-send-email-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: Jisheng Zhang Cc: Andrey Ryabinin Cc: Joel Fernandes Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 80 ++++++++++++++++++++++++++---------------------------------- 1 file changed, 35 insertions(+), 45 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 842ea986adcd..1f5501b43026 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -601,6 +601,13 @@ static unsigned long lazy_max_pages(void) static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +/* + * Serialize vmap purging. There is no actual criticial section protected + * by this look, but we want to avoid concurrent calls for performance + * reasons and to make the pcpu_get_vm_areas more deterministic. + */ +static DEFINE_SPINLOCK(vmap_purge_lock); + /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); @@ -615,59 +622,36 @@ void set_iounmap_nonlazy(void) /* * Purges all lazily-freed vmap areas. - * - * If sync is 0 then don't purge if there is already a purge in progress. - * If force_flush is 1, then flush kernel TLBs between *start and *end even - * if we found no lazy vmap areas to unmap (callers can use this to optimise - * their own TLB flushing). - * Returns with *start = min(*start, lowest purged address) - * *end = max(*end, highest purged address) */ -static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, - int sync, int force_flush) +static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) { - static DEFINE_SPINLOCK(purge_lock); struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; int nr = 0; - /* - * If sync is 0 but force_flush is 1, we'll go sync anyway but callers - * should not expect such behaviour. This just simplifies locking for - * the case that isn't actually used at the moment anyway. - */ - if (!sync && !force_flush) { - if (!spin_trylock(&purge_lock)) - return; - } else - spin_lock(&purge_lock); - - if (sync) - purge_fragmented_blocks_allcpus(); + lockdep_assert_held(&vmap_purge_lock); valist = llist_del_all(&vmap_purge_list); llist_for_each_entry(va, valist, purge_list) { - if (va->va_start < *start) - *start = va->va_start; - if (va->va_end > *end) - *end = va->va_end; + if (va->va_start < start) + start = va->va_start; + if (va->va_end > end) + end = va->va_end; nr += (va->va_end - va->va_start) >> PAGE_SHIFT; } - if (nr) - atomic_sub(nr, &vmap_lazy_nr); + if (!nr) + return false; - if (nr || force_flush) - flush_tlb_kernel_range(*start, *end); + atomic_sub(nr, &vmap_lazy_nr); + flush_tlb_kernel_range(start, end); - if (nr) { - spin_lock(&vmap_area_lock); - llist_for_each_entry_safe(va, n_va, valist, purge_list) - __free_vmap_area(va); - spin_unlock(&vmap_area_lock); - } - spin_unlock(&purge_lock); + spin_lock(&vmap_area_lock); + llist_for_each_entry_safe(va, n_va, valist, purge_list) + __free_vmap_area(va); + spin_unlock(&vmap_area_lock); + return true; } /* @@ -676,9 +660,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, */ static void try_purge_vmap_area_lazy(void) { - unsigned long start = ULONG_MAX, end = 0; - - __purge_vmap_area_lazy(&start, &end, 0, 0); + if (spin_trylock(&vmap_purge_lock)) { + __purge_vmap_area_lazy(ULONG_MAX, 0); + spin_unlock(&vmap_purge_lock); + } } /* @@ -686,9 +671,10 @@ static void try_purge_vmap_area_lazy(void) */ static void purge_vmap_area_lazy(void) { - unsigned long start = ULONG_MAX, end = 0; - - __purge_vmap_area_lazy(&start, &end, 1, 0); + spin_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + __purge_vmap_area_lazy(ULONG_MAX, 0); + spin_unlock(&vmap_purge_lock); } /* @@ -1075,7 +1061,11 @@ void vm_unmap_aliases(void) rcu_read_unlock(); } - __purge_vmap_area_lazy(&start, &end, 1, flush); + spin_lock(&vmap_purge_lock); + purge_fragmented_blocks_allcpus(); + if (!__purge_vmap_area_lazy(start, end) && flush) + flush_tlb_kernel_range(start, end); + spin_unlock(&vmap_purge_lock); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); -- cgit v1.2.3 From bf22e37a641327e34681b7b6959d9646e3886770 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Mon, 12 Dec 2016 16:44:10 -0800 Subject: mm: add vfree_atomic() We are going to use sleeping lock for freeing vmap. However some vfree() users want to free memory from atomic (but not from interrupt) context. For this we add vfree_atomic() - deferred variation of vfree() which can be used in any atomic context (except NMIs). [akpm@linux-foundation.org: tweak comment grammar] [aryabinin@virtuozzo.com: use raw_cpu_ptr() instead of this_cpu_ptr()] Link: http://lkml.kernel.org/r/1481553981-3856-1-git-send-email-aryabinin@virtuozzo.com Link: http://lkml.kernel.org/r/1479474236-4139-5-git-send-email-hch@lst.de Signed-off-by: Andrey Ryabinin Signed-off-by: Christoph Hellwig Cc: Joel Fernandes Cc: Jisheng Zhang Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 1 + mm/vmalloc.c | 42 ++++++++++++++++++++++++++++++++++++------ 2 files changed, 37 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 3d9d786a943c..d68edffbf142 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -82,6 +82,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, const void *caller); extern void vfree(const void *addr); +extern void vfree_atomic(const void *addr); extern void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 1f5501b43026..4ac776f10ad1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1486,7 +1486,39 @@ static void __vunmap(const void *addr, int deallocate_pages) kfree(area); return; } - + +static inline void __vfree_deferred(const void *addr) +{ + /* + * Use raw_cpu_ptr() because this can be called from preemptible + * context. Preemption is absolutely fine here, because the llist_add() + * implementation is lockless, so it works even if we are adding to + * nother cpu's list. schedule_work() should be fine with this too. + */ + struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred); + + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); +} + +/** + * vfree_atomic - release memory allocated by vmalloc() + * @addr: memory base address + * + * This one is just like vfree() but can be called in any atomic context + * except NMIs. + */ +void vfree_atomic(const void *addr) +{ + BUG_ON(in_nmi()); + + kmemleak_free(addr); + + if (!addr) + return; + __vfree_deferred(addr); +} + /** * vfree - release memory allocated by vmalloc() * @addr: memory base address @@ -1509,11 +1541,9 @@ void vfree(const void *addr) if (!addr) return; - if (unlikely(in_interrupt())) { - struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); - if (llist_add((struct llist_node *)addr, &p->list)) - schedule_work(&p->wq); - } else + if (unlikely(in_interrupt())) + __vfree_deferred(addr); + else __vunmap(addr, 1); } EXPORT_SYMBOL(vfree); -- cgit v1.2.3 From 5803ed292e63a1bf00722d6655d0229794607183 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2016 16:44:20 -0800 Subject: mm: mark all calls into the vmalloc subsystem as potentially sleeping We will take a sleeping lock in later in this series, so this adds the proper safeguards. Link: http://lkml.kernel.org/r/1479474236-4139-9-git-send-email-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: Jisheng Zhang Cc: Andrey Ryabinin Cc: Joel Fernandes Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4ac776f10ad1..3308007d8427 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -365,7 +365,7 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, BUG_ON(offset_in_page(size)); BUG_ON(!is_power_of_2(align)); - might_sleep_if(gfpflags_allow_blocking(gfp_mask)); + might_sleep(); va = kmalloc_node(sizeof(struct vmap_area), gfp_mask & GFP_RECLAIM_MASK, node); @@ -1037,6 +1037,8 @@ void vm_unmap_aliases(void) if (unlikely(!vmap_initialized)) return; + might_sleep(); + for_each_possible_cpu(cpu) { struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); struct vmap_block *vb; @@ -1080,6 +1082,7 @@ void vm_unmap_ram(const void *mem, unsigned int count) unsigned long addr = (unsigned long)mem; struct vmap_area *va; + might_sleep(); BUG_ON(!addr); BUG_ON(addr < VMALLOC_START); BUG_ON(addr > VMALLOC_END); @@ -1431,6 +1434,8 @@ struct vm_struct *remove_vm_area(const void *addr) { struct vmap_area *va; + might_sleep(); + va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->vm; -- cgit v1.2.3 From f9e09977671b618aeb25ddc0d4c9a84d5b5cde9d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 12 Dec 2016 16:44:23 -0800 Subject: mm: turn vmap_purge_lock into a mutex The purge_lock spinlock causes high latencies with non RT kernel. This has been reported multiple times on lkml [1] [2] and affects applications like audio. This patch replaces it with a mutex to allow preemption while holding the lock. Thanks to Joel Fernandes for the detailed report and analysis as well as an earlier attempt at fixing this issue. [1] http://lists.openwall.net/linux-kernel/2016/03/23/29 [2] https://lkml.org/lkml/2016/10/9/59 Link: http://lkml.kernel.org/r/1479474236-4139-10-git-send-email-hch@lst.de Signed-off-by: Christoph Hellwig Tested-by: Jisheng Zhang Cc: Andrey Ryabinin Cc: Joel Fernandes Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3308007d8427..d3c1f5ee48b4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -606,7 +606,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); * by this look, but we want to avoid concurrent calls for performance * reasons and to make the pcpu_get_vm_areas more deterministic. */ -static DEFINE_SPINLOCK(vmap_purge_lock); +static DEFINE_MUTEX(vmap_purge_lock); /* for per-CPU blocks */ static void purge_fragmented_blocks_allcpus(void); @@ -660,9 +660,9 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) */ static void try_purge_vmap_area_lazy(void) { - if (spin_trylock(&vmap_purge_lock)) { + if (mutex_trylock(&vmap_purge_lock)) { __purge_vmap_area_lazy(ULONG_MAX, 0); - spin_unlock(&vmap_purge_lock); + mutex_unlock(&vmap_purge_lock); } } @@ -671,10 +671,10 @@ static void try_purge_vmap_area_lazy(void) */ static void purge_vmap_area_lazy(void) { - spin_lock(&vmap_purge_lock); + mutex_lock(&vmap_purge_lock); purge_fragmented_blocks_allcpus(); __purge_vmap_area_lazy(ULONG_MAX, 0); - spin_unlock(&vmap_purge_lock); + mutex_unlock(&vmap_purge_lock); } /* @@ -1063,11 +1063,11 @@ void vm_unmap_aliases(void) rcu_read_unlock(); } - spin_lock(&vmap_purge_lock); + mutex_lock(&vmap_purge_lock); purge_fragmented_blocks_allcpus(); if (!__purge_vmap_area_lazy(start, end) && flush) flush_tlb_kernel_range(start, end); - spin_unlock(&vmap_purge_lock); + mutex_unlock(&vmap_purge_lock); } EXPORT_SYMBOL_GPL(vm_unmap_aliases); -- cgit v1.2.3 From 763b218ddfaf56761c19923beb7e16656f66ec62 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 12 Dec 2016 16:44:26 -0800 Subject: mm: add preempt points into __purge_vmap_area_lazy() Use cond_resched_lock to avoid holding the vmap_area_lock for a potentially long time and thus creating bad latencies for various workloads. [hch: split from a larger patch by Joel, wrote the crappy changelog] Link: http://lkml.kernel.org/r/1479474236-4139-11-git-send-email-hch@lst.de Signed-off-by: Joel Fernandes Signed-off-by: Christoph Hellwig Tested-by: Jisheng Zhang Cc: Andrey Ryabinin Cc: Chris Wilson Cc: John Dias Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d3c1f5ee48b4..a5584384eabc 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -628,7 +628,7 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) struct llist_node *valist; struct vmap_area *va; struct vmap_area *n_va; - int nr = 0; + bool do_free = false; lockdep_assert_held(&vmap_purge_lock); @@ -638,18 +638,22 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) start = va->va_start; if (va->va_end > end) end = va->va_end; - nr += (va->va_end - va->va_start) >> PAGE_SHIFT; + do_free = true; } - if (!nr) + if (!do_free) return false; - atomic_sub(nr, &vmap_lazy_nr); flush_tlb_kernel_range(start, end); spin_lock(&vmap_area_lock); - llist_for_each_entry_safe(va, n_va, valist, purge_list) + llist_for_each_entry_safe(va, n_va, valist, purge_list) { + int nr = (va->va_end - va->va_start) >> PAGE_SHIFT; + __free_vmap_area(va); + atomic_sub(nr, &vmap_lazy_nr); + cond_resched_lock(&vmap_area_lock); + } spin_unlock(&vmap_area_lock); return true; } -- cgit v1.2.3 From 1dd38b6c27d59414e89c08dd1ae9677a8e12cbc4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:44:29 -0800 Subject: mm: move vma_is_anonymous check within pmd_move_must_withdraw Independent of whether the vma is for anonymous memory, some arches like ppc64 would like to override pmd_move_must_withdraw(). One option is to encapsulate the vma_is_anonymous() check for general architectures inside pmd_move_must_withdraw() so that is always called and architectures that need unconditional overriding can override this function. ppc64 needs to override the function when the MMU is configured to use hash PTE's. [bsingharora@gmail.com: reworked changelog] Link: http://lkml.kernel.org/r/20161113150025.17942-1-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Acked-by: Michael Ellerman (powerpc) Cc: Benjamin Herrenschmidt Cc: Michael Neuling Cc: Paul Mackerras Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/pgtable.h | 3 ++- include/asm-generic/pgtable.h | 12 ------------ mm/huge_memory.c | 18 ++++++++++++++++-- 3 files changed, 18 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 9fd77f8794a0..700301bc5190 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1009,7 +1009,8 @@ static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, - struct spinlock *old_pmd_ptl) + struct spinlock *old_pmd_ptl, + struct vm_area_struct *vma) { if (radix_enabled()) return false; diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 41b95d82a185..2065e81701fc 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -652,18 +652,6 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) } #endif -#ifndef pmd_move_must_withdraw -static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, - spinlock_t *old_pmd_ptl) -{ - /* - * With split pmd lock we also need to move preallocated - * PTE page table if new_pmd is on different PMD page table. - */ - return new_pmd_ptl != old_pmd_ptl; -} -#endif - /* * This function is meant to be used by sites walking pagetables with * the mmap_sem hold in read mode to protect against MADV_DONTNEED and diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 26fd1161ca85..b54044c21076 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1429,6 +1429,21 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return 1; } +#ifndef pmd_move_must_withdraw +static inline int pmd_move_must_withdraw(spinlock_t *new_pmd_ptl, + spinlock_t *old_pmd_ptl, + struct vm_area_struct *vma) +{ + /* + * With split pmd lock we also need to move preallocated + * PTE page table if new_pmd is on different PMD page table. + * + * We also don't deposit and withdraw tables for file pages. + */ + return (new_pmd_ptl != old_pmd_ptl) && vma_is_anonymous(vma); +} +#endif + bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd, bool *need_flush) @@ -1466,8 +1481,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, force_flush = true; VM_BUG_ON(!pmd_none(*new_pmd)); - if (pmd_move_must_withdraw(new_ptl, old_ptl) && - vma_is_anonymous(vma)) { + if (pmd_move_must_withdraw(new_ptl, old_ptl, vma)) { pgtable_t pgtable; pgtable = pgtable_trans_huge_withdraw(mm, old_pmd); pgtable_trans_huge_deposit(mm, new_pmd, pgtable); -- cgit v1.2.3 From 953c66c2b22a304dbc3c3d7fc8e8c25cd97a03d8 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 12 Dec 2016 16:44:32 -0800 Subject: mm: THP page cache support for ppc64 Add arch specific callback in the generic THP page cache code that will deposit and withdarw preallocated page table. Archs like ppc64 use this preallocated table to store the hash pte slot information. Testing: kernel build of the patch series on tmpfs mounted with option huge=always The related thp stat: thp_fault_alloc 72939 thp_fault_fallback 60547 thp_collapse_alloc 603 thp_collapse_alloc_failed 0 thp_file_alloc 253763 thp_file_mapped 4251 thp_split_page 51518 thp_split_page_failed 1 thp_deferred_split_page 73566 thp_split_pmd 665 thp_zero_page_alloc 3 thp_zero_page_alloc_failed 0 [akpm@linux-foundation.org: remove unneeded parentheses, per Kirill] Link: http://lkml.kernel.org/r/20161113150025.17942-2-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Acked-by: Kirill A. Shutemov Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Michael Neuling Cc: Paul Mackerras Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/pgtable.h | 10 +++++ include/asm-generic/pgtable.h | 3 ++ mm/Kconfig | 6 +-- mm/huge_memory.c | 17 ++++++++ mm/khugepaged.c | 21 +++++++++- mm/memory.c | 60 +++++++++++++++++++++++----- 6 files changed, 100 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 700301bc5190..0ebfbc8f0449 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1021,6 +1021,16 @@ static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, */ return true; } + + +#define arch_needs_pgtable_deposit arch_needs_pgtable_deposit +static inline bool arch_needs_pgtable_deposit(void) +{ + if (radix_enabled()) + return false; + return true; +} + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* __ASSEMBLY__ */ #endif /* _ASM_POWERPC_BOOK3S_64_PGTABLE_H_ */ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 2065e81701fc..18af2bcefe6a 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -652,6 +652,9 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) } #endif +#ifndef arch_needs_pgtable_deposit +#define arch_needs_pgtable_deposit() (false) +#endif /* * This function is meant to be used by sites walking pagetables with * the mmap_sem hold in read mode to protect against MADV_DONTNEED and diff --git a/mm/Kconfig b/mm/Kconfig index 33a9b06ec618..9b8fccb969dc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -447,13 +447,9 @@ choice benefit. endchoice -# -# We don't deposit page tables on file THP mapping, -# but Power makes use of them to address MMU quirk. -# config TRANSPARENT_HUGE_PAGECACHE def_bool y - depends on TRANSPARENT_HUGEPAGE && !PPC + depends on TRANSPARENT_HUGEPAGE # # UP and nommu archs use km based percpu allocator diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b54044c21076..2b44ac11178f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1380,6 +1380,15 @@ out_unlocked: return ret; } +static inline void zap_deposited_table(struct mm_struct *mm, pmd_t *pmd) +{ + pgtable_t pgtable; + + pgtable = pgtable_trans_huge_withdraw(mm, pmd); + pte_free(mm, pgtable); + atomic_long_dec(&mm->nr_ptes); +} + int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { @@ -1421,6 +1430,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, atomic_long_dec(&tlb->mm->nr_ptes); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); } else { + if (arch_needs_pgtable_deposit()) + zap_deposited_table(tlb->mm, pmd); add_mm_counter(tlb->mm, MM_FILEPAGES, -HPAGE_PMD_NR); } spin_unlock(ptl); @@ -1607,6 +1618,12 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (!vma_is_anonymous(vma)) { _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + /* + * We are going to unmap this huge page. So + * just go ahead and zap it + */ + if (arch_needs_pgtable_deposit()) + zap_deposited_table(mm, pmd); if (vma_is_dax(vma)) return; page = pmd_page(_pmd); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7a50c726c5ae..09460955e818 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1242,6 +1242,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) struct vm_area_struct *vma; unsigned long addr; pmd_t *pmd, _pmd; + bool deposited = false; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { @@ -1266,10 +1267,26 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd); /* assume page table is clear */ _pmd = pmdp_collapse_flush(vma, addr, pmd); + /* + * now deposit the pgtable for arch that need it + * otherwise free it. + */ + if (arch_needs_pgtable_deposit()) { + /* + * The deposit should be visibile only after + * collapse is seen by others. + */ + smp_wmb(); + pgtable_trans_huge_deposit(vma->vm_mm, pmd, + pmd_pgtable(_pmd)); + deposited = true; + } spin_unlock(ptl); up_write(&vma->vm_mm->mmap_sem); - atomic_long_dec(&vma->vm_mm->nr_ptes); - pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + if (!deposited) { + atomic_long_dec(&vma->vm_mm->nr_ptes); + pte_free(vma->vm_mm, pmd_pgtable(_pmd)); + } } } i_mmap_unlock_write(mapping); diff --git a/mm/memory.c b/mm/memory.c index 0a72f821ccdc..32e9b7aec366 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2935,6 +2935,19 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } +static void deposit_prealloc_pte(struct fault_env *fe) +{ + struct vm_area_struct *vma = fe->vma; + + pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); + /* + * We are going to consume the prealloc table, + * count that as nr_ptes. + */ + atomic_long_inc(&vma->vm_mm->nr_ptes); + fe->prealloc_pte = 0; +} + static int do_set_pmd(struct fault_env *fe, struct page *page) { struct vm_area_struct *vma = fe->vma; @@ -2949,6 +2962,17 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = VM_FAULT_FALLBACK; page = compound_head(page); + /* + * Archs like ppc64 need additonal space to store information + * related to pte entry. Use the preallocated table for that. + */ + if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { + fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); + if (!fe->prealloc_pte) + return VM_FAULT_OOM; + smp_wmb(); /* See comment in __pte_alloc() */ + } + fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); if (unlikely(!pmd_none(*fe->pmd))) goto out; @@ -2962,6 +2986,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) add_mm_counter(vma->vm_mm, MM_FILEPAGES, HPAGE_PMD_NR); page_add_file_rmap(page, true); + /* + * deposit and withdraw with pmd lock held + */ + if (arch_needs_pgtable_deposit()) + deposit_prealloc_pte(fe); set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); @@ -2971,6 +3000,13 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) ret = 0; count_vm_event(THP_FILE_MAPPED); out: + /* + * If we are going to fallback to pte mapping, do a + * withdraw with pmd lock held. + */ + if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) + fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, + fe->pmd); spin_unlock(fe->ptl); return ret; } @@ -3010,18 +3046,20 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, ret = do_set_pmd(fe, page); if (ret != VM_FAULT_FALLBACK) - return ret; + goto fault_handled; } if (!fe->pte) { ret = pte_alloc_one_map(fe); if (ret) - return ret; + goto fault_handled; } /* Re-check under ptl */ - if (unlikely(!pte_none(*fe->pte))) - return VM_FAULT_NOPAGE; + if (unlikely(!pte_none(*fe->pte))) { + ret = VM_FAULT_NOPAGE; + goto fault_handled; + } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); @@ -3041,8 +3079,15 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, fe->address, fe->pte); + ret = 0; - return 0; +fault_handled: + /* preallocated pagetable is unused: free it */ + if (fe->prealloc_pte) { + pte_free(fe->vma->vm_mm, fe->prealloc_pte); + fe->prealloc_pte = 0; + } + return ret; } static unsigned long fault_around_bytes __read_mostly = @@ -3141,11 +3186,6 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); - /* preallocated pagetable is unused: free it */ - if (fe->prealloc_pte) { - pte_free(fe->vma->vm_mm, fe->prealloc_pte); - fe->prealloc_pte = 0; - } /* Huge page is mapped? Page fault is solved */ if (pmd_trans_huge(*fe->pmd)) { ret = VM_FAULT_NOPAGE; -- cgit v1.2.3 From 46e8a3a08c23d07d0f21fabeed182b671af68c93 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 12 Dec 2016 16:44:35 -0800 Subject: mm, debug: print raw struct page data in __dump_page() __dump_page() is used when a page metadata inconsistency is detected, either by standard runtime checks, or extra checks in CONFIG_DEBUG_VM builds. It prints some of the relevant metadata, but not the whole struct page, which is based on unions and interpretation is dependent on the context. This means that sometimes e.g. a VM_BUG_ON_PAGE() checks certain field, which is however not printed by __dump_page() and the resulting bug report may then lack clues that could help in determining the root cause. This patch solves the problem by simply printing the whole struct page word by word, so no part is missing, but the interpretation of the data is left to developers. This is similar to e.g. x86_64 raw stack dumps. Example output: page:ffffea00000475c0 count:1 mapcount:0 mapping: (null) index:0x0 flags: 0x100000000000400(reserved) raw: 0100000000000400 0000000000000000 0000000000000000 00000001ffffffff raw: ffffea00000475e0 ffffea00000475e0 0000000000000000 0000000000000000 page dumped because: VM_BUG_ON_PAGE(1) [aryabinin@virtuozzo.com: suggested print_hex_dump()] Link: http://lkml.kernel.org/r/2ff83214-70fe-741e-bf05-fe4a4073ec3e@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Acked-by: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/debug.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/debug.c b/mm/debug.c index 9feb699c5d25..db1cd26d8752 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -59,6 +59,10 @@ void __dump_page(struct page *page, const char *reason) pr_emerg("flags: %#lx(%pGp)\n", page->flags, &page->flags); + print_hex_dump(KERN_ALERT, "raw: ", DUMP_PREFIX_NONE, 32, + sizeof(unsigned long), page, + sizeof(struct page), false); + if (reason) pr_alert("page dumped because: %s\n", reason); -- cgit v1.2.3 From d5a187daf5856df9b997f9d208e5a7b64006eb2e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 12 Dec 2016 16:44:38 -0800 Subject: mm, rmap: handle anon_vma_prepare() common case inline anon_vma_prepare() is mostly a large "if (unlikely(...))" block, as the expected common case is that an anon_vma already exists. We could turn the condition around and return 0, but it also makes sense to do it inline and avoid a call for the common case. Bloat-o-meter naturally shows that inlining the check has some code size costs: add/remove: 1/1 grow/shrink: 4/0 up/down: 475/-373 (102) function old new delta __anon_vma_prepare - 359 +359 handle_mm_fault 2744 2796 +52 hugetlb_cow 1146 1170 +24 hugetlb_fault 2123 2145 +22 wp_page_copy 1469 1487 +18 anon_vma_prepare 373 - -373 Checking the asm however confirms that the hot paths now avoid a call, which is moved away. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20161116074005.22768-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Cc: "Kirill A. Shutemov" Cc: Johannes Weiner Cc: Konstantin Khlebnikov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/rmap.h | 10 +++++++- mm/rmap.c | 69 ++++++++++++++++++++++++++-------------------------- 2 files changed, 43 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b46bb5620a76..15321fb1df6b 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -137,11 +137,19 @@ static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) * anon_vma helper functions. */ void anon_vma_init(void); /* create anon_vma_cachep */ -int anon_vma_prepare(struct vm_area_struct *); +int __anon_vma_prepare(struct vm_area_struct *); void unlink_anon_vmas(struct vm_area_struct *); int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); +static inline int anon_vma_prepare(struct vm_area_struct *vma) +{ + if (likely(vma->anon_vma)) + return 0; + + return __anon_vma_prepare(vma); +} + static inline void anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) { diff --git a/mm/rmap.c b/mm/rmap.c index 1ef36404e7b2..91619fd70939 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -141,14 +141,15 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, } /** - * anon_vma_prepare - attach an anon_vma to a memory region + * __anon_vma_prepare - attach an anon_vma to a memory region * @vma: the memory region in question * * This makes sure the memory mapping described by 'vma' has * an 'anon_vma' attached to it, so that we can associate the * anonymous pages mapped into it with that anon_vma. * - * The common case will be that we already have one, but if + * The common case will be that we already have one, which + * is handled inline by anon_vma_prepare(). But if * not we either need to find an adjacent mapping that we * can re-use the anon_vma from (very common when the only * reason for splitting a vma has been mprotect()), or we @@ -167,48 +168,46 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, * * This must be called with the mmap_sem held for reading. */ -int anon_vma_prepare(struct vm_area_struct *vma) +int __anon_vma_prepare(struct vm_area_struct *vma) { - struct anon_vma *anon_vma = vma->anon_vma; + struct mm_struct *mm = vma->vm_mm; + struct anon_vma *anon_vma, *allocated; struct anon_vma_chain *avc; might_sleep(); - if (unlikely(!anon_vma)) { - struct mm_struct *mm = vma->vm_mm; - struct anon_vma *allocated; - avc = anon_vma_chain_alloc(GFP_KERNEL); - if (!avc) - goto out_enomem; + avc = anon_vma_chain_alloc(GFP_KERNEL); + if (!avc) + goto out_enomem; + + anon_vma = find_mergeable_anon_vma(vma); + allocated = NULL; + if (!anon_vma) { + anon_vma = anon_vma_alloc(); + if (unlikely(!anon_vma)) + goto out_enomem_free_avc; + allocated = anon_vma; + } - anon_vma = find_mergeable_anon_vma(vma); + anon_vma_lock_write(anon_vma); + /* page_table_lock to protect against threads */ + spin_lock(&mm->page_table_lock); + if (likely(!vma->anon_vma)) { + vma->anon_vma = anon_vma; + anon_vma_chain_link(vma, avc, anon_vma); + /* vma reference or self-parent link for new root */ + anon_vma->degree++; allocated = NULL; - if (!anon_vma) { - anon_vma = anon_vma_alloc(); - if (unlikely(!anon_vma)) - goto out_enomem_free_avc; - allocated = anon_vma; - } + avc = NULL; + } + spin_unlock(&mm->page_table_lock); + anon_vma_unlock_write(anon_vma); - anon_vma_lock_write(anon_vma); - /* page_table_lock to protect against threads */ - spin_lock(&mm->page_table_lock); - if (likely(!vma->anon_vma)) { - vma->anon_vma = anon_vma; - anon_vma_chain_link(vma, avc, anon_vma); - /* vma reference or self-parent link for new root */ - anon_vma->degree++; - allocated = NULL; - avc = NULL; - } - spin_unlock(&mm->page_table_lock); - anon_vma_unlock_write(anon_vma); + if (unlikely(allocated)) + put_anon_vma(allocated); + if (unlikely(avc)) + anon_vma_chain_free(avc); - if (unlikely(allocated)) - put_anon_vma(allocated); - if (unlikely(avc)) - anon_vma_chain_free(avc); - } return 0; out_enomem_free_avc: -- cgit v1.2.3 From a6de734bc002fe2027ccc074fbbd87d72957b7a4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 12 Dec 2016 16:44:41 -0800 Subject: mm, page_alloc: keep pcp count and list contents in sync if struct page is corrupted Vlastimil Babka pointed out that commit 479f854a207c ("mm, page_alloc: defer debugging checks of pages allocated from the PCP") will allow the per-cpu list counter to be out of sync with the per-cpu list contents if a struct page is corrupted. The consequence is an infinite loop if the per-cpu lists get fully drained by free_pcppages_bulk because all the lists are empty but the count is positive. The infinite loop occurs here do { batch_free++; if (++migratetype == MIGRATE_PCPTYPES) migratetype = 0; list = &pcp->lists[migratetype]; } while (list_empty(list)); What the user sees is a bad page warning followed by a soft lockup with interrupts disabled in free_pcppages_bulk(). This patch keeps the accounting in sync. Fixes: 479f854a207c ("mm, page_alloc: defer debugging checks of pages allocated from the PCP") Link: http://lkml.kernel.org/r/20161202112951.23346-2-mgorman@techsingularity.net Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Hillf Danton Cc: Christoph Lameter Cc: Johannes Weiner Cc: Jesper Dangaard Brouer Cc: Joonsoo Kim Cc: [4.7+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2b69e28706b1..3f2c9e535f7f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2218,7 +2218,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, bool cold) { - int i; + int i, alloced = 0; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -2243,13 +2243,21 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, else list_add_tail(&page->lru, list); list = &page->lru; + alloced++; if (is_migrate_cma(get_pcppage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } + + /* + * i pages were removed from the buddy list even if some leak due + * to check_pcp_refill failing so adjust NR_FREE_PAGES based + * on i. Do not confuse with 'alloced' which is the number of + * pages added to the pcp list. + */ __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); - return i; + return alloced; } #ifdef CONFIG_NUMA -- cgit v1.2.3 From dc644a073769dd8949a75691eb4a5bdeb70a7d51 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 12 Dec 2016 16:44:44 -0800 Subject: mm: add three more cond_resched() in swapoff Add a cond_resched() in the unuse_pmd_range() loop (so as to call it even when pmd none or trans_huge, like zap_pmd_range() does); and in the unuse_mm() loop (since that might skip over many vmas). shmem_unuse() and radix_tree_locate_item() look good enough already. Those were the obvious places, but in fact the stalls came from find_next_to_unuse(), which sometimes scans through many unused entries. Apply scan_swap_map()'s LATENCY_LIMIT of 256 there too; and only go off to test frontswap_map when a used entry is found. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1612052155140.13021@eggly.anvils Signed-off-by: Hugh Dickins Reported-by: Eric Dumazet Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index f30438970cd1..1c6e0321205d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1234,6 +1234,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { + cond_resched(); next = pmd_addr_end(addr, end); if (pmd_none_or_trans_huge_or_clear_bad(pmd)) continue; @@ -1313,6 +1314,7 @@ static int unuse_mm(struct mm_struct *mm, for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma && (ret = unuse_vma(vma, entry, page))) break; + cond_resched(); } up_read(&mm->mmap_sem); return (ret < 0)? ret: 0; @@ -1350,15 +1352,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, prev = 0; i = 1; } - if (frontswap) { - if (frontswap_test(si, i)) - break; - else - continue; - } count = READ_ONCE(si->swap_map[i]); if (count && swap_count(count) != SWAP_MAP_BAD) - break; + if (!frontswap || frontswap_test(si, i)) + break; + if ((i % LATENCY_LIMIT) == 0) + cond_resched(); } return i; } -- cgit v1.2.3 From 49920d28781dcced10cd30cb9a938e7d045a1c94 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 12 Dec 2016 16:44:50 -0800 Subject: mm: make transparent hugepage size public Test programs want to know the size of a transparent hugepage. While it is commonly the same as the size of a hugetlbfs page (shown as Hugepagesize in /proc/meminfo), that is not always so: powerpc implements transparent hugepages in a different way from hugetlbfs pages, so it's coincidence when their sizes are the same; and x86 and others can support more than one hugetlbfs page size. Add /sys/kernel/mm/transparent_hugepage/hpage_pmd_size to show the THP size in bytes - it's the same for Anonymous and Shmem hugepages. Call it hpage_pmd_size (after HPAGE_PMD_SIZE) rather than hpage_size, in case some transparent support for pud and pgd pages is added later. Link: http://lkml.kernel.org/r/alpine.LSU.2.11.1612052200290.13021@eggly.anvils Signed-off-by: Hugh Dickins Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Greg Thelen Cc: David Rientjes Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Dave Hansen Cc: Dan Williams Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/transhuge.txt | 5 +++++ mm/huge_memory.c | 10 ++++++++++ 2 files changed, 15 insertions(+) (limited to 'mm') diff --git a/Documentation/vm/transhuge.txt b/Documentation/vm/transhuge.txt index 2ec6adb5a4ce..c4171e4519c2 100644 --- a/Documentation/vm/transhuge.txt +++ b/Documentation/vm/transhuge.txt @@ -136,6 +136,11 @@ or enable it back by writing 1: echo 0 >/sys/kernel/mm/transparent_hugepage/use_zero_page echo 1 >/sys/kernel/mm/transparent_hugepage/use_zero_page +Some userspace (such as a test program, or an optimized memory allocation +library) may want to know the size (in bytes) of a transparent hugepage: + +cat /sys/kernel/mm/transparent_hugepage/hpage_pmd_size + khugepaged will be automatically started when transparent_hugepage/enabled is set to "always" or "madvise, and it'll be automatically shutdown if it's set to "never". diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2b44ac11178f..cee42cf05477 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -285,6 +285,15 @@ static ssize_t use_zero_page_store(struct kobject *kobj, } static struct kobj_attribute use_zero_page_attr = __ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store); + +static ssize_t hpage_pmd_size_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE); +} +static struct kobj_attribute hpage_pmd_size_attr = + __ATTR_RO(hpage_pmd_size); + #ifdef CONFIG_DEBUG_VM static ssize_t debug_cow_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -307,6 +316,7 @@ static struct attribute *hugepage_attr[] = { &enabled_attr.attr, &defrag_attr.attr, &use_zero_page_attr.attr, + &hpage_pmd_size_attr.attr, #if defined(CONFIG_SHMEM) && defined(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &shmem_enabled_attr.attr, #endif -- cgit v1.2.3 From 5c5c1f36cedfb51ec291181e71817f7fe7e03ee2 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Mon, 12 Dec 2016 16:44:53 -0800 Subject: kasan: support panic_on_warn If user sets panic_on_warn, he wants kernel to panic if there is anything barely wrong with the kernel. KASAN-detected errors are definitely not less benign than an arbitrary kernel WARNING. Panic after KASAN errors if panic_on_warn is set. We use this for continuous fuzzing where we want kernel to stop and reboot on any error. Link: http://lkml.kernel.org/r/1476694764-31986-1-git-send-email-dvyukov@google.com Signed-off-by: Dmitry Vyukov Acked-by: Andrey Ryabinin Cc: Alexander Potapenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/report.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 073325aedc68..b82b3e215157 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -136,6 +136,8 @@ static void kasan_end_report(unsigned long *flags) pr_err("==================================================================\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); spin_unlock_irqrestore(&report_lock, *flags); + if (panic_on_warn) + panic("panic_on_warn set ...\n"); kasan_enable_current(); } -- cgit v1.2.3 From 64abdcb24351a27bed6e2b6a3c27348fe532c73f Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Mon, 12 Dec 2016 16:44:56 -0800 Subject: kasan: eliminate long stalls during quarantine reduction Currently we dedicate 1/32 of RAM for quarantine and then reduce it by 1/4 of total quarantine size. This can be a significant amount of memory. For example, with 4GB of RAM total quarantine size is 128MB and it is reduced by 32MB at a time. With 128GB of RAM total quarantine size is 4GB and it is reduced by 1GB. This leads to several problems: - freeing 1GB can take tens of seconds, causes rcu stall warnings and just introduces unexpected long delays at random places - if kmalloc() is called under a mutex, other threads stall on that mutex while a thread reduces quarantine - threads wait on quarantine_lock while one thread grabs a large batch of objects to evict - we walk the uncached list of object to free twice which makes all of the above worse - when a thread frees objects, they are already not accounted against global_quarantine.bytes; as the result we can have quarantine_size bytes in quarantine + unbounded amount of memory in large batches in threads that are in process of freeing Reduce size of quarantine in smaller batches to reduce the delays. The only reason to reduce it in batches is amortization of overheads, the new batch size of 1MB should be well enough to amortize spinlock lock/unlock and few function calls. Plus organize quarantine as a FIFO array of batches. This allows to not walk the list in quarantine_reduce() under quarantine_lock, which in turn reduces contention and is just faster. This improves performance of heavy load (syzkaller fuzzing) by ~20% with 4 CPUs and 32GB of RAM. Also this eliminates frequent (every 5 sec) drops of CPU consumption from ~400% to ~100% (one thread reduces quarantine while others are waiting on a mutex). Some reference numbers: 1. Machine with 4 CPUs and 4GB of memory. Quarantine size 128MB. Currently we free 32MB at at time. With new code we free 1MB at a time (1024 batches, ~128 are used). 2. Machine with 32 CPUs and 128GB of memory. Quarantine size 4GB. Currently we free 1GB at at time. With new code we free 8MB at a time (1024 batches, ~512 are used). 3. Machine with 4096 CPUs and 1TB of memory. Quarantine size 32GB. Currently we free 8GB at at time. With new code we free 4MB at a time (16K batches, ~8K are used). Link: http://lkml.kernel.org/r/1478756952-18695-1-git-send-email-dvyukov@google.com Signed-off-by: Dmitry Vyukov Cc: Eric Dumazet Cc: Greg Thelen Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Andrey Konovalov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kasan/quarantine.c | 94 ++++++++++++++++++++++++++------------------------- 1 file changed, 48 insertions(+), 46 deletions(-) (limited to 'mm') diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c index baabaad4a4aa..dae929c02bbb 100644 --- a/mm/kasan/quarantine.c +++ b/mm/kasan/quarantine.c @@ -86,24 +86,9 @@ static void qlist_move_all(struct qlist_head *from, struct qlist_head *to) qlist_init(from); } -static void qlist_move(struct qlist_head *from, struct qlist_node *last, - struct qlist_head *to, size_t size) -{ - if (unlikely(last == from->tail)) { - qlist_move_all(from, to); - return; - } - if (qlist_empty(to)) - to->head = from->head; - else - to->tail->next = from->head; - to->tail = last; - from->head = last->next; - last->next = NULL; - from->bytes -= size; - to->bytes += size; -} - +#define QUARANTINE_PERCPU_SIZE (1 << 20) +#define QUARANTINE_BATCHES \ + (1024 > 4 * CONFIG_NR_CPUS ? 1024 : 4 * CONFIG_NR_CPUS) /* * The object quarantine consists of per-cpu queues and a global queue, @@ -111,11 +96,22 @@ static void qlist_move(struct qlist_head *from, struct qlist_node *last, */ static DEFINE_PER_CPU(struct qlist_head, cpu_quarantine); -static struct qlist_head global_quarantine; +/* Round-robin FIFO array of batches. */ +static struct qlist_head global_quarantine[QUARANTINE_BATCHES]; +static int quarantine_head; +static int quarantine_tail; +/* Total size of all objects in global_quarantine across all batches. */ +static unsigned long quarantine_size; static DEFINE_SPINLOCK(quarantine_lock); /* Maximum size of the global queue. */ -static unsigned long quarantine_size; +static unsigned long quarantine_max_size; + +/* + * Target size of a batch in global_quarantine. + * Usually equal to QUARANTINE_PERCPU_SIZE unless we have too much RAM. + */ +static unsigned long quarantine_batch_size; /* * The fraction of physical memory the quarantine is allowed to occupy. @@ -124,9 +120,6 @@ static unsigned long quarantine_size; */ #define QUARANTINE_FRACTION 32 -#define QUARANTINE_LOW_SIZE (READ_ONCE(quarantine_size) * 3 / 4) -#define QUARANTINE_PERCPU_SIZE (1 << 20) - static struct kmem_cache *qlink_to_cache(struct qlist_node *qlink) { return virt_to_head_page(qlink)->slab_cache; @@ -191,21 +184,30 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache) if (unlikely(!qlist_empty(&temp))) { spin_lock_irqsave(&quarantine_lock, flags); - qlist_move_all(&temp, &global_quarantine); + WRITE_ONCE(quarantine_size, quarantine_size + temp.bytes); + qlist_move_all(&temp, &global_quarantine[quarantine_tail]); + if (global_quarantine[quarantine_tail].bytes >= + READ_ONCE(quarantine_batch_size)) { + int new_tail; + + new_tail = quarantine_tail + 1; + if (new_tail == QUARANTINE_BATCHES) + new_tail = 0; + if (new_tail != quarantine_head) + quarantine_tail = new_tail; + } spin_unlock_irqrestore(&quarantine_lock, flags); } } void quarantine_reduce(void) { - size_t new_quarantine_size, percpu_quarantines; + size_t total_size, new_quarantine_size, percpu_quarantines; unsigned long flags; struct qlist_head to_free = QLIST_INIT; - size_t size_to_free = 0; - struct qlist_node *last; - if (likely(READ_ONCE(global_quarantine.bytes) <= - READ_ONCE(quarantine_size))) + if (likely(READ_ONCE(quarantine_size) <= + READ_ONCE(quarantine_max_size))) return; spin_lock_irqsave(&quarantine_lock, flags); @@ -214,24 +216,23 @@ void quarantine_reduce(void) * Update quarantine size in case of hotplug. Allocate a fraction of * the installed memory to quarantine minus per-cpu queue limits. */ - new_quarantine_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / + total_size = (READ_ONCE(totalram_pages) << PAGE_SHIFT) / QUARANTINE_FRACTION; percpu_quarantines = QUARANTINE_PERCPU_SIZE * num_online_cpus(); - new_quarantine_size = (new_quarantine_size < percpu_quarantines) ? - 0 : new_quarantine_size - percpu_quarantines; - WRITE_ONCE(quarantine_size, new_quarantine_size); - - last = global_quarantine.head; - while (last) { - struct kmem_cache *cache = qlink_to_cache(last); - - size_to_free += cache->size; - if (!last->next || size_to_free > - global_quarantine.bytes - QUARANTINE_LOW_SIZE) - break; - last = last->next; + new_quarantine_size = (total_size < percpu_quarantines) ? + 0 : total_size - percpu_quarantines; + WRITE_ONCE(quarantine_max_size, new_quarantine_size); + /* Aim at consuming at most 1/2 of slots in quarantine. */ + WRITE_ONCE(quarantine_batch_size, max((size_t)QUARANTINE_PERCPU_SIZE, + 2 * total_size / QUARANTINE_BATCHES)); + + if (likely(quarantine_size > quarantine_max_size)) { + qlist_move_all(&global_quarantine[quarantine_head], &to_free); + WRITE_ONCE(quarantine_size, quarantine_size - to_free.bytes); + quarantine_head++; + if (quarantine_head == QUARANTINE_BATCHES) + quarantine_head = 0; } - qlist_move(&global_quarantine, last, &to_free, size_to_free); spin_unlock_irqrestore(&quarantine_lock, flags); @@ -275,13 +276,14 @@ static void per_cpu_remove_cache(void *arg) void quarantine_remove_cache(struct kmem_cache *cache) { - unsigned long flags; + unsigned long flags, i; struct qlist_head to_free = QLIST_INIT; on_each_cpu(per_cpu_remove_cache, cache, 1); spin_lock_irqsave(&quarantine_lock, flags); - qlist_move_cache(&global_quarantine, &to_free, cache); + for (i = 0; i < QUARANTINE_BATCHES; i++) + qlist_move_cache(&global_quarantine[i], &to_free, cache); spin_unlock_irqrestore(&quarantine_lock, flags); qlist_free_all(&to_free, cache); -- cgit v1.2.3 From 8f6066049c54ef0f726869c27d610cef5d15e084 Mon Sep 17 00:00:00 2001 From: zijun_hu Date: Mon, 12 Dec 2016 16:45:02 -0800 Subject: mm/percpu.c: fix panic triggered by BUG_ON() falsely As shown by pcpu_build_alloc_info(), the number of units within a percpu group is deduced by rounding up the number of CPUs within the group to @upa boundary/ Therefore, the number of CPUs isn't equal to the units's if it isn't aligned to @upa normally. However, pcpu_page_first_chunk() uses BUG_ON() to assert that one number is equal to the other roughly, so a panic is maybe triggered by the BUG_ON() incorrectly. In order to fix this issue, the number of CPUs is rounded up then compared with units's and the BUG_ON() is replaced with a warning and return of an error code as well, to keep system alive as much as possible. Link: http://lkml.kernel.org/r/57FCF07C.2020103@zoho.com Signed-off-by: zijun_hu Cc: Tejun Heo Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/percpu.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/percpu.c b/mm/percpu.c index 255714302394..f696385bcc44 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2093,6 +2093,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, size_t pages_size; struct page **pages; int unit, i, j, rc; + int upa; + int nr_g0_units; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); @@ -2100,7 +2102,12 @@ int __init pcpu_page_first_chunk(size_t reserved_size, if (IS_ERR(ai)) return PTR_ERR(ai); BUG_ON(ai->nr_groups != 1); - BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); + upa = ai->alloc_size/ai->unit_size; + nr_g0_units = roundup(num_possible_cpus(), upa); + if (unlikely(WARN_ON(ai->groups[0].nr_units != nr_g0_units))) { + pcpu_free_alloc_info(ai); + return -EINVAL; + } unit_pages = ai->unit_size >> PAGE_SHIFT; @@ -2111,21 +2118,22 @@ int __init pcpu_page_first_chunk(size_t reserved_size, /* allocate pages */ j = 0; - for (unit = 0; unit < num_possible_cpus(); unit++) + for (unit = 0; unit < num_possible_cpus(); unit++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; for (i = 0; i < unit_pages; i++) { - unsigned int cpu = ai->groups[0].cpu_map[unit]; void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { pr_warn("failed to allocate %s page for cpu%u\n", - psize_str, cpu); + psize_str, cpu); goto enomem; } /* kmemleak tracks the percpu allocations separately */ kmemleak_free(ptr); pages[j++] = virt_to_page(ptr); } + } /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; -- cgit v1.2.3