From 63d0f0a3c7e1281fd79268a8d988167eff607fb6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 12 Nov 2013 15:07:09 -0800 Subject: mm/readahead.c:do_readhead(): don't check for ->readpage The callee force_page_cache_readahead() already does this and unlike do_readahead(), force_page_cache_readahead() remembers to check for ->readpages() as well. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index e4ed04149785..50241836fe82 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -569,7 +569,7 @@ static ssize_t do_readahead(struct address_space *mapping, struct file *filp, pgoff_t index, unsigned long nr) { - if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) + if (!mapping || !mapping->a_ops) return -EINVAL; force_page_cache_readahead(mapping, filp, index, nr); -- cgit v1.2.3 From bafe1e14403ad40714ffcfa1fb173b371cc87a1f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 12 Nov 2013 15:07:10 -0800 Subject: ksm: remove redundant __GFP_ZERO from kcalloc kcalloc returns zeroed memory. There's no need to use this flag. Signed-off-by: Joe Perches Reviewed-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 0bea2b262a47..175fff79dc95 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2309,8 +2309,8 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj, * Allocate stable and unstable together: * MAXSMP NODES_SHIFT 10 will use 16kB. */ - buf = kcalloc(nr_node_ids + nr_node_ids, - sizeof(*buf), GFP_KERNEL | __GFP_ZERO); + buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), + GFP_KERNEL); /* Let us assume that RB_ROOT is NULL is zero */ if (!buf) err = -ENOMEM; -- cgit v1.2.3 From 4b90951c0bd8ca6695837354a253794192f6dfd5 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Tue, 12 Nov 2013 15:07:11 -0800 Subject: mm/vmalloc: use NUMA_NO_NODE Use more appropriate "if (node == NUMA_NO_NODE)" instead of "if (node < 0)" Signed-off-by: Jianguo Wu Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 107454312d5e..dea15e6bfc8d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1577,7 +1577,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; - if (node < 0) + if (node == NUMA_NO_NODE) page = alloc_page(tmp_mask); else page = alloc_pages_node(node, tmp_mask, order); -- cgit v1.2.3 From 9e4be4708e9e88da46ae1f0bb1054c3619cc476e Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Tue, 12 Nov 2013 15:07:12 -0800 Subject: mm/compaction.c: update comment about zone lock in isolate_freepages_block Since commit f40d1e42bb98 ("mm: compaction: acquire the zone->lock as late as possible"), isolate_freepages_block() takes the zone->lock itself. The function description however still states that the zone->lock must be held. This patch removes this outdated statement. Signed-off-by: Jerome Marchand Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index b5326b141a25..805165bcd3dd 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -235,10 +235,9 @@ static bool suitable_migration_target(struct page *page) } /* - * Isolate free pages onto a private freelist. Caller must hold zone->lock. - * If @strict is true, will abort returning 0 on any invalid PFNs or non-free - * pages inside of the pageblock (even though it may still end up isolating - * some pages). + * Isolate free pages onto a private freelist. If @strict is true, will abort + * returning 0 on any invalid PFNs or non-free pages inside of the pageblock + * (even though it may still end up isolating some pages). */ static unsigned long isolate_freepages_block(struct compact_control *cc, unsigned long blockpfn, -- cgit v1.2.3 From c69ded84a968e8ecc529b4de68522e4a2dcbf92a Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 12 Nov 2013 15:07:15 -0800 Subject: mm: remove obsolete comments about page table lock The callers of free_pgd_range() and hugetlb_free_pgd_range() don't hold page table locks. The comments seems to be obsolete, so let's remove them. Signed-off-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/hugetlbpage.c | 2 -- mm/memory.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index d67db4bd672d..90bb6d9409bf 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -633,8 +633,6 @@ static void hugetlb_free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, /* * This function frees user-level page tables of a process. - * - * Must be called with pagetable lock held. */ void hugetlb_free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, diff --git a/mm/memory.c b/mm/memory.c index 1f2287eaa88e..15744b2cf919 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -453,8 +453,6 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, /* * This function frees user-level page tables of a process. - * - * Must be called with pagetable lock held. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, -- cgit v1.2.3 From 8bfa3f9a012c4049f3d661f7a772cd9c27a7da99 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Tue, 12 Nov 2013 15:07:16 -0800 Subject: mm/huge_memory.c: fix stale comments of transparent_hugepage_flags Since commit 13ece886d99c ("thp: transparent hugepage config choice"), transparent hugepage support is disabled by default, and TRANSPARENT_HUGEPAGE_ALWAYS is configured when TRANSPARENT_HUGEPAGE=y. And since commit d39d33c332c6 ("thp: enable direct defrag"), defrag is enable for all transparent hugepage page faults by default, not only in MADV_HUGEPAGE regions. Signed-off-by: Jianguo Wu Reviewed-by: Wanpeng Li Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2612f60f53ee..4f7e2113646c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -27,11 +27,12 @@ #include "internal.h" /* - * By default transparent hugepage support is enabled for all mappings - * and khugepaged scans all mappings. Defrag is only invoked by - * khugepaged hugepage allocations and by page faults inside - * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived - * allocations. + * By default transparent hugepage support is disabled in order that avoid + * to risk increase the memory footprint of applications without a guaranteed + * benefit. When transparent hugepage support is enabled, is for all mappings, + * and khugepaged scans all mappings. + * Defrag is invoked by khugepaged hugepage allocations and by page faults + * for all hugepage allocations. */ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS -- cgit v1.2.3 From 83285c72e08c42848808039ef2d3b67a1bb88832 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Tue, 12 Nov 2013 15:07:19 -0800 Subject: mm: use pgdat_end_pfn() to simplify the code in others Use "pgdat_end_pfn()" instead of "pgdat->node_start_pfn + pgdat->node_spanned_pages". Simplify the code, no functional change. Signed-off-by: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/kcore.c | 3 +-- mm/bootmem.c | 2 +- mm/memory_hotplug.c | 9 ++++----- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 06ea155e1a59..5ed0e52d6aa0 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -255,8 +255,7 @@ static int kcore_update_ram(void) end_pfn = 0; for_each_node_state(nid, N_MEMORY) { unsigned long node_end; - node_end = NODE_DATA(nid)->node_start_pfn + - NODE_DATA(nid)->node_spanned_pages; + node_end = node_end_pfn(nid); if (end_pfn < node_end) end_pfn = node_end; } diff --git a/mm/bootmem.c b/mm/bootmem.c index 6ab7744e692e..95b528cd4de7 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -784,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); /* update goal according ...MAX_DMA32_PFN */ - end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; + end_pfn = pgdat_end_pfn(pgdat); if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ed85fe3870e2..375a42d76b2c 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -365,8 +365,7 @@ out_fail: static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long old_pgdat_end_pfn = - pgdat->node_start_pfn + pgdat->node_spanned_pages; + unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) pgdat->node_start_pfn = start_pfn; @@ -579,9 +578,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, static void shrink_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pgdat_start_pfn = pgdat->node_start_pfn; - unsigned long pgdat_end_pfn = - pgdat->node_start_pfn + pgdat->node_spanned_pages; + unsigned long pgdat_start_pfn = pgdat->node_start_pfn; + unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ + unsigned long pgdat_end_pfn = p; unsigned long pfn; struct mem_section *ms; int nid = pgdat->node_id; -- cgit v1.2.3 From b38a872596dad80bd77d98f5fdbc58cc8f438dbb Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Tue, 12 Nov 2013 15:07:20 -0800 Subject: mm: use populated_zone() instead of if(zone->present_pages) Use "if (zone->present_pages)" instead of "if (zone->present_pages)". Simplify the code, no functional change. Signed-off-by: Xishi Qiu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 73d812f16dde..3d1d75a6629f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4266,7 +4266,7 @@ static __meminit void zone_pcp_init(struct zone *zone) */ zone->pageset = &boot_pageset; - if (zone->present_pages) + if (populated_zone(zone)) printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", zone->name, zone->present_pages, zone_batchsize(zone)); @@ -5160,7 +5160,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - if (zone->present_pages) { + if (populated_zone(zone)) { node_set_state(nid, N_HIGH_MEMORY); if (N_NORMAL_MEMORY != N_HIGH_MEMORY && zone_type <= ZONE_NORMAL) -- cgit v1.2.3 From d6de9d5349db61e134ab7fb6b2436a4c7938714c Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Tue, 12 Nov 2013 15:07:20 -0800 Subject: mm/memory_hotplug.c: rename the function is_memblock_offlined_cb() A is_memblock_offlined() return or 1 means memory block is offlined, but is_memblock_offlined_cb() returning 1 means memory block is not offlined, this will confuse somebody, so rename the function. Signed-off-by: Xishi Qiu Acked-by: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 375a42d76b2c..133a4e132632 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1701,7 +1701,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, } #ifdef CONFIG_MEMORY_HOTREMOVE -static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) +static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); @@ -1853,7 +1853,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) * if this is not the case. */ ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, - is_memblock_offlined_cb); + check_memblock_offlined_cb); if (ret) { unlock_memory_hotplug(); BUG(); -- cgit v1.2.3 From 9c2606b77d6bffb422928bca66c8dc84d85089be Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Tue, 12 Nov 2013 15:07:21 -0800 Subject: mm/memory_hotplug.c: use pfn_to_nid() instead of page_to_nid(pfn_to_page()) Use "pfn_to_nid(pfn)" instead of "page_to_nid(pfn_to_page(pfn))". Signed-off-by: Xishi Qiu Acked-by: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 133a4e132632..5118028468eb 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -934,7 +934,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = page_to_nid(pfn_to_page(pfn)); + nid = pfn_to_nid(pfn); ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); -- cgit v1.2.3 From b9921ecdee66984b00c38c00a358ef3f611d2b50 Mon Sep 17 00:00:00 2001 From: Qiang Huang Date: Tue, 12 Nov 2013 15:07:22 -0800 Subject: mm: add a helper function to check may oom condition Use helper function to check if we need to deal with oom condition. Signed-off-by: Qiang Huang Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/oom.h | 5 +++++ mm/memcontrol.c | 9 +-------- mm/page_alloc.c | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/include/linux/oom.h b/include/linux/oom.h index da60007075b5..4cd62677feb9 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -82,6 +82,11 @@ static inline void oom_killer_enable(void) oom_killer_disabled = false; } +static inline bool oom_gfp_allowed(gfp_t gfp_mask) +{ + return (gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY); +} + extern struct task_struct *find_lock_task_mm(struct task_struct *p); /* sysctls */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 13b9d0f221b8..3427de9897a5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2984,21 +2984,14 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) struct res_counter *fail_res; struct mem_cgroup *_memcg; int ret = 0; - bool may_oom; ret = res_counter_charge(&memcg->kmem, size, &fail_res); if (ret) return ret; - /* - * Conditions under which we can wait for the oom_killer. Those are - * the same conditions tested by the core page allocator - */ - may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); - _memcg = memcg; ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, - &_memcg, may_oom); + &_memcg, oom_gfp_allowed(gfp)); if (ret == -EINTR) { /* diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3d1d75a6629f..e0412c026e0d 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2593,7 +2593,7 @@ rebalance: * running out of options and have to consider going OOM */ if (!did_some_progress) { - if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + if (oom_gfp_allowed(gfp_mask)) { if (oom_killer_disabled) goto nopage; /* Coredumps can quickly deplete all memory reserves */ -- cgit v1.2.3 From 309d0b3917387b48d1fa1a15aa6762de489c9123 Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Tue, 12 Nov 2013 15:07:23 -0800 Subject: mm/nobootmem.c: have __free_pages_memory() free in larger chunks. On large memory machines it can take a few minutes to get through free_all_bootmem(). Currently, when free_all_bootmem() calls __free_pages_memory(), the number of contiguous pages that __free_pages_memory() passes to the buddy allocator is limited to BITS_PER_LONG. BITS_PER_LONG was originally chosen to keep things similar to mm/nobootmem.c. But it is more efficient to limit it to MAX_ORDER. base new change 8TB 202s 172s 30s 16TB 401s 351s 50s That is around 1%-3% improvement on total boot time. This patch was spun off from the boot time rfc Robin and I had been working on. Signed-off-by: Robin Holt Signed-off-by: Nathan Zimmer Cc: Robin Holt Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Mike Travis Cc: Yinghai Lu Cc: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/nobootmem.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 61107cf55bb3..2c254d374655 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -82,27 +82,18 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) static void __init __free_pages_memory(unsigned long start, unsigned long end) { - unsigned long i, start_aligned, end_aligned; - int order = ilog2(BITS_PER_LONG); + int order; - start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); - end_aligned = end & ~(BITS_PER_LONG - 1); + while (start < end) { + order = min(MAX_ORDER - 1UL, __ffs(start)); - if (end_aligned <= start_aligned) { - for (i = start; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); + while (start + (1UL << order) > end) + order--; - return; - } - - for (i = start; i < start_aligned; i++) - __free_pages_bootmem(pfn_to_page(i), 0); + __free_pages_bootmem(pfn_to_page(start), order); - for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) - __free_pages_bootmem(pfn_to_page(i), order); - - for (i = end_aligned; i < end; i++) - __free_pages_bootmem(pfn_to_page(i), 0); + start += (1UL << order); + } } static unsigned long __init __free_memory_core(phys_addr_t start, -- cgit v1.2.3 From 01b0f19707c51ef247404e6af1d4a97a11ba34f7 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 12 Nov 2013 15:07:25 -0800 Subject: cpu/mem hotplug: add try_online_node() for cpu_up() cpu_up() has #ifdef CONFIG_MEMORY_HOTPLUG code blocks, which call mem_online_node() to put its node online if offlined and then call build_all_zonelists() to initialize the zone list. These steps are specific to memory hotplug, and should be managed in mm/memory_hotplug.c. lock_memory_hotplug() should also be held for the whole steps. For this reason, this patch replaces mem_online_node() with try_online_node(), which performs the whole steps with lock_memory_hotplug() held. try_online_node() is named after try_offline_node() as they have similar purpose. There is no functional change in this patch. Signed-off-by: Toshi Kani Reviewed-by: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 8 +++++++- kernel/cpu.c | 29 +++-------------------------- mm/memory_hotplug.c | 16 ++++++++++++++-- 3 files changed, 24 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index dd38e62b84d2..22203c293f07 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -94,6 +94,8 @@ extern void __online_page_set_limits(struct page *page); extern void __online_page_increment_counters(struct page *page); extern void __online_page_free(struct page *page); +extern int try_online_node(int nid); + #ifdef CONFIG_MEMORY_HOTREMOVE extern bool is_pageblock_removable_nolock(struct page *page); extern int arch_remove_memory(u64 start, u64 size); @@ -225,6 +227,11 @@ static inline void register_page_bootmem_info_node(struct pglist_data *pgdat) { } +static inline int try_online_node(int nid) +{ + return 0; +} + static inline void lock_memory_hotplug(void) {} static inline void unlock_memory_hotplug(void) {} @@ -256,7 +263,6 @@ static inline void remove_memory(int nid, u64 start, u64 size) {} extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); -extern int mem_online_node(int nid); extern int add_memory(int nid, u64 start, u64 size); extern int arch_add_memory(int nid, u64 start, u64 size); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); diff --git a/kernel/cpu.c b/kernel/cpu.c index 63aa50d7ce1e..973d034acf84 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -437,11 +437,6 @@ int cpu_up(unsigned int cpu) { int err = 0; -#ifdef CONFIG_MEMORY_HOTPLUG - int nid; - pg_data_t *pgdat; -#endif - if (!cpu_possible(cpu)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); @@ -452,27 +447,9 @@ int cpu_up(unsigned int cpu) return -EINVAL; } -#ifdef CONFIG_MEMORY_HOTPLUG - nid = cpu_to_node(cpu); - if (!node_online(nid)) { - err = mem_online_node(nid); - if (err) - return err; - } - - pgdat = NODE_DATA(nid); - if (!pgdat) { - printk(KERN_ERR - "Can't online cpu %d due to NULL pgdat\n", cpu); - return -ENOMEM; - } - - if (pgdat->node_zonelists->_zonerefs->zone == NULL) { - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); - mutex_unlock(&zonelists_mutex); - } -#endif + err = try_online_node(cpu_to_node(cpu)); + if (err) + return err; cpu_maps_update_begin(); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5118028468eb..8285346be663 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1043,17 +1043,23 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } -/* +/** + * try_online_node - online a node if offlined + * * called by cpu_up() to online a node without onlined memory. */ -int mem_online_node(int nid) +int try_online_node(int nid) { pg_data_t *pgdat; int ret; + if (node_online(nid)) + return 0; + lock_memory_hotplug(); pgdat = hotadd_new_pgdat(nid, 0); if (!pgdat) { + pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; goto out; } @@ -1061,6 +1067,12 @@ int mem_online_node(int nid) ret = register_one_node(nid); BUG_ON(ret); + if (pgdat->node_zonelists->_zonerefs->zone == NULL) { + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL, NULL); + mutex_unlock(&zonelists_mutex); + } + out: unlock_memory_hotplug(); return ret; -- cgit v1.2.3 From 03b61ff3c324e094944b663cc611a8bab252539c Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 12 Nov 2013 15:07:26 -0800 Subject: mm/memory-failure.c: move set_migratetype_isolate() outside get_any_page() Chen Gong pointed out that set/unset_migratetype_isolate() was done in different functions in mm/memory-failure.c, which makes the code less readable/maintainable. So this patch does it in soft_offline_page(). With this patch, we get to hold lock_memory_hotplug() longer but it's not a problem because races between memory hotplug and soft offline are very rare. Signed-off-by: Naoya Horiguchi Reviewed-by: Chen, Gong Acked-by: Andi Kleen Reviewed-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index bf3351b5115e..f9d78ec7831f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1422,19 +1422,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) if (flags & MF_COUNT_INCREASED) return 1; - /* - * The lock_memory_hotplug prevents a race with memory hotplug. - * This is a big hammer, a better would be nicer. - */ - lock_memory_hotplug(); - - /* - * Isolate the page, so that it doesn't get reallocated if it - * was free. This flag should be kept set until the source page - * is freed and PG_hwpoison on it is set. - */ - if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE) - set_migratetype_isolate(p, true); /* * When the target page is a free hugepage, just remove it * from free hugepage list. @@ -1455,7 +1442,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) /* Not a free page */ ret = 1; } - unlock_memory_hotplug(); return ret; } @@ -1654,15 +1640,28 @@ int soft_offline_page(struct page *page, int flags) } } + /* + * The lock_memory_hotplug prevents a race with memory hotplug. + * This is a big hammer, a better would be nicer. + */ + lock_memory_hotplug(); + + /* + * Isolate the page, so that it doesn't get reallocated if it + * was free. This flag should be kept set until the source page + * is freed and PG_hwpoison on it is set. + */ + if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + set_migratetype_isolate(page, true); + ret = get_any_page(page, pfn, flags); - if (ret < 0) - goto unset; - if (ret) { /* for in-use pages */ + unlock_memory_hotplug(); + if (ret > 0) { /* for in-use pages */ if (PageHuge(page)) ret = soft_offline_huge_page(page, flags); else ret = __soft_offline_page(page, flags); - } else { /* for free pages */ + } else if (ret == 0) { /* for free pages */ if (PageHuge(page)) { set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); @@ -1673,7 +1672,6 @@ int soft_offline_page(struct page *page, int flags) atomic_long_inc(&num_poisoned_pages); } } -unset: unset_migratetype_isolate(page, MIGRATE_MOVABLE); return ret; } -- cgit v1.2.3 From 948927ee9e4f35f287e61a79c9f0e85ca2202c7d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 12 Nov 2013 15:07:28 -0800 Subject: mm, mempolicy: make mpol_to_str robust and always succeed mpol_to_str() should not fail. Currently, it either fails because the string buffer is too small or because a string hasn't been defined for a mempolicy mode. If a new mempolicy mode is introduced and no string is defined for it, just warn and return "unknown". If the buffer is too small, just truncate the string and return, the same behavior as snprintf(). This also fixes a bug where there was no NULL-byte termination when doing *p++ = '=' and *p++ ':' and maxlen has been reached. Signed-off-by: David Rientjes Cc: KOSAKI Motohiro Cc: Chen Gang Cc: Rik van Riel Cc: Dave Jones Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 14 ++++++------- include/linux/mempolicy.h | 5 ++--- mm/mempolicy.c | 52 +++++++++++++++-------------------------------- 3 files changed, 24 insertions(+), 47 deletions(-) (limited to 'mm') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 390bdab01c3c..9f1369fe0afb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1387,8 +1387,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) struct mm_struct *mm = vma->vm_mm; struct mm_walk walk = {}; struct mempolicy *pol; - int n; - char buffer[50]; + char buffer[64]; + int nid; if (!mm) return 0; @@ -1404,10 +1404,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) walk.mm = mm; pol = get_vma_policy(task, vma, vma->vm_start); - n = mpol_to_str(buffer, sizeof(buffer), pol); + mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); - if (n < 0) - return n; seq_printf(m, "%08lx %s", vma->vm_start, buffer); @@ -1460,9 +1458,9 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) if (md->writeback) seq_printf(m, " writeback=%lu", md->writeback); - for_each_node_state(n, N_MEMORY) - if (md->node[n]) - seq_printf(m, " N%d=%lu", n, md->node[n]); + for_each_node_state(nid, N_MEMORY) + if (md->node[nid]) + seq_printf(m, " N%d=%lu", nid, md->node[nid]); out: seq_putc(m, '\n'); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index ea4d2495c646..9fe426b30a41 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -169,7 +169,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, extern int mpol_parse_str(char *str, struct mempolicy **mpol); #endif -extern int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); +extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ static inline int vma_migratable(struct vm_area_struct *vma) @@ -307,9 +307,8 @@ static inline int mpol_parse_str(char *str, struct mempolicy **mpol) } #endif -static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) +static inline void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { - return 0; } static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 71cb253368cb..260b8213a873 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2914,62 +2914,45 @@ out: * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * - * Convert a mempolicy into a string. - * Returns the number of characters in buffer (if positive) - * or an error (negative) + * Convert @pol into a string. If @buffer is too short, truncate the string. + * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the + * longest flag, "relative", and to display at least a few node ids. */ -int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) +void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; - int l; - nodemask_t nodes; - unsigned short mode; - unsigned short flags = pol ? pol->flags : 0; - - /* - * Sanity check: room for longest mode, flag and some nodes - */ - VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); + nodemask_t nodes = NODE_MASK_NONE; + unsigned short mode = MPOL_DEFAULT; + unsigned short flags = 0; - if (!pol || pol == &default_policy) - mode = MPOL_DEFAULT; - else + if (pol && pol != &default_policy) { mode = pol->mode; + flags = pol->flags; + } switch (mode) { case MPOL_DEFAULT: - nodes_clear(nodes); break; - case MPOL_PREFERRED: - nodes_clear(nodes); if (flags & MPOL_F_LOCAL) mode = MPOL_LOCAL; else node_set(pol->v.preferred_node, nodes); break; - case MPOL_BIND: - /* Fall through */ case MPOL_INTERLEAVE: nodes = pol->v.nodes; break; - default: - return -EINVAL; + WARN_ON_ONCE(1); + snprintf(p, maxlen, "unknown"); + return; } - l = strlen(policy_modes[mode]); - if (buffer + maxlen < p + l + 1) - return -ENOSPC; - - strcpy(p, policy_modes[mode]); - p += l; + p += snprintf(p, maxlen, policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { - if (buffer + maxlen < p + 2) - return -ENOSPC; - *p++ = '='; + p += snprintf(p, buffer + maxlen - p, "="); /* * Currently, the only defined flags are mutually exclusive @@ -2981,10 +2964,7 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) } if (!nodes_empty(nodes)) { - if (buffer + maxlen < p + 2) - return -ENOSPC; - *p++ = ':'; + p += snprintf(p, buffer + maxlen - p, ":"); p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); } - return p - buffer; } -- cgit v1.2.3 From 3722e13cff361035583f6ecfa784437b824fe659 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 12 Nov 2013 15:07:29 -0800 Subject: mm/vmalloc: don't set area->caller twice The caller address has already been set in set_vmalloc_vm(), there's no need to set it again in __vmalloc_area_node. Reviewed-by: Zhang Yanfei Signed-off-by: Wanpeng Li Cc: Joonsoo Kim Cc: KOSAKI Motohiro Cc: Mitsuo Hayasaka Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index dea15e6bfc8d..285f0e7d28e7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1546,7 +1546,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node, const void *caller) + pgprot_t prot, int node) { const int order = 0; struct page **pages; @@ -1560,13 +1560,12 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, - PAGE_KERNEL, node, caller); + PAGE_KERNEL, node, area->caller); area->flags |= VM_VPAGES; } else { pages = kmalloc_node(array_size, nested_gfp, node); } area->pages = pages; - area->caller = caller; if (!area->pages) { remove_vm_area(area->addr); kfree(area); @@ -1634,7 +1633,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!area) goto fail; - addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); + addr = __vmalloc_area_node(area, gfp_mask, prot, node); if (!addr) goto fail; -- cgit v1.2.3 From c2ce8c142c43c360047e173d2018d94a4d0f7a59 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 12 Nov 2013 15:07:31 -0800 Subject: mm/vmalloc: fix show vmap_area information race with vmap_area tear down There is a race window between vmap_area tear down and show vmap_area information. A B remove_vm_area spin_lock(&vmap_area_lock); va->vm = NULL; va->flags &= ~VM_VM_AREA; spin_unlock(&vmap_area_lock); spin_lock(&vmap_area_lock); if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEZING)) return 0; if (!(va->flags & VM_VM_AREA)) { seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", (void *)va->va_start, (void *)va->va_end, va->va_end - va->va_start); return 0; } free_unmap_vmap_area(va); flush_cache_vunmap free_unmap_vmap_area_noflush unmap_vmap_area free_vmap_area_noflush va->flags |= VM_LAZY_FREE The assumption !VM_VM_AREA represents vm_map_ram allocation is introduced by d4033afdf828 ("mm, vmalloc: iterate vmap_area_list, instead of vmlist, in vmallocinfo()"). However, !VM_VM_AREA also represents vmap_area is being tear down in race window mentioned above. This patch fix it by don't dump any information for !VM_VM_AREA case and also remove (VM_LAZY_FREE | VM_LAZY_FREEING) check since they are not possible for !VM_VM_AREA case. Suggested-by: Joonsoo Kim Acked-by: KOSAKI Motohiro Signed-off-by: Wanpeng Li Cc: Mitsuo Hayasaka Cc: Zhang Yanfei Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 285f0e7d28e7..814ce9122709 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2578,15 +2578,12 @@ static int s_show(struct seq_file *m, void *p) struct vmap_area *va = p; struct vm_struct *v; - if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) - return 0; - - if (!(va->flags & VM_VM_AREA)) { - seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", - (void *)va->va_start, (void *)va->va_end, - va->va_end - va->va_start); + /* + * s_show can encounter race with remove_vm_area, !VM_VM_AREA on + * behalf of vmap area is being tear down or vm_map_ram allocation. + */ + if (!(va->flags & VM_VM_AREA)) return 0; - } v = va->vm; -- cgit v1.2.3 From af12346cdacda36f0c35c657088282b8ecd0df72 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 12 Nov 2013 15:07:32 -0800 Subject: mm/vmalloc: revert "mm/vmalloc.c: check VM_UNINITIALIZED flag in s_show instead of show_numa_info" The VM_UNINITIALIZED/VM_UNLIST flag introduced by f5252e009d5b ("mm: avoid null pointer access in vm_struct via /proc/vmallocinfo") is used to avoid accessing the pages field with unallocated page when show_numa_info() is called. This patch moves the check just before show_numa_info in order that some messages still can be dumped via /proc/vmallocinfo. This patch reverts commit d157a55815ff ("mm/vmalloc.c: check VM_UNINITIALIZED flag in s_show instead of show_numa_info"); Reviewed-by: Zhang Yanfei Signed-off-by: Wanpeng Li Cc: Mitsuo Hayasaka Cc: Joonsoo Kim Cc: KOSAKI Motohiro Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 814ce9122709..67535f87846c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2562,6 +2562,11 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) if (!counters) return; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + if (v->flags & VM_UNINITIALIZED) + return; + memset(counters, 0, nr_node_ids * sizeof(unsigned int)); for (nr = 0; nr < v->nr_pages; nr++) @@ -2587,11 +2592,6 @@ static int s_show(struct seq_file *m, void *p) v = va->vm; - /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ - smp_rmb(); - if (v->flags & VM_UNINITIALIZED) - return 0; - seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); -- cgit v1.2.3 From b82225f3ff5be4c52cb588a4a53686db50aa6eb6 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 12 Nov 2013 15:07:33 -0800 Subject: revert mm/vmalloc.c: emit the failure message before return Don't warn twice in __vmalloc_area_node and __vmalloc_node_range if __vmalloc_area_node allocation failure. This patch reverts commit 46c001a2753f ("mm/vmalloc.c: emit the failure message before return"). Signed-off-by: Wanpeng Li Reviewed-by: Zhang Yanfei Cc: Joonsoo Kim Cc: KOSAKI Motohiro Cc: Mitsuo Hayasaka Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 67535f87846c..745fa9567475 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1635,7 +1635,7 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, addr = __vmalloc_area_node(area, gfp_mask, prot, node); if (!addr) - goto fail; + return NULL; /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED -- cgit v1.2.3 From 10dc4155c7714f508fe2e4667164925ea971fb25 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 12 Nov 2013 15:07:35 -0800 Subject: mm: thp: cleanup: mv alloc_hugepage to better place Move alloc_hugepage() to a better place, no need for a seperate #ifndef CONFIG_NUMA Signed-off-by: Bob Liu Reviewed-by: Yasuaki Ishimatsu Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Andrew Davidoff Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4f7e2113646c..411c4f2c0492 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -759,14 +759,6 @@ static inline struct page *alloc_hugepage_vma(int defrag, HPAGE_PMD_ORDER, vma, haddr, nd); } -#ifndef CONFIG_NUMA -static inline struct page *alloc_hugepage(int defrag) -{ - return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), - HPAGE_PMD_ORDER); -} -#endif - static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) @@ -2251,6 +2243,12 @@ static struct page return *hpage; } #else +static inline struct page *alloc_hugepage(int defrag) +{ + return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), + HPAGE_PMD_ORDER); +} + static struct page *khugepaged_alloc_hugepage(bool *wait) { struct page *hpage; -- cgit v1.2.3 From 9f1b868a13ac36bd207a571f5ea1193d823ab18d Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 12 Nov 2013 15:07:37 -0800 Subject: mm: thp: khugepaged: add policy for finding target node Khugepaged will scan/free HPAGE_PMD_NR normal pages and replace with a hugepage which is allocated from the node of the first scanned normal page, but this policy is too rough and may end with unexpected result to upper users. The problem is the original page-balancing among all nodes will be broken after hugepaged started. Thinking about the case if the first scanned normal page is allocated from node A, most of other scanned normal pages are allocated from node B or C.. But hugepaged will always allocate hugepage from node A which will cause extra memory pressure on node A which is not the situation before khugepaged started. This patch try to fix this problem by making khugepaged allocate hugepage from the node which have max record of scaned normal pages hit, so that the effect to original page-balancing can be minimized. The other problem is if normal scanned pages are equally allocated from Node A,B and C, after khugepaged started Node A will still suffer extra memory pressure. Andrew Davidoff reported a related issue several days ago. He wanted his application interleaving among all nodes and "numactl --interleave=all ./test" was used to run the testcase, but the result wasn't not as expected. cat /proc/2814/numa_maps: 7f50bd440000 interleave:0-3 anon=51403 dirty=51403 N0=435 N1=435 N2=435 N3=50098 The end result showed that most pages are from Node3 instead of interleave among node0-3 which was unreasonable. This patch also fix this issue by allocating hugepage round robin from all nodes have the same record, after this patch the result was as expected: 7f78399c0000 interleave:0-3 anon=51403 dirty=51403 N0=12723 N1=12723 N2=13235 N3=12722 The simple testcase is like this: int main() { char *p; int i; int j; for (i=0; i < 200; i++) { p = (char *)malloc(1048576); printf("malloc done\n"); if (p == 0) { printf("Out of memory\n"); return 1; } for (j=0; j < 1048576; j++) { p[j] = 'A'; } printf("touched memory\n"); sleep(1); } printf("enter sleep\n"); while(1) { sleep(100); } } [akpm@linux-foundation.org: make last_khugepaged_target_node local to khugepaged_find_target_node()] Reported-by: Andrew Davidoff Tested-by: Andrew Davidoff Signed-off-by: Bob Liu Cc: Andrea Arcangeli Cc: Kirill A. Shutemov Cc: Mel Gorman Cc: Yasuaki Ishimatsu Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 53 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 411c4f2c0492..0556c6a44959 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2191,7 +2191,34 @@ static void khugepaged_alloc_sleep(void) msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); } +static int khugepaged_node_load[MAX_NUMNODES]; + #ifdef CONFIG_NUMA +static int khugepaged_find_target_node(void) +{ + static int last_khugepaged_target_node = NUMA_NO_NODE; + int nid, target_node = 0, max_value = 0; + + /* find first node with max normal pages hit */ + for (nid = 0; nid < MAX_NUMNODES; nid++) + if (khugepaged_node_load[nid] > max_value) { + max_value = khugepaged_node_load[nid]; + target_node = nid; + } + + /* do some balance if several nodes have the same hit record */ + if (target_node <= last_khugepaged_target_node) + for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == khugepaged_node_load[nid]) { + target_node = nid; + break; + } + + last_khugepaged_target_node = target_node; + return target_node; +} + static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) { if (IS_ERR(*hpage)) { @@ -2225,9 +2252,8 @@ static struct page * mmap_sem in read mode is good idea also to allow greater * scalability. */ - *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, - node, __GFP_OTHER_NODE); - + *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( + khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); /* * After allocating the hugepage, release the mmap_sem read lock in * preparation for taking it in write mode. @@ -2243,6 +2269,11 @@ static struct page return *hpage; } #else +static int khugepaged_find_target_node(void) +{ + return 0; +} + static inline struct page *alloc_hugepage(int defrag) { return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), @@ -2455,6 +2486,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (pmd_trans_huge(*pmd)) goto out; + memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { @@ -2471,12 +2503,13 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (unlikely(!page)) goto out_unmap; /* - * Chose the node of the first page. This could - * be more sophisticated and look at more pages, - * but isn't for now. + * Record which node the original page is from and save this + * information to khugepaged_node_load[]. + * Khupaged will allocate hugepage from the node has the max + * hit record. */ - if (node == NUMA_NO_NODE) - node = page_to_nid(page); + node = page_to_nid(page); + khugepaged_node_load[node]++; VM_BUG_ON(PageCompound(page)); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; @@ -2491,9 +2524,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) + if (ret) { + node = khugepaged_find_target_node(); /* collapse_huge_page will return with the mmap_sem released */ collapse_huge_page(mm, address, hpage, vma, node); + } out: return ret; } -- cgit v1.2.3 From b76ac7e734608d706bf225be062a7a46d165dda6 Mon Sep 17 00:00:00 2001 From: Jianguo Wu Date: Tue, 12 Nov 2013 15:07:39 -0800 Subject: mm/mempolicy: use NUMA_NO_NODE Use more appropriate NUMA_NO_NODE instead of -1 Signed-off-by: Jianguo Wu Acked-by: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 260b8213a873..4cc19f6ab6c6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1125,7 +1125,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, tmp = *from; while (!nodes_empty(tmp)) { int s,d; - int source = -1; + int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { @@ -1160,7 +1160,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, if (!node_isset(dest, tmp)) break; } - if (source == -1) + if (source == NUMA_NO_NODE) break; node_clear(source, tmp); @@ -1835,7 +1835,7 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; int c; - int nid = -1; + int nid = NUMA_NO_NODE; if (!nnodes) return numa_node_id(); @@ -1872,11 +1872,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol, /* * Return the bit number of a random bit set in the nodemask. - * (returns -1 if nodemask is empty) + * (returns NUMA_NO_NODE if nodemask is empty) */ int node_random(const nodemask_t *maskp) { - int w, bit = -1; + int w, bit = NUMA_NO_NODE; w = nodes_weight(*maskp); if (w) -- cgit v1.2.3 From 25485de6e90ef1684c55a203988fad5eab7a45d6 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Tue, 12 Nov 2013 15:07:40 -0800 Subject: memcg: refactor mem_control_numa_stat_show() Refactor mem_control_numa_stat_show() to use a new stats structure for smaller and simpler code. This consolidates nearly identical code. text data bss dec hex filename 8,137,679 1,703,496 1,896,448 11,737,623 b31a17 vmlinux.before 8,136,911 1,703,496 1,896,448 11,736,855 b31717 vmlinux.after Signed-off-by: Greg Thelen Signed-off-by: Ying Han Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 56 ++++++++++++++++++++++---------------------------------- 1 file changed, 22 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3427de9897a5..91b5d3a62059 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5369,45 +5369,33 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, static int memcg_numa_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { + struct numa_stat { + const char *name; + unsigned int lru_mask; + }; + + static const struct numa_stat stats[] = { + { "total", LRU_ALL }, + { "file", LRU_ALL_FILE }, + { "anon", LRU_ALL_ANON }, + { "unevictable", BIT(LRU_UNEVICTABLE) }, + }; + const struct numa_stat *stat; int nid; - unsigned long total_nr, file_nr, anon_nr, unevictable_nr; - unsigned long node_nr; + unsigned long nr; struct mem_cgroup *memcg = mem_cgroup_from_css(css); - total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); - seq_printf(m, "total=%lu", total_nr); - for_each_node_state(nid, N_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); - seq_printf(m, " N%d=%lu", nid, node_nr); - } - seq_putc(m, '\n'); - - file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); - seq_printf(m, "file=%lu", file_nr); - for_each_node_state(nid, N_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - LRU_ALL_FILE); - seq_printf(m, " N%d=%lu", nid, node_nr); - } - seq_putc(m, '\n'); - - anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); - seq_printf(m, "anon=%lu", anon_nr); - for_each_node_state(nid, N_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - LRU_ALL_ANON); - seq_printf(m, " N%d=%lu", nid, node_nr); + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { + nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); + seq_printf(m, "%s=%lu", stat->name, nr); + for_each_node_state(nid, N_MEMORY) { + nr = mem_cgroup_node_nr_lru_pages(memcg, nid, + stat->lru_mask); + seq_printf(m, " N%d=%lu", nid, nr); + } + seq_putc(m, '\n'); } - seq_putc(m, '\n'); - unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); - seq_printf(m, "unevictable=%lu", unevictable_nr); - for_each_node_state(nid, N_MEMORY) { - node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - BIT(LRU_UNEVICTABLE)); - seq_printf(m, " N%d=%lu", nid, node_nr); - } - seq_putc(m, '\n'); return 0; } #endif /* CONFIG_NUMA */ -- cgit v1.2.3 From 071aee138410210e3764f3ae8d37ef46dc6d3b42 Mon Sep 17 00:00:00 2001 From: Ying Han Date: Tue, 12 Nov 2013 15:07:41 -0800 Subject: memcg: support hierarchical memory.numa_stats The memory.numa_stat file was not hierarchical. Memory charged to the children was not shown in parent's numa_stat. This change adds the "hierarchical_" stats to the existing stats. The new hierarchical stats include the sum of all children's values in addition to the value of the memcg. Tested: Create cgroup a, a/b and run workload under b. The values of b are included in the "hierarchical_*" under a. $ cd /sys/fs/cgroup $ echo 1 > memory.use_hierarchy $ mkdir a a/b Run workload in a/b: $ (echo $BASHPID >> a/b/cgroup.procs && cat /some/file && bash) & The hierarchical_ fields in parent (a) show use of workload in a/b: $ cat a/memory.numa_stat total=0 N0=0 N1=0 N2=0 N3=0 file=0 N0=0 N1=0 N2=0 N3=0 anon=0 N0=0 N1=0 N2=0 N3=0 unevictable=0 N0=0 N1=0 N2=0 N3=0 hierarchical_total=908 N0=552 N1=317 N2=39 N3=0 hierarchical_file=850 N0=549 N1=301 N2=0 N3=0 hierarchical_anon=58 N0=3 N1=16 N2=39 N3=0 hierarchical_unevictable=0 N0=0 N1=0 N2=0 N3=0 $ cat a/b/memory.numa_stat total=908 N0=552 N1=317 N2=39 N3=0 file=850 N0=549 N1=301 N2=0 N3=0 anon=58 N0=3 N1=16 N2=39 N3=0 unevictable=0 N0=0 N1=0 N2=0 N3=0 hierarchical_total=908 N0=552 N1=317 N2=39 N3=0 hierarchical_file=850 N0=549 N1=301 N2=0 N3=0 hierarchical_anon=58 N0=3 N1=16 N2=39 N3=0 hierarchical_unevictable=0 N0=0 N1=0 N2=0 N3=0 Signed-off-by: Ying Han Signed-off-by: Greg Thelen Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/cgroups/memory.txt | 10 +++++++--- mm/memcontrol.c | 17 +++++++++++++++++ 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 8af4ad121828..e2bc132608fd 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -573,15 +573,19 @@ an memcg since the pages are allowed to be allocated from any physical node. One of the use cases is evaluating application performance by combining this information with the application's CPU allocation. -We export "total", "file", "anon" and "unevictable" pages per-node for -each memcg. The ouput format of memory.numa_stat is: +Each memcg's numa_stat file includes "total", "file", "anon" and "unevictable" +per-node page counts including "hierarchical_" which sums up all +hierarchical children's values in addition to the memcg's own value. + +The ouput format of memory.numa_stat is: total= N0= N1= ... file= N0= N1= ... anon= N0= N1= ... unevictable= N0= N1= ... +hierarchical_= N0= N1= ... -And we have total = file + anon + unevictable. +The "total" count is sum of file + anon + unevictable. 6. Hierarchy support diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 91b5d3a62059..c89072443166 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5396,6 +5396,23 @@ static int memcg_numa_stat_show(struct cgroup_subsys_state *css, seq_putc(m, '\n'); } + for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { + struct mem_cgroup *iter; + + nr = 0; + for_each_mem_cgroup_tree(iter, memcg) + nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); + seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); + for_each_node_state(nid, N_MEMORY) { + nr = 0; + for_each_mem_cgroup_tree(iter, memcg) + nr += mem_cgroup_node_nr_lru_pages( + iter, nid, stat->lru_mask); + seq_printf(m, " N%d=%lu", nid, nr); + } + seq_putc(m, '\n'); + } + return 0; } #endif /* CONFIG_NUMA */ -- cgit v1.2.3 From 85b35feaecd4d2284505b22708795bc1f03fc897 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Tue, 12 Nov 2013 15:07:42 -0800 Subject: mm/sparsemem: use PAGES_PER_SECTION to remove redundant nr_pages parameter For below functions, - sparse_add_one_section() - kmalloc_section_memmap() - __kmalloc_section_memmap() - __kfree_section_memmap() they are always invoked to operate on one memory section, so it is redundant to always pass a nr_pages parameter, which is the page numbers in one section. So we can directly use predefined macro PAGES_PER_SECTION instead of passing the parameter. Signed-off-by: Zhang Yanfei Cc: Wen Congyang Cc: Tang Chen Cc: Toshi Kani Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Yasunori Goto Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 3 +-- mm/memory_hotplug.c | 3 +-- mm/sparse.c | 33 +++++++++++++++------------------ 3 files changed, 17 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 22203c293f07..4ca3d951fe91 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -268,8 +268,7 @@ extern int arch_add_memory(int nid, u64 start, u64 size); extern int offline_pages(unsigned long start_pfn, unsigned long nr_pages); extern bool is_memblock_offlined(struct memory_block *mem); extern void remove_memory(int nid, u64 start, u64 size); -extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn, - int nr_pages); +extern int sparse_add_one_section(struct zone *zone, unsigned long start_pfn); extern void sparse_remove_one_section(struct zone *zone, struct mem_section *ms); extern struct page *sparse_decode_mem_map(unsigned long coded_mem_map, unsigned long pnum); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 8285346be663..1b6fe8ca71e6 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -401,13 +401,12 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) static int __meminit __add_section(int nid, struct zone *zone, unsigned long phys_start_pfn) { - int nr_pages = PAGES_PER_SECTION; int ret; if (pfn_valid(phys_start_pfn)) return -EEXIST; - ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); + ret = sparse_add_one_section(zone, phys_start_pfn); if (ret < 0) return ret; diff --git a/mm/sparse.c b/mm/sparse.c index 4ac1d7ef548f..fbb9dbc6aca9 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -590,16 +590,15 @@ void __init sparse_init(void) #ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_SPARSEMEM_VMEMMAP -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, - unsigned long nr_pages) +static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) { /* This will make the necessary allocations eventually. */ return sparse_mem_map_populate(pnum, nid); } -static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) +static void __kfree_section_memmap(struct page *memmap) { unsigned long start = (unsigned long)memmap; - unsigned long end = (unsigned long)(memmap + nr_pages); + unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); vmemmap_free(start, end); } @@ -613,10 +612,10 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) } #endif /* CONFIG_MEMORY_HOTREMOVE */ #else -static struct page *__kmalloc_section_memmap(unsigned long nr_pages) +static struct page *__kmalloc_section_memmap(void) { struct page *page, *ret; - unsigned long memmap_size = sizeof(struct page) * nr_pages; + unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION; page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); if (page) @@ -634,19 +633,18 @@ got_map_ptr: return ret; } -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, - unsigned long nr_pages) +static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) { - return __kmalloc_section_memmap(nr_pages); + return __kmalloc_section_memmap(); } -static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) +static void __kfree_section_memmap(struct page *memmap) { if (is_vmalloc_addr(memmap)) vfree(memmap); else free_pages((unsigned long)memmap, - get_order(sizeof(struct page) * nr_pages)); + get_order(sizeof(struct page) * PAGES_PER_SECTION)); } #ifdef CONFIG_MEMORY_HOTREMOVE @@ -684,8 +682,7 @@ static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, - int nr_pages) +int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) { unsigned long section_nr = pfn_to_section_nr(start_pfn); struct pglist_data *pgdat = zone->zone_pgdat; @@ -702,12 +699,12 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, ret = sparse_index_init(section_nr, pgdat->node_id); if (ret < 0 && ret != -EEXIST) return ret; - memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); + memmap = kmalloc_section_memmap(section_nr, pgdat->node_id); if (!memmap) return -ENOMEM; usemap = __kmalloc_section_usemap(); if (!usemap) { - __kfree_section_memmap(memmap, nr_pages); + __kfree_section_memmap(memmap); return -ENOMEM; } @@ -719,7 +716,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, goto out; } - memset(memmap, 0, sizeof(struct page) * nr_pages); + memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); ms->section_mem_map |= SECTION_MARKED_PRESENT; @@ -729,7 +726,7 @@ out: pgdat_resize_unlock(pgdat, &flags); if (ret <= 0) { kfree(usemap); - __kfree_section_memmap(memmap, nr_pages); + __kfree_section_memmap(memmap); } return ret; } @@ -771,7 +768,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) if (PageSlab(usemap_page) || PageCompound(usemap_page)) { kfree(usemap); if (memmap) - __kfree_section_memmap(memmap, PAGES_PER_SECTION); + __kfree_section_memmap(memmap); return; } -- cgit v1.2.3 From 81556b02525181e19ef073a798ba9d48db96f708 Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Tue, 12 Nov 2013 15:07:43 -0800 Subject: mm/sparsemem: fix a bug in free_map_bootmem when CONFIG_SPARSEMEM_VMEMMAP We pass the number of pages which hold page structs of a memory section to free_map_bootmem(). This is right when !CONFIG_SPARSEMEM_VMEMMAP but wrong when CONFIG_SPARSEMEM_VMEMMAP. When CONFIG_SPARSEMEM_VMEMMAP, we should pass the number of pages of a memory section to free_map_bootmem. So the fix is removing the nr_pages parameter. When CONFIG_SPARSEMEM_VMEMMAP, we directly use the prefined marco PAGES_PER_SECTION in free_map_bootmem. When !CONFIG_SPARSEMEM_VMEMMAP, we calculate page numbers needed to hold the page structs for a memory section and use the value in free_map_bootmem(). This was found by reading the code. And I have no machine that support memory hot-remove to test the bug now. Signed-off-by: Zhang Yanfei Reviewed-by: Wanpeng Li Cc: Wen Congyang Cc: Tang Chen Cc: Toshi Kani Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Yasunori Goto Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index fbb9dbc6aca9..8cc7be0e9590 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -603,10 +603,10 @@ static void __kfree_section_memmap(struct page *memmap) vmemmap_free(start, end); } #ifdef CONFIG_MEMORY_HOTREMOVE -static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) +static void free_map_bootmem(struct page *memmap) { unsigned long start = (unsigned long)memmap; - unsigned long end = (unsigned long)(memmap + nr_pages); + unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); vmemmap_free(start, end); } @@ -648,12 +648,15 @@ static void __kfree_section_memmap(struct page *memmap) } #ifdef CONFIG_MEMORY_HOTREMOVE -static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) +static void free_map_bootmem(struct page *memmap) { unsigned long maps_section_nr, removing_section_nr, i; - unsigned long magic; + unsigned long magic, nr_pages; struct page *page = virt_to_page(memmap); + nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) + >> PAGE_SHIFT; + for (i = 0; i < nr_pages; i++, page++) { magic = (unsigned long) page->lru.next; @@ -756,7 +759,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) static void free_section_usemap(struct page *memmap, unsigned long *usemap) { struct page *usemap_page; - unsigned long nr_pages; if (!usemap) return; @@ -777,12 +779,8 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) * on the section which has pgdat at boot time. Just keep it as is now. */ - if (memmap) { - nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) - >> PAGE_SHIFT; - - free_map_bootmem(memmap, nr_pages); - } + if (memmap) + free_map_bootmem(memmap); } void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) -- cgit v1.2.3 From 7f88f88f83ed609650a01b18572e605ea50cd163 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Tue, 12 Nov 2013 15:07:45 -0800 Subject: mm: kmemleak: avoid false negatives on vmalloc'ed objects Commit 248ac0e1943a ("mm/vmalloc: remove guard page from between vmap blocks") had the side effect of making vmap_area.va_end member point to the next vmap_area.va_start. This was creating an artificial reference to vmalloc'ed objects and kmemleak was rarely reporting vmalloc() leaks. This patch marks the vmap_area containing pointers explicitly and reduces the min ref_count to 2 as vm_struct still contains a reference to the vmalloc'ed object. The kmemleak add_scan_area() function has been improved to allow a SIZE_MAX argument covering the rest of the object (for simpler calling sites). Signed-off-by: Catalin Marinas Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 4 +++- mm/vmalloc.c | 14 ++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index e126b0ef9ad2..31f01c5011e5 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -753,7 +753,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) } spin_lock_irqsave(&object->lock, flags); - if (ptr + size > object->pointer + object->size) { + if (size == SIZE_MAX) { + size = object->pointer + object->size - ptr; + } else if (ptr + size > object->pointer + object->size) { kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); dump_object_info(object); kmem_cache_free(scan_area_cache, area); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 745fa9567475..0fdf96803c5b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -359,6 +359,12 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, if (unlikely(!va)) return ERR_PTR(-ENOMEM); + /* + * Only scan the relevant parts containing pointers to other objects + * to avoid false negatives. + */ + kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); + retry: spin_lock(&vmap_area_lock); /* @@ -1645,11 +1651,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, clear_vm_uninitialized_flag(area); /* - * A ref_count = 3 is needed because the vm_struct and vmap_area - * structures allocated in the __get_vm_area_node() function contain - * references to the virtual address of the vmalloc'ed block. + * A ref_count = 2 is needed because vm_struct allocated in + * __get_vm_area_node() contains a reference to the virtual address of + * the vmalloc'ed block. */ - kmemleak_alloc(addr, real_size, 3, gfp_mask); + kmemleak_alloc(addr, real_size, 2, gfp_mask); return addr; -- cgit v1.2.3 From 2de1a7e40a30bed83f3da60d8cf0937354d9e7d1 Mon Sep 17 00:00:00 2001 From: Seth Jennings Date: Tue, 12 Nov 2013 15:07:46 -0800 Subject: mm/swapfile.c: fix comment typos Signed-off-by: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index de7c904e52e5..64458e3997bc 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -707,7 +707,7 @@ noswap: return (swp_entry_t) {0}; } -/* The only caller of this function is now susupend routine */ +/* The only caller of this function is now suspend routine */ swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si; @@ -845,7 +845,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, } /* - * Caller has made sure that the swapdevice corresponding to entry + * Caller has made sure that the swap device corresponding to entry * is still around or has not been recycled. */ void swap_free(swp_entry_t entry) @@ -947,7 +947,7 @@ int try_to_free_swap(struct page *page) * original page might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * - * Hibration suspends storage while it is writing the image + * Hibernation suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) @@ -1179,7 +1179,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * some architectures (e.g. x86_32 with PAE) we might catch a glimpse * of unmatched parts which look like swp_pte, so unuse_pte must * recheck under pte lock. Scanning without pte lock lets it be - * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. + * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. */ pte = pte_offset_map(pmd, addr); do { @@ -1934,7 +1934,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) vfree(swap_map); vfree(cluster_info); vfree(frontswap_map); - /* Destroy swap account informatin */ + /* Destroy swap account information */ swap_cgroup_swapoff(type); inode = mapping->host; @@ -2786,8 +2786,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) /* * We are fortunate that although vmalloc_to_page uses pte_offset_map, - * no architecture is using highmem pages for kernel pagetables: so it - * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. + * no architecture is using highmem pages for kernel page tables: so it + * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. */ head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; -- cgit v1.2.3 From 58e97ba6b1a0c78d0c847998cf3bcfa5344c19aa Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 12 Nov 2013 15:07:47 -0800 Subject: frontswap: enable call to invalidate area on swapoff During swapoff the frontswap_map was NULL-ified before calling frontswap_invalidate_area(). However the frontswap_invalidate_area() exits early if frontswap_map is NULL. Invalidate was never called during swapoff. This patch moves frontswap_map_set() in swapoff just after calling frontswap_invalidate_area() so outside of locks (swap_lock and swap_info_struct->lock). This shouldn't be a problem as during swapon the frontswap_map_set() is called also outside of any locks. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Seth Jennings Cc: Konrad Rzeszutek Wilk Cc: Shaohua Li Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 64458e3997bc..612a7c9795f6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1924,10 +1924,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->cluster_info = NULL; p->flags = 0; frontswap_map = frontswap_map_get(p); - frontswap_map_set(p, NULL); spin_unlock(&p->lock); spin_unlock(&swap_lock); frontswap_invalidate_area(type); + frontswap_map_set(p, NULL); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; -- cgit v1.2.3 From bfc4f9d5206739d0612782508a39e93b0793609d Mon Sep 17 00:00:00 2001 From: Zhang Yanfei Date: Tue, 12 Nov 2013 15:07:48 -0800 Subject: mm/page_alloc.c: remove unused marco LONG_ALIGN Signed-off-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e0412c026e0d..770dbb43465f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3881,8 +3881,6 @@ static inline unsigned long wait_table_bits(unsigned long size) return ffz(~size); } -#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) - /* * Check if a pageblock contains reserved pages */ -- cgit v1.2.3 From b349acc76b7f65400b85abd09a5379ddd6fa5a97 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Tue, 12 Nov 2013 15:07:52 -0800 Subject: mm/zswap: avoid unnecessary page scanning Add SetPageReclaim() before __swap_writepage() so that page can be moved to the tail of the inactive list, which can avoid unnecessary page scanning as this page was reclaimed by swap subsystem before. Signed-off-by: Weijie Yang Reviewed-by: Bob Liu Reviewed-by: Minchan Kim Acked-by: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index d93510c6aa2d..001474c1a594 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -556,6 +556,9 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) SetPageUptodate(page); } + /* move it to the tail of the inactive list after end_writeback */ + SetPageReclaim(page); + /* start writeback */ __swap_writepage(page, &wbc, end_swap_bio_write); page_cache_release(page); -- cgit v1.2.3 From 4e99b02131b280b064d30a5926ef1c4763f3097b Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 12 Nov 2013 15:07:54 -0800 Subject: mmap: arch_get_unmapped_area(): use proper mmap base for bottom up direction This is more or less the generic variant of commit 41aacc1eea64 ("x86 get_unmapped_area: Access mmap_legacy_base through mm_struct member"). So effectively architectures which use an own arch_pick_mmap_layout() implementation but call the generic arch_get_unmapped_area() now can also randomize their mmap_base. All architectures which have an own arch_pick_mmap_layout() and call the generic arch_get_unmapped_area() (arm64, s390, tile) currently set mmap_base to TASK_UNMAPPED_BASE. This is also true for the generic arch_pick_mmap_layout() function. So this change is a no-op currently. Signed-off-by: Heiko Carstens Cc: Radu Caragea Cc: Michel Lespinasse Cc: Catalin Marinas Cc: Will Deacon Cc: Chris Metcalf Cc: Martin Schwidefsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 9d548512ff8a..fa206abb5c9d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1872,7 +1872,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.flags = 0; info.length = len; - info.low_limit = TASK_UNMAPPED_BASE; + info.low_limit = mm->mmap_base; info.high_limit = TASK_SIZE; info.align_mask = 0; return vm_unmapped_area(&info); -- cgit v1.2.3 From 1402899e43fda490f08d2c47a7558931f8b9c60c Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 12 Nov 2013 15:07:57 -0800 Subject: mm/memblock.c: factor out of top-down allocation [Problem] The current Linux cannot migrate pages used by the kernel because of the kernel direct mapping. In Linux kernel space, va = pa + PAGE_OFFSET. When the pa is changed, we cannot simply update the pagetable and keep the va unmodified. So the kernel pages are not migratable. There are also some other issues will cause the kernel pages not migratable. For example, the physical address may be cached somewhere and will be used. It is not to update all the caches. When doing memory hotplug in Linux, we first migrate all the pages in one memory device somewhere else, and then remove the device. But if pages are used by the kernel, they are not migratable. As a result, memory used by the kernel cannot be hot-removed. Modifying the kernel direct mapping mechanism is too difficult to do. And it may cause the kernel performance down and unstable. So we use the following way to do memory hotplug. [What we are doing] In Linux, memory in one numa node is divided into several zones. One of the zones is ZONE_MOVABLE, which the kernel won't use. In order to implement memory hotplug in Linux, we are going to arrange all hotpluggable memory in ZONE_MOVABLE so that the kernel won't use these memory. To do this, we need ACPI's help. In ACPI, SRAT(System Resource Affinity Table) contains NUMA info. The memory affinities in SRAT record every memory range in the system, and also, flags specifying if the memory range is hotpluggable. (Please refer to ACPI spec 5.0 5.2.16) With the help of SRAT, we have to do the following two things to achieve our goal: 1. When doing memory hot-add, allow the users arranging hotpluggable as ZONE_MOVABLE. (This has been done by the MOVABLE_NODE functionality in Linux.) 2. when the system is booting, prevent bootmem allocator from allocating hotpluggable memory for the kernel before the memory initialization finishes. The problem 2 is the key problem we are going to solve. But before solving it, we need some preparation. Please see below. [Preparation] Bootloader has to load the kernel image into memory. And this memory must be unhotpluggable. We cannot prevent this anyway. So in a memory hotplug system, we can assume any node the kernel resides in is not hotpluggable. Before SRAT is parsed, we don't know which memory ranges are hotpluggable. But memblock has already started to work. In the current kernel, memblock allocates the following memory before SRAT is parsed: setup_arch() |->memblock_x86_fill() /* memblock is ready */ |...... |->early_reserve_e820_mpc_new() /* allocate memory under 1MB */ |->reserve_real_mode() /* allocate memory under 1MB */ |->init_mem_mapping() /* allocate page tables, about 2MB to map 1GB memory */ |->dma_contiguous_reserve() /* specified by user, should be low */ |->setup_log_buf() /* specified by user, several mega bytes */ |->relocate_initrd() /* could be large, but will be freed after boot, should reorder */ |->acpi_initrd_override() /* several mega bytes */ |->reserve_crashkernel() /* could be large, should reorder */ |...... |->initmem_init() /* Parse SRAT */ According to Tejun's advice, before SRAT is parsed, we should try our best to allocate memory near the kernel image. Since the whole node the kernel resides in won't be hotpluggable, and for a modern server, a node may have at least 16GB memory, allocating several mega bytes memory around the kernel image won't cross to hotpluggable memory. [About this patchset] So this patchset is the preparation for the problem 2 that we want to solve. It does the following: 1. Make memblock be able to allocate memory bottom up. 1) Keep all the memblock APIs' prototype unmodified. 2) When the direction is bottom up, keep the start address greater than the end of kernel image. 2. Improve init_mem_mapping() to support allocate page tables in bottom up direction. 3. Introduce "movable_node" boot option to enable and disable this functionality. This patch (of 6): Create a new function __memblock_find_range_top_down to factor out of top-down allocation from memblock_find_in_range_node. This is a preparation because we will introduce a new bottom-up allocation mode in the following patch. Signed-off-by: Tang Chen Signed-off-by: Zhang Yanfei Acked-by: Tejun Heo Acked-by: Toshi Kani Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Wanpeng Li Cc: Thomas Renninger Cc: Yinghai Lu Cc: Jiang Liu Cc: Wen Congyang Cc: Lai Jiangshan Cc: Yasuaki Ishimatsu Cc: Taku Izumi Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Rik van Riel Cc: Johannes Weiner Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 47 ++++++++++++++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 0ac412a0a7ee..accff1087137 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -83,33 +83,25 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, } /** - * memblock_find_in_range_node - find free area in given range and node + * __memblock_find_range_top_down - find free area utility, in top-down * @start: start of candidate range * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} * @size: size of free area to find * @align: alignment of free area to find * @nid: nid of the free area to find, %MAX_NUMNODES for any node * - * Find @size free area aligned to @align in the specified range and node. + * Utility called from memblock_find_in_range_node(), find free area top-down. * * RETURNS: * Found address on success, %0 on failure. */ -phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, - phys_addr_t end, phys_addr_t size, - phys_addr_t align, int nid) +static phys_addr_t __init_memblock +__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align, int nid) { phys_addr_t this_start, this_end, cand; u64 i; - /* pump up @end */ - if (end == MEMBLOCK_ALLOC_ACCESSIBLE) - end = memblock.current_limit; - - /* avoid allocating the first page */ - start = max_t(phys_addr_t, start, PAGE_SIZE); - end = max(start, end); - for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { this_start = clamp(this_start, start, end); this_end = clamp(this_end, start, end); @@ -121,9 +113,38 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, if (cand >= this_start) return cand; } + return 0; } +/** + * memblock_find_in_range_node - find free area in given range and node + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @size: size of free area to find + * @align: alignment of free area to find + * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * + * Find @size free area aligned to @align in the specified range and node. + * + * RETURNS: + * Found address on success, %0 on failure. + */ +phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, + phys_addr_t end, phys_addr_t size, + phys_addr_t align, int nid) +{ + /* pump up @end */ + if (end == MEMBLOCK_ALLOC_ACCESSIBLE) + end = memblock.current_limit; + + /* avoid allocating the first page */ + start = max_t(phys_addr_t, start, PAGE_SIZE); + end = max(start, end); + + return __memblock_find_range_top_down(start, end, size, align, nid); +} + /** * memblock_find_in_range - find free area in given range * @start: start of candidate range -- cgit v1.2.3 From 79442ed189acb8b949662676e750eda173c06f9b Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 12 Nov 2013 15:07:59 -0800 Subject: mm/memblock.c: introduce bottom-up allocation mode The Linux kernel cannot migrate pages used by the kernel. As a result, kernel pages cannot be hot-removed. So we cannot allocate hotpluggable memory for the kernel. ACPI SRAT (System Resource Affinity Table) contains the memory hotplug info. But before SRAT is parsed, memblock has already started to allocate memory for the kernel. So we need to prevent memblock from doing this. In a memory hotplug system, any numa node the kernel resides in should be unhotpluggable. And for a modern server, each node could have at least 16GB memory. So memory around the kernel image is highly likely unhotpluggable. So the basic idea is: Allocate memory from the end of the kernel image and to the higher memory. Since memory allocation before SRAT is parsed won't be too much, it could highly likely be in the same node with kernel image. The current memblock can only allocate memory top-down. So this patch introduces a new bottom-up allocation mode to allocate memory bottom-up. And later when we use this allocation direction to allocate memory, we will limit the start address above the kernel. Signed-off-by: Tang Chen Signed-off-by: Zhang Yanfei Acked-by: Toshi Kani Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Tejun Heo Cc: Wanpeng Li Cc: Thomas Renninger Cc: Yinghai Lu Cc: Jiang Liu Cc: Wen Congyang Cc: Lai Jiangshan Cc: Yasuaki Ishimatsu Cc: Taku Izumi Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Rik van Riel Cc: Johannes Weiner Cc: Kamezawa Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memblock.h | 24 ++++++++++++++ include/linux/mm.h | 4 +++ mm/memblock.c | 83 ++++++++++++++++++++++++++++++++++++++++++++++-- 3 files changed, 108 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 31e95acddb4d..77c60e52939d 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -35,6 +35,7 @@ struct memblock_type { }; struct memblock { + bool bottom_up; /* is bottom up direction? */ phys_addr_t current_limit; struct memblock_type memory; struct memblock_type reserved; @@ -148,6 +149,29 @@ phys_addr_t memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) phys_addr_t memblock_alloc(phys_addr_t size, phys_addr_t align); +#ifdef CONFIG_MOVABLE_NODE +/* + * Set the allocation direction to bottom-up or top-down. + */ +static inline void memblock_set_bottom_up(bool enable) +{ + memblock.bottom_up = enable; +} + +/* + * Check if the allocation direction is bottom-up or not. + * if this is true, that said, memblock will allocate memory + * in bottom-up direction. + */ +static inline bool memblock_bottom_up(void) +{ + return memblock.bottom_up; +} +#else +static inline void memblock_set_bottom_up(bool enable) {} +static inline bool memblock_bottom_up(void) { return false; } +#endif + /* Flags for memblock_alloc_base() amd __memblock_alloc_base() */ #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) #define MEMBLOCK_ALLOC_ACCESSIBLE 0 diff --git a/include/linux/mm.h b/include/linux/mm.h index 8aa4006b9636..42a35d94b82c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -50,6 +50,10 @@ extern int sysctl_legacy_va_layout; #include #include +#ifndef __pa_symbol +#define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) +#endif + extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; diff --git a/mm/memblock.c b/mm/memblock.c index accff1087137..53e477bb5558 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -20,6 +20,8 @@ #include #include +#include + static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; @@ -32,6 +34,7 @@ struct memblock memblock __initdata_memblock = { .reserved.cnt = 1, /* empty dummy entry */ .reserved.max = INIT_MEMBLOCK_REGIONS, + .bottom_up = false, .current_limit = MEMBLOCK_ALLOC_ANYWHERE, }; @@ -82,6 +85,38 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, return (i < type->cnt) ? i : -1; } +/* + * __memblock_find_range_bottom_up - find free area utility in bottom-up + * @start: start of candidate range + * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} + * @size: size of free area to find + * @align: alignment of free area to find + * @nid: nid of the free area to find, %MAX_NUMNODES for any node + * + * Utility called from memblock_find_in_range_node(), find free area bottom-up. + * + * RETURNS: + * Found address on success, 0 on failure. + */ +static phys_addr_t __init_memblock +__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align, int nid) +{ + phys_addr_t this_start, this_end, cand; + u64 i; + + for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); + + cand = round_up(this_start, align); + if (cand < this_end && this_end - cand >= size) + return cand; + } + + return 0; +} + /** * __memblock_find_range_top_down - find free area utility, in top-down * @start: start of candidate range @@ -93,7 +128,7 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, * Utility called from memblock_find_in_range_node(), find free area top-down. * * RETURNS: - * Found address on success, %0 on failure. + * Found address on success, 0 on failure. */ static phys_addr_t __init_memblock __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, @@ -127,13 +162,24 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * * Find @size free area aligned to @align in the specified range and node. * + * When allocation direction is bottom-up, the @start should be greater + * than the end of the kernel image. Otherwise, it will be trimmed. The + * reason is that we want the bottom-up allocation just near the kernel + * image so it is highly likely that the allocated memory and the kernel + * will reside in the same node. + * + * If bottom-up allocation failed, will try to allocate memory top-down. + * * RETURNS: - * Found address on success, %0 on failure. + * Found address on success, 0 on failure. */ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align, int nid) { + int ret; + phys_addr_t kernel_end; + /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) end = memblock.current_limit; @@ -141,6 +187,37 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, /* avoid allocating the first page */ start = max_t(phys_addr_t, start, PAGE_SIZE); end = max(start, end); + kernel_end = __pa_symbol(_end); + + /* + * try bottom-up allocation only when bottom-up mode + * is set and @end is above the kernel image. + */ + if (memblock_bottom_up() && end > kernel_end) { + phys_addr_t bottom_up_start; + + /* make sure we will allocate above the kernel */ + bottom_up_start = max(start, kernel_end); + + /* ok, try bottom-up allocation first */ + ret = __memblock_find_range_bottom_up(bottom_up_start, end, + size, align, nid); + if (ret) + return ret; + + /* + * we always limit bottom-up allocation above the kernel, + * but top-down allocation doesn't have the limit, so + * retrying top-down allocation may succeed when bottom-up + * allocation failed. + * + * bottom-up allocation is expected to be fail very rarely, + * so we use WARN_ONCE() here to see the stack trace if + * fail happens. + */ + WARN_ONCE(1, "memblock: bottom-up allocation failed, " + "memory hotunplug may be affected\n"); + } return __memblock_find_range_top_down(start, end, size, align, nid); } @@ -155,7 +232,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, * Find @size free area aligned to @align in the specified range. * * RETURNS: - * Found address on success, %0 on failure. + * Found address on success, 0 on failure. */ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, -- cgit v1.2.3 From c5320926e370b4cfb8f10c2169e26f960079cf67 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 12 Nov 2013 15:08:10 -0800 Subject: mem-hotplug: introduce movable_node boot option The hot-Pluggable field in SRAT specifies which memory is hotpluggable. As we mentioned before, if hotpluggable memory is used by the kernel, it cannot be hot-removed. So memory hotplug users may want to set all hotpluggable memory in ZONE_MOVABLE so that the kernel won't use it. Memory hotplug users may also set a node as movable node, which has ZONE_MOVABLE only, so that the whole node can be hot-removed. But the kernel cannot use memory in ZONE_MOVABLE. By doing this, the kernel cannot use memory in movable nodes. This will cause NUMA performance down. And other users may be unhappy. So we need a way to allow users to enable and disable this functionality. In this patch, we introduce movable_node boot option to allow users to choose to not to consume hotpluggable memory at early boot time and later we can set it as ZONE_MOVABLE. To achieve this, the movable_node boot option will control the memblock allocation direction. That said, after memblock is ready, before SRAT is parsed, we should allocate memory near the kernel image as we explained in the previous patches. So if movable_node boot option is set, the kernel does the following: 1. After memblock is ready, make memblock allocate memory bottom up. 2. After SRAT is parsed, make memblock behave as default, allocate memory top down. Users can specify "movable_node" in kernel commandline to enable this functionality. For those who don't use memory hotplug or who don't want to lose their NUMA performance, just don't specify anything. The kernel will work as before. Signed-off-by: Tang Chen Signed-off-by: Zhang Yanfei Suggested-by: Kamezawa Hiroyuki Suggested-by: Ingo Molnar Acked-by: Tejun Heo Acked-by: Toshi Kani Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Wanpeng Li Cc: Thomas Renninger Cc: Yinghai Lu Cc: Jiang Liu Cc: Wen Congyang Cc: Lai Jiangshan Cc: Yasuaki Ishimatsu Cc: Taku Izumi Cc: Mel Gorman Cc: Michal Nazarewicz Cc: Minchan Kim Cc: Rik van Riel Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/kernel-parameters.txt | 3 +++ arch/x86/mm/numa.c | 11 +++++++++++ mm/Kconfig | 17 ++++++++++++----- mm/memory_hotplug.c | 31 +++++++++++++++++++++++++++++++ 4 files changed, 57 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index fd3ecedc084d..882a40d405c8 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1775,6 +1775,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted. that the amount of memory usable for all allocations is not too small. + movable_node [KNL,X86] Boot-time switch to enable the effects + of CONFIG_MOVABLE_NODE=y. See mm/Kconfig for details. + MTD_Partition= [MTD] Format: ,,, diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 8bf93bae1f13..24aec58d6afd 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -567,6 +567,17 @@ static int __init numa_init(int (*init_func)(void)) ret = init_func(); if (ret < 0) return ret; + + /* + * We reset memblock back to the top-down direction + * here because if we configured ACPI_NUMA, we have + * parsed SRAT in init_func(). It is ok to have the + * reset here even if we did't configure ACPI_NUMA + * or acpi numa init fails and fallbacks to dummy + * numa init. + */ + memblock_set_bottom_up(false); + ret = numa_cleanup_meminfo(&numa_meminfo); if (ret < 0) return ret; diff --git a/mm/Kconfig b/mm/Kconfig index 394838f489eb..3f4ffda152bb 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -153,11 +153,18 @@ config MOVABLE_NODE help Allow a node to have only movable memory. Pages used by the kernel, such as direct mapping pages cannot be migrated. So the corresponding - memory device cannot be hotplugged. This option allows users to - online all the memory of a node as movable memory so that the whole - node can be hotplugged. Users who don't use the memory hotplug - feature are fine with this option on since they don't online memory - as movable. + memory device cannot be hotplugged. This option allows the following + two things: + - When the system is booting, node full of hotpluggable memory can + be arranged to have only movable memory so that the whole node can + be hot-removed. (need movable_node boot option specified). + - After the system is up, the option allows users to online all the + memory of a node as movable memory so that the whole node can be + hot-removed. + + Users who don't use the memory hotplug feature are fine with this + option on since they don't specify movable_node boot option or they + don't online memory as movable. Say Y here if you want to hotplug a whole node. Say N here if you want kernel to use memory on all nodes evenly. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1b6fe8ca71e6..489f235502db 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -1422,6 +1423,36 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) } #endif /* CONFIG_MOVABLE_NODE */ +static int __init cmdline_parse_movable_node(char *p) +{ +#ifdef CONFIG_MOVABLE_NODE + /* + * Memory used by the kernel cannot be hot-removed because Linux + * cannot migrate the kernel pages. When memory hotplug is + * enabled, we should prevent memblock from allocating memory + * for the kernel. + * + * ACPI SRAT records all hotpluggable memory ranges. But before + * SRAT is parsed, we don't know about it. + * + * The kernel image is loaded into memory at very early time. We + * cannot prevent this anyway. So on NUMA system, we set any + * node the kernel resides in as un-hotpluggable. + * + * Since on modern servers, one node could have double-digit + * gigabytes memory, we can assume the memory around the kernel + * image is also un-hotpluggable. So before SRAT is parsed, just + * allocate memory near the kernel image to try the best to keep + * the kernel away from hotpluggable memory. + */ + memblock_set_bottom_up(true); +#else + pr_warn("movable_node option not supported\n"); +#endif + return 0; +} +early_param("movable_node", cmdline_parse_movable_node); + /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) -- cgit v1.2.3 From d7e0b37a87c39f5c02dd7b5d55c7a3ec2f65b943 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 12 Nov 2013 15:08:12 -0800 Subject: mm: set N_CPU to node_states during boot After a system booted, N_CPU is not set to any node as has_cpu shows an empty line. # cat /sys/devices/system/node/has_cpu (show-empty-line) setup_vmstat() registers its CPU notifier callback, vmstat_cpuup_callback(), which marks N_CPU to a node when a CPU is put into online. However, setup_vmstat() is called after all CPUs are launched in the boot sequence. Changed setup_vmstat() to mark N_CPU to the nodes with online CPUs at boot, which is consistent with other operations in vmstat_cpuup_callback(), i.e. start_cpu_timer() and refresh_zone_stat_thresholds(). Also added get_online_cpus() to protect the for_each_online_cpu() loop. Signed-off-by: Toshi Kani Acked-by: Christoph Lameter Reviewed-by: Yasuaki Ishimatsu Tested-by: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 9bb314577911..0a1f7de972b3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1276,8 +1276,12 @@ static int __init setup_vmstat(void) register_cpu_notifier(&vmstat_notifier); - for_each_online_cpu(cpu) + get_online_cpus(); + for_each_online_cpu(cpu) { start_cpu_timer(cpu); + node_set_state(cpu_to_node(cpu), N_CPU); + } + put_online_cpus(); #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); -- cgit v1.2.3 From 807a1bd2b2a38845fd422b93328e7d69f13eb13a Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 12 Nov 2013 15:08:13 -0800 Subject: mm: clear N_CPU from node_states at CPU offline vmstat_cpuup_callback() is a CPU notifier callback, which marks N_CPU to a node at CPU online event. However, it does not update this N_CPU info at CPU offline event. Changed vmstat_cpuup_callback() to clear N_CPU when the last CPU in the node is put into offline, i.e. the node no longer has any online CPU. Signed-off-by: Toshi Kani Acked-by: Christoph Lameter Reviewed-by: Yasuaki Ishimatsu Tested-by: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmstat.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'mm') diff --git a/mm/vmstat.c b/mm/vmstat.c index 0a1f7de972b3..b6d17edf8cf3 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1229,6 +1229,20 @@ static void start_cpu_timer(int cpu) schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); } +static void vmstat_cpu_dead(int node) +{ + int cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + if (cpu_to_node(cpu) == node) + goto end; + + node_clear_state(node, N_CPU); +end: + put_online_cpus(); +} + /* * Use the cpu notifier to insure that the thresholds are recalculated * when necessary. @@ -1258,6 +1272,7 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_DEAD: case CPU_DEAD_FROZEN: refresh_zone_stat_thresholds(); + vmstat_cpu_dead(cpu_to_node(cpu)); break; default: break; -- cgit v1.2.3 From 4a099fb4bdb9982759ce2625f50636a6fbd173bc Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Tue, 12 Nov 2013 15:08:14 -0800 Subject: mm/bootmem.c: remove unused local `map' Signed-off-by: Daeseok Youn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index 95b528cd4de7..90bd3507b413 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -172,11 +172,12 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { struct page *page; - unsigned long start, end, pages, count = 0; + unsigned long *map, start, end, pages, count = 0; if (!bdata->node_bootmem_map) return 0; + map = bdata->node_bootmem_map; start = bdata->node_min_pfn; end = bdata->node_low_pfn; @@ -184,10 +185,9 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) bdata - bootmem_node_data, start, end); while (start < end) { - unsigned long *map, idx, vec; + unsigned long idx, vec; unsigned shift; - map = bdata->node_bootmem_map; idx = start - bdata->node_min_pfn; shift = idx & (BITS_PER_LONG - 1); /* -- cgit v1.2.3 From af248a0c67457e5c6d2bcf288f07b4b2ed064f1f Mon Sep 17 00:00:00 2001 From: Damien Ramonda Date: Tue, 12 Nov 2013 15:08:16 -0800 Subject: readahead: fix sequential read cache miss detection The kernel's readahead algorithm sometimes interprets random read accesses as sequential and triggers unnecessary data prefecthing from storage device (impacting random read average latency). In order to identify sequential cache read misses, the readahead algorithm intends to check whether offset - previous offset == 1 (trivial sequential reads) or offset - previous offset == 0 (sequential reads not aligned on page boundary): if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) The current offset is stored in the "offset" variable of type "pgoff_t" (unsigned long), while previous offset is stored in "ra->prev_pos" of type "loff_t" (long long). Therefore, operands of the if statement are implicitly converted to type long long. Consequently, when previous offset > current offset (which happens on random pattern), the if condition is true and access is wrongly interpeted as sequential. An unnecessary data prefetching is triggered, impacting the average random read latency. Storing the previous offset value in a "pgoff_t" variable (unsigned long) fixes the sequential read detection logic. Signed-off-by: Damien Ramonda Reviewed-by: Fengguang Wu Acked-by: Pierre Tardy Acked-by: David Cohen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/readahead.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/readahead.c b/mm/readahead.c index 50241836fe82..7cdbb44aa90b 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -401,6 +401,7 @@ ondemand_readahead(struct address_space *mapping, unsigned long req_size) { unsigned long max = max_sane_readahead(ra->ra_pages); + pgoff_t prev_offset; /* * start of file @@ -452,8 +453,11 @@ ondemand_readahead(struct address_space *mapping, /* * sequential cache miss + * trivial case: (offset - prev_offset) == 1 + * unaligned reads: (offset - prev_offset) == 0 */ - if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) + prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT; + if (offset - prev_offset <= 1UL) goto initial_readahead; /* -- cgit v1.2.3 From 5d0f3f72efb1c1968ce1f56c58f9e3e6495effa6 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 12 Nov 2013 15:08:18 -0800 Subject: mm: fix page_group_by_mobility_disabled breakage Currently, set_pageblock_migratetype() screws up MIGRATE_CMA and MIGRATE_ISOLATE if page_group_by_mobility_disabled is true. It rewrites the argument to MIGRATE_UNMOVABLE and we lost these attribute. The problem was introduced by commit 49255c619fbd ("page allocator: move check for disabled anti-fragmentation out of fastpath"). So a 4 year old issue may mean that nobody uses page_group_by_mobility_disabled. But anyway, this patch fixes the problem. Signed-off-by: KOSAKI Motohiro Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 770dbb43465f..5a9883614d99 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -234,8 +234,8 @@ int page_group_by_mobility_disabled __read_mostly; void set_pageblock_migratetype(struct page *page, int migratetype) { - - if (unlikely(page_group_by_mobility_disabled)) + if (unlikely(page_group_by_mobility_disabled && + migratetype < MIGRATE_PCPTYPES)) migratetype = MIGRATE_UNMOVABLE; set_pageblock_flags_group(page, (unsigned long)migratetype, -- cgit v1.2.3 From 52c8f6a5aeb0bdd396849ecaa72d96f8175528f5 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 12 Nov 2013 15:08:19 -0800 Subject: mm: get rid of unnecessary overhead of trace_mm_page_alloc_extfrag() In general, every tracepoint should be zero overhead if it is disabled. However, trace_mm_page_alloc_extfrag() is one of exception. It evaluate "new_type == start_migratetype" even if tracepoint is disabled. However, the code can be moved into tracepoint's TP_fast_assign() and TP_fast_assign exist exactly such purpose. This patch does it. Signed-off-by: KOSAKI Motohiro Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/kmem.h | 10 ++++------ mm/page_alloc.c | 5 ++--- 2 files changed, 6 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h index d0c613476620..aece1346ceb7 100644 --- a/include/trace/events/kmem.h +++ b/include/trace/events/kmem.h @@ -267,14 +267,12 @@ DEFINE_EVENT_PRINT(mm_page, mm_page_pcpu_drain, TRACE_EVENT(mm_page_alloc_extfrag, TP_PROTO(struct page *page, - int alloc_order, int fallback_order, - int alloc_migratetype, int fallback_migratetype, - int change_ownership), + int alloc_order, int fallback_order, + int alloc_migratetype, int fallback_migratetype, int new_migratetype), TP_ARGS(page, alloc_order, fallback_order, - alloc_migratetype, fallback_migratetype, - change_ownership), + alloc_migratetype, fallback_migratetype, new_migratetype), TP_STRUCT__entry( __field( struct page *, page ) @@ -291,7 +289,7 @@ TRACE_EVENT(mm_page_alloc_extfrag, __entry->fallback_order = fallback_order; __entry->alloc_migratetype = alloc_migratetype; __entry->fallback_migratetype = fallback_migratetype; - __entry->change_ownership = change_ownership; + __entry->change_ownership = (new_migratetype == alloc_migratetype); ), TP_printk("page=%p pfn=%lu alloc_order=%d fallback_order=%d pageblock_order=%d alloc_migratetype=%d fallback_migratetype=%d fragmenting=%d change_ownership=%d", diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5a9883614d99..442f1298f9a7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1103,9 +1103,8 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) is_migrate_cma(migratetype) ? migratetype : start_migratetype); - trace_mm_page_alloc_extfrag(page, order, - current_order, start_migratetype, migratetype, - new_type == start_migratetype); + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, migratetype, new_type); return page; } -- cgit v1.2.3 From 0cbef29a782162a3896487901eca4550bfa397ef Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 12 Nov 2013 15:08:20 -0800 Subject: mm: __rmqueue_fallback() should respect pageblock type When __rmqueue_fallback() doesn't find a free block with the required size it splits a larger page and puts the rest of the page onto the free list. But it has one serious mistake. When putting back, __rmqueue_fallback() always use start_migratetype if type is not CMA. However, __rmqueue_fallback() is only called when all of the start_migratetype queue is empty. That said, __rmqueue_fallback always puts back memory to the wrong queue except try_to_steal_freepages() changed pageblock type (i.e. requested size is smaller than half of page block). The end result is that the antifragmentation framework increases fragmenation instead of decreasing it. Mel's original anti fragmentation does the right thing. But commit 47118af076f6 ("mm: mmzone: MIGRATE_CMA migration type added") broke it. This patch restores sane and old behavior. It also removes an incorrect comment which was introduced by commit fef903efcf0c ("mm/page_alloc.c: restructure free-page stealing code and fix a bug"). Signed-off-by: KOSAKI Motohiro Cc: Mel Gorman Cc: Michal Nazarewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 442f1298f9a7..cb17e8b800d6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1027,6 +1027,10 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, { int current_order = page_order(page); + /* + * When borrowing from MIGRATE_CMA, we need to release the excess + * buddy pages to CMA itself. + */ if (is_migrate_cma(fallback_type)) return fallback_type; @@ -1091,17 +1095,8 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) list_del(&page->lru); rmv_page_order(page); - /* - * Borrow the excess buddy pages as well, irrespective - * of whether we stole freepages, or took ownership of - * the pageblock or not. - * - * Exception: When borrowing from MIGRATE_CMA, release - * the excess buddy pages to CMA itself. - */ expand(zone, page, order, current_order, area, - is_migrate_cma(migratetype) - ? migratetype : start_migratetype); + new_type); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype, new_type); -- cgit v1.2.3 From 2afc745f3e3079ab16c826be4860da2529054dd2 Mon Sep 17 00:00:00 2001 From: Akira Takeuchi Date: Tue, 12 Nov 2013 15:08:21 -0800 Subject: mm: ensure get_unmapped_area() returns higher address than mmap_min_addr This patch fixes the problem that get_unmapped_area() can return illegal address and result in failing mmap(2) etc. In case that the address higher than PAGE_SIZE is set to /proc/sys/vm/mmap_min_addr, the address lower than mmap_min_addr can be returned by get_unmapped_area(), even if you do not pass any virtual address hint (i.e. the second argument). This is because the current get_unmapped_area() code does not take into account mmap_min_addr. This leads to two actual problems as follows: 1. mmap(2) can fail with EPERM on the process without CAP_SYS_RAWIO, although any illegal parameter is not passed. 2. The bottom-up search path after the top-down search might not work in arch_get_unmapped_area_topdown(). Note: The first and third chunk of my patch, which changes "len" check, are for more precise check using mmap_min_addr, and not for solving the above problem. [How to reproduce] --- test.c ------------------------------------------------- #include #include #include #include int main(int argc, char *argv[]) { void *ret = NULL, *last_map; size_t pagesize = sysconf(_SC_PAGESIZE); do { last_map = ret; ret = mmap(0, pagesize, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); // printf("ret=%p\n", ret); } while (ret != MAP_FAILED); if (errno != ENOMEM) { printf("ERR: unexpected errno: %d (last map=%p)\n", errno, last_map); } return 0; } --------------------------------------------------------------- $ gcc -m32 -o test test.c $ sudo sysctl -w vm.mmap_min_addr=65536 vm.mmap_min_addr = 65536 $ ./test (run as non-priviledge user) ERR: unexpected errno: 1 (last map=0x10000) Signed-off-by: Akira Takeuchi Signed-off-by: Kiyoshi Owada Reviewed-by: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index fa206abb5c9d..3d3e224be771 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1856,7 +1856,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_area_struct *vma; struct vm_unmapped_area_info info; - if (len > TASK_SIZE) + if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -1865,7 +1865,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vma->vm_start)) return addr; } @@ -1895,7 +1895,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, struct vm_unmapped_area_info info; /* requested length too big for entire address space */ - if (len > TASK_SIZE) + if (len > TASK_SIZE - mmap_min_addr) return -ENOMEM; if (flags & MAP_FIXED) @@ -1905,14 +1905,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && + if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && (!vma || addr + len <= vma->vm_start)) return addr; } info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = PAGE_SIZE; + info.low_limit = max(PAGE_SIZE, mmap_min_addr); info.high_limit = mm->mmap_base; info.align_mask = 0; addr = vm_unmapped_area(&info); -- cgit v1.2.3 From f35c3a8eed52878c2dde9c4b9a87b276127f7f8d Mon Sep 17 00:00:00 2001 From: Qiang Huang Date: Tue, 12 Nov 2013 15:08:22 -0800 Subject: memcg, kmem: use is_root_cache instead of hard code Signed-off-by: Qiang Huang Reviewed-by: Pekka Enberg Acked-by: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c89072443166..3d28d5a61efd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -59,6 +59,7 @@ #include #include #include +#include "slab.h" #include @@ -3131,7 +3132,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) { struct memcg_cache_params *cur_params = s->memcg_params; - VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); + VM_BUG_ON(!is_root_cache(s)); if (num_groups > memcg_limited_groups_array_size) { int i; -- cgit v1.2.3 From 2ade4de871172b17dd81b336cf0488a83885ffde Mon Sep 17 00:00:00 2001 From: Qiang Huang Date: Tue, 12 Nov 2013 15:08:23 -0800 Subject: memcg, kmem: rename cache_from_memcg to cache_from_memcg_idx We can't see the relationship with memcg from the parameters, so the name with memcg_idx would be more reasonable. Signed-off-by: Qiang Huang Reviewed-by: Pekka Enberg Acked-by: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 +- mm/slab.h | 6 ++++-- mm/slab_common.c | 2 +- mm/slub.c | 2 +- 4 files changed, 7 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 2580db062df9..0c8967bb2018 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -3982,7 +3982,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, VM_BUG_ON(!mutex_is_locked(&slab_mutex)); for_each_memcg_cache_index(i) { - c = cache_from_memcg(cachep, i); + c = cache_from_memcg_idx(cachep, i); if (c) /* return value determined by the parent cache only */ __do_tune_cpucache(c, limit, batchcount, shared, gfp); diff --git a/mm/slab.h b/mm/slab.h index a535033f7e9a..0859c4241ba1 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -160,7 +160,8 @@ static inline const char *cache_name(struct kmem_cache *s) return s->name; } -static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +static inline struct kmem_cache * +cache_from_memcg_idx(struct kmem_cache *s, int idx) { if (!s->memcg_params) return NULL; @@ -204,7 +205,8 @@ static inline const char *cache_name(struct kmem_cache *s) return s->name; } -static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) +static inline struct kmem_cache * +cache_from_memcg_idx(struct kmem_cache *s, int idx) { return NULL; } diff --git a/mm/slab_common.c b/mm/slab_common.c index e2e98af703ea..0b7bb399b0e4 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -571,7 +571,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) return; for_each_memcg_cache_index(i) { - c = cache_from_memcg(s, i); + c = cache_from_memcg_idx(s, i); if (!c) continue; diff --git a/mm/slub.c b/mm/slub.c index c3eb3d3ca835..92737a0b787b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4983,7 +4983,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, * through the descendants with best-effort propagation. */ for_each_memcg_cache_index(i) { - struct kmem_cache *c = cache_from_memcg(s, i); + struct kmem_cache *c = cache_from_memcg_idx(s, i); if (c) attribute->store(c, buf, len); } -- cgit v1.2.3 From 7a67d7abcc8da30a16ed64c3909d3fea004bde93 Mon Sep 17 00:00:00 2001 From: Qiang Huang Date: Tue, 12 Nov 2013 15:08:24 -0800 Subject: memcg, kmem: use cache_from_memcg_idx instead of hard code Signed-off-by: Qiang Huang Reviewed-by: Pekka Enberg Acked-by: David Rientjes Cc: Johannes Weiner Cc: Michal Hocko Cc: Glauber Costa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3d28d5a61efd..3d4bb07c7679 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2956,7 +2956,7 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) VM_BUG_ON(p->is_root_cache); cachep = p->root_cache; - return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; + return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); } #ifdef CONFIG_SLABINFO @@ -3393,7 +3393,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, idx = memcg_cache_id(memcg); mutex_lock(&memcg_cache_mutex); - new_cachep = cachep->memcg_params->memcg_caches[idx]; + new_cachep = cache_from_memcg_idx(cachep, idx); if (new_cachep) { css_put(&memcg->css); goto out; @@ -3439,8 +3439,8 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) * we'll take the set_limit_mutex to protect ourselves against this. */ mutex_lock(&set_limit_mutex); - for (i = 0; i < memcg_limited_groups_array_size; i++) { - c = s->memcg_params->memcg_caches[i]; + for_each_memcg_cache_index(i) { + c = cache_from_memcg_idx(s, i); if (!c) continue; @@ -3573,8 +3573,8 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * code updating memcg_caches will issue a write barrier to match this. */ read_barrier_depends(); - if (likely(cachep->memcg_params->memcg_caches[idx])) { - cachep = cachep->memcg_params->memcg_caches[idx]; + if (likely(cache_from_memcg_idx(cachep, idx))) { + cachep = cache_from_memcg_idx(cachep, idx); goto out; } -- cgit v1.2.3 From 67d13fe846c57a54d12578e7a4518f68c5c86ad7 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Tue, 12 Nov 2013 15:08:26 -0800 Subject: mm/zswap: bugfix: memory leak when invalidate and reclaim occur concurrently Consider the following scenario: thread 0: reclaim entry x (get refcount, but not call zswap_get_swap_cache_page) thread 1: call zswap_frontswap_invalidate_page to invalidate entry x. finished, entry x and its zbud is not freed as its refcount != 0 now, the swap_map[x] = 0 thread 0: now call zswap_get_swap_cache_page swapcache_prepare return -ENOENT because entry x is not used any more zswap_get_swap_cache_page return ZSWAP_SWAPCACHE_NOMEM zswap_writeback_entry do nothing except put refcount Now, the memory of zswap_entry x and its zpage leak. Modify: - check the refcount in fail path, free memory if it is not referenced. - use ZSWAP_SWAPCACHE_FAIL instead of ZSWAP_SWAPCACHE_NOMEM as the fail path can be not only caused by nomem but also by invalidate. Signed-off-by: Weijie Yang Reviewed-by: Bob Liu Reviewed-by: Minchan Kim Acked-by: Seth Jennings Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index 001474c1a594..0ffcad03baea 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -387,7 +387,7 @@ static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) enum zswap_get_swap_ret { ZSWAP_SWAPCACHE_NEW, ZSWAP_SWAPCACHE_EXIST, - ZSWAP_SWAPCACHE_NOMEM + ZSWAP_SWAPCACHE_FAIL, }; /* @@ -401,9 +401,10 @@ enum zswap_get_swap_ret { * added to the swap cache, and returned in retpage. * * If success, the swap cache page is returned in retpage - * Returns 0 if page was already in the swap cache, page is not locked - * Returns 1 if the new page needs to be populated, page is locked - * Returns <0 on error + * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache + * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, + * the new page is added to swapcache and locked + * Returns ZSWAP_SWAPCACHE_FAIL on error */ static int zswap_get_swap_cache_page(swp_entry_t entry, struct page **retpage) @@ -475,7 +476,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, if (new_page) page_cache_release(new_page); if (!found_page) - return ZSWAP_SWAPCACHE_NOMEM; + return ZSWAP_SWAPCACHE_FAIL; *retpage = found_page; return ZSWAP_SWAPCACHE_EXIST; } @@ -529,11 +530,11 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) /* try to allocate swap cache page */ switch (zswap_get_swap_cache_page(swpentry, &page)) { - case ZSWAP_SWAPCACHE_NOMEM: /* no memory */ + case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ ret = -ENOMEM; goto fail; - case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */ + case ZSWAP_SWAPCACHE_EXIST: /* page is already in the swap cache, ignore for now */ page_cache_release(page); ret = -EEXIST; @@ -594,7 +595,12 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) fail: spin_lock(&tree->lock); - zswap_entry_put(entry); + refcount = zswap_entry_put(entry); + if (refcount <= 0) { + /* invalidate happened, consider writeback as success */ + zswap_free_entry(tree, entry); + ret = 0; + } spin_unlock(&tree->lock); return ret; } -- cgit v1.2.3 From 0ab0abcf511545d1fddbe72a36b3ca73388ac937 Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Tue, 12 Nov 2013 15:08:27 -0800 Subject: mm/zswap: refactor the get/put routines The refcount routine was not fit the kernel get/put semantic exactly, There were too many judgement statements on refcount and it could be minus. This patch does the following: - move refcount judgement to zswap_entry_put() to hide resource free function. - add a new function zswap_entry_find_get(), so that callers can use easily in the following pattern: zswap_entry_find_get .../* do something */ zswap_entry_put - to eliminate compile error, move some functions declaration This patch is based on Minchan Kim 's idea and suggestion. Signed-off-by: Weijie Yang Cc: Seth Jennings Acked-by: Minchan Kim Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 182 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 88 insertions(+), 94 deletions(-) (limited to 'mm') diff --git a/mm/zswap.c b/mm/zswap.c index 0ffcad03baea..5a63f78a5601 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -217,6 +217,7 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) if (!entry) return NULL; entry->refcount = 1; + RB_CLEAR_NODE(&entry->rbnode); return entry; } @@ -225,19 +226,6 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } -/* caller must hold the tree lock */ -static void zswap_entry_get(struct zswap_entry *entry) -{ - entry->refcount++; -} - -/* caller must hold the tree lock */ -static int zswap_entry_put(struct zswap_entry *entry) -{ - entry->refcount--; - return entry->refcount; -} - /********************************* * rbtree functions **********************************/ @@ -285,6 +273,61 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, return 0; } +static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) +{ + if (!RB_EMPTY_NODE(&entry->rbnode)) { + rb_erase(&entry->rbnode, root); + RB_CLEAR_NODE(&entry->rbnode); + } +} + +/* + * Carries out the common pattern of freeing and entry's zsmalloc allocation, + * freeing the entry itself, and decrementing the number of stored pages. + */ +static void zswap_free_entry(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + zbud_free(tree->pool, entry->handle); + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + zswap_pool_pages = zbud_get_pool_size(tree->pool); +} + +/* caller must hold the tree lock */ +static void zswap_entry_get(struct zswap_entry *entry) +{ + entry->refcount++; +} + +/* caller must hold the tree lock +* remove from the tree and free it, if nobody reference the entry +*/ +static void zswap_entry_put(struct zswap_tree *tree, + struct zswap_entry *entry) +{ + int refcount = --entry->refcount; + + BUG_ON(refcount < 0); + if (refcount == 0) { + zswap_rb_erase(&tree->rbroot, entry); + zswap_free_entry(tree, entry); + } +} + +/* caller must hold the tree lock */ +static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, + pgoff_t offset) +{ + struct zswap_entry *entry = NULL; + + entry = zswap_rb_search(root, offset); + if (entry) + zswap_entry_get(entry); + + return entry; +} + /********************************* * per-cpu code **********************************/ @@ -368,18 +411,6 @@ static bool zswap_is_full(void) zswap_pool_pages); } -/* - * Carries out the common pattern of freeing and entry's zsmalloc allocation, - * freeing the entry itself, and decrementing the number of stored pages. - */ -static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) -{ - zbud_free(tree->pool, entry->handle); - zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(tree->pool); -} - /********************************* * writeback code **********************************/ @@ -503,7 +534,7 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) struct page *page; u8 *src, *dst; unsigned int dlen; - int ret, refcount; + int ret; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; @@ -518,13 +549,12 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) /* find and ref zswap entry */ spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); + entry = zswap_entry_find_get(&tree->rbroot, offset); if (!entry) { /* entry was invalidated */ spin_unlock(&tree->lock); return 0; } - zswap_entry_get(entry); spin_unlock(&tree->lock); BUG_ON(offset != entry->offset); @@ -566,42 +596,35 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) zswap_written_back_pages++; spin_lock(&tree->lock); - /* drop local reference */ - zswap_entry_put(entry); - /* drop the initial reference from entry creation */ - refcount = zswap_entry_put(entry); + zswap_entry_put(tree, entry); /* - * There are three possible values for refcount here: - * (1) refcount is 1, load is in progress, unlink from rbtree, - * load will free - * (2) refcount is 0, (normal case) entry is valid, - * remove from rbtree and free entry - * (3) refcount is -1, invalidate happened during writeback, - * free entry - */ - if (refcount >= 0) { - /* no invalidate yet, remove from rbtree */ - rb_erase(&entry->rbnode, &tree->rbroot); - } + * There are two possible situations for entry here: + * (1) refcount is 1(normal case), entry is valid and on the tree + * (2) refcount is 0, entry is freed and not on the tree + * because invalidate happened during writeback + * search the tree and free the entry if find entry + */ + if (entry == zswap_rb_search(&tree->rbroot, offset)) + zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - if (refcount <= 0) { - /* free the entry */ - zswap_free_entry(tree, entry); - return 0; - } - return -EAGAIN; + goto end; + + /* + * if we get here due to ZSWAP_SWAPCACHE_EXIST + * a load may happening concurrently + * it is safe and okay to not free the entry + * if we free the entry in the following put + * it it either okay to return !0 + */ fail: spin_lock(&tree->lock); - refcount = zswap_entry_put(entry); - if (refcount <= 0) { - /* invalidate happened, consider writeback as success */ - zswap_free_entry(tree, entry); - ret = 0; - } + zswap_entry_put(tree, entry); spin_unlock(&tree->lock); + +end: return ret; } @@ -685,11 +708,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, if (ret == -EEXIST) { zswap_duplicate_entry++; /* remove from rbtree */ - rb_erase(&dupentry->rbnode, &tree->rbroot); - if (!zswap_entry_put(dupentry)) { - /* free */ - zswap_free_entry(tree, dupentry); - } + zswap_rb_erase(&tree->rbroot, dupentry); + zswap_entry_put(tree, dupentry); } } while (ret == -EEXIST); spin_unlock(&tree->lock); @@ -718,17 +738,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, struct zswap_entry *entry; u8 *src, *dst; unsigned int dlen; - int refcount, ret; + int ret; /* find */ spin_lock(&tree->lock); - entry = zswap_rb_search(&tree->rbroot, offset); + entry = zswap_entry_find_get(&tree->rbroot, offset); if (!entry) { /* entry was written back */ spin_unlock(&tree->lock); return -1; } - zswap_entry_get(entry); spin_unlock(&tree->lock); /* decompress */ @@ -743,22 +762,9 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, BUG_ON(ret); spin_lock(&tree->lock); - refcount = zswap_entry_put(entry); - if (likely(refcount)) { - spin_unlock(&tree->lock); - return 0; - } + zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - /* - * We don't have to unlink from the rbtree because - * zswap_writeback_entry() or zswap_frontswap_invalidate page() - * has already done this for us if we are the last reference. - */ - /* free */ - - zswap_free_entry(tree, entry); - return 0; } @@ -767,7 +773,6 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; - int refcount; /* find */ spin_lock(&tree->lock); @@ -779,20 +784,12 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) } /* remove from rbtree */ - rb_erase(&entry->rbnode, &tree->rbroot); + zswap_rb_erase(&tree->rbroot, entry); /* drop the initial reference from entry creation */ - refcount = zswap_entry_put(entry); + zswap_entry_put(tree, entry); spin_unlock(&tree->lock); - - if (refcount) { - /* writeback in progress, writeback will free */ - return; - } - - /* free */ - zswap_free_entry(tree, entry); } /* frees all zswap entries for the given swap type */ @@ -806,11 +803,8 @@ static void zswap_frontswap_invalidate_area(unsigned type) /* walk the tree and free everything */ spin_lock(&tree->lock); - rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) { - zbud_free(tree->pool, entry->handle); - zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); - } + rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) + zswap_free_entry(tree, entry); tree->rbroot = RB_ROOT; spin_unlock(&tree->lock); -- cgit v1.2.3 From a1aeb65a4c80d8e97eca8bcde02a5aeae11f6201 Mon Sep 17 00:00:00 2001 From: Zhi Yong Wu Date: Tue, 12 Nov 2013 15:08:29 -0800 Subject: mm/page_alloc.c: fix comment in zlc_setup() Signed-off-by: Zhi Yong Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cb17e8b800d6..580a5f075ed0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1705,7 +1705,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, * comments in mmzone.h. Reduces cache footprint of zonelist scans * that have to skip over a lot of full or unallowed zones. * - * If the zonelist cache is present in the passed in zonelist, then + * If the zonelist cache is present in the passed zonelist, then * returns a pointer to the allowed node mask (either the current * tasks mems_allowed, or node_states[N_MEMORY].) * -- cgit v1.2.3 From 00619bcc44d6b779aa366130b354153c222e4380 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Tue, 12 Nov 2013 15:08:31 -0800 Subject: mm: factor commit limit calculation The same calculation is currently done in three differents places. Factor that code so future changes has to be made at only one place. [akpm@linux-foundation.org: uninline vm_commit_limit()] Signed-off-by: Jerome Marchand Cc: Dave Hansen Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 5 +---- include/linux/mman.h | 2 ++ mm/mmap.c | 4 +--- mm/nommu.c | 3 +-- mm/util.c | 13 +++++++++++++ 5 files changed, 18 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 59d85d608898..c805d5b69ba1 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -24,7 +24,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) { struct sysinfo i; unsigned long committed; - unsigned long allowed; struct vmalloc_info vmi; long cached; unsigned long pages[NR_LRU_LISTS]; @@ -37,8 +36,6 @@ static int meminfo_proc_show(struct seq_file *m, void *v) si_meminfo(&i); si_swapinfo(&i); committed = percpu_counter_read_positive(&vm_committed_as); - allowed = ((totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100) + total_swap_pages; cached = global_page_state(NR_FILE_PAGES) - total_swapcache_pages() - i.bufferram; @@ -147,7 +144,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(global_page_state(NR_UNSTABLE_NFS)), K(global_page_state(NR_BOUNCE)), K(global_page_state(NR_WRITEBACK_TEMP)), - K(allowed), + K(vm_commit_limit()), K(committed), (unsigned long)VMALLOC_TOTAL >> 10, vmi.used >> 10, diff --git a/include/linux/mman.h b/include/linux/mman.h index 92dc257251e4..7f7f8dae4b1d 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -87,4 +87,6 @@ calc_vm_flag_bits(unsigned long flags) _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); } + +unsigned long vm_commit_limit(void); #endif /* _LINUX_MMAN_H */ diff --git a/mm/mmap.c b/mm/mmap.c index 3d3e224be771..803048e9c568 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -179,14 +179,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) goto error; } - allowed = (totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100; + allowed = vm_commit_limit(); /* * Reserve some for root */ if (!cap_sys_admin) allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - allowed += total_swap_pages; /* * Don't let a single process grow so big a user can't recover diff --git a/mm/nommu.c b/mm/nommu.c index ecd1f158548e..d8a957bb9e31 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1948,13 +1948,12 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) goto error; } - allowed = totalram_pages * sysctl_overcommit_ratio / 100; + allowed = vm_commit_limit(); /* * Reserve some 3% for root */ if (!cap_sys_admin) allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); - allowed += total_swap_pages; /* * Don't let a single process grow so big a user can't recover diff --git a/mm/util.c b/mm/util.c index eaf63fc2c92f..f7bc2096071c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -7,6 +7,9 @@ #include #include #include +#include +#include + #include #include "internal.h" @@ -398,6 +401,16 @@ struct address_space *page_mapping(struct page *page) return mapping; } +/* + * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used + */ +unsigned long vm_commit_limit(void) +{ + return ((totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100) + total_swap_pages; +} + + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -- cgit v1.2.3 From 72403b4a0fbdf433c1fe0127e49864658f6f6468 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 12 Nov 2013 15:08:32 -0800 Subject: mm: numa: return the number of base pages altered by protection changes Commit 0255d4918480 ("mm: Account for a THP NUMA hinting update as one PTE update") was added to account for the number of PTE updates when marking pages prot_numa. task_numa_work was using the old return value to track how much address space had been updated. Altering the return value causes the scanner to do more work than it is configured or documented to in a single unit of work. This patch reverts that commit and accounts for the number of THP updates separately in vmstat. It is up to the administrator to interpret the pair of values correctly. This is a straight-forward operation and likely to only be of interest when actively debugging NUMA balancing problems. The impact of this patch is that the NUMA PTE scanner will scan slower when THP is enabled and workloads may converge slower as a result. On the flip size system CPU usage should be lower than recent tests reported. This is an illustrative example of a short single JVM specjbb test specjbb 3.12.0 3.12.0 vanilla acctupdates TPut 1 26143.00 ( 0.00%) 25747.00 ( -1.51%) TPut 7 185257.00 ( 0.00%) 183202.00 ( -1.11%) TPut 13 329760.00 ( 0.00%) 346577.00 ( 5.10%) TPut 19 442502.00 ( 0.00%) 460146.00 ( 3.99%) TPut 25 540634.00 ( 0.00%) 549053.00 ( 1.56%) TPut 31 512098.00 ( 0.00%) 519611.00 ( 1.47%) TPut 37 461276.00 ( 0.00%) 474973.00 ( 2.97%) TPut 43 403089.00 ( 0.00%) 414172.00 ( 2.75%) 3.12.0 3.12.0 vanillaacctupdates User 5169.64 5184.14 System 100.45 80.02 Elapsed 252.75 251.85 Performance is similar but note the reduction in system CPU time. While this showed a performance gain, it will not be universal but at least it'll be behaving as documented. The vmstats are obviously different but here is an obvious interpretation of them from mmtests. 3.12.0 3.12.0 vanillaacctupdates NUMA page range updates 1408326 11043064 NUMA huge PMD updates 0 21040 NUMA PTE updates 1408326 291624 "NUMA page range updates" == nr_pte_updates and is the value returned to the NUMA pte scanner. NUMA huge PMD updates were the number of THP updates which in combination can be used to calculate how many ptes were updated from userspace. Signed-off-by: Mel Gorman Reported-by: Alex Thorlton Reviewed-by: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vm_event_item.h | 1 + mm/mprotect.c | 10 +++++++--- mm/vmstat.c | 1 + 3 files changed, 9 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 1855f0a22add..c557c6d096de 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -39,6 +39,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, PAGEOUTRUN, ALLOCSTALL, PGROTATED, #ifdef CONFIG_NUMA_BALANCING NUMA_PTE_UPDATES, + NUMA_HUGE_PTE_UPDATES, NUMA_HINT_FAULTS, NUMA_HINT_FAULTS_LOCAL, NUMA_PAGE_MIGRATE, diff --git a/mm/mprotect.c b/mm/mprotect.c index a597f2ffcd6f..26667971c824 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -112,6 +112,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pmd_t *pmd; unsigned long next; unsigned long pages = 0; + unsigned long nr_huge_updates = 0; pmd = pmd_offset(pud, addr); do { @@ -126,9 +127,10 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, newprot, prot_numa); if (nr_ptes) { - if (nr_ptes == HPAGE_PMD_NR) - pages++; - + if (nr_ptes == HPAGE_PMD_NR) { + pages += HPAGE_PMD_NR; + nr_huge_updates++; + } continue; } } @@ -141,6 +143,8 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pages += this_pages; } while (pmd++, addr = next, addr != end); + if (nr_huge_updates) + count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); return pages; } diff --git a/mm/vmstat.c b/mm/vmstat.c index b6d17edf8cf3..72496140ac08 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -812,6 +812,7 @@ const char * const vmstat_text[] = { #ifdef CONFIG_NUMA_BALANCING "numa_pte_updates", + "numa_huge_pte_updates", "numa_hint_faults", "numa_hint_faults_local", "numa_pages_migrated", -- cgit v1.2.3