diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 3 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/backing-dev.c | 29 | ||||
-rw-r--r-- | mm/bounce.c | 9 | ||||
-rw-r--r-- | mm/compaction.c | 26 | ||||
-rw-r--r-- | mm/debug-pagealloc.c | 56 | ||||
-rw-r--r-- | mm/filemap.c | 9 | ||||
-rw-r--r-- | mm/highmem.c | 6 | ||||
-rw-r--r-- | mm/huge_memory.c | 91 | ||||
-rw-r--r-- | mm/internal.h | 46 | ||||
-rw-r--r-- | mm/ksm.c | 3 | ||||
-rw-r--r-- | mm/memblock.c | 11 | ||||
-rw-r--r-- | mm/memcontrol.c | 1219 | ||||
-rw-r--r-- | mm/memory-failure.c | 12 | ||||
-rw-r--r-- | mm/memory.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 11 | ||||
-rw-r--r-- | mm/migrate.c | 83 | ||||
-rw-r--r-- | mm/mlock.c | 13 | ||||
-rw-r--r-- | mm/mmap.c | 9 | ||||
-rw-r--r-- | mm/mremap.c | 42 | ||||
-rw-r--r-- | mm/oom_kill.c | 53 | ||||
-rw-r--r-- | mm/page-writeback.c | 19 | ||||
-rw-r--r-- | mm/page_alloc.c | 17 | ||||
-rw-r--r-- | mm/page_cgroup.c | 12 | ||||
-rw-r--r-- | mm/process_vm_access.c | 496 | ||||
-rw-r--r-- | mm/rmap.c | 2 | ||||
-rw-r--r-- | mm/shmem.c | 12 | ||||
-rw-r--r-- | mm/slab.c | 19 | ||||
-rw-r--r-- | mm/slub.c | 613 | ||||
-rw-r--r-- | mm/swap.c | 83 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/thrash.c | 2 | ||||
-rw-r--r-- | mm/vmalloc.c | 95 | ||||
-rw-r--r-- | mm/vmscan.c | 396 | ||||
-rw-r--r-- | mm/vmstat.c | 7 |
35 files changed, 2128 insertions, 1383 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index f2f1ca19ed53..011b110365c8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -131,6 +131,9 @@ config SPARSEMEM_VMEMMAP config HAVE_MEMBLOCK boolean +config NO_BOOTMEM + boolean + # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" diff --git a/mm/Makefile b/mm/Makefile index 836e4163c1bf..50ec00ef2a0e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,7 +5,8 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o pgtable-generic.o + vmalloc.o pagewalk.o pgtable-generic.o \ + process_vm_access.o obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d6edf8d14f9c..7520ef0bfd47 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -359,6 +359,17 @@ static unsigned long bdi_longest_inactive(void) return max(5UL * 60 * HZ, interval); } +/* + * Clear pending bit and wakeup anybody waiting for flusher thread creation or + * shutdown + */ +static void bdi_clear_pending(struct backing_dev_info *bdi) +{ + clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); + wake_up_bit(&bdi->state, BDI_pending); +} + static int bdi_forker_thread(void *ptr) { struct bdi_writeback *me = ptr; @@ -390,6 +401,12 @@ static int bdi_forker_thread(void *ptr) } spin_lock_bh(&bdi_lock); + /* + * In the following loop we are going to check whether we have + * some work to do without any synchronization with tasks + * waking us up to do work for them. Set the task state here + * so that we don't miss wakeups after verifying conditions. + */ set_current_state(TASK_INTERRUPTIBLE); list_for_each_entry(bdi, &bdi_list, bdi_list) { @@ -469,11 +486,13 @@ static int bdi_forker_thread(void *ptr) spin_unlock_bh(&bdi->wb_lock); wake_up_process(task); } + bdi_clear_pending(bdi); break; case KILL_THREAD: __set_current_state(TASK_RUNNING); kthread_stop(task); + bdi_clear_pending(bdi); break; case NO_ACTION: @@ -489,16 +508,8 @@ static int bdi_forker_thread(void *ptr) else schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); try_to_freeze(); - /* Back to the main loop */ - continue; + break; } - - /* - * Clear pending bit and wakeup anybody waiting to tear us down. - */ - clear_bit(BDI_pending, &bdi->state); - smp_mb__after_clear_bit(); - wake_up_bit(&bdi->state, BDI_pending); } return 0; diff --git a/mm/bounce.c b/mm/bounce.c index 1481de68184b..434fb4f0c5e4 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> +#include <linux/bootmem.h> #include <asm/tlbflush.h> #include <trace/events/block.h> @@ -26,12 +27,10 @@ static mempool_t *page_pool, *isa_page_pool; #ifdef CONFIG_HIGHMEM static __init int init_emergency_pool(void) { - struct sysinfo i; - si_meminfo(&i); - si_swapinfo(&i); - - if (!i.totalhigh) +#ifndef CONFIG_MEMORY_HOTPLUG + if (max_pfn <= max_low_pfn) return 0; +#endif page_pool = mempool_create_page_pool(POOL_SIZE, 0); BUG_ON(!page_pool); diff --git a/mm/compaction.c b/mm/compaction.c index 6cc604bd5649..899d95638586 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -35,10 +35,6 @@ struct compact_control { unsigned long migrate_pfn; /* isolate_migratepages search base */ bool sync; /* Synchronous migration */ - /* Account for isolated anon and file pages */ - unsigned long nr_anon; - unsigned long nr_file; - unsigned int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; @@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone, static void acct_isolated(struct zone *zone, struct compact_control *cc) { struct page *page; - unsigned int count[NR_LRU_LISTS] = { 0, }; + unsigned int count[2] = { 0, }; - list_for_each_entry(page, &cc->migratepages, lru) { - int lru = page_lru_base_type(page); - count[lru]++; - } + list_for_each_entry(page, &cc->migratepages, lru) + count[!!page_is_file_cache(page)]++; - cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; - cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; - __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon); - __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file); + __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]); } /* Similar to reclaim, but different enough that they don't share logic */ @@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; + isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE; /* Do not scan outside zone boundaries */ low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); @@ -356,8 +349,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, continue; } + if (!cc->sync) + mode |= ISOLATE_CLEAN; + /* Try isolate the page */ - if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) + if (__isolate_lru_page(page, mode, 0) != 0) continue; VM_BUG_ON(PageTransCompound(page)); @@ -586,7 +582,7 @@ out: return ret; } -unsigned long compact_zone_order(struct zone *zone, +static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, bool sync) { diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c index a1e3324de2b5..7cea557407f4 100644 --- a/mm/debug-pagealloc.c +++ b/mm/debug-pagealloc.c @@ -1,7 +1,10 @@ #include <linux/kernel.h> +#include <linux/string.h> #include <linux/mm.h> +#include <linux/highmem.h> #include <linux/page-debug-flags.h> #include <linux/poison.h> +#include <linux/ratelimit.h> static inline void set_page_poison(struct page *page) { @@ -18,28 +21,13 @@ static inline bool page_poison(struct page *page) return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags); } -static void poison_highpage(struct page *page) -{ - /* - * Page poisoning for highmem pages is not implemented. - * - * This can be called from interrupt contexts. - * So we need to create a new kmap_atomic slot for this - * application and it will need interrupt protection. - */ -} - static void poison_page(struct page *page) { - void *addr; + void *addr = kmap_atomic(page); - if (PageHighMem(page)) { - poison_highpage(page); - return; - } set_page_poison(page); - addr = page_address(page); memset(addr, PAGE_POISON, PAGE_SIZE); + kunmap_atomic(addr); } static void poison_pages(struct page *page, int n) @@ -59,14 +47,12 @@ static bool single_bit_flip(unsigned char a, unsigned char b) static void check_poison_mem(unsigned char *mem, size_t bytes) { + static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10); unsigned char *start; unsigned char *end; - for (start = mem; start < mem + bytes; start++) { - if (*start != PAGE_POISON) - break; - } - if (start == mem + bytes) + start = memchr_inv(mem, PAGE_POISON, bytes); + if (!start) return; for (end = mem + bytes - 1; end > start; end--) { @@ -74,7 +60,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) break; } - if (!printk_ratelimit()) + if (!__ratelimit(&ratelimit)) return; else if (start == end && single_bit_flip(*start, PAGE_POISON)) printk(KERN_ERR "pagealloc: single bit error\n"); @@ -86,27 +72,17 @@ static void check_poison_mem(unsigned char *mem, size_t bytes) dump_stack(); } -static void unpoison_highpage(struct page *page) -{ - /* - * See comment in poison_highpage(). - * Highmem pages should not be poisoned for now - */ - BUG_ON(page_poison(page)); -} - static void unpoison_page(struct page *page) { - if (PageHighMem(page)) { - unpoison_highpage(page); + void *addr; + + if (!page_poison(page)) return; - } - if (page_poison(page)) { - void *addr = page_address(page); - check_poison_mem(addr, PAGE_SIZE); - clear_page_poison(page); - } + addr = kmap_atomic(page); + check_poison_mem(addr, PAGE_SIZE); + clear_page_poison(page); + kunmap_atomic(addr); } static void unpoison_pages(struct page *page, int n) diff --git a/mm/filemap.c b/mm/filemap.c index 645a080ba4df..5cf820a7c8ec 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -827,13 +827,14 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, { unsigned int i; unsigned int ret; - unsigned int nr_found; + unsigned int nr_found, nr_skip; rcu_read_lock(); restart: nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, (void ***)pages, NULL, start, nr_pages); ret = 0; + nr_skip = 0; for (i = 0; i < nr_found; i++) { struct page *page; repeat: @@ -856,6 +857,7 @@ repeat: * here as an exceptional entry: so skip over it - * we only reach this from invalidate_mapping_pages(). */ + nr_skip++; continue; } @@ -876,7 +878,7 @@ repeat: * If all entries were removed before we could secure them, * try again, because callers stop trying once 0 is returned. */ - if (unlikely(!ret && nr_found)) + if (unlikely(!ret && nr_found > nr_skip)) goto restart; rcu_read_unlock(); return ret; @@ -2113,6 +2115,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) } else { const struct iovec *iov = i->iov; size_t base = i->iov_offset; + unsigned long nr_segs = i->nr_segs; /* * The !iov->iov_len check ensures we skip over unlikely @@ -2128,11 +2131,13 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes) base += copy; if (iov->iov_len == base) { iov++; + nr_segs--; base = 0; } } i->iov = iov; i->iov_offset = base; + i->nr_segs = nr_segs; } } EXPORT_SYMBOL(iov_iter_advance); diff --git a/mm/highmem.c b/mm/highmem.c index 693394daa2ed..e159a7b1cc22 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -250,7 +250,7 @@ void *kmap_high_get(struct page *page) #endif /** - * kunmap_high - map a highmem page into memory + * kunmap_high - unmap a highmem page into memory * @page: &struct page to unmap * * If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called @@ -326,7 +326,7 @@ static struct page_address_slot { spinlock_t lock; /* Protect this bucket's list */ } ____cacheline_aligned_in_smp page_address_htable[1<<PA_HASH_ORDER]; -static struct page_address_slot *page_slot(struct page *page) +static struct page_address_slot *page_slot(const struct page *page) { return &page_address_htable[hash_ptr(page, PA_HASH_ORDER)]; } @@ -337,7 +337,7 @@ static struct page_address_slot *page_slot(struct page *page) * * Returns the page's virtual address. */ -void *page_address(struct page *page) +void *page_address(const struct page *page) { unsigned long flags; void *ret; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e2d1587be269..4298abaae153 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -89,7 +89,8 @@ struct khugepaged_scan { struct list_head mm_head; struct mm_slot *mm_slot; unsigned long address; -} khugepaged_scan = { +}; +static struct khugepaged_scan khugepaged_scan = { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; @@ -829,7 +830,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, for (i = 0; i < HPAGE_PMD_NR; i++) { copy_user_highpage(pages[i], page + i, - haddr + PAGE_SHIFT*i, vma); + haddr + PAGE_SIZE * i, vma); __SetPageUptodate(pages[i]); cond_resched(); } @@ -989,7 +990,7 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON(!PageCompound(page)); if (flags & FOLL_GET) - get_page(page); + get_page_foll(page); out: return page; @@ -1052,6 +1053,51 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, return ret; } +int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, + unsigned long old_addr, + unsigned long new_addr, unsigned long old_end, + pmd_t *old_pmd, pmd_t *new_pmd) +{ + int ret = 0; + pmd_t pmd; + + struct mm_struct *mm = vma->vm_mm; + + if ((old_addr & ~HPAGE_PMD_MASK) || + (new_addr & ~HPAGE_PMD_MASK) || + old_end - old_addr < HPAGE_PMD_SIZE || + (new_vma->vm_flags & VM_NOHUGEPAGE)) + goto out; + + /* + * The destination pmd shouldn't be established, free_pgtables() + * should have release it. + */ + if (WARN_ON(!pmd_none(*new_pmd))) { + VM_BUG_ON(pmd_trans_huge(*new_pmd)); + goto out; + } + + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*old_pmd))) { + if (pmd_trans_splitting(*old_pmd)) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, old_pmd); + ret = -1; + } else { + pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); + VM_BUG_ON(!pmd_none(*new_pmd)); + set_pmd_at(mm, new_addr, new_pmd, pmd); + spin_unlock(&mm->page_table_lock); + ret = 1; + } + } else { + spin_unlock(&mm->page_table_lock); + } +out: + return ret; +} + int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot) { @@ -1156,6 +1202,7 @@ static void __split_huge_page_refcount(struct page *page) unsigned long head_index = page->index; struct zone *zone = page_zone(page); int zonestat; + int tail_count = 0; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -1164,11 +1211,27 @@ static void __split_huge_page_refcount(struct page *page) for (i = 1; i < HPAGE_PMD_NR; i++) { struct page *page_tail = page + i; - /* tail_page->_count cannot change */ - atomic_sub(atomic_read(&page_tail->_count), &page->_count); - BUG_ON(page_count(page) <= 0); - atomic_add(page_mapcount(page) + 1, &page_tail->_count); - BUG_ON(atomic_read(&page_tail->_count) <= 0); + /* tail_page->_mapcount cannot change */ + BUG_ON(page_mapcount(page_tail) < 0); + tail_count += page_mapcount(page_tail); + /* check for overflow */ + BUG_ON(tail_count < 0); + BUG_ON(atomic_read(&page_tail->_count) != 0); + /* + * tail_page->_count is zero and not changing from + * under us. But get_page_unless_zero() may be running + * from under us on the tail_page. If we used + * atomic_set() below instead of atomic_add(), we + * would then run atomic_set() concurrently with + * get_page_unless_zero(), and atomic_set() is + * implemented in C not using locked ops. spin_unlock + * on x86 sometime uses locked ops because of PPro + * errata 66, 92, so unless somebody can guarantee + * atomic_set() here would be safe on all archs (and + * not only on x86), it's safer to use atomic_add(). + */ + atomic_add(page_mapcount(page) + page_mapcount(page_tail) + 1, + &page_tail->_count); /* after clearing PageTail the gup refcount can be released */ smp_mb(); @@ -1186,10 +1249,7 @@ static void __split_huge_page_refcount(struct page *page) (1L << PG_uptodate))); page_tail->flags |= (1L << PG_dirty); - /* - * 1) clear PageTail before overwriting first_page - * 2) clear PageTail before clearing PageHead for VM_BUG_ON - */ + /* clear PageTail before overwriting first_page */ smp_wmb(); /* @@ -1206,7 +1266,6 @@ static void __split_huge_page_refcount(struct page *page) * status is achieved setting a reserved bit in the * pmd, not by clearing the present bit. */ - BUG_ON(page_mapcount(page_tail)); page_tail->_mapcount = page->_mapcount; BUG_ON(page_tail->mapping); @@ -1223,6 +1282,8 @@ static void __split_huge_page_refcount(struct page *page) lru_add_page_tail(zone, page, page_tail); } + atomic_sub(tail_count, &page->_count); + BUG_ON(atomic_read(&page->_count) <= 0); __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); @@ -1906,7 +1967,7 @@ static void collapse_huge_page(struct mm_struct *mm, BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); set_pmd_at(mm, address, pmd, _pmd); - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, _pmd); prepare_pmd_huge_pte(pgtable, mm); mm->nr_ptes--; spin_unlock(&mm->page_table_lock); @@ -2024,6 +2085,8 @@ static void collect_mm_slot(struct mm_slot *mm_slot) static unsigned int khugepaged_scan_mm_slot(unsigned int pages, struct page **hpage) + __releases(&khugepaged_mm_lock) + __acquires(&khugepaged_mm_lock) { struct mm_slot *mm_slot; struct mm_struct *mm; diff --git a/mm/internal.h b/mm/internal.h index d071d380fb49..2189af491783 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -37,6 +37,52 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } +static inline void __get_page_tail_foll(struct page *page, + bool get_page_head) +{ + /* + * If we're getting a tail page, the elevated page->_count is + * required only in the head page and we will elevate the head + * page->_count and tail page->_mapcount. + * + * We elevate page_tail->_mapcount for tail pages to force + * page_tail->_count to be zero at all times to avoid getting + * false positives from get_page_unless_zero() with + * speculative page access (like in + * page_cache_get_speculative()) on tail pages. + */ + VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); + VM_BUG_ON(page_mapcount(page) < 0); + if (get_page_head) + atomic_inc(&page->first_page->_count); + atomic_inc(&page->_mapcount); +} + +/* + * This is meant to be called as the FOLL_GET operation of + * follow_page() and it must be called while holding the proper PT + * lock while the pte (or pmd_trans_huge) is still mapping the page. + */ +static inline void get_page_foll(struct page *page) +{ + if (unlikely(PageTail(page))) + /* + * This is safe only because + * __split_huge_page_refcount() can't run under + * get_page_foll() because we hold the proper PT lock. + */ + __get_page_tail_foll(page, true); + else { + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. + */ + VM_BUG_ON(atomic_read(&page->_count) <= 0); + atomic_inc(&page->_count); + } +} + extern unsigned long highest_memmap_pfn; /* @@ -1905,7 +1905,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); err = unmerge_and_remove_all_rmap_items(); - test_set_oom_score_adj(oom_score_adj); + compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, + oom_score_adj); if (err) { ksm_run = KSM_RUN_STOP; count = err; diff --git a/mm/memblock.c b/mm/memblock.c index ccbf97339592..84bec4969ed5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -58,7 +58,8 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); } -long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) +static long __init_memblock memblock_overlaps_region(struct memblock_type *type, + phys_addr_t base, phys_addr_t size) { unsigned long i; @@ -267,7 +268,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type) return 0; } -extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, +int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, phys_addr_t addr2, phys_addr_t size2) { return 1; @@ -626,6 +627,12 @@ phys_addr_t __init memblock_phys_mem_size(void) return memblock.memory_size; } +/* lowest address */ +phys_addr_t __init_memblock memblock_start_of_DRAM(void) +{ + return memblock.memory.regions[0].base; +} + phys_addr_t __init_memblock memblock_end_of_DRAM(void) { int idx = memblock.memory.cnt - 1; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f4ec4e7ca4cd..7af1d5ee1598 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -201,52 +201,8 @@ struct mem_cgroup_eventfd_list { struct eventfd_ctx *eventfd; }; -static void mem_cgroup_threshold(struct mem_cgroup *mem); -static void mem_cgroup_oom_notify(struct mem_cgroup *mem); - -enum { - SCAN_BY_LIMIT, - SCAN_BY_SYSTEM, - NR_SCAN_CONTEXT, - SCAN_BY_SHRINK, /* not recorded now */ -}; - -enum { - SCAN, - SCAN_ANON, - SCAN_FILE, - ROTATE, - ROTATE_ANON, - ROTATE_FILE, - FREED, - FREED_ANON, - FREED_FILE, - ELAPSED, - NR_SCANSTATS, -}; - -struct scanstat { - spinlock_t lock; - unsigned long stats[NR_SCAN_CONTEXT][NR_SCANSTATS]; - unsigned long rootstats[NR_SCAN_CONTEXT][NR_SCANSTATS]; -}; - -const char *scanstat_string[NR_SCANSTATS] = { - "scanned_pages", - "scanned_anon_pages", - "scanned_file_pages", - "rotated_pages", - "rotated_anon_pages", - "rotated_file_pages", - "freed_pages", - "freed_anon_pages", - "freed_file_pages", - "elapsed_ns", -}; -#define SCANSTAT_WORD_LIMIT "_by_limit" -#define SCANSTAT_WORD_SYSTEM "_by_system" -#define SCANSTAT_WORD_HIERARCHY "_under_hierarchy" - +static void mem_cgroup_threshold(struct mem_cgroup *memcg); +static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); /* * The memory controller data structure. The memory controller controls both @@ -313,8 +269,7 @@ struct mem_cgroup { /* For oom notifier event fd */ struct list_head oom_notify; - /* For recording LRU-scan statistics */ - struct scanstat scanstat; + /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? @@ -407,29 +362,29 @@ enum charge_type { #define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 #define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) -static void mem_cgroup_get(struct mem_cgroup *mem); -static void mem_cgroup_put(struct mem_cgroup *mem); -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); -static void drain_all_stock_async(struct mem_cgroup *mem); +static void mem_cgroup_get(struct mem_cgroup *memcg); +static void mem_cgroup_put(struct mem_cgroup *memcg); +static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); +static void drain_all_stock_async(struct mem_cgroup *memcg); static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) +mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) { - return &mem->info.nodeinfo[nid]->zoneinfo[zid]; + return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; } -struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) +struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) { - return &mem->css; + return &memcg->css; } static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct mem_cgroup *mem, struct page *page) +page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) { int nid = page_to_nid(page); int zid = page_zonenum(page); - return mem_cgroup_zoneinfo(mem, nid, zid); + return mem_cgroup_zoneinfo(memcg, nid, zid); } static struct mem_cgroup_tree_per_zone * @@ -448,7 +403,7 @@ soft_limit_tree_from_page(struct page *page) } static void -__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, +__mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz, unsigned long long new_usage_in_excess) @@ -482,7 +437,7 @@ __mem_cgroup_insert_exceeded(struct mem_cgroup *mem, } static void -__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, +__mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz) { @@ -493,17 +448,17 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem, } static void -mem_cgroup_remove_exceeded(struct mem_cgroup *mem, +mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, struct mem_cgroup_per_zone *mz, struct mem_cgroup_tree_per_zone *mctz) { spin_lock(&mctz->lock); - __mem_cgroup_remove_exceeded(mem, mz, mctz); + __mem_cgroup_remove_exceeded(memcg, mz, mctz); spin_unlock(&mctz->lock); } -static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) +static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) { unsigned long long excess; struct mem_cgroup_per_zone *mz; @@ -516,9 +471,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) * Necessary to update all ancestors when hierarchy is used. * because their event counter is not touched. */ - for (; mem; mem = parent_mem_cgroup(mem)) { - mz = mem_cgroup_zoneinfo(mem, nid, zid); - excess = res_counter_soft_limit_excess(&mem->res); + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + mz = mem_cgroup_zoneinfo(memcg, nid, zid); + excess = res_counter_soft_limit_excess(&memcg->res); /* * We have to update the tree if mz is on RB-tree or * mem is over its softlimit. @@ -527,18 +482,18 @@ static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) spin_lock(&mctz->lock); /* if on-tree, remove it */ if (mz->on_tree) - __mem_cgroup_remove_exceeded(mem, mz, mctz); + __mem_cgroup_remove_exceeded(memcg, mz, mctz); /* * Insert again. mz->usage_in_excess will be updated. * If excess is 0, no tree ops. */ - __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); + __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); spin_unlock(&mctz->lock); } } } -static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) +static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) { int node, zone; struct mem_cgroup_per_zone *mz; @@ -546,9 +501,9 @@ static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) for_each_node_state(node, N_POSSIBLE) { for (zone = 0; zone < MAX_NR_ZONES; zone++) { - mz = mem_cgroup_zoneinfo(mem, node, zone); + mz = mem_cgroup_zoneinfo(memcg, node, zone); mctz = soft_limit_tree_node_zone(node, zone); - mem_cgroup_remove_exceeded(mem, mz, mctz); + mem_cgroup_remove_exceeded(memcg, mz, mctz); } } } @@ -609,7 +564,7 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) * common workload, threashold and synchonization as vmstat[] should be * implemented. */ -static long mem_cgroup_read_stat(struct mem_cgroup *mem, +static long mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { long val = 0; @@ -617,81 +572,83 @@ static long mem_cgroup_read_stat(struct mem_cgroup *mem, get_online_cpus(); for_each_online_cpu(cpu) - val += per_cpu(mem->stat->count[idx], cpu); + val += per_cpu(memcg->stat->count[idx], cpu); #ifdef CONFIG_HOTPLUG_CPU - spin_lock(&mem->pcp_counter_lock); - val += mem->nocpu_base.count[idx]; - spin_unlock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); + val += memcg->nocpu_base.count[idx]; + spin_unlock(&memcg->pcp_counter_lock); #endif put_online_cpus(); return val; } -static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, +static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) { int val = (charge) ? 1 : -1; - this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); + this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); } -void mem_cgroup_pgfault(struct mem_cgroup *mem, int val) +void mem_cgroup_pgfault(struct mem_cgroup *memcg, int val) { - this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); + this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT], val); } -void mem_cgroup_pgmajfault(struct mem_cgroup *mem, int val) +void mem_cgroup_pgmajfault(struct mem_cgroup *memcg, int val) { - this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); + this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT], val); } -static unsigned long mem_cgroup_read_events(struct mem_cgroup *mem, +static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, enum mem_cgroup_events_index idx) { unsigned long val = 0; int cpu; for_each_online_cpu(cpu) - val += per_cpu(mem->stat->events[idx], cpu); + val += per_cpu(memcg->stat->events[idx], cpu); #ifdef CONFIG_HOTPLUG_CPU - spin_lock(&mem->pcp_counter_lock); - val += mem->nocpu_base.events[idx]; - spin_unlock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); + val += memcg->nocpu_base.events[idx]; + spin_unlock(&memcg->pcp_counter_lock); #endif return val; } -static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, +static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, bool file, int nr_pages) { preempt_disable(); if (file) - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], + nr_pages); else - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); + __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], + nr_pages); /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); + __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); else { - __this_cpu_inc(mem->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); + __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); nr_pages = -nr_pages; /* for event */ } - __this_cpu_add(mem->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); + __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT], nr_pages); preempt_enable(); } unsigned long -mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, +mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, unsigned int lru_mask) { struct mem_cgroup_per_zone *mz; enum lru_list l; unsigned long ret = 0; - mz = mem_cgroup_zoneinfo(mem, nid, zid); + mz = mem_cgroup_zoneinfo(memcg, nid, zid); for_each_lru(l) { if (BIT(l) & lru_mask) @@ -701,44 +658,45 @@ mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *mem, int nid, int zid, } static unsigned long -mem_cgroup_node_nr_lru_pages(struct mem_cgroup *mem, +mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, int nid, unsigned int lru_mask) { u64 total = 0; int zid; for (zid = 0; zid < MAX_NR_ZONES; zid++) - total += mem_cgroup_zone_nr_lru_pages(mem, nid, zid, lru_mask); + total += mem_cgroup_zone_nr_lru_pages(memcg, + nid, zid, lru_mask); return total; } -static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *mem, +static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, unsigned int lru_mask) { int nid; u64 total = 0; for_each_node_state(nid, N_HIGH_MEMORY) - total += mem_cgroup_node_nr_lru_pages(mem, nid, lru_mask); + total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); return total; } -static bool __memcg_event_check(struct mem_cgroup *mem, int target) +static bool __memcg_event_check(struct mem_cgroup *memcg, int target) { unsigned long val, next; - val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); - next = this_cpu_read(mem->stat->targets[target]); + val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); + next = __this_cpu_read(memcg->stat->targets[target]); /* from time_after() in jiffies.h */ return ((long)next - (long)val < 0); } -static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) +static void __mem_cgroup_target_update(struct mem_cgroup *memcg, int target) { unsigned long val, next; - val = this_cpu_read(mem->stat->events[MEM_CGROUP_EVENTS_COUNT]); + val = __this_cpu_read(memcg->stat->events[MEM_CGROUP_EVENTS_COUNT]); switch (target) { case MEM_CGROUP_TARGET_THRESH: @@ -754,34 +712,36 @@ static void __mem_cgroup_target_update(struct mem_cgroup *mem, int target) return; } - this_cpu_write(mem->stat->targets[target], next); + __this_cpu_write(memcg->stat->targets[target], next); } /* * Check events in order. * */ -static void memcg_check_events(struct mem_cgroup *mem, struct page *page) +static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) { + preempt_disable(); /* threshold event is triggered in finer grain than soft limit */ - if (unlikely(__memcg_event_check(mem, MEM_CGROUP_TARGET_THRESH))) { - mem_cgroup_threshold(mem); - __mem_cgroup_target_update(mem, MEM_CGROUP_TARGET_THRESH); - if (unlikely(__memcg_event_check(mem, + if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_THRESH))) { + mem_cgroup_threshold(memcg); + __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_THRESH); + if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_SOFTLIMIT))) { - mem_cgroup_update_tree(mem, page); - __mem_cgroup_target_update(mem, + mem_cgroup_update_tree(memcg, page); + __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_SOFTLIMIT); } #if MAX_NUMNODES > 1 - if (unlikely(__memcg_event_check(mem, + if (unlikely(__memcg_event_check(memcg, MEM_CGROUP_TARGET_NUMAINFO))) { - atomic_inc(&mem->numainfo_events); - __mem_cgroup_target_update(mem, + atomic_inc(&memcg->numainfo_events); + __mem_cgroup_target_update(memcg, MEM_CGROUP_TARGET_NUMAINFO); } #endif } + preempt_enable(); } static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) @@ -807,7 +767,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; if (!mm) return NULL; @@ -818,25 +778,25 @@ struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) */ rcu_read_lock(); do { - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!mem)) + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) break; - } while (!css_tryget(&mem->css)); + } while (!css_tryget(&memcg->css)); rcu_read_unlock(); - return mem; + return memcg; } /* The caller has to guarantee "mem" exists before calling this */ -static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) +static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *memcg) { struct cgroup_subsys_state *css; int found; - if (!mem) /* ROOT cgroup has the smallest ID */ + if (!memcg) /* ROOT cgroup has the smallest ID */ return root_mem_cgroup; /*css_put/get against root is ignored*/ - if (!mem->use_hierarchy) { - if (css_tryget(&mem->css)) - return mem; + if (!memcg->use_hierarchy) { + if (css_tryget(&memcg->css)) + return memcg; return NULL; } rcu_read_lock(); @@ -844,13 +804,13 @@ static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) * searching a memory cgroup which has the smallest ID under given * ROOT cgroup. (ID >= 1) */ - css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); + css = css_get_next(&mem_cgroup_subsys, 1, &memcg->css, &found); if (css && css_tryget(css)) - mem = container_of(css, struct mem_cgroup, css); + memcg = container_of(css, struct mem_cgroup, css); else - mem = NULL; + memcg = NULL; rcu_read_unlock(); - return mem; + return memcg; } static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, @@ -904,29 +864,29 @@ static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, for_each_mem_cgroup_tree_cond(iter, NULL, true) -static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) +static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { - return (mem == root_mem_cgroup); + return (memcg == root_mem_cgroup); } void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) { - struct mem_cgroup *mem; + struct mem_cgroup *memcg; if (!mm) return; rcu_read_lock(); - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!mem)) + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) goto out; switch (idx) { case PGMAJFAULT: - mem_cgroup_pgmajfault(mem, 1); + mem_cgroup_pgmajfault(memcg, 1); break; case PGFAULT: - mem_cgroup_pgfault(mem, 1); + mem_cgroup_pgfault(memcg, 1); break; default: BUG(); @@ -1035,6 +995,16 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); VM_BUG_ON(PageCgroupAcctLRU(pc)); + /* + * putback: charge: + * SetPageLRU SetPageCgroupUsed + * smp_mb smp_mb + * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU + * + * Ensure that one of the two sides adds the page to the memcg + * LRU during a race. + */ + smp_mb(); if (!PageCgroupUsed(pc)) return; /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ @@ -1086,7 +1056,16 @@ static void mem_cgroup_lru_add_after_commit(struct page *page) unsigned long flags; struct zone *zone = page_zone(page); struct page_cgroup *pc = lookup_page_cgroup(page); - + /* + * putback: charge: + * SetPageLRU SetPageCgroupUsed + * smp_mb smp_mb + * PageCgroupUsed && add to memcg LRU PageLRU && add to memcg LRU + * + * Ensure that one of the two sides adds the page to the memcg + * LRU during a race. + */ + smp_mb(); /* taking care of that the page is added to LRU while we commit it */ if (likely(!PageLRU(page))) return; @@ -1108,21 +1087,21 @@ void mem_cgroup_move_lists(struct page *page, } /* - * Checks whether given mem is same or in the root_mem's + * Checks whether given mem is same or in the root_mem_cgroup's * hierarchy subtree */ -static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_mem, - struct mem_cgroup *mem) +static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, + struct mem_cgroup *memcg) { - if (root_mem != mem) { - return (root_mem->use_hierarchy && - css_is_ancestor(&mem->css, &root_mem->css)); + if (root_memcg != memcg) { + return (root_memcg->use_hierarchy && + css_is_ancestor(&memcg->css, &root_memcg->css)); } return true; } -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) +int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) { int ret; struct mem_cgroup *curr = NULL; @@ -1136,25 +1115,29 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) if (!curr) return 0; /* - * We should check use_hierarchy of "mem" not "curr". Because checking + * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is - * enabled in "curr" and "curr" is a child of "mem" in *cgroup* - * hierarchy(even if use_hierarchy is disabled in "mem"). + * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* + * hierarchy(even if use_hierarchy is disabled in "memcg"). */ - ret = mem_cgroup_same_or_subtree(mem, curr); + ret = mem_cgroup_same_or_subtree(memcg, curr); css_put(&curr->css); return ret; } -static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) +int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg, struct zone *zone) { - unsigned long active; + unsigned long inactive_ratio; + int nid = zone_to_nid(zone); + int zid = zone_idx(zone); unsigned long inactive; + unsigned long active; unsigned long gb; - unsigned long inactive_ratio; - inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); - active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); + inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, + BIT(LRU_INACTIVE_ANON)); + active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, + BIT(LRU_ACTIVE_ANON)); gb = (inactive + active) >> (30 - PAGE_SHIFT); if (gb) @@ -1162,39 +1145,20 @@ static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_ else inactive_ratio = 1; - if (present_pages) { - present_pages[0] = inactive; - present_pages[1] = active; - } - - return inactive_ratio; + return inactive * inactive_ratio < active; } -int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg) -{ - unsigned long active; - unsigned long inactive; - unsigned long present_pages[2]; - unsigned long inactive_ratio; - - inactive_ratio = calc_inactive_ratio(memcg, present_pages); - - inactive = present_pages[0]; - active = present_pages[1]; - - if (inactive * inactive_ratio < active) - return 1; - - return 0; -} - -int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg) +int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg, struct zone *zone) { unsigned long active; unsigned long inactive; + int zid = zone_idx(zone); + int nid = zone_to_nid(zone); - inactive = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); - active = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); + inactive = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, + BIT(LRU_INACTIVE_FILE)); + active = mem_cgroup_zone_nr_lru_pages(memcg, nid, zid, + BIT(LRU_ACTIVE_FILE)); return (active > inactive); } @@ -1230,7 +1194,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct list_head *dst, unsigned long *scanned, int order, - int mode, struct zone *z, + isolate_mode_t mode, + struct zone *z, struct mem_cgroup *mem_cont, int active, int file) { @@ -1298,13 +1263,13 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, * Returns the maximum amount of memory @mem can be charged with, in * pages. */ -static unsigned long mem_cgroup_margin(struct mem_cgroup *mem) +static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) { unsigned long long margin; - margin = res_counter_margin(&mem->res); + margin = res_counter_margin(&memcg->res); if (do_swap_account) - margin = min(margin, res_counter_margin(&mem->memsw)); + margin = min(margin, res_counter_margin(&memcg->memsw)); return margin >> PAGE_SHIFT; } @@ -1319,33 +1284,33 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) return memcg->swappiness; } -static void mem_cgroup_start_move(struct mem_cgroup *mem) +static void mem_cgroup_start_move(struct mem_cgroup *memcg) { int cpu; get_online_cpus(); - spin_lock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); for_each_online_cpu(cpu) - per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; - mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; - spin_unlock(&mem->pcp_counter_lock); + per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; + memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; + spin_unlock(&memcg->pcp_counter_lock); put_online_cpus(); synchronize_rcu(); } -static void mem_cgroup_end_move(struct mem_cgroup *mem) +static void mem_cgroup_end_move(struct mem_cgroup *memcg) { int cpu; - if (!mem) + if (!memcg) return; get_online_cpus(); - spin_lock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); for_each_online_cpu(cpu) - per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; - mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; - spin_unlock(&mem->pcp_counter_lock); + per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; + memcg->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; + spin_unlock(&memcg->pcp_counter_lock); put_online_cpus(); } /* @@ -1360,13 +1325,13 @@ static void mem_cgroup_end_move(struct mem_cgroup *mem) * waiting at hith-memory prressure caused by "move". */ -static bool mem_cgroup_stealed(struct mem_cgroup *mem) +static bool mem_cgroup_stealed(struct mem_cgroup *memcg) { VM_BUG_ON(!rcu_read_lock_held()); - return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; + return this_cpu_read(memcg->stat->count[MEM_CGROUP_ON_MOVE]) > 0; } -static bool mem_cgroup_under_move(struct mem_cgroup *mem) +static bool mem_cgroup_under_move(struct mem_cgroup *memcg) { struct mem_cgroup *from; struct mem_cgroup *to; @@ -1381,17 +1346,17 @@ static bool mem_cgroup_under_move(struct mem_cgroup *mem) if (!from) goto unlock; - ret = mem_cgroup_same_or_subtree(mem, from) - || mem_cgroup_same_or_subtree(mem, to); + ret = mem_cgroup_same_or_subtree(memcg, from) + || mem_cgroup_same_or_subtree(memcg, to); unlock: spin_unlock(&mc.lock); return ret; } -static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) +static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) { if (mc.moving_task && current != mc.moving_task) { - if (mem_cgroup_under_move(mem)) { + if (mem_cgroup_under_move(memcg)) { DEFINE_WAIT(wait); prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); /* moving charge context might have finished. */ @@ -1475,12 +1440,12 @@ done: * This function returns the number of memcg under hierarchy tree. Returns * 1(self count) if no children. */ -static int mem_cgroup_count_children(struct mem_cgroup *mem) +static int mem_cgroup_count_children(struct mem_cgroup *memcg) { int num = 0; struct mem_cgroup *iter; - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) num++; return num; } @@ -1510,21 +1475,21 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) * that to reclaim free pages from. */ static struct mem_cgroup * -mem_cgroup_select_victim(struct mem_cgroup *root_mem) +mem_cgroup_select_victim(struct mem_cgroup *root_memcg) { struct mem_cgroup *ret = NULL; struct cgroup_subsys_state *css; int nextid, found; - if (!root_mem->use_hierarchy) { - css_get(&root_mem->css); - ret = root_mem; + if (!root_memcg->use_hierarchy) { + css_get(&root_memcg->css); + ret = root_memcg; } while (!ret) { rcu_read_lock(); - nextid = root_mem->last_scanned_child + 1; - css = css_get_next(&mem_cgroup_subsys, nextid, &root_mem->css, + nextid = root_memcg->last_scanned_child + 1; + css = css_get_next(&mem_cgroup_subsys, nextid, &root_memcg->css, &found); if (css && css_tryget(css)) ret = container_of(css, struct mem_cgroup, css); @@ -1533,9 +1498,9 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) /* Updates scanning parameter */ if (!css) { /* this means start scan from ID:1 */ - root_mem->last_scanned_child = 0; + root_memcg->last_scanned_child = 0; } else - root_mem->last_scanned_child = found; + root_memcg->last_scanned_child = found; } return ret; @@ -1551,14 +1516,14 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) * reclaimable pages on a node. Returns true if there are any reclaimable * pages in the node. */ -static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, +static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, int nid, bool noswap) { - if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_FILE)) + if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) return true; if (noswap || !total_swap_pages) return false; - if (mem_cgroup_node_nr_lru_pages(mem, nid, LRU_ALL_ANON)) + if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) return true; return false; @@ -1571,29 +1536,29 @@ static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *mem, * nodes based on the zonelist. So update the list loosely once per 10 secs. * */ -static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) +static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) { int nid; /* * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET * pagein/pageout changes since the last update. */ - if (!atomic_read(&mem->numainfo_events)) + if (!atomic_read(&memcg->numainfo_events)) return; - if (atomic_inc_return(&mem->numainfo_updating) > 1) + if (atomic_inc_return(&memcg->numainfo_updating) > 1) return; /* make a nodemask where this memcg uses memory from */ - mem->scan_nodes = node_states[N_HIGH_MEMORY]; + memcg->scan_nodes = node_states[N_HIGH_MEMORY]; for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) { - if (!test_mem_cgroup_node_reclaimable(mem, nid, false)) - node_clear(nid, mem->scan_nodes); + if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) + node_clear(nid, memcg->scan_nodes); } - atomic_set(&mem->numainfo_events, 0); - atomic_set(&mem->numainfo_updating, 0); + atomic_set(&memcg->numainfo_events, 0); + atomic_set(&memcg->numainfo_updating, 0); } /* @@ -1608,16 +1573,16 @@ static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem) * * Now, we use round-robin. Better algorithm is welcomed. */ -int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) { int node; - mem_cgroup_may_update_nodemask(mem); - node = mem->last_scanned_node; + mem_cgroup_may_update_nodemask(memcg); + node = memcg->last_scanned_node; - node = next_node(node, mem->scan_nodes); + node = next_node(node, memcg->scan_nodes); if (node == MAX_NUMNODES) - node = first_node(mem->scan_nodes); + node = first_node(memcg->scan_nodes); /* * We call this when we hit limit, not when pages are added to LRU. * No LRU may hold pages because all pages are UNEVICTABLE or @@ -1627,7 +1592,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem) if (unlikely(node == MAX_NUMNODES)) node = numa_node_id(); - mem->last_scanned_node = node; + memcg->last_scanned_node = node; return node; } @@ -1637,7 +1602,7 @@ int mem_cgroup_select_victim_node(struct mem_cgroup *mem) * unused nodes. But scan_nodes is lazily updated and may not cotain * enough new information. We need to do double check. */ -bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) +bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) { int nid; @@ -1645,12 +1610,12 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) * quick check...making use of scan_node. * We can skip unused nodes. */ - if (!nodes_empty(mem->scan_nodes)) { - for (nid = first_node(mem->scan_nodes); + if (!nodes_empty(memcg->scan_nodes)) { + for (nid = first_node(memcg->scan_nodes); nid < MAX_NUMNODES; - nid = next_node(nid, mem->scan_nodes)) { + nid = next_node(nid, memcg->scan_nodes)) { - if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) + if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) return true; } } @@ -1658,77 +1623,39 @@ bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) * Check rest of nodes. */ for_each_node_state(nid, N_HIGH_MEMORY) { - if (node_isset(nid, mem->scan_nodes)) + if (node_isset(nid, memcg->scan_nodes)) continue; - if (test_mem_cgroup_node_reclaimable(mem, nid, noswap)) + if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) return true; } return false; } #else -int mem_cgroup_select_victim_node(struct mem_cgroup *mem) +int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) { return 0; } -bool mem_cgroup_reclaimable(struct mem_cgroup *mem, bool noswap) +bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) { - return test_mem_cgroup_node_reclaimable(mem, 0, noswap); + return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); } #endif -static void __mem_cgroup_record_scanstat(unsigned long *stats, - struct memcg_scanrecord *rec) -{ - - stats[SCAN] += rec->nr_scanned[0] + rec->nr_scanned[1]; - stats[SCAN_ANON] += rec->nr_scanned[0]; - stats[SCAN_FILE] += rec->nr_scanned[1]; - - stats[ROTATE] += rec->nr_rotated[0] + rec->nr_rotated[1]; - stats[ROTATE_ANON] += rec->nr_rotated[0]; - stats[ROTATE_FILE] += rec->nr_rotated[1]; - - stats[FREED] += rec->nr_freed[0] + rec->nr_freed[1]; - stats[FREED_ANON] += rec->nr_freed[0]; - stats[FREED_FILE] += rec->nr_freed[1]; - - stats[ELAPSED] += rec->elapsed; -} - -static void mem_cgroup_record_scanstat(struct memcg_scanrecord *rec) -{ - struct mem_cgroup *mem; - int context = rec->context; - - if (context >= NR_SCAN_CONTEXT) - return; - - mem = rec->mem; - spin_lock(&mem->scanstat.lock); - __mem_cgroup_record_scanstat(mem->scanstat.stats[context], rec); - spin_unlock(&mem->scanstat.lock); - - mem = rec->root; - spin_lock(&mem->scanstat.lock); - __mem_cgroup_record_scanstat(mem->scanstat.rootstats[context], rec); - spin_unlock(&mem->scanstat.lock); -} - /* * Scan the hierarchy if needed to reclaim memory. We remember the last child * we reclaimed from, so that we don't end up penalizing one child extensively * based on its position in the children list. * - * root_mem is the original ancestor that we've been reclaim from. + * root_memcg is the original ancestor that we've been reclaim from. * - * We give up and return to the caller when we visit root_mem twice. + * We give up and return to the caller when we visit root_memcg twice. * (other groups can be removed while we're walking....) * * If shrink==true, for avoiding to free too much, this returns immedieately. */ -static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, +static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_memcg, struct zone *zone, gfp_t gfp_mask, unsigned long reclaim_options, @@ -1740,28 +1667,18 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; - struct memcg_scanrecord rec; unsigned long excess; - unsigned long scanned; + unsigned long nr_scanned; - excess = res_counter_soft_limit_excess(&root_mem->res) >> PAGE_SHIFT; + excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; /* If memsw_is_minimum==1, swap-out is of-no-use. */ - if (!check_soft && !shrink && root_mem->memsw_is_minimum) + if (!check_soft && !shrink && root_memcg->memsw_is_minimum) noswap = true; - if (shrink) - rec.context = SCAN_BY_SHRINK; - else if (check_soft) - rec.context = SCAN_BY_SYSTEM; - else - rec.context = SCAN_BY_LIMIT; - - rec.root = root_mem; - while (1) { - victim = mem_cgroup_select_victim(root_mem); - if (victim == root_mem) { + victim = mem_cgroup_select_victim(root_memcg); + if (victim == root_memcg) { loop++; /* * We are not draining per cpu cached charges during @@ -1770,7 +1687,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, * charges will not give any. */ if (!check_soft && loop >= 1) - drain_all_stock_async(root_mem); + drain_all_stock_async(root_memcg); if (loop >= 2) { /* * If we have not been able to reclaim @@ -1799,23 +1716,14 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, css_put(&victim->css); continue; } - rec.mem = victim; - rec.nr_scanned[0] = 0; - rec.nr_scanned[1] = 0; - rec.nr_rotated[0] = 0; - rec.nr_rotated[1] = 0; - rec.nr_freed[0] = 0; - rec.nr_freed[1] = 0; - rec.elapsed = 0; /* we use swappiness of local cgroup */ if (check_soft) { ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, - noswap, zone, &rec, &scanned); - *total_scanned += scanned; + noswap, zone, &nr_scanned); + *total_scanned += nr_scanned; } else ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, - noswap, &rec); - mem_cgroup_record_scanstat(&rec); + noswap); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -1826,9 +1734,9 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, return ret; total += ret; if (check_soft) { - if (!res_counter_soft_limit_excess(&root_mem->res)) + if (!res_counter_soft_limit_excess(&root_memcg->res)) return total; - } else if (mem_cgroup_margin(root_mem)) + } else if (mem_cgroup_margin(root_memcg)) return total; } return total; @@ -1839,69 +1747,62 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, * If someone is running, return false. * Has to be called with memcg_oom_lock */ -static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) +static bool mem_cgroup_oom_lock(struct mem_cgroup *memcg) { - int lock_count = -1; struct mem_cgroup *iter, *failed = NULL; bool cond = true; - for_each_mem_cgroup_tree_cond(iter, mem, cond) { - bool locked = iter->oom_lock; - - iter->oom_lock = true; - if (lock_count == -1) - lock_count = iter->oom_lock; - else if (lock_count != locked) { + for_each_mem_cgroup_tree_cond(iter, memcg, cond) { + if (iter->oom_lock) { /* * this subtree of our hierarchy is already locked * so we cannot give a lock. */ - lock_count = 0; failed = iter; cond = false; - } + } else + iter->oom_lock = true; } if (!failed) - goto done; + return true; /* * OK, we failed to lock the whole subtree so we have to clean up * what we set up to the failing subtree */ cond = true; - for_each_mem_cgroup_tree_cond(iter, mem, cond) { + for_each_mem_cgroup_tree_cond(iter, memcg, cond) { if (iter == failed) { cond = false; continue; } iter->oom_lock = false; } -done: - return lock_count; + return false; } /* * Has to be called with memcg_oom_lock */ -static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) +static int mem_cgroup_oom_unlock(struct mem_cgroup *memcg) { struct mem_cgroup *iter; - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) iter->oom_lock = false; return 0; } -static void mem_cgroup_mark_under_oom(struct mem_cgroup *mem) +static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) { struct mem_cgroup *iter; - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) atomic_inc(&iter->under_oom); } -static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) +static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) { struct mem_cgroup *iter; @@ -1910,7 +1811,7 @@ static void mem_cgroup_unmark_under_oom(struct mem_cgroup *mem) * mem_cgroup_oom_lock() may not be called. We have to use * atomic_add_unless() here. */ - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) atomic_add_unless(&iter->under_oom, -1, 0); } @@ -1925,85 +1826,85 @@ struct oom_wait_info { static int memcg_oom_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *arg) { - struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg, - *oom_wait_mem; + struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg, + *oom_wait_memcg; struct oom_wait_info *oom_wait_info; oom_wait_info = container_of(wait, struct oom_wait_info, wait); - oom_wait_mem = oom_wait_info->mem; + oom_wait_memcg = oom_wait_info->mem; /* * Both of oom_wait_info->mem and wake_mem are stable under us. * Then we can use css_is_ancestor without taking care of RCU. */ - if (!mem_cgroup_same_or_subtree(oom_wait_mem, wake_mem) - && !mem_cgroup_same_or_subtree(wake_mem, oom_wait_mem)) + if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) + && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) return 0; return autoremove_wake_function(wait, mode, sync, arg); } -static void memcg_wakeup_oom(struct mem_cgroup *mem) +static void memcg_wakeup_oom(struct mem_cgroup *memcg) { - /* for filtering, pass "mem" as argument. */ - __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); + /* for filtering, pass "memcg" as argument. */ + __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); } -static void memcg_oom_recover(struct mem_cgroup *mem) +static void memcg_oom_recover(struct mem_cgroup *memcg) { - if (mem && atomic_read(&mem->under_oom)) - memcg_wakeup_oom(mem); + if (memcg && atomic_read(&memcg->under_oom)) + memcg_wakeup_oom(memcg); } /* * try to call OOM killer. returns false if we should exit memory-reclaim loop. */ -bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) +bool mem_cgroup_handle_oom(struct mem_cgroup *memcg, gfp_t mask) { struct oom_wait_info owait; bool locked, need_to_kill; - owait.mem = mem; + owait.mem = memcg; owait.wait.flags = 0; owait.wait.func = memcg_oom_wake_function; owait.wait.private = current; INIT_LIST_HEAD(&owait.wait.task_list); need_to_kill = true; - mem_cgroup_mark_under_oom(mem); + mem_cgroup_mark_under_oom(memcg); - /* At first, try to OOM lock hierarchy under mem.*/ + /* At first, try to OOM lock hierarchy under memcg.*/ spin_lock(&memcg_oom_lock); - locked = mem_cgroup_oom_lock(mem); + locked = mem_cgroup_oom_lock(memcg); /* * Even if signal_pending(), we can't quit charge() loop without * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL * under OOM is always welcomed, use TASK_KILLABLE here. */ prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); - if (!locked || mem->oom_kill_disable) + if (!locked || memcg->oom_kill_disable) need_to_kill = false; if (locked) - mem_cgroup_oom_notify(mem); + mem_cgroup_oom_notify(memcg); spin_unlock(&memcg_oom_lock); if (need_to_kill) { finish_wait(&memcg_oom_waitq, &owait.wait); - mem_cgroup_out_of_memory(mem, mask); + mem_cgroup_out_of_memory(memcg, mask); } else { schedule(); finish_wait(&memcg_oom_waitq, &owait.wait); } spin_lock(&memcg_oom_lock); if (locked) - mem_cgroup_oom_unlock(mem); - memcg_wakeup_oom(mem); + mem_cgroup_oom_unlock(memcg); + memcg_wakeup_oom(memcg); spin_unlock(&memcg_oom_lock); - mem_cgroup_unmark_under_oom(mem); + mem_cgroup_unmark_under_oom(memcg); if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) return false; /* Give chance to dying process */ - schedule_timeout(1); + schedule_timeout_uninterruptible(1); return true; } @@ -2034,7 +1935,7 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) void mem_cgroup_update_page_stat(struct page *page, enum mem_cgroup_page_stat_item idx, int val) { - struct mem_cgroup *mem; + struct mem_cgroup *memcg; struct page_cgroup *pc = lookup_page_cgroup(page); bool need_unlock = false; unsigned long uninitialized_var(flags); @@ -2043,16 +1944,16 @@ void mem_cgroup_update_page_stat(struct page *page, return; rcu_read_lock(); - mem = pc->mem_cgroup; - if (unlikely(!mem || !PageCgroupUsed(pc))) + memcg = pc->mem_cgroup; + if (unlikely(!memcg || !PageCgroupUsed(pc))) goto out; /* pc->mem_cgroup is unstable ? */ - if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { + if (unlikely(mem_cgroup_stealed(memcg)) || PageTransHuge(page)) { /* take a lock against to access pc->mem_cgroup */ move_lock_page_cgroup(pc, &flags); need_unlock = true; - mem = pc->mem_cgroup; - if (!mem || !PageCgroupUsed(pc)) + memcg = pc->mem_cgroup; + if (!memcg || !PageCgroupUsed(pc)) goto out; } @@ -2068,7 +1969,7 @@ void mem_cgroup_update_page_stat(struct page *page, BUG(); } - this_cpu_add(mem->stat->count[idx], val); + this_cpu_add(memcg->stat->count[idx], val); out: if (unlikely(need_unlock)) @@ -2091,6 +1992,7 @@ struct memcg_stock_pcp { #define FLUSHING_CACHED_CHARGE (0) }; static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); +static DEFINE_MUTEX(percpu_charge_mutex); /* * Try to consume stocked charge on this cpu. If success, one page is consumed @@ -2098,13 +2000,13 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); * cgroup which is not current target, returns false. This stock will be * refilled. */ -static bool consume_stock(struct mem_cgroup *mem) +static bool consume_stock(struct mem_cgroup *memcg) { struct memcg_stock_pcp *stock; bool ret = true; stock = &get_cpu_var(memcg_stock); - if (mem == stock->cached && stock->nr_pages) + if (memcg == stock->cached && stock->nr_pages) stock->nr_pages--; else /* need to call res_counter_charge */ ret = false; @@ -2145,44 +2047,38 @@ static void drain_local_stock(struct work_struct *dummy) * Cache charges(val) which is from res_counter, to local per_cpu area. * This will be consumed by consume_stock() function, later. */ -static void refill_stock(struct mem_cgroup *mem, unsigned int nr_pages) +static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) { struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); - if (stock->cached != mem) { /* reset if necessary */ + if (stock->cached != memcg) { /* reset if necessary */ drain_stock(stock); - stock->cached = mem; + stock->cached = memcg; } stock->nr_pages += nr_pages; put_cpu_var(memcg_stock); } /* - * Drains all per-CPU charge caches for given root_mem resp. subtree + * Drains all per-CPU charge caches for given root_memcg resp. subtree * of the hierarchy under it. sync flag says whether we should block * until the work is done. */ -static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) +static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) { int cpu, curcpu; /* Notify other cpus that system-wide "drain" is running */ get_online_cpus(); - /* - * Get a hint for avoiding draining charges on the current cpu, - * which must be exhausted by our charging. It is not required that - * this be a precise check, so we use raw_smp_processor_id() instead of - * getcpu()/putcpu(). - */ - curcpu = raw_smp_processor_id(); + curcpu = get_cpu(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - struct mem_cgroup *mem; + struct mem_cgroup *memcg; - mem = stock->cached; - if (!mem || !stock->nr_pages) + memcg = stock->cached; + if (!memcg || !stock->nr_pages) continue; - if (!mem_cgroup_same_or_subtree(root_mem, mem)) + if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) continue; if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { if (cpu == curcpu) @@ -2191,14 +2087,14 @@ static void drain_all_stock(struct mem_cgroup *root_mem, bool sync) schedule_work_on(cpu, &stock->work); } } + put_cpu(); if (!sync) goto out; for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); - if (mem_cgroup_same_or_subtree(root_mem, stock->cached) && - test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) + if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) flush_work(&stock->work); } out: @@ -2211,51 +2107,59 @@ out: * expects some charges will be back to res_counter later but cannot wait for * it. */ -static void drain_all_stock_async(struct mem_cgroup *root_mem) +static void drain_all_stock_async(struct mem_cgroup *root_memcg) { - drain_all_stock(root_mem, false); + /* + * If someone calls draining, avoid adding more kworker runs. + */ + if (!mutex_trylock(&percpu_charge_mutex)) + return; + drain_all_stock(root_memcg, false); + mutex_unlock(&percpu_charge_mutex); } /* This is a synchronous drain interface. */ -static void drain_all_stock_sync(struct mem_cgroup *root_mem) +static void drain_all_stock_sync(struct mem_cgroup *root_memcg) { /* called when force_empty is called */ - drain_all_stock(root_mem, true); + mutex_lock(&percpu_charge_mutex); + drain_all_stock(root_memcg, true); + mutex_unlock(&percpu_charge_mutex); } /* * This function drains percpu counter value from DEAD cpu and * move it to local cpu. Note that this function can be preempted. */ -static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) +static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) { int i; - spin_lock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { - long x = per_cpu(mem->stat->count[i], cpu); + long x = per_cpu(memcg->stat->count[i], cpu); - per_cpu(mem->stat->count[i], cpu) = 0; - mem->nocpu_base.count[i] += x; + per_cpu(memcg->stat->count[i], cpu) = 0; + memcg->nocpu_base.count[i] += x; } for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { - unsigned long x = per_cpu(mem->stat->events[i], cpu); + unsigned long x = per_cpu(memcg->stat->events[i], cpu); - per_cpu(mem->stat->events[i], cpu) = 0; - mem->nocpu_base.events[i] += x; + per_cpu(memcg->stat->events[i], cpu) = 0; + memcg->nocpu_base.events[i] += x; } /* need to clear ON_MOVE value, works as a kind of lock. */ - per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; - spin_unlock(&mem->pcp_counter_lock); + per_cpu(memcg->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; + spin_unlock(&memcg->pcp_counter_lock); } -static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) +static void synchronize_mem_cgroup_on_move(struct mem_cgroup *memcg, int cpu) { int idx = MEM_CGROUP_ON_MOVE; - spin_lock(&mem->pcp_counter_lock); - per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; - spin_unlock(&mem->pcp_counter_lock); + spin_lock(&memcg->pcp_counter_lock); + per_cpu(memcg->stat->count[idx], cpu) = memcg->nocpu_base.count[idx]; + spin_unlock(&memcg->pcp_counter_lock); } static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, @@ -2293,7 +2197,7 @@ enum { CHARGE_OOM_DIE, /* the current is killed because of OOM */ }; -static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, +static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages, bool oom_check) { unsigned long csize = nr_pages * PAGE_SIZE; @@ -2302,16 +2206,16 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, unsigned long flags = 0; int ret; - ret = res_counter_charge(&mem->res, csize, &fail_res); + ret = res_counter_charge(&memcg->res, csize, &fail_res); if (likely(!ret)) { if (!do_swap_account) return CHARGE_OK; - ret = res_counter_charge(&mem->memsw, csize, &fail_res); + ret = res_counter_charge(&memcg->memsw, csize, &fail_res); if (likely(!ret)) return CHARGE_OK; - res_counter_uncharge(&mem->res, csize); + res_counter_uncharge(&memcg->res, csize); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else @@ -2369,12 +2273,12 @@ static int mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, static int __mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, unsigned int nr_pages, - struct mem_cgroup **memcg, + struct mem_cgroup **ptr, bool oom) { unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; int ret; /* @@ -2392,17 +2296,17 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, * thread group leader migrates. It's possible that mm is not * set, if so charge the init_mm (happens for pagecache usage). */ - if (!*memcg && !mm) + if (!*ptr && !mm) goto bypass; again: - if (*memcg) { /* css should be a valid one */ - mem = *memcg; - VM_BUG_ON(css_is_removed(&mem->css)); - if (mem_cgroup_is_root(mem)) + if (*ptr) { /* css should be a valid one */ + memcg = *ptr; + VM_BUG_ON(css_is_removed(&memcg->css)); + if (mem_cgroup_is_root(memcg)) goto done; - if (nr_pages == 1 && consume_stock(mem)) + if (nr_pages == 1 && consume_stock(memcg)) goto done; - css_get(&mem->css); + css_get(&memcg->css); } else { struct task_struct *p; @@ -2410,7 +2314,7 @@ again: p = rcu_dereference(mm->owner); /* * Because we don't have task_lock(), "p" can exit. - * In that case, "mem" can point to root or p can be NULL with + * In that case, "memcg" can point to root or p can be NULL with * race with swapoff. Then, we have small risk of mis-accouning. * But such kind of mis-account by race always happens because * we don't have cgroup_mutex(). It's overkill and we allo that @@ -2418,12 +2322,12 @@ again: * (*) swapoff at el will charge against mm-struct not against * task-struct. So, mm->owner can be NULL. */ - mem = mem_cgroup_from_task(p); - if (!mem || mem_cgroup_is_root(mem)) { + memcg = mem_cgroup_from_task(p); + if (!memcg || mem_cgroup_is_root(memcg)) { rcu_read_unlock(); goto done; } - if (nr_pages == 1 && consume_stock(mem)) { + if (nr_pages == 1 && consume_stock(memcg)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -2436,7 +2340,7 @@ again: goto done; } /* after here, we may be blocked. we need to get refcnt */ - if (!css_tryget(&mem->css)) { + if (!css_tryget(&memcg->css)) { rcu_read_unlock(); goto again; } @@ -2448,7 +2352,7 @@ again: /* If killed, bypass charge */ if (fatal_signal_pending(current)) { - css_put(&mem->css); + css_put(&memcg->css); goto bypass; } @@ -2458,43 +2362,43 @@ again: nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; } - ret = mem_cgroup_do_charge(mem, gfp_mask, batch, oom_check); + ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, oom_check); switch (ret) { case CHARGE_OK: break; case CHARGE_RETRY: /* not in OOM situation but retry */ batch = nr_pages; - css_put(&mem->css); - mem = NULL; + css_put(&memcg->css); + memcg = NULL; goto again; case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ - css_put(&mem->css); + css_put(&memcg->css); goto nomem; case CHARGE_NOMEM: /* OOM routine works */ if (!oom) { - css_put(&mem->css); + css_put(&memcg->css); goto nomem; } /* If oom, we never return -ENOMEM */ nr_oom_retries--; break; case CHARGE_OOM_DIE: /* Killed by OOM Killer */ - css_put(&mem->css); + css_put(&memcg->css); goto bypass; } } while (ret != CHARGE_OK); if (batch > nr_pages) - refill_stock(mem, batch - nr_pages); - css_put(&mem->css); + refill_stock(memcg, batch - nr_pages); + css_put(&memcg->css); done: - *memcg = mem; + *ptr = memcg; return 0; nomem: - *memcg = NULL; + *ptr = NULL; return -ENOMEM; bypass: - *memcg = NULL; + *ptr = NULL; return 0; } @@ -2503,15 +2407,15 @@ bypass: * This function is for that and do uncharge, put css's refcnt. * gotten by try_charge(). */ -static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, +static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages) { - if (!mem_cgroup_is_root(mem)) { + if (!mem_cgroup_is_root(memcg)) { unsigned long bytes = nr_pages * PAGE_SIZE; - res_counter_uncharge(&mem->res, bytes); + res_counter_uncharge(&memcg->res, bytes); if (do_swap_account) - res_counter_uncharge(&mem->memsw, bytes); + res_counter_uncharge(&memcg->memsw, bytes); } } @@ -2536,7 +2440,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; unsigned short id; swp_entry_t ent; @@ -2546,23 +2450,23 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { - mem = pc->mem_cgroup; - if (mem && !css_tryget(&mem->css)) - mem = NULL; + memcg = pc->mem_cgroup; + if (memcg && !css_tryget(&memcg->css)) + memcg = NULL; } else if (PageSwapCache(page)) { ent.val = page_private(page); id = lookup_swap_cgroup(ent); rcu_read_lock(); - mem = mem_cgroup_lookup(id); - if (mem && !css_tryget(&mem->css)) - mem = NULL; + memcg = mem_cgroup_lookup(id); + if (memcg && !css_tryget(&memcg->css)) + memcg = NULL; rcu_read_unlock(); } unlock_page_cgroup(pc); - return mem; + return memcg; } -static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, +static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, struct page *page, unsigned int nr_pages, struct page_cgroup *pc, @@ -2571,14 +2475,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - __mem_cgroup_cancel_charge(mem, nr_pages); + __mem_cgroup_cancel_charge(memcg, nr_pages); return; } /* * we don't need page_cgroup_lock about tail pages, becase they are not * accessed by any other context at this point. */ - pc->mem_cgroup = mem; + pc->mem_cgroup = memcg; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). * Especially when a page_cgroup is taken from a page, pc->mem_cgroup @@ -2601,14 +2505,14 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, break; } - mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); + mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), nr_pages); unlock_page_cgroup(pc); /* * "charge_statistics" updated event counter. Then, check it. * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. * if they exceeds softlimit. */ - memcg_check_events(mem, page); + memcg_check_events(memcg, page); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -2795,7 +2699,7 @@ out: static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, enum charge_type ctype) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; struct page_cgroup *pc; bool oom = true; @@ -2814,11 +2718,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, pc = lookup_page_cgroup(page); BUG_ON(!pc); /* XXX: remove this and move pc lookup into commit */ - ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &mem, oom); - if (ret || !mem) + ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); + if (ret || !memcg) return ret; - __mem_cgroup_commit_charge(mem, page, nr_pages, pc, ctype); + __mem_cgroup_commit_charge(memcg, page, nr_pages, pc, ctype); return 0; } @@ -2847,7 +2751,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, enum charge_type ctype); static void -__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, +__mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) { struct page_cgroup *pc = lookup_page_cgroup(page); @@ -2857,7 +2761,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, * LRU. Take care of it. */ mem_cgroup_lru_del_before_commit(page); - __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); + __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); mem_cgroup_lru_add_after_commit(page); return; } @@ -2865,7 +2769,7 @@ __mem_cgroup_commit_charge_lrucare(struct page *page, struct mem_cgroup *mem, int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; int ret; if (mem_cgroup_disabled()) @@ -2877,8 +2781,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, mm = &init_mm; if (page_is_file_cache(page)) { - ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &mem, true); - if (ret || !mem) + ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, &memcg, true); + if (ret || !memcg) return ret; /* @@ -2886,15 +2790,15 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, * put that would remove them from the LRU list, make * sure that they get relinked properly. */ - __mem_cgroup_commit_charge_lrucare(page, mem, + __mem_cgroup_commit_charge_lrucare(page, memcg, MEM_CGROUP_CHARGE_TYPE_CACHE); return ret; } /* shmem */ if (PageSwapCache(page)) { - ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); + ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); if (!ret) - __mem_cgroup_commit_charge_swapin(page, mem, + __mem_cgroup_commit_charge_swapin(page, memcg, MEM_CGROUP_CHARGE_TYPE_SHMEM); } else ret = mem_cgroup_charge_common(page, mm, gfp_mask, @@ -2913,7 +2817,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, gfp_t mask, struct mem_cgroup **ptr) { - struct mem_cgroup *mem; + struct mem_cgroup *memcg; int ret; *ptr = NULL; @@ -2931,12 +2835,12 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, */ if (!PageSwapCache(page)) goto charge_cur_mm; - mem = try_get_mem_cgroup_from_page(page); - if (!mem) + memcg = try_get_mem_cgroup_from_page(page); + if (!memcg) goto charge_cur_mm; - *ptr = mem; + *ptr = memcg; ret = __mem_cgroup_try_charge(NULL, mask, 1, ptr, true); - css_put(&mem->css); + css_put(&memcg->css); return ret; charge_cur_mm: if (unlikely(!mm)) @@ -2996,16 +2900,16 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) MEM_CGROUP_CHARGE_TYPE_MAPPED); } -void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) +void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return; - if (!mem) + if (!memcg) return; - __mem_cgroup_cancel_charge(mem, 1); + __mem_cgroup_cancel_charge(memcg, 1); } -static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, +static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages, const enum charge_type ctype) { @@ -3023,7 +2927,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, * uncharges. Then, it's ok to ignore memcg's refcnt. */ if (!batch->memcg) - batch->memcg = mem; + batch->memcg = memcg; /* * do_batch > 0 when unmapping pages or inode invalidate/truncate. * In those cases, all pages freed continuously can be expected to be in @@ -3043,7 +2947,7 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, * merge a series of uncharges to an uncharge of res_counter. * If not, we uncharge res_counter ony by one. */ - if (batch->memcg != mem) + if (batch->memcg != memcg) goto direct_uncharge; /* remember freed charge and uncharge it later */ batch->nr_pages++; @@ -3051,11 +2955,11 @@ static void mem_cgroup_do_uncharge(struct mem_cgroup *mem, batch->memsw_nr_pages++; return; direct_uncharge: - res_counter_uncharge(&mem->res, nr_pages * PAGE_SIZE); + res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, nr_pages * PAGE_SIZE); - if (unlikely(batch->memcg != mem)) - memcg_oom_recover(mem); + res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); + if (unlikely(batch->memcg != memcg)) + memcg_oom_recover(memcg); return; } @@ -3065,7 +2969,7 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; struct page_cgroup *pc; @@ -3088,7 +2992,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) lock_page_cgroup(pc); - mem = pc->mem_cgroup; + memcg = pc->mem_cgroup; if (!PageCgroupUsed(pc)) goto unlock_out; @@ -3111,7 +3015,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -nr_pages); + mem_cgroup_charge_statistics(memcg, PageCgroupCache(pc), -nr_pages); ClearPageCgroupUsed(pc); /* @@ -3123,18 +3027,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) unlock_page_cgroup(pc); /* - * even after unlock, we have mem->res.usage here and this memcg + * even after unlock, we have memcg->res.usage here and this memcg * will never be freed. */ - memcg_check_events(mem, page); + memcg_check_events(memcg, page); if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { - mem_cgroup_swap_statistics(mem, true); - mem_cgroup_get(mem); + mem_cgroup_swap_statistics(memcg, true); + mem_cgroup_get(memcg); } - if (!mem_cgroup_is_root(mem)) - mem_cgroup_do_uncharge(mem, nr_pages, ctype); + if (!mem_cgroup_is_root(memcg)) + mem_cgroup_do_uncharge(memcg, nr_pages, ctype); - return mem; + return memcg; unlock_out: unlock_page_cgroup(pc); @@ -3324,7 +3228,7 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **ptr, gfp_t gfp_mask) { - struct mem_cgroup *mem = NULL; + struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; enum charge_type ctype; int ret = 0; @@ -3338,8 +3242,8 @@ int mem_cgroup_prepare_migration(struct page *page, pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { - mem = pc->mem_cgroup; - css_get(&mem->css); + memcg = pc->mem_cgroup; + css_get(&memcg->css); /* * At migrating an anonymous page, its mapcount goes down * to 0 and uncharge() will be called. But, even if it's fully @@ -3377,12 +3281,12 @@ int mem_cgroup_prepare_migration(struct page *page, * If the page is not charged at this point, * we return here. */ - if (!mem) + if (!memcg) return 0; - *ptr = mem; + *ptr = memcg; ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, ptr, false); - css_put(&mem->css);/* drop extra refcnt */ + css_put(&memcg->css);/* drop extra refcnt */ if (ret || *ptr == NULL) { if (PageAnon(page)) { lock_page_cgroup(pc); @@ -3408,21 +3312,21 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(mem, page, 1, pc, ctype); + __mem_cgroup_commit_charge(memcg, page, 1, pc, ctype); return ret; } /* remove redundant charge if migration failed*/ -void mem_cgroup_end_migration(struct mem_cgroup *mem, +void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok) { struct page *used, *unused; struct page_cgroup *pc; - if (!mem) + if (!memcg) return; /* blocks rmdir() */ - cgroup_exclude_rmdir(&mem->css); + cgroup_exclude_rmdir(&memcg->css); if (!migration_ok) { used = oldpage; unused = newpage; @@ -3458,7 +3362,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, * So, rmdir()->pre_destroy() can be called while we do this charge. * In that case, we need to call pre_destroy() again. check it here. */ - cgroup_release_and_wakeup_rmdir(&mem->css); + cgroup_release_and_wakeup_rmdir(&memcg->css); } #ifdef CONFIG_DEBUG_VM @@ -3537,7 +3441,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, /* * Rather than hide all in some function, I do this in * open coded manner. You see what this really does. - * We have to guarantee mem->res.limit < mem->memsw.limit. + * We have to guarantee memcg->res.limit < memcg->memsw.limit. */ mutex_lock(&set_limit_mutex); memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); @@ -3599,7 +3503,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, /* * Rather than hide all in some function, I do this in * open coded manner. You see what this really does. - * We have to guarantee mem->res.limit < mem->memsw.limit. + * We have to guarantee memcg->res.limit < memcg->memsw.limit. */ mutex_lock(&set_limit_mutex); memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); @@ -3737,7 +3641,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, * This routine traverse page_cgroup in given list and drop them all. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. */ -static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, +static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { struct zone *zone; @@ -3748,7 +3652,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, int ret = 0; zone = &NODE_DATA(node)->node_zones[zid]; - mz = mem_cgroup_zoneinfo(mem, node, zid); + mz = mem_cgroup_zoneinfo(memcg, node, zid); list = &mz->lists[lru]; loop = MEM_CGROUP_ZSTAT(mz, lru); @@ -3775,7 +3679,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, page = lookup_cgroup_page(pc); - ret = mem_cgroup_move_parent(page, pc, mem, GFP_KERNEL); + ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); if (ret == -ENOMEM) break; @@ -3796,14 +3700,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, * make mem_cgroup's charge to be 0 if there is no task. * This enables deleting this mem_cgroup. */ -static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) +static int mem_cgroup_force_empty(struct mem_cgroup *memcg, bool free_all) { int ret; int node, zid, shrink; int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct cgroup *cgrp = mem->css.cgroup; + struct cgroup *cgrp = memcg->css.cgroup; - css_get(&mem->css); + css_get(&memcg->css); shrink = 0; /* should free all ? */ @@ -3819,14 +3723,14 @@ move_account: goto out; /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); - drain_all_stock_sync(mem); + drain_all_stock_sync(memcg); ret = 0; - mem_cgroup_start_move(mem); + mem_cgroup_start_move(memcg); for_each_node_state(node, N_HIGH_MEMORY) { for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { enum lru_list l; for_each_lru(l) { - ret = mem_cgroup_force_empty_list(mem, + ret = mem_cgroup_force_empty_list(memcg, node, zid, l); if (ret) break; @@ -3835,16 +3739,16 @@ move_account: if (ret) break; } - mem_cgroup_end_move(mem); - memcg_oom_recover(mem); + mem_cgroup_end_move(memcg); + memcg_oom_recover(memcg); /* it seems parent cgroup doesn't have enough mem */ if (ret == -ENOMEM) goto try_to_free; cond_resched(); /* "ret" should also be checked to ensure all lists are empty. */ - } while (mem->res.usage > 0 || ret); + } while (memcg->res.usage > 0 || ret); out: - css_put(&mem->css); + css_put(&memcg->css); return ret; try_to_free: @@ -3857,19 +3761,15 @@ try_to_free: lru_add_drain_all(); /* try to free all pages in this cgroup */ shrink = 1; - while (nr_retries && mem->res.usage > 0) { - struct memcg_scanrecord rec; + while (nr_retries && memcg->res.usage > 0) { int progress; if (signal_pending(current)) { ret = -EINTR; goto out; } - rec.context = SCAN_BY_SHRINK; - rec.mem = mem; - rec.root = mem; - progress = try_to_free_mem_cgroup_pages(mem, GFP_KERNEL, - false, &rec); + progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, + false); if (!progress) { nr_retries--; /* maybe some writeback is necessary */ @@ -3897,12 +3797,12 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, u64 val) { int retval = 0; - struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); struct cgroup *parent = cont->parent; - struct mem_cgroup *parent_mem = NULL; + struct mem_cgroup *parent_memcg = NULL; if (parent) - parent_mem = mem_cgroup_from_cont(parent); + parent_memcg = mem_cgroup_from_cont(parent); cgroup_lock(); /* @@ -3913,10 +3813,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, * For the root cgroup, parent_mem is NULL, we allow value to be * set if there are no children. */ - if ((!parent_mem || !parent_mem->use_hierarchy) && + if ((!parent_memcg || !parent_memcg->use_hierarchy) && (val == 1 || val == 0)) { if (list_empty(&cont->children)) - mem->use_hierarchy = val; + memcg->use_hierarchy = val; else retval = -EBUSY; } else @@ -3927,14 +3827,14 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, } -static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, +static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx) { struct mem_cgroup *iter; long val = 0; /* Per-cpu values can be negative, use a signed accumulator */ - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) val += mem_cgroup_read_stat(iter, idx); if (val < 0) /* race ? */ @@ -3942,29 +3842,29 @@ static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *mem, return val; } -static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) +static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) { u64 val; - if (!mem_cgroup_is_root(mem)) { + if (!mem_cgroup_is_root(memcg)) { if (!swap) - return res_counter_read_u64(&mem->res, RES_USAGE); + return res_counter_read_u64(&memcg->res, RES_USAGE); else - return res_counter_read_u64(&mem->memsw, RES_USAGE); + return res_counter_read_u64(&memcg->memsw, RES_USAGE); } - val = mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_CACHE); - val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_RSS); + val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); if (swap) - val += mem_cgroup_recursive_stat(mem, MEM_CGROUP_STAT_SWAPOUT); + val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); return val << PAGE_SHIFT; } static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); u64 val; int type, name; @@ -3973,15 +3873,15 @@ static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) switch (type) { case _MEM: if (name == RES_USAGE) - val = mem_cgroup_usage(mem, false); + val = mem_cgroup_usage(memcg, false); else - val = res_counter_read_u64(&mem->res, name); + val = res_counter_read_u64(&memcg->res, name); break; case _MEMSWAP: if (name == RES_USAGE) - val = mem_cgroup_usage(mem, true); + val = mem_cgroup_usage(memcg, true); else - val = res_counter_read_u64(&mem->memsw, name); + val = res_counter_read_u64(&memcg->memsw, name); break; default: BUG(); @@ -4069,24 +3969,24 @@ out: static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) { - struct mem_cgroup *mem; + struct mem_cgroup *memcg; int type, name; - mem = mem_cgroup_from_cont(cont); + memcg = mem_cgroup_from_cont(cont); type = MEMFILE_TYPE(event); name = MEMFILE_ATTR(event); switch (name) { case RES_MAX_USAGE: if (type == _MEM) - res_counter_reset_max(&mem->res); + res_counter_reset_max(&memcg->res); else - res_counter_reset_max(&mem->memsw); + res_counter_reset_max(&memcg->memsw); break; case RES_FAILCNT: if (type == _MEM) - res_counter_reset_failcnt(&mem->res); + res_counter_reset_failcnt(&memcg->res); else - res_counter_reset_failcnt(&mem->memsw); + res_counter_reset_failcnt(&memcg->memsw); break; } @@ -4103,7 +4003,7 @@ static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, static int mem_cgroup_move_charge_write(struct cgroup *cgrp, struct cftype *cft, u64 val) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); if (val >= (1 << NR_MOVE_TYPE)) return -EINVAL; @@ -4113,7 +4013,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, * inconsistent. */ cgroup_lock(); - mem->move_charge_at_immigrate = val; + memcg->move_charge_at_immigrate = val; cgroup_unlock(); return 0; @@ -4170,49 +4070,49 @@ struct { static void -mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) +mem_cgroup_get_local_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) { s64 val; /* per cpu stat */ - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); + val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE); s->stat[MCS_CACHE] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); + val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS); s->stat[MCS_RSS] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); + val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED); s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; - val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGIN); + val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGIN); s->stat[MCS_PGPGIN] += val; - val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGPGOUT); + val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGPGOUT); s->stat[MCS_PGPGOUT] += val; if (do_swap_account) { - val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); + val = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); s->stat[MCS_SWAP] += val * PAGE_SIZE; } - val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGFAULT); + val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGFAULT); s->stat[MCS_PGFAULT] += val; - val = mem_cgroup_read_events(mem, MEM_CGROUP_EVENTS_PGMAJFAULT); + val = mem_cgroup_read_events(memcg, MEM_CGROUP_EVENTS_PGMAJFAULT); s->stat[MCS_PGMAJFAULT] += val; /* per zone stat */ - val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_ANON)); + val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_ANON)); s->stat[MCS_INACTIVE_ANON] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_ANON)); + val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_ANON)); s->stat[MCS_ACTIVE_ANON] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_INACTIVE_FILE)); + val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_INACTIVE_FILE)); s->stat[MCS_INACTIVE_FILE] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_ACTIVE_FILE)); + val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_ACTIVE_FILE)); s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; - val = mem_cgroup_nr_lru_pages(mem, BIT(LRU_UNEVICTABLE)); + val = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; } static void -mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) +mem_cgroup_get_total_stat(struct mem_cgroup *memcg, struct mcs_total_stat *s) { struct mem_cgroup *iter; - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) mem_cgroup_get_local_stat(iter, s); } @@ -4298,8 +4198,6 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, } #ifdef CONFIG_DEBUG_VM - cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); - { int nid, zid; struct mem_cgroup_per_zone *mz; @@ -4436,20 +4334,20 @@ static int compare_thresholds(const void *a, const void *b) return _a->threshold - _b->threshold; } -static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) { struct mem_cgroup_eventfd_list *ev; - list_for_each_entry(ev, &mem->oom_notify, list) + list_for_each_entry(ev, &memcg->oom_notify, list) eventfd_signal(ev->eventfd, 1); return 0; } -static void mem_cgroup_oom_notify(struct mem_cgroup *mem) +static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) { struct mem_cgroup *iter; - for_each_mem_cgroup_tree(iter, mem) + for_each_mem_cgroup_tree(iter, memcg) mem_cgroup_oom_notify_cb(iter); } @@ -4639,7 +4537,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp, static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, struct cftype *cft, struct eventfd_ctx *eventfd) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup_eventfd_list *ev, *tmp; int type = MEMFILE_TYPE(cft->private); @@ -4647,7 +4545,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, spin_lock(&memcg_oom_lock); - list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { + list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { if (ev->eventfd == eventfd) { list_del(&ev->list); kfree(ev); @@ -4660,11 +4558,11 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, static int mem_cgroup_oom_control_read(struct cgroup *cgrp, struct cftype *cft, struct cgroup_map_cb *cb) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); - cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); + cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); - if (atomic_read(&mem->under_oom)) + if (atomic_read(&memcg->under_oom)) cb->fill(cb, "under_oom", 1); else cb->fill(cb, "under_oom", 0); @@ -4674,7 +4572,7 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp, static int mem_cgroup_oom_control_write(struct cgroup *cgrp, struct cftype *cft, u64 val) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup *parent; /* cannot set to root cgroup and only 0 and 1 are allowed */ @@ -4686,13 +4584,13 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, cgroup_lock(); /* oom-kill-disable is a flag for subhierarchy. */ if ((parent->use_hierarchy) || - (mem->use_hierarchy && !list_empty(&cgrp->children))) { + (memcg->use_hierarchy && !list_empty(&cgrp->children))) { cgroup_unlock(); return -EINVAL; } - mem->oom_kill_disable = val; + memcg->oom_kill_disable = val; if (!val) - memcg_oom_recover(mem); + memcg_oom_recover(memcg); cgroup_unlock(); return 0; } @@ -4713,54 +4611,6 @@ static int mem_control_numa_stat_open(struct inode *unused, struct file *file) } #endif /* CONFIG_NUMA */ -static int mem_cgroup_vmscan_stat_read(struct cgroup *cgrp, - struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); - char string[64]; - int i; - - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_LIMIT); - cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_LIMIT][i]); - } - - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_SYSTEM); - cb->fill(cb, string, mem->scanstat.stats[SCAN_BY_SYSTEM][i]); - } - - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_LIMIT); - strcat(string, SCANSTAT_WORD_HIERARCHY); - cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_LIMIT][i]); - } - for (i = 0; i < NR_SCANSTATS; i++) { - strcpy(string, scanstat_string[i]); - strcat(string, SCANSTAT_WORD_SYSTEM); - strcat(string, SCANSTAT_WORD_HIERARCHY); - cb->fill(cb, string, mem->scanstat.rootstats[SCAN_BY_SYSTEM][i]); - } - return 0; -} - -static int mem_cgroup_reset_vmscan_stat(struct cgroup *cgrp, - unsigned int event) -{ - struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); - - spin_lock(&mem->scanstat.lock); - memset(&mem->scanstat.stats, 0, sizeof(mem->scanstat.stats)); - memset(&mem->scanstat.rootstats, 0, sizeof(mem->scanstat.rootstats)); - spin_unlock(&mem->scanstat.lock); - return 0; -} - - static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", @@ -4831,11 +4681,6 @@ static struct cftype mem_cgroup_files[] = { .mode = S_IRUGO, }, #endif - { - .name = "vmscan_stat", - .read_map = mem_cgroup_vmscan_stat_read, - .trigger = mem_cgroup_reset_vmscan_stat, - }, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -4881,7 +4726,7 @@ static int register_memsw_files(struct cgroup *cont, struct cgroup_subsys *ss) } #endif -static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) +static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn; struct mem_cgroup_per_zone *mz; @@ -4901,21 +4746,21 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) if (!pn) return 1; - mem->info.nodeinfo[node] = pn; for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) INIT_LIST_HEAD(&mz->lists[l]); mz->usage_in_excess = 0; mz->on_tree = false; - mz->mem = mem; + mz->mem = memcg; } + memcg->info.nodeinfo[node] = pn; return 0; } -static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) +static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { - kfree(mem->info.nodeinfo[node]); + kfree(memcg->info.nodeinfo[node]); } static struct mem_cgroup *mem_cgroup_alloc(void) @@ -4957,51 +4802,51 @@ out_free: * Removal of cgroup itself succeeds regardless of refs from swap. */ -static void __mem_cgroup_free(struct mem_cgroup *mem) +static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; - mem_cgroup_remove_from_trees(mem); - free_css_id(&mem_cgroup_subsys, &mem->css); + mem_cgroup_remove_from_trees(memcg); + free_css_id(&mem_cgroup_subsys, &memcg->css); for_each_node_state(node, N_POSSIBLE) - free_mem_cgroup_per_zone_info(mem, node); + free_mem_cgroup_per_zone_info(memcg, node); - free_percpu(mem->stat); + free_percpu(memcg->stat); if (sizeof(struct mem_cgroup) < PAGE_SIZE) - kfree(mem); + kfree(memcg); else - vfree(mem); + vfree(memcg); } -static void mem_cgroup_get(struct mem_cgroup *mem) +static void mem_cgroup_get(struct mem_cgroup *memcg) { - atomic_inc(&mem->refcnt); + atomic_inc(&memcg->refcnt); } -static void __mem_cgroup_put(struct mem_cgroup *mem, int count) +static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) { - if (atomic_sub_and_test(count, &mem->refcnt)) { - struct mem_cgroup *parent = parent_mem_cgroup(mem); - __mem_cgroup_free(mem); + if (atomic_sub_and_test(count, &memcg->refcnt)) { + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + __mem_cgroup_free(memcg); if (parent) mem_cgroup_put(parent); } } -static void mem_cgroup_put(struct mem_cgroup *mem) +static void mem_cgroup_put(struct mem_cgroup *memcg) { - __mem_cgroup_put(mem, 1); + __mem_cgroup_put(memcg, 1); } /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ -static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem) +static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { - if (!mem->res.parent) + if (!memcg->res.parent) return NULL; - return mem_cgroup_from_res_counter(mem->res.parent, res); + return mem_cgroup_from_res_counter(memcg->res.parent, res); } #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -5044,16 +4889,16 @@ static int mem_cgroup_soft_limit_tree_init(void) static struct cgroup_subsys_state * __ref mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { - struct mem_cgroup *mem, *parent; + struct mem_cgroup *memcg, *parent; long error = -ENOMEM; int node; - mem = mem_cgroup_alloc(); - if (!mem) + memcg = mem_cgroup_alloc(); + if (!memcg) return ERR_PTR(error); for_each_node_state(node, N_POSSIBLE) - if (alloc_mem_cgroup_per_zone_info(mem, node)) + if (alloc_mem_cgroup_per_zone_info(memcg, node)) goto free_out; /* root ? */ @@ -5061,7 +4906,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) int cpu; enable_swap_cgroup(); parent = NULL; - root_mem_cgroup = mem; + root_mem_cgroup = memcg; if (mem_cgroup_soft_limit_tree_init()) goto free_out; for_each_possible_cpu(cpu) { @@ -5072,13 +4917,13 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) hotcpu_notifier(memcg_cpu_hotplug_callback, 0); } else { parent = mem_cgroup_from_cont(cont->parent); - mem->use_hierarchy = parent->use_hierarchy; - mem->oom_kill_disable = parent->oom_kill_disable; + memcg->use_hierarchy = parent->use_hierarchy; + memcg->oom_kill_disable = parent->oom_kill_disable; } if (parent && parent->use_hierarchy) { - res_counter_init(&mem->res, &parent->res); - res_counter_init(&mem->memsw, &parent->memsw); + res_counter_init(&memcg->res, &parent->res); + res_counter_init(&memcg->memsw, &parent->memsw); /* * We increment refcnt of the parent to ensure that we can * safely access it on res_counter_charge/uncharge. @@ -5087,22 +4932,21 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) */ mem_cgroup_get(parent); } else { - res_counter_init(&mem->res, NULL); - res_counter_init(&mem->memsw, NULL); + res_counter_init(&memcg->res, NULL); + res_counter_init(&memcg->memsw, NULL); } - mem->last_scanned_child = 0; - mem->last_scanned_node = MAX_NUMNODES; - INIT_LIST_HEAD(&mem->oom_notify); + memcg->last_scanned_child = 0; + memcg->last_scanned_node = MAX_NUMNODES; + INIT_LIST_HEAD(&memcg->oom_notify); if (parent) - mem->swappiness = mem_cgroup_swappiness(parent); - atomic_set(&mem->refcnt, 1); - mem->move_charge_at_immigrate = 0; - mutex_init(&mem->thresholds_lock); - spin_lock_init(&mem->scanstat.lock); - return &mem->css; + memcg->swappiness = mem_cgroup_swappiness(parent); + atomic_set(&memcg->refcnt, 1); + memcg->move_charge_at_immigrate = 0; + mutex_init(&memcg->thresholds_lock); + return &memcg->css; free_out: - __mem_cgroup_free(mem); + __mem_cgroup_free(memcg); root_mem_cgroup = NULL; return ERR_PTR(error); } @@ -5110,17 +4954,17 @@ free_out: static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - return mem_cgroup_force_empty(mem, false); + return mem_cgroup_force_empty(memcg, false); } static void mem_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { - struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - mem_cgroup_put(mem); + mem_cgroup_put(memcg); } static int mem_cgroup_populate(struct cgroup_subsys *ss, @@ -5143,9 +4987,9 @@ static int mem_cgroup_do_precharge(unsigned long count) { int ret = 0; int batch_count = PRECHARGE_COUNT_AT_ONCE; - struct mem_cgroup *mem = mc.to; + struct mem_cgroup *memcg = mc.to; - if (mem_cgroup_is_root(mem)) { + if (mem_cgroup_is_root(memcg)) { mc.precharge += count; /* we don't need css_get for root */ return ret; @@ -5154,16 +4998,16 @@ static int mem_cgroup_do_precharge(unsigned long count) if (count > 1) { struct res_counter *dummy; /* - * "mem" cannot be under rmdir() because we've already checked + * "memcg" cannot be under rmdir() because we've already checked * by cgroup_lock_live_cgroup() that it is not removed and we * are still under the same cgroup_mutex. So we can postpone * css_get(). */ - if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) + if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) goto one_by_one; - if (do_swap_account && res_counter_charge(&mem->memsw, + if (do_swap_account && res_counter_charge(&memcg->memsw, PAGE_SIZE * count, &dummy)) { - res_counter_uncharge(&mem->res, PAGE_SIZE * count); + res_counter_uncharge(&memcg->res, PAGE_SIZE * count); goto one_by_one; } mc.precharge += count; @@ -5180,8 +5024,9 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, 1, &mem, false); - if (ret || !mem) + ret = __mem_cgroup_try_charge(NULL, + GFP_KERNEL, 1, &memcg, false); + if (ret || !memcg) /* mem_cgroup_clear_mc() will do uncharge later */ return -ENOMEM; mc.precharge++; @@ -5455,13 +5300,13 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, struct task_struct *p) { int ret = 0; - struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgroup); - if (mem->move_charge_at_immigrate) { + if (memcg->move_charge_at_immigrate) { struct mm_struct *mm; struct mem_cgroup *from = mem_cgroup_from_task(p); - VM_BUG_ON(from == mem); + VM_BUG_ON(from == memcg); mm = get_task_mm(p); if (!mm) @@ -5476,7 +5321,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; - mc.to = mem; + mc.to = memcg; spin_unlock(&mc.lock); /* We set mc.moving_task later */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2b43ba051ac9..edc388db730a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1310,7 +1310,7 @@ int unpoison_memory(unsigned long pfn) * to the end. */ if (PageHuge(page)) { - pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); + pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); return 0; } if (TestClearPageHWPoison(p)) @@ -1419,7 +1419,7 @@ static int soft_offline_huge_page(struct page *page, int flags) if (PageHWPoison(hpage)) { put_page(hpage); - pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); + pr_info("soft offline: %#lx hugepage already poisoned\n", pfn); return -EBUSY; } @@ -1433,8 +1433,8 @@ static int soft_offline_huge_page(struct page *page, int flags) list_for_each_entry_safe(page1, page2, &pagelist, lru) put_page(page1); - pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", - pfn, ret, page->flags); + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); if (ret > 0) ret = -EIO; return ret; @@ -1505,7 +1505,7 @@ int soft_offline_page(struct page *page, int flags) } if (!PageLRU(page)) { pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", - pfn, page->flags); + pfn, page->flags); return -EIO; } @@ -1566,7 +1566,7 @@ int soft_offline_page(struct page *page, int flags) } } else { pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", - pfn, ret, page_count(page), page->flags); + pfn, ret, page_count(page), page->flags); } if (ret) return ret; diff --git a/mm/memory.c b/mm/memory.c index a56e3ba816b2..b2b87315cdc6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1503,7 +1503,7 @@ split_fallthrough: } if (flags & FOLL_GET) - get_page(page); + get_page_foll(page); if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && !pte_dirty(pte) && !PageDirty(page)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8b57173c1dd5..cd237f478304 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -111,7 +111,7 @@ enum zone_type policy_zone = 0; /* * run-time system-wide default policy => local allocation */ -struct mempolicy default_policy = { +static struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ .mode = MPOL_PREFERRED, .flags = MPOL_F_LOCAL, @@ -636,7 +636,6 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; - pgoff_t pgoff; unsigned long vmstart; unsigned long vmend; @@ -649,9 +648,9 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, new_pol); + vma->anon_vma, vma->vm_file, vma->vm_pgoff, + new_pol); if (prev) { vma = prev; next = vma->vm_next; @@ -1412,7 +1411,9 @@ asmlinkage long compat_sys_get_mempolicy(int __user *policy, err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags); if (!err && nmask) { - err = copy_from_user(bm, nm, alloc_size); + unsigned long copy_size; + copy_size = min_t(unsigned long, sizeof(bm), alloc_size); + err = copy_from_user(bm, nm, copy_size); /* ensure entire bitmap is zeroed */ err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); err |= compat_put_bitmap(nmask, bm, nr_bits); diff --git a/mm/migrate.c b/mm/migrate.c index 666e4e677414..33358f878111 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, ptep = pte_offset_map(pmd, addr); - if (!is_swap_pte(*ptep)) { - pte_unmap(ptep); - goto out; - } + /* + * Peek to check is_swap_pte() before taking ptlock? No, we + * can race mremap's move_ptes(), which skips anon_vma lock. + */ ptl = pte_lockptr(mm, pmd); } @@ -621,38 +621,18 @@ static int move_to_new_page(struct page *newpage, struct page *page, return rc; } -/* - * Obtain the lock on page, remove all ptes and migrate the page - * to the newly allocated page in newpage. - */ -static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, bool offlining, bool sync) +static int __unmap_and_move(struct page *page, struct page *newpage, + int force, bool offlining, bool sync) { - int rc = 0; - int *result = NULL; - struct page *newpage = get_new_page(page, private, &result); + int rc = -EAGAIN; int remap_swapcache = 1; int charge = 0; struct mem_cgroup *mem; struct anon_vma *anon_vma = NULL; - if (!newpage) - return -ENOMEM; - - if (page_count(page) == 1) { - /* page was freed from under us. So we are done. */ - goto move_newpage; - } - if (unlikely(PageTransHuge(page))) - if (unlikely(split_huge_page(page))) - goto move_newpage; - - /* prepare cgroup just returns 0 or -ENOMEM */ - rc = -EAGAIN; - if (!trylock_page(page)) { if (!force || !sync) - goto move_newpage; + goto out; /* * It's not safe for direct compaction to call lock_page. @@ -668,7 +648,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * altogether. */ if (current->flags & PF_MEMALLOC) - goto move_newpage; + goto out; lock_page(page); } @@ -785,27 +765,52 @@ uncharge: mem_cgroup_end_migration(mem, page, newpage, rc == 0); unlock: unlock_page(page); +out: + return rc; +} -move_newpage: +/* + * Obtain the lock on page, remove all ptes and migrate the page + * to the newly allocated page in newpage. + */ +static int unmap_and_move(new_page_t get_new_page, unsigned long private, + struct page *page, int force, bool offlining, bool sync) +{ + int rc = 0; + int *result = NULL; + struct page *newpage = get_new_page(page, private, &result); + + if (!newpage) + return -ENOMEM; + + if (page_count(page) == 1) { + /* page was freed from under us. So we are done. */ + goto out; + } + + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) + goto out; + + rc = __unmap_and_move(page, newpage, force, offlining, sync); +out: if (rc != -EAGAIN) { - /* - * A page that has been migrated has all references - * removed and will be freed. A page that has not been - * migrated will have kepts its references and be - * restored. - */ - list_del(&page->lru); + /* + * A page that has been migrated has all references + * removed and will be freed. A page that has not been + * migrated will have kepts its references and be + * restored. + */ + list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); putback_lru_page(page); } - /* * Move the new page to the LRU. If migration was not successful * then this will free the page. */ putback_lru_page(newpage); - if (result) { if (rc) *result = rc; diff --git a/mm/mlock.c b/mm/mlock.c index 048260c4e02e..bd34b3a10852 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -110,7 +110,15 @@ void munlock_vma_page(struct page *page) if (TestClearPageMlocked(page)) { dec_zone_page_state(page, NR_MLOCK); if (!isolate_lru_page(page)) { - int ret = try_to_munlock(page); + int ret = SWAP_AGAIN; + + /* + * Optimization: if the page was mapped just once, + * that's our mapping and we don't need to check all the + * other vmas. + */ + if (page_mapcount(page) > 1) + ret = try_to_munlock(page); /* * did try_to_unlock() succeed or punt? */ @@ -549,7 +557,8 @@ SYSCALL_DEFINE1(mlockall, int, flags) if (!can_do_mlock()) goto out; - lru_add_drain_all(); /* flush pagevec */ + if (flags & MCL_CURRENT) + lru_add_drain_all(); /* flush pagevec */ down_write(¤t->mm->mmap_sem); diff --git a/mm/mmap.c b/mm/mmap.c index a65efd4db3e1..3c0061f744f5 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2558,7 +2558,6 @@ int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; - int ret = -EINTR; BUG_ON(down_read_trylock(&mm->mmap_sem)); @@ -2579,13 +2578,11 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_anon_vma(mm, avc->anon_vma); } - ret = 0; + return 0; out_unlock: - if (ret) - mm_drop_all_locks(mm); - - return ret; + mm_drop_all_locks(mm); + return -EINTR; } static void vm_unlock_anon_vma(struct anon_vma *anon_vma) diff --git a/mm/mremap.c b/mm/mremap.c index 506fa44403df..d6959cb4df58 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -41,8 +41,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) return NULL; pmd = pmd_offset(pud, addr); - split_huge_page_pmd(mm, pmd); - if (pmd_none_or_clear_bad(pmd)) + if (pmd_none(*pmd)) return NULL; return pmd; @@ -65,8 +64,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, return NULL; VM_BUG_ON(pmd_trans_huge(*pmd)); - if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr)) - return NULL; return pmd; } @@ -80,11 +77,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; spinlock_t *old_ptl, *new_ptl; - unsigned long old_start; - old_start = old_addr; - mmu_notifier_invalidate_range_start(vma->vm_mm, - old_start, old_end); if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before @@ -111,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, new_pte++, new_addr += PAGE_SIZE) { if (pte_none(*old_pte)) continue; - pte = ptep_clear_flush(vma, old_addr, old_pte); + pte = ptep_get_and_clear(mm, old_addr, old_pte); pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); set_pte_at(mm, new_addr, new_pte, pte); } @@ -123,7 +116,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) mutex_unlock(&mapping->i_mmap_mutex); - mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end); } #define LATENCY_LIMIT (64 * PAGE_SIZE) @@ -134,22 +126,43 @@ unsigned long move_page_tables(struct vm_area_struct *vma, { unsigned long extent, next, old_end; pmd_t *old_pmd, *new_pmd; + bool need_flush = false; old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); + mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end); + for (; old_addr < old_end; old_addr += extent, new_addr += extent) { cond_resched(); next = (old_addr + PMD_SIZE) & PMD_MASK; - if (next - 1 > old_end) - next = old_end; + /* even if next overflowed, extent below will be ok */ extent = next - old_addr; + if (extent > old_end - old_addr) + extent = old_end - old_addr; old_pmd = get_old_pmd(vma->vm_mm, old_addr); if (!old_pmd) continue; new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; + if (pmd_trans_huge(*old_pmd)) { + int err = 0; + if (extent == HPAGE_PMD_SIZE) + err = move_huge_pmd(vma, new_vma, old_addr, + new_addr, old_end, + old_pmd, new_pmd); + if (err > 0) { + need_flush = true; + continue; + } else if (!err) { + split_huge_page_pmd(vma->vm_mm, old_pmd); + } + VM_BUG_ON(pmd_trans_huge(*old_pmd)); + } + if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma, + new_pmd, new_addr)) + break; next = (new_addr + PMD_SIZE) & PMD_MASK; if (extent > next - new_addr) extent = next - new_addr; @@ -157,7 +170,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma, extent = LATENCY_LIMIT; move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma, new_pmd, new_addr); + need_flush = true; } + if (likely(need_flush)) + flush_tlb_range(vma, old_end-len, old_addr); + + mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end); return len + old_addr - old_end; /* how much done */ } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 626303b52f3c..e916168b6e0a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -32,12 +32,32 @@ #include <linux/mempolicy.h> #include <linux/security.h> #include <linux/ptrace.h> +#include <linux/freezer.h> int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; static DEFINE_SPINLOCK(zone_scan_lock); +/* + * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj + * @old_val: old oom_score_adj for compare + * @new_val: new oom_score_adj for swap + * + * Sets the oom_score_adj value for current to @new_val iff its present value is + * @old_val. Usually used to reinstate a previous value to prevent racing with + * userspacing tuning the value in the interim. + */ +void compare_swap_oom_score_adj(int old_val, int new_val) +{ + struct sighand_struct *sighand = current->sighand; + + spin_lock_irq(&sighand->siglock); + if (current->signal->oom_score_adj == old_val) + current->signal->oom_score_adj = new_val; + spin_unlock_irq(&sighand->siglock); +} + /** * test_set_oom_score_adj() - set current's oom_score_adj and return old value * @new_val: new oom_score_adj value @@ -53,13 +73,7 @@ int test_set_oom_score_adj(int new_val) spin_lock_irq(&sighand->siglock); old_val = current->signal->oom_score_adj; - if (new_val != old_val) { - if (new_val == OOM_SCORE_ADJ_MIN) - atomic_inc(¤t->mm->oom_disable_count); - else if (old_val == OOM_SCORE_ADJ_MIN) - atomic_dec(¤t->mm->oom_disable_count); - current->signal->oom_score_adj = new_val; - } + current->signal->oom_score_adj = new_val; spin_unlock_irq(&sighand->siglock); return old_val; @@ -172,16 +186,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, return 0; /* - * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN - * so the entire heuristic doesn't need to be executed for something - * that cannot be killed. - */ - if (atomic_read(&p->mm->oom_disable_count)) { - task_unlock(p); - return 0; - } - - /* * The memory controller may have a limit of 0 bytes, so avoid a divide * by zero, if necessary. */ @@ -317,8 +321,11 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * blocked waiting for another task which itself is waiting * for memory. Is there a better alternative? */ - if (test_tsk_thread_flag(p, TIF_MEMDIE)) + if (test_tsk_thread_flag(p, TIF_MEMDIE)) { + if (unlikely(frozen(p))) + thaw_process(p); return ERR_PTR(-1UL); + } if (!p->mm) continue; @@ -435,7 +442,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) task_unlock(p); /* - * Kill all processes sharing p->mm in other thread groups, if any. + * Kill all user processes sharing p->mm in other thread groups, if any. * They don't get access to memory reserves or a higher scheduler * priority, though, to avoid depletion of all memory or task * starvation. This prevents mm->mmap_sem livelock when an oom killed @@ -445,7 +452,11 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) * signal. */ for_each_process(q) - if (q->mm == mm && !same_thread_group(q, p)) { + if (q->mm == mm && !same_thread_group(q, p) && + !(q->flags & PF_KTHREAD)) { + if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + continue; + task_lock(q); /* Protect ->comm from prctl() */ pr_err("Kill process %d (%s) sharing same memory\n", task_pid_nr(q), q->comm); @@ -722,7 +733,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, read_lock(&tasklist_lock); if (sysctl_oom_kill_allocating_task && !oom_unkillable_task(current, NULL, nodemask) && - current->mm && !atomic_read(¤t->mm->oom_disable_count)) { + current->mm) { /* * oom_kill_process() needs tasklist_lock held. If it returns * non-zero, current could not be killed so we must fallback to diff --git a/mm/page-writeback.c b/mm/page-writeback.c index d1960744f881..793e9874de51 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -305,7 +305,9 @@ static unsigned long task_min_dirty_limit(unsigned long bdi_dirty) } /* - * + * bdi_min_ratio keeps the sum of the minimum dirty shares of all + * registered backing devices, which, for obvious reasons, can not + * exceed 100%. */ static unsigned int bdi_min_ratio; @@ -754,21 +756,10 @@ static void balance_dirty_pages(struct address_space *mapping, * 200ms is typically more than enough to curb heavy dirtiers; * (b) the pause time limit makes the dirtiers more responsive. */ - if (nr_dirty < dirty_thresh + - dirty_thresh / DIRTY_MAXPAUSE_AREA && + if (nr_dirty < dirty_thresh && + bdi_dirty < (task_bdi_thresh + bdi_thresh) / 2 && time_after(jiffies, start_time + MAX_PAUSE)) break; - /* - * pass-good area. When some bdi gets blocked (eg. NFS server - * not responding), or write bandwidth dropped dramatically due - * to concurrent reads, or dirty threshold suddenly dropped and - * the dirty pages cannot be brought down anytime soon (eg. on - * slow USB stick), at least let go of the good bdi's. - */ - if (nr_dirty < dirty_thresh + - dirty_thresh / DIRTY_PASSGOOD_AREA && - bdi_dirty < bdi_thresh) - break; /* * Increase the delay for each loop, up to our previous diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e8ecb6e021c..9dd443d89d8b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -318,6 +318,7 @@ static void bad_page(struct page *page) current->comm, page_to_pfn(page)); dump_page(page); + print_modules(); dump_stack(); out: /* Leave bad fields for debug, except PageBuddy could make trouble */ @@ -1753,7 +1754,6 @@ static DEFINE_RATELIMIT_STATE(nopage_rs, void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) { - va_list args; unsigned int filter = SHOW_MEM_FILTER_NODES; if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) @@ -1772,14 +1772,21 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...) filter &= ~SHOW_MEM_FILTER_NODES; if (fmt) { - printk(KERN_WARNING); + struct va_format vaf; + va_list args; + va_start(args, fmt); - vprintk(fmt, args); + + vaf.fmt = fmt; + vaf.va = &args; + + pr_warn("%pV", &vaf); + va_end(args); } - pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n", - current->comm, order, gfp_mask); + pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n", + current->comm, order, gfp_mask); dump_stack(); if (!should_suppress_show_mem()) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 39d216d535ea..2d123f94a8df 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -133,10 +133,13 @@ struct page *lookup_cgroup_page(struct page_cgroup *pc) static void *__meminit alloc_page_cgroup(size_t size, int nid) { void *addr = NULL; + gfp_t flags = GFP_KERNEL | __GFP_NOWARN; - addr = alloc_pages_exact_nid(nid, size, GFP_KERNEL | __GFP_NOWARN); - if (addr) + addr = alloc_pages_exact_nid(nid, size, flags); + if (addr) { + kmemleak_alloc(addr, size, 1, flags); return addr; + } if (node_state(nid, N_HIGH_MEMORY)) addr = vmalloc_node(size, nid); @@ -357,7 +360,7 @@ struct swap_cgroup_ctrl { spinlock_t lock; }; -struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; +static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; struct swap_cgroup { unsigned short id; @@ -513,11 +516,10 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array_size = length * sizeof(void *); - array = vmalloc(array_size); + array = vzalloc(array_size); if (!array) goto nomem; - memset(array, 0, array_size); ctrl = &swap_cgroup_ctrl[type]; mutex_lock(&swap_cgroup_mutex); ctrl->length = length; diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c new file mode 100644 index 000000000000..e920aa3ce104 --- /dev/null +++ b/mm/process_vm_access.c @@ -0,0 +1,496 @@ +/* + * linux/mm/process_vm_access.c + * + * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/mm.h> +#include <linux/uio.h> +#include <linux/sched.h> +#include <linux/highmem.h> +#include <linux/ptrace.h> +#include <linux/slab.h> +#include <linux/syscalls.h> + +#ifdef CONFIG_COMPAT +#include <linux/compat.h> +#endif + +/** + * process_vm_rw_pages - read/write pages from task specified + * @task: task to read/write from + * @mm: mm for task + * @process_pages: struct pages area that can store at least + * nr_pages_to_copy struct page pointers + * @pa: address of page in task to start copying from/to + * @start_offset: offset in page to start copying from/to + * @len: number of bytes to copy + * @lvec: iovec array specifying where to copy to/from + * @lvec_cnt: number of elements in iovec array + * @lvec_current: index in iovec array we are up to + * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @vm_write: 0 means copy from, 1 means copy to + * @nr_pages_to_copy: number of pages to copy + * @bytes_copied: returns number of bytes successfully copied + * Returns 0 on success, error code otherwise + */ +static int process_vm_rw_pages(struct task_struct *task, + struct mm_struct *mm, + struct page **process_pages, + unsigned long pa, + unsigned long start_offset, + unsigned long len, + const struct iovec *lvec, + unsigned long lvec_cnt, + unsigned long *lvec_current, + size_t *lvec_offset, + int vm_write, + unsigned int nr_pages_to_copy, + ssize_t *bytes_copied) +{ + int pages_pinned; + void *target_kaddr; + int pgs_copied = 0; + int j; + int ret; + ssize_t bytes_to_copy; + ssize_t rc = 0; + + *bytes_copied = 0; + + /* Get the pages we're interested in */ + down_read(&mm->mmap_sem); + pages_pinned = get_user_pages(task, mm, pa, + nr_pages_to_copy, + vm_write, 0, process_pages, NULL); + up_read(&mm->mmap_sem); + + if (pages_pinned != nr_pages_to_copy) { + rc = -EFAULT; + goto end; + } + + /* Do the copy for each page */ + for (pgs_copied = 0; + (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt); + pgs_copied++) { + /* Make sure we have a non zero length iovec */ + while (*lvec_current < lvec_cnt + && lvec[*lvec_current].iov_len == 0) + (*lvec_current)++; + if (*lvec_current == lvec_cnt) + break; + + /* + * Will copy smallest of: + * - bytes remaining in page + * - bytes remaining in destination iovec + */ + bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset, + len - *bytes_copied); + bytes_to_copy = min_t(ssize_t, bytes_to_copy, + lvec[*lvec_current].iov_len + - *lvec_offset); + + target_kaddr = kmap(process_pages[pgs_copied]) + start_offset; + + if (vm_write) + ret = copy_from_user(target_kaddr, + lvec[*lvec_current].iov_base + + *lvec_offset, + bytes_to_copy); + else + ret = copy_to_user(lvec[*lvec_current].iov_base + + *lvec_offset, + target_kaddr, bytes_to_copy); + kunmap(process_pages[pgs_copied]); + if (ret) { + *bytes_copied += bytes_to_copy - ret; + pgs_copied++; + rc = -EFAULT; + goto end; + } + *bytes_copied += bytes_to_copy; + *lvec_offset += bytes_to_copy; + if (*lvec_offset == lvec[*lvec_current].iov_len) { + /* + * Need to copy remaining part of page into the + * next iovec if there are any bytes left in page + */ + (*lvec_current)++; + *lvec_offset = 0; + start_offset = (start_offset + bytes_to_copy) + % PAGE_SIZE; + if (start_offset) + pgs_copied--; + } else { + start_offset = 0; + } + } + +end: + if (vm_write) { + for (j = 0; j < pages_pinned; j++) { + if (j < pgs_copied) + set_page_dirty_lock(process_pages[j]); + put_page(process_pages[j]); + } + } else { + for (j = 0; j < pages_pinned; j++) + put_page(process_pages[j]); + } + + return rc; +} + +/* Maximum number of pages kmalloc'd to hold struct page's during copy */ +#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2) + +/** + * process_vm_rw_single_vec - read/write pages from task specified + * @addr: start memory address of target process + * @len: size of area to copy to/from + * @lvec: iovec array specifying where to copy to/from locally + * @lvec_cnt: number of elements in iovec array + * @lvec_current: index in iovec array we are up to + * @lvec_offset: offset in bytes from current iovec iov_base we are up to + * @process_pages: struct pages area that can store at least + * nr_pages_to_copy struct page pointers + * @mm: mm for task + * @task: task to read/write from + * @vm_write: 0 means copy from, 1 means copy to + * @bytes_copied: returns number of bytes successfully copied + * Returns 0 on success or on failure error code + */ +static int process_vm_rw_single_vec(unsigned long addr, + unsigned long len, + const struct iovec *lvec, + unsigned long lvec_cnt, + unsigned long *lvec_current, + size_t *lvec_offset, + struct page **process_pages, + struct mm_struct *mm, + struct task_struct *task, + int vm_write, + ssize_t *bytes_copied) +{ + unsigned long pa = addr & PAGE_MASK; + unsigned long start_offset = addr - pa; + unsigned long nr_pages; + ssize_t bytes_copied_loop; + ssize_t rc = 0; + unsigned long nr_pages_copied = 0; + unsigned long nr_pages_to_copy; + unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES + / sizeof(struct pages *); + + *bytes_copied = 0; + + /* Work out address and page range required */ + if (len == 0) + return 0; + nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1; + + while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) { + nr_pages_to_copy = min(nr_pages - nr_pages_copied, + max_pages_per_loop); + + rc = process_vm_rw_pages(task, mm, process_pages, pa, + start_offset, len, + lvec, lvec_cnt, + lvec_current, lvec_offset, + vm_write, nr_pages_to_copy, + &bytes_copied_loop); + start_offset = 0; + *bytes_copied += bytes_copied_loop; + + if (rc < 0) { + return rc; + } else { + len -= bytes_copied_loop; + nr_pages_copied += nr_pages_to_copy; + pa += nr_pages_to_copy * PAGE_SIZE; + } + } + + return rc; +} + +/* Maximum number of entries for process pages array + which lives on stack */ +#define PVM_MAX_PP_ARRAY_COUNT 16 + +/** + * process_vm_rw_core - core of reading/writing pages from task specified + * @pid: PID of process to read/write from/to + * @lvec: iovec array specifying where to copy to/from locally + * @liovcnt: size of lvec array + * @rvec: iovec array specifying where to copy to/from in the other process + * @riovcnt: size of rvec array + * @flags: currently unused + * @vm_write: 0 if reading from other process, 1 if writing to other process + * Returns the number of bytes read/written or error code. May + * return less bytes than expected if an error occurs during the copying + * process. + */ +static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec, + unsigned long liovcnt, + const struct iovec *rvec, + unsigned long riovcnt, + unsigned long flags, int vm_write) +{ + struct task_struct *task; + struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT]; + struct page **process_pages = pp_stack; + struct mm_struct *mm; + unsigned long i; + ssize_t rc = 0; + ssize_t bytes_copied_loop; + ssize_t bytes_copied = 0; + unsigned long nr_pages = 0; + unsigned long nr_pages_iov; + unsigned long iov_l_curr_idx = 0; + size_t iov_l_curr_offset = 0; + ssize_t iov_len; + + /* + * Work out how many pages of struct pages we're going to need + * when eventually calling get_user_pages + */ + for (i = 0; i < riovcnt; i++) { + iov_len = rvec[i].iov_len; + if (iov_len > 0) { + nr_pages_iov = ((unsigned long)rvec[i].iov_base + + iov_len) + / PAGE_SIZE - (unsigned long)rvec[i].iov_base + / PAGE_SIZE + 1; + nr_pages = max(nr_pages, nr_pages_iov); + } + } + + if (nr_pages == 0) + return 0; + + if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) { + /* For reliability don't try to kmalloc more than + 2 pages worth */ + process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES, + sizeof(struct pages *)*nr_pages), + GFP_KERNEL); + + if (!process_pages) + return -ENOMEM; + } + + /* Get process information */ + rcu_read_lock(); + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + if (!task) { + rc = -ESRCH; + goto free_proc_pages; + } + + task_lock(task); + if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) { + task_unlock(task); + rc = -EPERM; + goto put_task_struct; + } + mm = task->mm; + + if (!mm || (task->flags & PF_KTHREAD)) { + task_unlock(task); + rc = -EINVAL; + goto put_task_struct; + } + + atomic_inc(&mm->mm_users); + task_unlock(task); + + for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) { + rc = process_vm_rw_single_vec( + (unsigned long)rvec[i].iov_base, rvec[i].iov_len, + lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset, + process_pages, mm, task, vm_write, &bytes_copied_loop); + bytes_copied += bytes_copied_loop; + if (rc != 0) { + /* If we have managed to copy any data at all then + we return the number of bytes copied. Otherwise + we return the error code */ + if (bytes_copied) + rc = bytes_copied; + goto put_mm; + } + } + + rc = bytes_copied; +put_mm: + mmput(mm); + +put_task_struct: + put_task_struct(task); + +free_proc_pages: + if (process_pages != pp_stack) + kfree(process_pages); + return rc; +} + +/** + * process_vm_rw - check iovecs before calling core routine + * @pid: PID of process to read/write from/to + * @lvec: iovec array specifying where to copy to/from locally + * @liovcnt: size of lvec array + * @rvec: iovec array specifying where to copy to/from in the other process + * @riovcnt: size of rvec array + * @flags: currently unused + * @vm_write: 0 if reading from other process, 1 if writing to other process + * Returns the number of bytes read/written or error code. May + * return less bytes than expected if an error occurs during the copying + * process. + */ +static ssize_t process_vm_rw(pid_t pid, + const struct iovec __user *lvec, + unsigned long liovcnt, + const struct iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags, int vm_write) +{ + struct iovec iovstack_l[UIO_FASTIOV]; + struct iovec iovstack_r[UIO_FASTIOV]; + struct iovec *iov_l = iovstack_l; + struct iovec *iov_r = iovstack_r; + ssize_t rc; + + if (flags != 0) + return -EINVAL; + + /* Check iovecs */ + if (vm_write) + rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV, + iovstack_l, &iov_l, 1); + else + rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV, + iovstack_l, &iov_l, 1); + if (rc <= 0) + goto free_iovecs; + + rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV, + iovstack_r, &iov_r, 0); + if (rc <= 0) + goto free_iovecs; + + rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, + vm_write); + +free_iovecs: + if (iov_r != iovstack_r) + kfree(iov_r); + if (iov_l != iovstack_l) + kfree(iov_l); + + return rc; +} + +SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec, + unsigned long, liovcnt, const struct iovec __user *, rvec, + unsigned long, riovcnt, unsigned long, flags) +{ + return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0); +} + +SYSCALL_DEFINE6(process_vm_writev, pid_t, pid, + const struct iovec __user *, lvec, + unsigned long, liovcnt, const struct iovec __user *, rvec, + unsigned long, riovcnt, unsigned long, flags) +{ + return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1); +} + +#ifdef CONFIG_COMPAT + +asmlinkage ssize_t +compat_process_vm_rw(compat_pid_t pid, + const struct compat_iovec __user *lvec, + unsigned long liovcnt, + const struct compat_iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags, int vm_write) +{ + struct iovec iovstack_l[UIO_FASTIOV]; + struct iovec iovstack_r[UIO_FASTIOV]; + struct iovec *iov_l = iovstack_l; + struct iovec *iov_r = iovstack_r; + ssize_t rc = -EFAULT; + + if (flags != 0) + return -EINVAL; + + if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec))) + goto out; + + if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec))) + goto out; + + if (vm_write) + rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt, + UIO_FASTIOV, iovstack_l, + &iov_l, 1); + else + rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt, + UIO_FASTIOV, iovstack_l, + &iov_l, 1); + if (rc <= 0) + goto free_iovecs; + rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt, + UIO_FASTIOV, iovstack_r, + &iov_r, 0); + if (rc <= 0) + goto free_iovecs; + + rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags, + vm_write); + +free_iovecs: + if (iov_r != iovstack_r) + kfree(iov_r); + if (iov_l != iovstack_l) + kfree(iov_l); + +out: + return rc; +} + +asmlinkage ssize_t +compat_sys_process_vm_readv(compat_pid_t pid, + const struct compat_iovec __user *lvec, + unsigned long liovcnt, + const struct compat_iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags) +{ + return compat_process_vm_rw(pid, lvec, liovcnt, rvec, + riovcnt, flags, 0); +} + +asmlinkage ssize_t +compat_sys_process_vm_writev(compat_pid_t pid, + const struct compat_iovec __user *lvec, + unsigned long liovcnt, + const struct compat_iovec __user *rvec, + unsigned long riovcnt, + unsigned long flags) +{ + return compat_process_vm_rw(pid, lvec, liovcnt, rvec, + riovcnt, flags, 1); +} + +#endif diff --git a/mm/rmap.c b/mm/rmap.c index 8005080fb9e3..6541cf7fd1d3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1164,7 +1164,7 @@ void page_remove_rmap(struct page *page) /* * Subfunctions of try_to_unmap: try_to_unmap_one called - * repeatedly from either try_to_unmap_anon or try_to_unmap_file. + * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file. */ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long address, enum ttu_flags flags) diff --git a/mm/shmem.c b/mm/shmem.c index 32f6763f16fb..45b9acb575f9 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1068,6 +1068,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) user_shm_unlock(inode->i_size, user); info->flags &= ~VM_LOCKED; mapping_clear_unevictable(file->f_mapping); + /* + * Ensure that a racing putback_lru_page() can see + * the pages of this mapping are evictable when we + * skip them due to !PageLRU during the scan. + */ + smp_mb__after_clear_bit(); scan_mapping_unevictable_pages(file->f_mapping); } retval = 0; @@ -1458,7 +1464,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { error = security_inode_init_security(inode, dir, - &dentry->d_name, NULL, + &dentry->d_name, NULL, NULL); if (error) { if (error != -EOPNOTSUPP) { @@ -1598,7 +1604,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s if (!inode) return -ENOSPC; - error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, + error = security_inode_init_security(inode, dir, &dentry->d_name, NULL, NULL); if (error) { if (error != -EOPNOTSUPP) { @@ -2497,7 +2503,7 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags d_instantiate(path.dentry, inode); inode->i_size = size; - inode->i_nlink = 0; /* It is unlinked */ + clear_nlink(inode); /* It is unlinked */ #ifndef CONFIG_MMU error = ramfs_nommu_expand_for_mapping(inode, size); if (error) diff --git a/mm/slab.c b/mm/slab.c index 6d90a091fdca..708efe886154 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1851,15 +1851,15 @@ static void dump_line(char *data, int offset, int limit) unsigned char error = 0; int bad_count = 0; - printk(KERN_ERR "%03x:", offset); + printk(KERN_ERR "%03x: ", offset); for (i = 0; i < limit; i++) { if (data[offset + i] != POISON_FREE) { error = data[offset + i]; bad_count++; } - printk(" %02x", (unsigned char)data[offset + i]); } - printk("\n"); + print_hex_dump(KERN_CONT, "", 0, 16, 1, + &data[offset], limit, 1); if (bad_count == 1) { error ^= POISON_FREE; @@ -3039,14 +3039,9 @@ bad: printk(KERN_ERR "slab: Internal list corruption detected in " "cache '%s'(%d), slabp %p(%d). Hexdump:\n", cachep->name, cachep->num, slabp, slabp->inuse); - for (i = 0; - i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t); - i++) { - if (i % 16 == 0) - printk("\n%03x:", i); - printk(" %02x", ((unsigned char *)slabp)[i]); - } - printk("\n"); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, + sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), + 1); BUG(); } } @@ -4584,7 +4579,7 @@ static const struct file_operations proc_slabstats_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); + proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations); #ifdef CONFIG_DEBUG_SLAB_LEAK proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); #endif diff --git a/mm/slub.c b/mm/slub.c index eb5a8f93338a..7d2a996c307e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -467,34 +467,8 @@ static int disable_higher_order_debug; */ static void print_section(char *text, u8 *addr, unsigned int length) { - int i, offset; - int newline = 1; - char ascii[17]; - - ascii[16] = 0; - - for (i = 0; i < length; i++) { - if (newline) { - printk(KERN_ERR "%8s 0x%p: ", text, addr + i); - newline = 0; - } - printk(KERN_CONT " %02x", addr[i]); - offset = i % 16; - ascii[offset] = isgraph(addr[i]) ? addr[i] : '.'; - if (offset == 15) { - printk(KERN_CONT " %s\n", ascii); - newline = 1; - } - } - if (!newline) { - i %= 16; - while (i < 16) { - printk(KERN_CONT " "); - ascii[i] = ' '; - i++; - } - printk(KERN_CONT " %s\n", ascii); - } + print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr, + length, 1); } static struct track *get_track(struct kmem_cache *s, void *object, @@ -625,12 +599,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) p, p - addr, get_freepointer(s, p)); if (p > addr + 16) - print_section("Bytes b4", p - 16, 16); - - print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE)); + print_section("Bytes b4 ", p - 16, 16); + print_section("Object ", p, min_t(unsigned long, s->objsize, + PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section("Redzone", p + s->objsize, + print_section("Redzone ", p + s->objsize, s->inuse - s->objsize); if (s->offset) @@ -643,7 +617,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (off != s->size) /* Beginning of the filler is the free pointer */ - print_section("Padding", p + off, s->size - off); + print_section("Padding ", p + off, s->size - off); dump_stack(); } @@ -681,49 +655,6 @@ static void init_object(struct kmem_cache *s, void *object, u8 val) memset(p + s->objsize, val, s->inuse - s->objsize); } -static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes) -{ - while (bytes) { - if (*start != value) - return start; - start++; - bytes--; - } - return NULL; -} - -static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes) -{ - u64 value64; - unsigned int words, prefix; - - if (bytes <= 16) - return check_bytes8(start, value, bytes); - - value64 = value | value << 8 | value << 16 | value << 24; - value64 = value64 | value64 << 32; - prefix = 8 - ((unsigned long)start) % 8; - - if (prefix) { - u8 *r = check_bytes8(start, value, prefix); - if (r) - return r; - start += prefix; - bytes -= prefix; - } - - words = bytes / 8; - - while (words) { - if (*(u64 *)start != value64) - return check_bytes8(start, value, 8); - start += 8; - words--; - } - - return check_bytes8(start, value, bytes % 8); -} - static void restore_bytes(struct kmem_cache *s, char *message, u8 data, void *from, void *to) { @@ -738,7 +669,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, u8 *fault; u8 *end; - fault = check_bytes(start, value, bytes); + fault = memchr_inv(start, value, bytes); if (!fault) return 1; @@ -831,14 +762,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) if (!remainder) return 1; - fault = check_bytes(end - remainder, POISON_INUSE, remainder); + fault = memchr_inv(end - remainder, POISON_INUSE, remainder); if (!fault) return 1; while (end > fault && end[-1] == POISON_INUSE) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section("Padding", end - remainder, remainder); + print_section("Padding ", end - remainder, remainder); restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); return 0; @@ -987,7 +918,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, page->freelist); if (!alloc) - print_section("Object", (void *)object, s->objsize); + print_section("Object ", (void *)object, s->objsize); dump_stack(); } @@ -1447,7 +1378,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) set_freepointer(s, last, NULL); page->freelist = start; - page->inuse = 0; + page->inuse = page->objects; page->frozen = 1; out: return page; @@ -1534,7 +1465,7 @@ static inline void add_partial(struct kmem_cache_node *n, struct page *page, int tail) { n->nr_partial++; - if (tail) + if (tail == DEACTIVATE_TO_TAIL) list_add_tail(&page->lru, &n->partial); else list_add(&page->lru, &n->partial); @@ -1554,10 +1485,13 @@ static inline void remove_partial(struct kmem_cache_node *n, * Lock slab, remove from the partial list and put the object into the * per cpu freelist. * + * Returns a list of objects or NULL if it fails. + * * Must hold list_lock. */ -static inline int acquire_slab(struct kmem_cache *s, - struct kmem_cache_node *n, struct page *page) +static inline void *acquire_slab(struct kmem_cache *s, + struct kmem_cache_node *n, struct page *page, + int mode) { void *freelist; unsigned long counters; @@ -1572,7 +1506,8 @@ static inline int acquire_slab(struct kmem_cache *s, freelist = page->freelist; counters = page->counters; new.counters = counters; - new.inuse = page->objects; + if (mode) + new.inuse = page->objects; VM_BUG_ON(new.frozen); new.frozen = 1; @@ -1583,32 +1518,19 @@ static inline int acquire_slab(struct kmem_cache *s, "lock and freeze")); remove_partial(n, page); - - if (freelist) { - /* Populate the per cpu freelist */ - this_cpu_write(s->cpu_slab->freelist, freelist); - this_cpu_write(s->cpu_slab->page, page); - this_cpu_write(s->cpu_slab->node, page_to_nid(page)); - return 1; - } else { - /* - * Slab page came from the wrong list. No object to allocate - * from. Put it onto the correct list and continue partial - * scan. - */ - printk(KERN_ERR "SLUB: %s : Page without available objects on" - " partial list\n", s->name); - return 0; - } + return freelist; } +static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain); + /* * Try to allocate a partial slab from a specific node. */ -static struct page *get_partial_node(struct kmem_cache *s, - struct kmem_cache_node *n) +static void *get_partial_node(struct kmem_cache *s, + struct kmem_cache_node *n, struct kmem_cache_cpu *c) { - struct page *page; + struct page *page, *page2; + void *object = NULL; /* * Racy check. If we mistakenly see no partial slabs then we @@ -1620,26 +1542,43 @@ static struct page *get_partial_node(struct kmem_cache *s, return NULL; spin_lock(&n->list_lock); - list_for_each_entry(page, &n->partial, lru) - if (acquire_slab(s, n, page)) - goto out; - page = NULL; -out: + list_for_each_entry_safe(page, page2, &n->partial, lru) { + void *t = acquire_slab(s, n, page, object == NULL); + int available; + + if (!t) + break; + + if (!object) { + c->page = page; + c->node = page_to_nid(page); + stat(s, ALLOC_FROM_PARTIAL); + object = t; + available = page->objects - page->inuse; + } else { + page->freelist = t; + available = put_cpu_partial(s, page, 0); + } + if (kmem_cache_debug(s) || available > s->cpu_partial / 2) + break; + + } spin_unlock(&n->list_lock); - return page; + return object; } /* * Get a page from somewhere. Search in increasing NUMA distances. */ -static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) +static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags, + struct kmem_cache_cpu *c) { #ifdef CONFIG_NUMA struct zonelist *zonelist; struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(flags); - struct page *page; + void *object; /* * The defrag ratio allows a configuration of the tradeoffs between @@ -1672,10 +1611,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { - page = get_partial_node(s, n); - if (page) { + object = get_partial_node(s, n, c); + if (object) { put_mems_allowed(); - return page; + return object; } } } @@ -1687,16 +1626,17 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) /* * Get a partial page, lock it and return it. */ -static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) +static void *get_partial(struct kmem_cache *s, gfp_t flags, int node, + struct kmem_cache_cpu *c) { - struct page *page; + void *object; int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; - page = get_partial_node(s, get_node(s, searchnode)); - if (page || node != NUMA_NO_NODE) - return page; + object = get_partial_node(s, get_node(s, searchnode), c); + if (object || node != NUMA_NO_NODE) + return object; - return get_any_partial(s, flags); + return get_any_partial(s, flags, c); } #ifdef CONFIG_PREEMPT @@ -1765,9 +1705,6 @@ void init_kmem_cache_cpus(struct kmem_cache *s) for_each_possible_cpu(cpu) per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu); } -/* - * Remove the cpu slab - */ /* * Remove the cpu slab @@ -1781,13 +1718,13 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) enum slab_modes l = M_NONE, m = M_NONE; void *freelist; void *nextfree; - int tail = 0; + int tail = DEACTIVATE_TO_HEAD; struct page new; struct page old; if (page->freelist) { stat(s, DEACTIVATE_REMOTE_FREES); - tail = 1; + tail = DEACTIVATE_TO_TAIL; } c->tid = next_tid(c->tid); @@ -1854,7 +1791,7 @@ redo: new.frozen = 0; - if (!new.inuse && n->nr_partial < s->min_partial) + if (!new.inuse && n->nr_partial > s->min_partial) m = M_FREE; else if (new.freelist) { m = M_PARTIAL; @@ -1893,7 +1830,7 @@ redo: if (m == M_PARTIAL) { add_partial(n, page, tail); - stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); + stat(s, tail); } else if (m == M_FULL) { @@ -1920,6 +1857,123 @@ redo: } } +/* Unfreeze all the cpu partial slabs */ +static void unfreeze_partials(struct kmem_cache *s) +{ + struct kmem_cache_node *n = NULL; + struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab); + struct page *page; + + while ((page = c->partial)) { + enum slab_modes { M_PARTIAL, M_FREE }; + enum slab_modes l, m; + struct page new; + struct page old; + + c->partial = page->next; + l = M_FREE; + + do { + + old.freelist = page->freelist; + old.counters = page->counters; + VM_BUG_ON(!old.frozen); + + new.counters = old.counters; + new.freelist = old.freelist; + + new.frozen = 0; + + if (!new.inuse && (!n || n->nr_partial > s->min_partial)) + m = M_FREE; + else { + struct kmem_cache_node *n2 = get_node(s, + page_to_nid(page)); + + m = M_PARTIAL; + if (n != n2) { + if (n) + spin_unlock(&n->list_lock); + + n = n2; + spin_lock(&n->list_lock); + } + } + + if (l != m) { + if (l == M_PARTIAL) + remove_partial(n, page); + else + add_partial(n, page, 1); + + l = m; + } + + } while (!cmpxchg_double_slab(s, page, + old.freelist, old.counters, + new.freelist, new.counters, + "unfreezing slab")); + + if (m == M_FREE) { + stat(s, DEACTIVATE_EMPTY); + discard_slab(s, page); + stat(s, FREE_SLAB); + } + } + + if (n) + spin_unlock(&n->list_lock); +} + +/* + * Put a page that was just frozen (in __slab_free) into a partial page + * slot if available. This is done without interrupts disabled and without + * preemption disabled. The cmpxchg is racy and may put the partial page + * onto a random cpus partial slot. + * + * If we did not find a slot then simply move all the partials to the + * per node partial list. + */ +int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) +{ + struct page *oldpage; + int pages; + int pobjects; + + do { + pages = 0; + pobjects = 0; + oldpage = this_cpu_read(s->cpu_slab->partial); + + if (oldpage) { + pobjects = oldpage->pobjects; + pages = oldpage->pages; + if (drain && pobjects > s->cpu_partial) { + unsigned long flags; + /* + * partial array is full. Move the existing + * set to the per node partial list. + */ + local_irq_save(flags); + unfreeze_partials(s); + local_irq_restore(flags); + pobjects = 0; + pages = 0; + } + } + + pages++; + pobjects += page->objects - page->inuse; + + page->pages = pages; + page->pobjects = pobjects; + page->next = oldpage; + + } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); + stat(s, CPU_PARTIAL_FREE); + return pobjects; +} + static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { stat(s, CPUSLAB_FLUSH); @@ -1935,8 +1989,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); - if (likely(c && c->page)) - flush_slab(s, c); + if (likely(c)) { + if (c->page) + flush_slab(s, c); + + unfreeze_partials(s); + } } static void flush_cpu_slab(void *d) @@ -2027,12 +2085,39 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) } } +static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, + int node, struct kmem_cache_cpu **pc) +{ + void *object; + struct kmem_cache_cpu *c; + struct page *page = new_slab(s, flags, node); + + if (page) { + c = __this_cpu_ptr(s->cpu_slab); + if (c->page) + flush_slab(s, c); + + /* + * No other reference to the page yet so we can + * muck around with it freely without cmpxchg + */ + object = page->freelist; + page->freelist = NULL; + + stat(s, ALLOC_SLAB); + c->node = page_to_nid(page); + c->page = page; + *pc = c; + } else + object = NULL; + + return object; +} + /* * Slow path. The lockless freelist is empty or we need to perform * debugging duties. * - * Interrupts are disabled. - * * Processing is still very fast if new objects have been freed to the * regular freelist. In that case we simply take over the regular freelist * as the lockless freelist and zap the regular freelist. @@ -2049,7 +2134,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, unsigned long addr, struct kmem_cache_cpu *c) { void **object; - struct page *page; unsigned long flags; struct page new; unsigned long counters; @@ -2064,13 +2148,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c = this_cpu_ptr(s->cpu_slab); #endif - /* We handle __GFP_ZERO in the caller */ - gfpflags &= ~__GFP_ZERO; - - page = c->page; - if (!page) + if (!c->page) goto new_slab; - +redo: if (unlikely(!node_match(c, node))) { stat(s, ALLOC_NODE_MISMATCH); deactivate_slab(s, c); @@ -2080,8 +2160,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_SLOWPATH); do { - object = page->freelist; - counters = page->counters; + object = c->page->freelist; + counters = c->page->counters; new.counters = counters; VM_BUG_ON(!new.frozen); @@ -2093,17 +2173,17 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, * * If there are objects left then we retrieve them * and use them to refill the per cpu queue. - */ + */ - new.inuse = page->objects; + new.inuse = c->page->objects; new.frozen = object != NULL; - } while (!__cmpxchg_double_slab(s, page, + } while (!__cmpxchg_double_slab(s, c->page, object, counters, NULL, new.counters, "__slab_alloc")); - if (unlikely(!object)) { + if (!object) { c->page = NULL; stat(s, DEACTIVATE_BYPASS); goto new_slab; @@ -2112,58 +2192,47 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, stat(s, ALLOC_REFILL); load_freelist: - VM_BUG_ON(!page->frozen); c->freelist = get_freepointer(s, object); c->tid = next_tid(c->tid); local_irq_restore(flags); return object; new_slab: - page = get_partial(s, gfpflags, node); - if (page) { - stat(s, ALLOC_FROM_PARTIAL); - object = c->freelist; - if (kmem_cache_debug(s)) - goto debug; - goto load_freelist; + if (c->partial) { + c->page = c->partial; + c->partial = c->page->next; + c->node = page_to_nid(c->page); + stat(s, CPU_PARTIAL_ALLOC); + c->freelist = NULL; + goto redo; } - page = new_slab(s, gfpflags, node); + /* Then do expensive stuff like retrieving pages from the partial lists */ + object = get_partial(s, gfpflags, node, c); - if (page) { - c = __this_cpu_ptr(s->cpu_slab); - if (c->page) - flush_slab(s, c); + if (unlikely(!object)) { - /* - * No other reference to the page yet so we can - * muck around with it freely without cmpxchg - */ - object = page->freelist; - page->freelist = NULL; - page->inuse = page->objects; + object = new_slab_objects(s, gfpflags, node, &c); - stat(s, ALLOC_SLAB); - c->node = page_to_nid(page); - c->page = page; + if (unlikely(!object)) { + if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) + slab_out_of_memory(s, gfpflags, node); - if (kmem_cache_debug(s)) - goto debug; - goto load_freelist; + local_irq_restore(flags); + return NULL; + } } - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(s, gfpflags, node); - local_irq_restore(flags); - return NULL; -debug: - if (!object || !alloc_debug_processing(s, page, object, addr)) - goto new_slab; + if (likely(!kmem_cache_debug(s))) + goto load_freelist; + + /* Only entered in the debug case */ + if (!alloc_debug_processing(s, c->page, object, addr)) + goto new_slab; /* Slab failed checks. Next slab needed */ c->freelist = get_freepointer(s, object); deactivate_slab(s, c); - c->page = NULL; c->node = NUMA_NO_NODE; local_irq_restore(flags); return object; @@ -2333,16 +2402,29 @@ static void __slab_free(struct kmem_cache *s, struct page *page, was_frozen = new.frozen; new.inuse--; if ((!new.inuse || !prior) && !was_frozen && !n) { - n = get_node(s, page_to_nid(page)); - /* - * Speculatively acquire the list_lock. - * If the cmpxchg does not succeed then we may - * drop the list_lock without any processing. - * - * Otherwise the list_lock will synchronize with - * other processors updating the list of slabs. - */ - spin_lock_irqsave(&n->list_lock, flags); + + if (!kmem_cache_debug(s) && !prior) + + /* + * Slab was on no list before and will be partially empty + * We can defer the list move and instead freeze it. + */ + new.frozen = 1; + + else { /* Needs to be taken off a list */ + + n = get_node(s, page_to_nid(page)); + /* + * Speculatively acquire the list_lock. + * If the cmpxchg does not succeed then we may + * drop the list_lock without any processing. + * + * Otherwise the list_lock will synchronize with + * other processors updating the list of slabs. + */ + spin_lock_irqsave(&n->list_lock, flags); + + } } inuse = new.inuse; @@ -2352,7 +2434,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page, "__slab_free")); if (likely(!n)) { - /* + + /* + * If we just froze the page then put it onto the + * per cpu partial list. + */ + if (new.frozen && !was_frozen) + put_cpu_partial(s, page, 1); + + /* * The list lock was not taken therefore no list * activity can be necessary. */ @@ -2377,7 +2467,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, */ if (unlikely(!prior)) { remove_full(s, page); - add_partial(n, page, 0); + add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } } @@ -2387,11 +2477,13 @@ static void __slab_free(struct kmem_cache *s, struct page *page, slab_empty: if (prior) { /* - * Slab still on the partial list. + * Slab on the partial list. */ remove_partial(n, page); stat(s, FREE_REMOVE_PARTIAL); - } + } else + /* Slab must be on the full list */ + remove_full(s, page); spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); @@ -2419,7 +2511,6 @@ static __always_inline void slab_free(struct kmem_cache *s, slab_free_hook(s, x); redo: - /* * Determine the currently cpus per cpu slab. * The cpu may change afterward. However that does not matter since @@ -2683,7 +2774,7 @@ static void early_kmem_cache_node_alloc(int node) n = page->freelist; BUG_ON(!n); page->freelist = get_freepointer(kmem_cache_node, n); - page->inuse++; + page->inuse = 1; page->frozen = 0; kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG @@ -2693,7 +2784,7 @@ static void early_kmem_cache_node_alloc(int node) init_kmem_cache_node(n, kmem_cache_node); inc_slabs_node(kmem_cache_node, node, page->objects); - add_partial(n, page, 0); + add_partial(n, page, DEACTIVATE_TO_HEAD); } static void free_kmem_cache_nodes(struct kmem_cache *s) @@ -2909,7 +3000,34 @@ static int kmem_cache_open(struct kmem_cache *s, * The larger the object size is, the more pages we want on the partial * list to avoid pounding the page allocator excessively. */ - set_min_partial(s, ilog2(s->size)); + set_min_partial(s, ilog2(s->size) / 2); + + /* + * cpu_partial determined the maximum number of objects kept in the + * per cpu partial lists of a processor. + * + * Per cpu partial lists mainly contain slabs that just have one + * object freed. If they are used for allocation then they can be + * filled up again with minimal effort. The slab will never hit the + * per node partial lists and therefore no locking will be required. + * + * This setting also determines + * + * A) The number of objects from per cpu partial slabs dumped to the + * per node list when we reach the limit. + * B) The number of objects in cpu partial slabs to extract from the + * per node list when we run out of per cpu objects. We only fetch 50% + * to keep some capacity around for frees. + */ + if (s->size >= PAGE_SIZE) + s->cpu_partial = 2; + else if (s->size >= 1024) + s->cpu_partial = 6; + else if (s->size >= 256) + s->cpu_partial = 13; + else + s->cpu_partial = 30; + s->refcount = 1; #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; @@ -2968,13 +3086,13 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, /* * Attempt to free all partial slabs on a node. + * This is called from kmem_cache_close(). We must be the last thread + * using the cache and therefore we do not need to lock anymore. */ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) { - unsigned long flags; struct page *page, *h; - spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { remove_partial(n, page); @@ -2984,7 +3102,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) "Objects remaining on kmem_cache_close()"); } } - spin_unlock_irqrestore(&n->list_lock, flags); } /* @@ -3018,6 +3135,7 @@ void kmem_cache_destroy(struct kmem_cache *s) s->refcount--; if (!s->refcount) { list_del(&s->list); + up_write(&slub_lock); if (kmem_cache_close(s)) { printk(KERN_ERR "SLUB %s: %s called for cache that " "still has objects.\n", s->name, __func__); @@ -3026,8 +3144,8 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); sysfs_slab_remove(s); - } - up_write(&slub_lock); + } else + up_write(&slub_lock); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -3345,23 +3463,23 @@ int kmem_cache_shrink(struct kmem_cache *s) * list_lock. page->inuse here is the upper limit. */ list_for_each_entry_safe(page, t, &n->partial, lru) { - if (!page->inuse) { - remove_partial(n, page); - discard_slab(s, page); - } else { - list_move(&page->lru, - slabs_by_inuse + page->inuse); - } + list_move(&page->lru, slabs_by_inuse + page->inuse); + if (!page->inuse) + n->nr_partial--; } /* * Rebuild the partial list with the slabs filled up most * first and the least used slabs at the end. */ - for (i = objects - 1; i >= 0; i--) + for (i = objects - 1; i > 0; i--) list_splice(slabs_by_inuse + i, n->partial.prev); spin_unlock_irqrestore(&n->list_lock, flags); + + /* Release empty slabs */ + list_for_each_entry_safe(page, t, slabs_by_inuse, lru) + discard_slab(s, page); } kfree(slabs_by_inuse); @@ -4317,6 +4435,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, for_each_possible_cpu(cpu) { struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + struct page *page; if (!c || c->node < 0) continue; @@ -4332,6 +4451,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s, total += x; nodes[c->node] += x; } + page = c->partial; + + if (page) { + x = page->pobjects; + total += x; + nodes[c->node] += x; + } per_cpu[c->node]++; } } @@ -4410,11 +4536,12 @@ struct slab_attribute { }; #define SLAB_ATTR_RO(_name) \ - static struct slab_attribute _name##_attr = __ATTR_RO(_name) + static struct slab_attribute _name##_attr = \ + __ATTR(_name, 0400, _name##_show, NULL) #define SLAB_ATTR(_name) \ static struct slab_attribute _name##_attr = \ - __ATTR(_name, 0644, _name##_show, _name##_store) + __ATTR(_name, 0600, _name##_show, _name##_store) static ssize_t slab_size_show(struct kmem_cache *s, char *buf) { @@ -4483,6 +4610,27 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf, } SLAB_ATTR(min_partial); +static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%u\n", s->cpu_partial); +} + +static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + unsigned long objects; + int err; + + err = strict_strtoul(buf, 10, &objects); + if (err) + return err; + + s->cpu_partial = objects; + flush_all(s); + return length; +} +SLAB_ATTR(cpu_partial); + static ssize_t ctor_show(struct kmem_cache *s, char *buf) { if (!s->ctor) @@ -4521,6 +4669,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(objects_partial); +static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf) +{ + int objects = 0; + int pages = 0; + int cpu; + int len; + + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial; + + if (page) { + pages += page->pages; + objects += page->pobjects; + } + } + + len = sprintf(buf, "%d(%d)", objects, pages); + +#ifdef CONFIG_SMP + for_each_online_cpu(cpu) { + struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial; + + if (page && len < PAGE_SIZE - 20) + len += sprintf(buf + len, " C%d=%d(%d)", cpu, + page->pobjects, page->pages); + } +#endif + return len + sprintf(buf + len, "\n"); +} +SLAB_ATTR_RO(slabs_cpu_partial); + static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); @@ -4843,6 +5022,8 @@ STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass); STAT_ATTR(ORDER_FALLBACK, order_fallback); STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); +STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); +STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); #endif static struct attribute *slab_attrs[] = { @@ -4851,6 +5032,7 @@ static struct attribute *slab_attrs[] = { &objs_per_slab_attr.attr, &order_attr.attr, &min_partial_attr.attr, + &cpu_partial_attr.attr, &objects_attr.attr, &objects_partial_attr.attr, &partial_attr.attr, @@ -4863,6 +5045,7 @@ static struct attribute *slab_attrs[] = { &destroy_by_rcu_attr.attr, &shrink_attr.attr, &reserved_attr.attr, + &slabs_cpu_partial_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, &slabs_attr.attr, @@ -4904,6 +5087,8 @@ static struct attribute *slab_attrs[] = { &order_fallback_attr.attr, &cmpxchg_double_fail_attr.attr, &cmpxchg_double_cpu_fail_attr.attr, + &cpu_partial_alloc_attr.attr, + &cpu_partial_free_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, @@ -5255,7 +5440,7 @@ static const struct file_operations proc_slabinfo_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); + proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); return 0; } module_init(slab_proc_init); diff --git a/mm/swap.c b/mm/swap.c index 3a442f18b0b3..87627f181c3f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -78,39 +78,22 @@ static void put_compound_page(struct page *page) { if (unlikely(PageTail(page))) { /* __split_huge_page_refcount can run under us */ - struct page *page_head = page->first_page; - smp_rmb(); - /* - * If PageTail is still set after smp_rmb() we can be sure - * that the page->first_page we read wasn't a dangling pointer. - * See __split_huge_page_refcount() smp_wmb(). - */ - if (likely(PageTail(page) && get_page_unless_zero(page_head))) { + struct page *page_head = compound_trans_head(page); + + if (likely(page != page_head && + get_page_unless_zero(page_head))) { unsigned long flags; /* - * Verify that our page_head wasn't converted - * to a a regular page before we got a - * reference on it. + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. */ - if (unlikely(!PageHead(page_head))) { - /* PageHead is cleared after PageTail */ - smp_rmb(); - VM_BUG_ON(PageTail(page)); - goto out_put_head; - } - /* - * Only run compound_lock on a valid PageHead, - * after having it pinned with - * get_page_unless_zero() above. - */ - smp_mb(); - /* page_head wasn't a dangling pointer */ flags = compound_lock_irqsave(page_head); if (unlikely(!PageTail(page))) { /* __split_huge_page_refcount run before us */ compound_unlock_irqrestore(page_head, flags); VM_BUG_ON(PageHead(page_head)); - out_put_head: if (put_page_testzero(page_head)) __put_single_page(page_head); out_put_single: @@ -121,16 +104,17 @@ static void put_compound_page(struct page *page) VM_BUG_ON(page_head != page->first_page); /* * We can release the refcount taken by - * get_page_unless_zero now that - * split_huge_page_refcount is blocked on the - * compound_lock. + * get_page_unless_zero() now that + * __split_huge_page_refcount() is blocked on + * the compound_lock. */ if (put_page_testzero(page_head)) VM_BUG_ON(1); /* __split_huge_page_refcount will wait now */ - VM_BUG_ON(atomic_read(&page->_count) <= 0); - atomic_dec(&page->_count); + VM_BUG_ON(page_mapcount(page) <= 0); + atomic_dec(&page->_mapcount); VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + VM_BUG_ON(atomic_read(&page->_count) != 0); compound_unlock_irqrestore(page_head, flags); if (put_page_testzero(page_head)) { if (PageHead(page_head)) @@ -160,6 +144,45 @@ void put_page(struct page *page) } EXPORT_SYMBOL(put_page); +/* + * This function is exported but must not be called by anything other + * than get_page(). It implements the slow path of get_page(). + */ +bool __get_page_tail(struct page *page) +{ + /* + * This takes care of get_page() if run on a tail page + * returned by one of the get_user_pages/follow_page variants. + * get_user_pages/follow_page itself doesn't need the compound + * lock because it runs __get_page_tail_foll() under the + * proper PT lock that already serializes against + * split_huge_page(). + */ + unsigned long flags; + bool got = false; + struct page *page_head = compound_trans_head(page); + + if (likely(page != page_head && get_page_unless_zero(page_head))) { + /* + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. + */ + flags = compound_lock_irqsave(page_head); + /* here __split_huge_page_refcount won't run anymore */ + if (likely(PageTail(page))) { + __get_page_tail_foll(page, false); + got = true; + } + compound_unlock_irqrestore(page_head, flags); + if (unlikely(!got)) + put_page(page_head); + } + return got; +} +EXPORT_SYMBOL(__get_page_tail); + /** * put_pages_list() - release a list of pages * @pages: list of pages threaded on page->lru diff --git a/mm/swapfile.c b/mm/swapfile.c index 17bc224bce68..c9d654009125 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1617,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); err = try_to_unuse(type); - test_set_oom_score_adj(oom_score_adj); + compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); if (err) { /* diff --git a/mm/thrash.c b/mm/thrash.c index e53f7d02c17c..57ad495dbd54 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -29,7 +29,7 @@ static DEFINE_SPINLOCK(swap_token_lock); struct mm_struct *swap_token_mm; -struct mem_cgroup *swap_token_memcg; +static struct mem_cgroup *swap_token_memcg; #ifdef CONFIG_CGROUP_MEM_RES_CTLR static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 464621d18eb2..b669aa6f6caf 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -725,9 +725,10 @@ static void free_unmap_vmap_area_addr(unsigned long addr) #define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) #define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ #define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ -#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ - VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ - VMALLOC_PAGES / NR_CPUS / 16)) +#define VMAP_BBMAP_BITS \ + VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ + VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ + VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16)) #define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) @@ -1252,18 +1253,22 @@ EXPORT_SYMBOL_GPL(map_vm_area); DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; -static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, +static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, void *caller) { - struct vm_struct *tmp, **p; - vm->flags = flags; vm->addr = (void *)va->va_start; vm->size = va->va_end - va->va_start; vm->caller = caller; va->private = vm; va->flags |= VM_VM_AREA; +} +static void insert_vmalloc_vmlist(struct vm_struct *vm) +{ + struct vm_struct *tmp, **p; + + vm->flags &= ~VM_UNLIST; write_lock(&vmlist_lock); for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { if (tmp->addr >= vm->addr) @@ -1274,6 +1279,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, write_unlock(&vmlist_lock); } +static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, + unsigned long flags, void *caller) +{ + setup_vmalloc_vm(vm, va, flags, caller); + insert_vmalloc_vmlist(vm); +} + static struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, void *caller) @@ -1312,7 +1324,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - insert_vmalloc_vm(area, va, flags, caller); + /* + * When this function is called from __vmalloc_node_range, + * we do not add vm_struct to vmlist here to avoid + * accessing uninitialized members of vm_struct such as + * pages and nr_pages fields. They will be set later. + * To distinguish it from others, we use a VM_UNLIST flag. + */ + if (flags & VM_UNLIST) + setup_vmalloc_vm(area, va, flags, caller); + else + insert_vmalloc_vm(area, va, flags, caller); + return area; } @@ -1380,17 +1403,20 @@ struct vm_struct *remove_vm_area(const void *addr) va = find_vmap_area((unsigned long)addr); if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->private; - struct vm_struct *tmp, **p; - /* - * remove from list and disallow access to this vm_struct - * before unmap. (address range confliction is maintained by - * vmap.) - */ - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) - ; - *p = tmp->next; - write_unlock(&vmlist_lock); + + if (!(vm->flags & VM_UNLIST)) { + struct vm_struct *tmp, **p; + /* + * remove from list and disallow access to + * this vm_struct before unmap. (address range + * confliction is maintained by vmap.) + */ + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) + ; + *p = tmp->next; + write_unlock(&vmlist_lock); + } vmap_debug_free_range(va->va_start, va->va_end); free_unmap_vmap_area(va); @@ -1567,8 +1593,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, return area->addr; fail: - warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, " - "allocated %ld of %ld bytes\n", + warn_alloc_failed(gfp_mask, order, + "vmalloc: allocation failure, allocated %ld of %ld bytes\n", (area->nr_pages*PAGE_SIZE), area->size); vfree(area->addr); return NULL; @@ -1599,17 +1625,22 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, size = PAGE_ALIGN(size); if (!size || (size >> PAGE_SHIFT) > totalram_pages) - return NULL; - - area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, - gfp_mask, caller); + goto fail; + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, + start, end, node, gfp_mask, caller); if (!area) - return NULL; + goto fail; addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); /* + * In this function, newly allocated vm_struct is not added + * to vmlist at __get_vm_area_node(). so, it is added here. + */ + insert_vmalloc_vmlist(area); + + /* * A ref_count = 3 is needed because the vm_struct and vmap_area * structures allocated in the __get_vm_area_node() function contain * references to the virtual address of the vmalloc'ed block. @@ -1617,6 +1648,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, kmemleak_alloc(addr, real_size, 3, gfp_mask); return addr; + +fail: + warn_alloc_failed(gfp_mask, 0, + "vmalloc: allocation failure: %lu bytes\n", + real_size); + return NULL; } /** @@ -2139,6 +2176,14 @@ struct vm_struct *alloc_vm_area(size_t size) return NULL; } + /* + * If the allocated address space is passed to a hypercall + * before being used then we cannot rely on a page fault to + * trigger an update of the page tables. So sync all the page + * tables here. + */ + vmalloc_sync_all(); + return area; } EXPORT_SYMBOL_GPL(alloc_vm_area); diff --git a/mm/vmscan.c b/mm/vmscan.c index 7ef69124fa3e..132d1ddb2238 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -105,7 +105,6 @@ struct scan_control { /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; - struct memcg_scanrecord *memcg_record; /* * Nodemask of nodes allowed by the caller. If NULL, all nodes @@ -496,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, return PAGE_ACTIVATE; } - /* - * Wait on writeback if requested to. This happens when - * direct reclaiming a large contiguous area and the - * first attempt to free a range of pages fails. - */ - if (PageWriteback(page) && - (sc->reclaim_mode & RECLAIM_MODE_SYNC)) - wait_on_page_writeback(page); - if (!PageWriteback(page)) { /* synchronous write or broken a_ops? */ ClearPageReclaim(page); @@ -643,13 +633,14 @@ redo: lru = LRU_UNEVICTABLE; add_page_to_unevictable_list(page); /* - * When racing with an mlock clearing (page is - * unlocked), make sure that if the other thread does - * not observe our setting of PG_lru and fails - * isolation, we see PG_mlocked cleared below and move + * When racing with an mlock or AS_UNEVICTABLE clearing + * (page is unlocked) make sure that if the other thread + * does not observe our setting of PG_lru and fails + * isolation/check_move_unevictable_page, + * we see PG_mlocked/AS_UNEVICTABLE cleared below and move * the page back to the evictable list. * - * The other side is TestClearPageMlocked(). + * The other side is TestClearPageMlocked() or shmem_lock(). */ smp_mb(); } @@ -760,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages) */ static unsigned long shrink_page_list(struct list_head *page_list, struct zone *zone, - struct scan_control *sc) + struct scan_control *sc, + int priority, + unsigned long *ret_nr_dirty, + unsigned long *ret_nr_writeback) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); @@ -768,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, unsigned long nr_dirty = 0; unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; + unsigned long nr_writeback = 0; cond_resched(); @@ -804,13 +799,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); if (PageWriteback(page)) { + nr_writeback++; /* - * Synchronous reclaim is performed in two passes, - * first an asynchronous pass over the list to - * start parallel writeback, and a second synchronous - * pass to wait for the IO to complete. Wait here - * for any page for which writeback has already - * started. + * Synchronous reclaim cannot queue pages for + * writeback due to the possibility of stack overflow + * but if it encounters a page under writeback, wait + * for the IO to complete. */ if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && may_enter_fs) @@ -866,6 +860,25 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (PageDirty(page)) { nr_dirty++; + /* + * Only kswapd can writeback filesystem pages to + * avoid risk of stack overflow but do not writeback + * unless under significant pressure. + */ + if (page_is_file_cache(page) && + (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) { + /* + * Immediately reclaim when written back. + * Similar in principal to deactivate_page() + * except we already have the page isolated + * and know it's dirty + */ + inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE); + SetPageReclaim(page); + + goto keep_locked; + } + if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs) @@ -1000,6 +1013,8 @@ keep_lumpy: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); + *ret_nr_dirty += nr_dirty; + *ret_nr_writeback += nr_writeback; return nr_reclaimed; } @@ -1013,23 +1028,27 @@ keep_lumpy: * * returns 0 on success, -ve errno on failure. */ -int __isolate_lru_page(struct page *page, int mode, int file) +int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file) { + bool all_lru_mode; int ret = -EINVAL; /* Only take pages on the LRU. */ if (!PageLRU(page)) return ret; + all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) == + (ISOLATE_ACTIVE|ISOLATE_INACTIVE); + /* * When checking the active state, we need to be sure we are * dealing with comparible boolean values. Take the logical not * of each. */ - if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) + if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE)) return ret; - if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) + if (!all_lru_mode && !!page_is_file_cache(page) != file) return ret; /* @@ -1042,6 +1061,12 @@ int __isolate_lru_page(struct page *page, int mode, int file) ret = -EBUSY; + if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page))) + return ret; + + if ((mode & ISOLATE_UNMAPPED) && page_mapped(page)) + return ret; + if (likely(get_page_unless_zero(page))) { /* * Be careful not to clear PageLRU until after we're @@ -1077,7 +1102,8 @@ int __isolate_lru_page(struct page *page, int mode, int file) */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct list_head *src, struct list_head *dst, - unsigned long *scanned, int order, int mode, int file) + unsigned long *scanned, int order, isolate_mode_t mode, + int file) { unsigned long nr_taken = 0; unsigned long nr_lumpy_taken = 0; @@ -1202,8 +1228,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, static unsigned long isolate_pages_global(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, - int mode, struct zone *z, - int active, int file) + isolate_mode_t mode, + struct zone *z, int active, int file) { int lru = LRU_BASE; if (active) @@ -1349,8 +1375,6 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_rotated[file] += numpages; } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -1394,14 +1418,10 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone, reclaim_stat->recent_scanned[0] += *nr_anon; reclaim_stat->recent_scanned[1] += *nr_file; - if (!scanning_global_lru(sc)) { - sc->memcg_record->nr_scanned[0] += *nr_anon; - sc->memcg_record->nr_scanned[1] += *nr_file; - } } /* - * Returns true if the caller should wait to clean dirty/writeback pages. + * Returns true if a direct reclaim should wait on pages under writeback. * * If we are direct reclaiming for contiguous pages and we do not reclaim * everything in the list, try again and wait for writeback IO to complete. @@ -1423,7 +1443,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) return false; - /* If we have relaimed everything on the isolated list, no stall */ + /* If we have reclaimed everything on the isolated list, no stall */ if (nr_freed == nr_taken) return false; @@ -1455,6 +1475,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, unsigned long nr_taken; unsigned long nr_anon; unsigned long nr_file; + unsigned long nr_dirty = 0; + unsigned long nr_writeback = 0; + isolate_mode_t reclaim_mode = ISOLATE_INACTIVE; while (unlikely(too_many_isolated(zone, file, sc))) { congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -1465,15 +1488,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } set_reclaim_mode(priority, sc, false); + if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) + reclaim_mode |= ISOLATE_ACTIVE; + lru_add_drain(); + + if (!sc->may_unmap) + reclaim_mode |= ISOLATE_UNMAPPED; + if (!sc->may_writepage) + reclaim_mode |= ISOLATE_CLEAN; + spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { - nr_taken = isolate_pages_global(nr_to_scan, - &page_list, &nr_scanned, sc->order, - sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? - ISOLATE_BOTH : ISOLATE_INACTIVE, - zone, 0, file); + nr_taken = isolate_pages_global(nr_to_scan, &page_list, + &nr_scanned, sc->order, reclaim_mode, zone, 0, file); zone->pages_scanned += nr_scanned; if (current_is_kswapd()) __count_zone_vm_events(PGSCAN_KSWAPD, zone, @@ -1482,12 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scanned); } else { - nr_taken = mem_cgroup_isolate_pages(nr_to_scan, - &page_list, &nr_scanned, sc->order, - sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? - ISOLATE_BOTH : ISOLATE_INACTIVE, - zone, sc->mem_cgroup, - 0, file); + nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, + &nr_scanned, sc->order, reclaim_mode, zone, + sc->mem_cgroup, 0, file); /* * mem_cgroup_isolate_pages() keeps track of * scanned pages on its own. @@ -1503,17 +1529,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, spin_unlock_irq(&zone->lru_lock); - nr_reclaimed = shrink_page_list(&page_list, zone, sc); + nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority, + &nr_dirty, &nr_writeback); /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { set_reclaim_mode(priority, sc, true); - nr_reclaimed += shrink_page_list(&page_list, zone, sc); + nr_reclaimed += shrink_page_list(&page_list, zone, sc, + priority, &nr_dirty, &nr_writeback); } - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_freed[file] += nr_reclaimed; - local_irq_disable(); if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_reclaimed); @@ -1521,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + /* + * If reclaim is isolating dirty pages under writeback, it implies + * that the long-lived page allocation rate is exceeding the page + * laundering rate. Either the global limits are not being effective + * at throttling processes due to the page distribution throughout + * zones or there is heavy usage of a slow backing device. The + * only option is to throttle from reclaim context which is not ideal + * as there is no guarantee the dirtying process is throttled in the + * same way balance_dirty_pages() manages. + * + * This scales the number of dirty pages that must be under writeback + * before throttling depending on priority. It is a simple backoff + * function that has the most effect in the range DEF_PRIORITY to + * DEF_PRIORITY-2 which is the priority reclaim is considered to be + * in trouble and reclaim is considered to be in trouble. + * + * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle + * DEF_PRIORITY-1 50% must be PageWriteback + * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble + * ... + * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any + * isolated page is PageWriteback + */ + if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority))) + wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); + trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, zone_idx(zone), nr_scanned, nr_reclaimed, @@ -1592,19 +1643,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct page *page; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); unsigned long nr_rotated = 0; + isolate_mode_t reclaim_mode = ISOLATE_ACTIVE; lru_add_drain(); + + if (!sc->may_unmap) + reclaim_mode |= ISOLATE_UNMAPPED; + if (!sc->may_writepage) + reclaim_mode |= ISOLATE_CLEAN; + spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_pages, &l_hold, &pgscanned, sc->order, - ISOLATE_ACTIVE, zone, + reclaim_mode, zone, 1, file); zone->pages_scanned += pgscanned; } else { nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, - ISOLATE_ACTIVE, zone, + reclaim_mode, zone, sc->mem_cgroup, 1, file); /* * mem_cgroup_isolate_pages() keeps track of @@ -1613,8 +1671,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } reclaim_stat->recent_scanned[file] += nr_taken; - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) @@ -1666,8 +1722,6 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * get_scan_ratio. */ reclaim_stat->recent_rotated[file] += nr_rotated; - if (!scanning_global_lru(sc)) - sc->memcg_record->nr_rotated[file] += nr_rotated; move_active_pages_to_lru(zone, &l_active, LRU_ACTIVE + file * LRU_FILE); @@ -1713,7 +1767,7 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) if (scanning_global_lru(sc)) low = inactive_anon_is_low_global(zone); else - low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); + low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup, zone); return low; } #else @@ -1756,7 +1810,7 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) if (scanning_global_lru(sc)) low = inactive_file_is_low_global(zone); else - low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); + low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup, zone); return low; } @@ -1808,23 +1862,22 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, u64 fraction[2], denominator; enum lru_list l; int noswap = 0; - int force_scan = 0; - unsigned long nr_force_scan[2]; - + bool force_scan = false; - anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); - - if (((anon + file) >> priority) < SWAP_CLUSTER_MAX) { - /* kswapd does zone balancing and need to scan this zone */ - if (scanning_global_lru(sc) && current_is_kswapd()) - force_scan = 1; - /* memcg may have small limit and need to avoid priority drop */ - if (!scanning_global_lru(sc)) - force_scan = 1; - } + /* + * If the zone or memcg is small, nr[l] can be 0. This + * results in no scanning on this priority and a potential + * priority drop. Global direct reclaim can go to the next + * zone and tends to have no problems. Global kswapd is for + * zone balancing and it needs to scan a minimum amount. When + * reclaiming for a memcg, a priority drop can cause high + * latencies, so it's better to scan a minimum amount there as + * well. + */ + if (scanning_global_lru(sc) && current_is_kswapd()) + force_scan = true; + if (!scanning_global_lru(sc)) + force_scan = true; /* If we have no swap space, do not bother scanning anon pages. */ if (!sc->may_swap || (nr_swap_pages <= 0)) { @@ -1832,11 +1885,14 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = 0; fraction[1] = 1; denominator = 1; - nr_force_scan[0] = 0; - nr_force_scan[1] = SWAP_CLUSTER_MAX; goto out; } + anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (scanning_global_lru(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, @@ -1845,8 +1901,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = 1; fraction[1] = 0; denominator = 1; - nr_force_scan[0] = SWAP_CLUSTER_MAX; - nr_force_scan[1] = 0; goto out; } } @@ -1895,11 +1949,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc, fraction[0] = ap; fraction[1] = fp; denominator = ap + fp + 1; - if (force_scan) { - unsigned long scan = SWAP_CLUSTER_MAX; - nr_force_scan[0] = div64_u64(scan * ap, denominator); - nr_force_scan[1] = div64_u64(scan * fp, denominator); - } out: for_each_evictable_lru(l) { int file = is_file_lru(l); @@ -1908,20 +1957,10 @@ out: scan = zone_nr_lru_pages(zone, sc, l); if (priority || noswap) { scan >>= priority; + if (!scan && force_scan) + scan = SWAP_CLUSTER_MAX; scan = div64_u64(scan * fraction[file], denominator); } - - /* - * If zone is small or memcg is small, nr[l] can be 0. - * This results no-scan on this priority and priority drop down. - * For global direct reclaim, it can visit next zone and tend - * not to have problems. For global kswapd, it's for zone - * balancing and it need to scan a small amounts. When using - * memcg, priority drop can cause big latency. So, it's better - * to scan small amount. See may_noscan above. - */ - if (!scan && force_scan) - scan = nr_force_scan[file]; nr[l] = scan; } } @@ -2000,12 +2039,14 @@ static void shrink_zone(int priority, struct zone *zone, enum lru_list l; unsigned long nr_reclaimed, nr_scanned; unsigned long nr_to_reclaim = sc->nr_to_reclaim; + struct blk_plug plug; restart: nr_reclaimed = 0; nr_scanned = sc->nr_scanned; get_scan_count(zone, sc, nr, priority); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { @@ -2029,6 +2070,7 @@ restart: if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } + blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; /* @@ -2061,14 +2103,19 @@ restart: * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. + * + * This function returns true if a zone is being reclaimed for a costly + * high-order allocation and compaction is either ready to begin or deferred. + * This indicates to the caller that it should retry the allocation or fail. */ -static void shrink_zones(int priority, struct zonelist *zonelist, +static bool shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { struct zoneref *z; struct zone *zone; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; + bool should_abort_reclaim = false; for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(sc->gfp_mask), sc->nodemask) { @@ -2083,6 +2130,23 @@ static void shrink_zones(int priority, struct zonelist *zonelist, continue; if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ + if (COMPACTION_BUILD) { + /* + * If we already have plenty of memory free for + * compaction in this zone, don't free any more. + * Even though compaction is invoked for any + * non-zero order, only frequent costly order + * reclamation is disruptive enough to become a + * noticable problem, like transparent huge page + * allocations. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER && + (compaction_suitable(zone, sc->order) || + compaction_deferred(zone))) { + should_abort_reclaim = true; + continue; + } + } /* * This steals pages from memory cgroups over softlimit * and returns the number of reclaimed pages and @@ -2100,6 +2164,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist, shrink_zone(priority, zone, sc); } + + return should_abort_reclaim; } static bool zone_reclaimable(struct zone *zone) @@ -2164,7 +2230,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(sc->mem_cgroup); - shrink_zones(priority, zonelist, sc); + if (shrink_zones(priority, zonelist, sc)) + break; + /* * Don't shrink slabs when reclaiming memory from * over limit cgroups @@ -2268,10 +2336,9 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, - gfp_t gfp_mask, bool noswap, - struct zone *zone, - struct memcg_scanrecord *rec, - unsigned long *scanned) + gfp_t gfp_mask, bool noswap, + struct zone *zone, + unsigned long *nr_scanned) { struct scan_control sc = { .nr_scanned = 0, @@ -2281,9 +2348,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_swap = !noswap, .order = 0, .mem_cgroup = mem, - .memcg_record = rec, }; - unsigned long start, end; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -2292,7 +2357,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, sc.may_writepage, sc.gfp_mask); - start = sched_clock(); /* * NOTE: Although we can get the priority field, using it * here is not a good idea, since it limits the pages we can scan. @@ -2301,25 +2365,19 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, * the priority and make it zero. */ shrink_zone(0, zone, &sc); - end = sched_clock(); - - if (rec) - rec->elapsed += end - start; - *scanned = sc.nr_scanned; trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; } unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, - bool noswap, - struct memcg_scanrecord *rec) + bool noswap) { struct zonelist *zonelist; unsigned long nr_reclaimed; - unsigned long start, end; int nid; struct scan_control sc = { .may_writepage = !laptop_mode, @@ -2328,7 +2386,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .nr_to_reclaim = SWAP_CLUSTER_MAX, .order = 0, .mem_cgroup = mem_cont, - .memcg_record = rec, .nodemask = NULL, /* we don't care the placement */ .gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK), @@ -2337,7 +2394,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .gfp_mask = sc.gfp_mask, }; - start = sched_clock(); /* * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't * take care of from where we get pages. So the node where we start the @@ -2352,9 +2408,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, sc.gfp_mask); nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink); - end = sched_clock(); - if (rec) - rec->elapsed += end - start; trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); @@ -2529,6 +2582,9 @@ loop_again: high_wmark_pages(zone), 0, 0)) { end_zone = i; break; + } else { + /* If balanced, clear the congested flag */ + zone_clear_flag(zone, ZONE_CONGESTED); } } if (i < 0) @@ -2719,6 +2775,8 @@ out: /* If balanced, clear the congested flag */ zone_clear_flag(zone, ZONE_CONGESTED); + if (i <= *classzone_idx) + balanced += zone->present_pages; } } @@ -2792,7 +2850,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) static int kswapd(void *p) { unsigned long order, new_order; + unsigned balanced_order; int classzone_idx, new_classzone_idx; + int balanced_classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -2823,7 +2883,9 @@ static int kswapd(void *p) set_freezable(); order = new_order = 0; + balanced_order = 0; classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; + balanced_classzone_idx = classzone_idx; for ( ; ; ) { int ret; @@ -2832,7 +2894,8 @@ static int kswapd(void *p) * new request of a similar or harder type will succeed soon * so consider going to sleep on the basis we reclaimed at */ - if (classzone_idx >= new_classzone_idx && order == new_order) { + if (balanced_classzone_idx >= new_classzone_idx && + balanced_order == new_order) { new_order = pgdat->kswapd_max_order; new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; @@ -2847,9 +2910,12 @@ static int kswapd(void *p) order = new_order; classzone_idx = new_classzone_idx; } else { - kswapd_try_to_sleep(pgdat, order, classzone_idx); + kswapd_try_to_sleep(pgdat, balanced_order, + balanced_classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; + new_order = order; + new_classzone_idx = classzone_idx; pgdat->kswapd_max_order = 0; pgdat->classzone_idx = pgdat->nr_zones - 1; } @@ -2864,7 +2930,9 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - order = balance_pgdat(pgdat, order, &classzone_idx); + balanced_classzone_idx = classzone_idx; + balanced_order = balance_pgdat(pgdat, order, + &balanced_classzone_idx); } } return 0; @@ -3376,66 +3444,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) } -/** - * scan_zone_unevictable_pages - check unevictable list for evictable pages - * @zone - zone of which to scan the unevictable list - * - * Scan @zone's unevictable LRU lists to check for pages that have become - * evictable. Move those that have to @zone's inactive list where they - * become candidates for reclaim, unless shrink_inactive_zone() decides - * to reactivate them. Pages that are still unevictable are rotated - * back onto @zone's unevictable list. - */ -#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ -static void scan_zone_unevictable_pages(struct zone *zone) +static void warn_scan_unevictable_pages(void) { - struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; - unsigned long scan; - unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); - - while (nr_to_scan > 0) { - unsigned long batch_size = min(nr_to_scan, - SCAN_UNEVICTABLE_BATCH_SIZE); - - spin_lock_irq(&zone->lru_lock); - for (scan = 0; scan < batch_size; scan++) { - struct page *page = lru_to_page(l_unevictable); - - if (!trylock_page(page)) - continue; - - prefetchw_prev_lru_page(page, l_unevictable, flags); - - if (likely(PageLRU(page) && PageUnevictable(page))) - check_move_unevictable_page(page, zone); - - unlock_page(page); - } - spin_unlock_irq(&zone->lru_lock); - - nr_to_scan -= batch_size; - } -} - - -/** - * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages - * - * A really big hammer: scan all zones' unevictable LRU lists to check for - * pages that have become evictable. Move those back to the zones' - * inactive list where they become candidates for reclaim. - * This occurs when, e.g., we have unswappable pages on the unevictable lists, - * and we add swap to the system. As such, it runs in the context of a task - * that has possibly/probably made some previously unevictable pages - * evictable. - */ -static void scan_all_zones_unevictable_pages(void) -{ - struct zone *zone; - - for_each_zone(zone) { - scan_zone_unevictable_pages(zone); - } + printk_once(KERN_WARNING + "The scan_unevictable_pages sysctl/node-interface has been " + "disabled for lack of a legitimate use case. If you have " + "one, please send an email to linux-mm@kvack.org.\n"); } /* @@ -3448,11 +3462,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { + warn_scan_unevictable_pages(); proc_doulongvec_minmax(table, write, buffer, length, ppos); - - if (write && *(unsigned long *)table->data) - scan_all_zones_unevictable_pages(); - scan_unevictable_pages = 0; return 0; } @@ -3467,6 +3478,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev, struct sysdev_attribute *attr, char *buf) { + warn_scan_unevictable_pages(); return sprintf(buf, "0\n"); /* always zero; should fit... */ } @@ -3474,19 +3486,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev, struct sysdev_attribute *attr, const char *buf, size_t count) { - struct zone *node_zones = NODE_DATA(dev->id)->node_zones; - struct zone *zone; - unsigned long res; - unsigned long req = strict_strtoul(buf, 10, &res); - - if (!req) - return 1; /* zero is no-op */ - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!populated_zone(zone)) - continue; - scan_zone_unevictable_pages(zone); - } + warn_scan_unevictable_pages(); return 1; } diff --git a/mm/vmstat.c b/mm/vmstat.c index 20c18b7694b2..8fd603b1665e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu) * * vm_stat contains the global counters */ -atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; +atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp; EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP @@ -659,7 +659,7 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, } #endif -#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) +#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA) #ifdef CONFIG_ZONE_DMA #define TEXT_FOR_DMA(xx) xx "_dma", #else @@ -702,6 +702,7 @@ const char * const vmstat_text[] = { "nr_unstable", "nr_bounce", "nr_vmscan_write", + "nr_vmscan_immediate_reclaim", "nr_writeback_temp", "nr_isolated_anon", "nr_isolated_file", @@ -788,7 +789,7 @@ const char * const vmstat_text[] = { #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; -#endif /* CONFIG_PROC_FS || CONFIG_SYSFS */ +#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ #ifdef CONFIG_PROC_FS |