summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2016-03-04 12:12:08 +0100
committerIngo Molnar <mingo@kernel.org>2016-03-04 12:12:08 +0100
commitbc94b99636dc7bcccce439a9fb9c00065e2e2627 (patch)
treebddbd29a5fd7b2d270d039efc1d6858791a2c98f /mm
parent4650bac1fc45d64aef62ab99aa4db93d41dedbd9 (diff)
parentfc77dbd34c5c99bce46d40a2491937c3bcbd10af (diff)
Merge tag 'v4.5-rc6' into core/resources, to resolve conflict
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/backing-dev.c4
-rw-r--r--mm/filemap.c13
-rw-r--r--mm/gup.c4
-rw-r--r--mm/huge_memory.c102
-rw-r--r--mm/hugetlb.c13
-rw-r--r--mm/internal.h31
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memory.c28
-rw-r--r--mm/mempolicy.c14
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c119
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page_alloc.c7
-rw-r--r--mm/pgtable-generic.c8
-rw-r--r--mm/slab.c12
-rw-r--r--mm/slab.h1
-rw-r--r--mm/slab_common.c1
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c38
-rw-r--r--mm/util.c27
-rw-r--r--mm/vmpressure.c3
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/vmstat.c70
25 files changed, 307 insertions, 215 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 97a4e06b15c0..03cbfa072f42 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -624,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
bool
config DEFERRED_STRUCT_PAGE_INIT
- bool "Defer initialisation of struct pages to kswapd"
+ bool "Defer initialisation of struct pages to kthreads"
default n
depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
depends on MEMORY_HOTPLUG
@@ -633,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT
single thread. On very large machines this can take a considerable
amount of time. If this option is set, large machines will bring up
a subset of memmap at boot and then initialise the rest in parallel
- when kswapd starts. This has a potential performance impact on
- processes running early in the lifetime of the systemm until kswapd
- finishes the initialisation.
+ by starting one-off "pgdatinitX" kernel thread for each node X. This
+ has a potential performance impact on processes running early in the
+ lifetime of the system until these kthreads finish the
+ initialisation.
config IDLE_PAGE_TRACKING
bool "Enable idle page tracking"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index cc5d29d2da9b..c554d173a65f 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -328,7 +328,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
return 0;
out_destroy_stat:
- while (--i)
+ while (i--)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
out_put_cong:
@@ -989,7 +989,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
* here rather than calling cond_resched().
*/
if (current->flags & PF_WQ_WORKER)
- schedule_timeout(1);
+ schedule_timeout_uninterruptible(1);
else
cond_resched();
diff --git a/mm/filemap.c b/mm/filemap.c
index bc943867d68c..3461d97ecb30 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -446,7 +446,8 @@ int filemap_write_and_wait(struct address_space *mapping)
{
int err = 0;
- if (mapping->nrpages) {
+ if ((!dax_mapping(mapping) && mapping->nrpages) ||
+ (dax_mapping(mapping) && mapping->nrexceptional)) {
err = filemap_fdatawrite(mapping);
/*
* Even if the above returned error, the pages may be
@@ -482,13 +483,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
{
int err = 0;
- if (dax_mapping(mapping) && mapping->nrexceptional) {
- err = dax_writeback_mapping_range(mapping, lstart, lend);
- if (err)
- return err;
- }
-
- if (mapping->nrpages) {
+ if ((!dax_mapping(mapping) && mapping->nrpages) ||
+ (dax_mapping(mapping) && mapping->nrexceptional)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
@@ -1890,6 +1886,7 @@ EXPORT_SYMBOL(generic_file_read_iter);
* page_cache_read - adds requested page to the page cache if not already there
* @file: file to read
* @offset: page index
+ * @gfp_mask: memory allocation flags
*
* This adds the requested page to the page cache if it isn't already there,
* and schedules an I/O to read in its contents from disk.
diff --git a/mm/gup.c b/mm/gup.c
index b64a36175884..7bf19ffa2199 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -430,10 +430,8 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* Anon pages in shared mappings are surprising: now
* just reject it.
*/
- if (!is_cow_mapping(vm_flags)) {
- WARN_ON_ONCE(vm_flags & VM_MAYWRITE);
+ if (!is_cow_mapping(vm_flags))
return -EFAULT;
- }
}
} else if (!(vm_flags & VM_READ)) {
if (!(gup_flags & FOLL_FORCE))
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fd3a07b3e6f4..e10a4fee88d2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -138,9 +138,6 @@ static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
-static DEFINE_SPINLOCK(split_queue_lock);
-static LIST_HEAD(split_queue);
-static unsigned long split_queue_len;
static struct shrinker deferred_split_shrinker;
static void set_recommended_min_free_kbytes(void)
@@ -861,7 +858,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return false;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ if (pgtable)
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
atomic_long_inc(&mm->nr_ptes);
return true;
@@ -1039,13 +1037,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
pmd_t pmd;
- pgtable_t pgtable;
+ pgtable_t pgtable = NULL;
int ret;
- ret = -ENOMEM;
- pgtable = pte_alloc_one(dst_mm, addr);
- if (unlikely(!pgtable))
- goto out;
+ if (!vma_is_dax(vma)) {
+ ret = -ENOMEM;
+ pgtable = pte_alloc_one(dst_mm, addr);
+ if (unlikely(!pgtable))
+ goto out;
+ }
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
@@ -1076,7 +1076,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
- if (pmd_trans_huge(pmd)) {
+ if (!vma_is_dax(vma)) {
/* thp accounting separate from pmd_devmap accounting */
src_page = pmd_page(pmd);
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
@@ -1700,7 +1700,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
- if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
+ if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
+ vma_is_anonymous(vma)) {
pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
@@ -2835,6 +2836,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pgtable_t pgtable;
pmd_t _pmd;
bool young, write, dirty;
+ unsigned long addr;
int i;
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
@@ -2860,10 +2862,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
young = pmd_young(*pmd);
dirty = pmd_dirty(*pmd);
+ pmdp_huge_split_prepare(vma, haddr, pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
pte_t entry, *pte;
/*
* Note that NUMA hinting access restrictions are not
@@ -2884,9 +2887,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
if (dirty)
SetPageDirty(page + i);
- pte = pte_offset_map(&_pmd, haddr);
+ pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
+ set_pte_at(mm, addr, pte, entry);
atomic_inc(&page[i]._mapcount);
pte_unmap(pte);
}
@@ -2936,7 +2939,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pmd_populate(mm, pmd, pgtable);
if (freeze) {
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
page_remove_rmap(page + i, false);
put_page(page + i);
}
@@ -3358,6 +3361,7 @@ int total_mapcount(struct page *page)
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct page *head = compound_head(page);
+ struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
struct anon_vma *anon_vma;
int count, mapcount, ret;
bool mlocked;
@@ -3401,19 +3405,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
lru_add_drain();
/* Prevent deferred_split_scan() touching ->_count */
- spin_lock_irqsave(&split_queue_lock, flags);
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
count = page_count(head);
mapcount = total_mapcount(head);
if (!mapcount && count == 1) {
if (!list_empty(page_deferred_list(head))) {
- split_queue_len--;
+ pgdata->split_queue_len--;
list_del(page_deferred_list(head));
}
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
__split_huge_page(page, list);
ret = 0;
} else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
pr_alert("total_mapcount: %u, page_count(): %u\n",
mapcount, count);
if (PageTail(page))
@@ -3421,7 +3425,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
dump_page(page, "total_mapcount(head) > 0");
BUG();
} else {
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
unfreeze_page(anon_vma, head);
ret = -EBUSY;
}
@@ -3436,64 +3440,65 @@ out:
void free_transhuge_page(struct page *page)
{
+ struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
unsigned long flags;
- spin_lock_irqsave(&split_queue_lock, flags);
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
if (!list_empty(page_deferred_list(page))) {
- split_queue_len--;
+ pgdata->split_queue_len--;
list_del(page_deferred_list(page));
}
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
free_compound_page(page);
}
void deferred_split_huge_page(struct page *page)
{
+ struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
unsigned long flags;
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- spin_lock_irqsave(&split_queue_lock, flags);
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
- list_add_tail(page_deferred_list(page), &split_queue);
- split_queue_len++;
+ list_add_tail(page_deferred_list(page), &pgdata->split_queue);
+ pgdata->split_queue_len++;
}
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
}
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- /*
- * Split a page from split_queue will free up at least one page,
- * at most HPAGE_PMD_NR - 1. We don't track exact number.
- * Let's use HPAGE_PMD_NR / 2 as ballpark.
- */
- return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+ struct pglist_data *pgdata = NODE_DATA(sc->nid);
+ return ACCESS_ONCE(pgdata->split_queue_len);
}
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
+ struct pglist_data *pgdata = NODE_DATA(sc->nid);
unsigned long flags;
LIST_HEAD(list), *pos, *next;
struct page *page;
int split = 0;
- spin_lock_irqsave(&split_queue_lock, flags);
- list_splice_init(&split_queue, &list);
-
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
- list_for_each_safe(pos, next, &list) {
+ list_for_each_safe(pos, next, &pgdata->split_queue) {
page = list_entry((void *)pos, struct page, mapping);
page = compound_head(page);
- /* race with put_compound_page() */
- if (!get_page_unless_zero(page)) {
+ if (get_page_unless_zero(page)) {
+ list_move(page_deferred_list(page), &list);
+ } else {
+ /* We lost race with put_compound_page() */
list_del_init(page_deferred_list(page));
- split_queue_len--;
+ pgdata->split_queue_len--;
}
+ if (!--sc->nr_to_scan)
+ break;
}
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping);
@@ -3505,17 +3510,24 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
put_page(page);
}
- spin_lock_irqsave(&split_queue_lock, flags);
- list_splice_tail(&list, &split_queue);
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ list_splice_tail(&list, &pgdata->split_queue);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
- return split * HPAGE_PMD_NR / 2;
+ /*
+ * Stop shrinker if we didn't split any page, but the queue is empty.
+ * This can happen if pages were freed under us.
+ */
+ if (!split && list_empty(&pgdata->split_queue))
+ return SHRINK_STOP;
+ return split;
}
static struct shrinker deferred_split_shrinker = {
.count_objects = deferred_split_count,
.scan_objects = deferred_split_scan,
.seeks = DEFAULT_SEEKS,
+ .flags = SHRINKER_NUMA_AWARE,
};
#ifdef CONFIG_DEBUG_FS
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 12908dcf5831..01f2b48c8618 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1001,7 +1001,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA))
static void destroy_compound_gigantic_page(struct page *page,
unsigned int order)
{
@@ -1214,8 +1214,8 @@ void free_huge_page(struct page *page)
set_page_private(page, 0);
page->mapping = NULL;
- BUG_ON(page_count(page));
- BUG_ON(page_mapcount(page));
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(page_mapcount(page), page);
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
@@ -1286,6 +1286,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
set_page_count(p, 0);
set_compound_head(p, page);
}
+ atomic_set(compound_mapcount_ptr(page), -1);
}
/*
@@ -2629,8 +2630,10 @@ static int __init hugetlb_init(void)
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
}
default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
- if (default_hstate_max_huge_pages)
- default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ if (default_hstate_max_huge_pages) {
+ if (!default_hstate.max_huge_pages)
+ default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ }
hugetlb_init_hstates();
gather_bootmem_prealloc();
diff --git a/mm/internal.h b/mm/internal.h
index ed8b5ffcf9b1..a38a21ebddb4 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -216,6 +216,37 @@ static inline bool is_cow_mapping(vm_flags_t flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
+/*
+ * These three helpers classifies VMAs for virtual memory accounting.
+ */
+
+/*
+ * Executable code area - executable, not writable, not stack
+ */
+static inline bool is_exec_mapping(vm_flags_t flags)
+{
+ return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
+}
+
+/*
+ * Stack area - atomatically grows in one direction
+ *
+ * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
+ * do_mmap() forbids all other combinations.
+ */
+static inline bool is_stack_mapping(vm_flags_t flags)
+{
+ return (flags & VM_STACK) == VM_STACK;
+}
+
+/*
+ * Data area - private, writable, not stack
+ */
+static inline bool is_data_mapping(vm_flags_t flags)
+{
+ return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
+}
+
/* mm/util.c */
void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent);
diff --git a/mm/memblock.c b/mm/memblock.c
index d2ed81e59a94..dd7989929f13 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1448,7 +1448,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
* Remaining API functions
*/
-phys_addr_t __init memblock_phys_mem_size(void)
+phys_addr_t __init_memblock memblock_phys_mem_size(void)
{
return memblock.memory.total_size;
}
diff --git a/mm/memory.c b/mm/memory.c
index 30991f83d0bf..8132787ae4d5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1591,10 +1591,15 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr,
* than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP
* without pte special, it would there be refcounted as a normal page.
*/
- if (!HAVE_PTE_SPECIAL && pfn_t_valid(pfn)) {
+ if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) {
struct page *page;
- page = pfn_t_to_page(pfn);
+ /*
+ * At this point we are committed to insert_page()
+ * regardless of whether the caller specified flags that
+ * result in pfn_t_has_page() == false.
+ */
+ page = pfn_to_page(pfn_t_to_pfn(pfn));
return insert_page(vma, addr, page, vma->vm_page_prot);
}
return insert_pfn(vma, addr, pfn, vma->vm_page_prot);
@@ -2232,11 +2237,6 @@ static int wp_page_shared(struct mm_struct *mm, struct vm_area_struct *vma,
page_cache_get(old_page);
- /*
- * Only catch write-faults on shared writable pages,
- * read-only shared pages can get COWed by
- * get_user_pages(.write=1, .force=1).
- */
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
int tmp;
@@ -3404,8 +3404,18 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(pmd_none(*pmd)) &&
unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
- /* if an huge pmd materialized from under us just retry later */
- if (unlikely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+ /*
+ * If a huge pmd materialized under us just retry later. Use
+ * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
+ * didn't become pmd_trans_huge under us and then back to pmd_none, as
+ * a result of MADV_DONTNEED running immediately after a huge pmd fault
+ * in a different thread of this mm, in turn leading to a misleading
+ * pmd_trans_huge() retval. All we have to ensure is that it is a
+ * regular pmd that we can walk with pte_offset_map() and we can do that
+ * through an atomic read in C, which is what pmd_trans_unstable()
+ * provides.
+ */
+ if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd)))
return 0;
/*
* A regular pmd is established and it can't morph into a huge pmd
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 27d135408a22..4c4187c0e1de 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -548,8 +548,7 @@ retry:
goto retry;
}
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- migrate_page_add(page, qp->pagelist, flags);
+ migrate_page_add(page, qp->pagelist, flags);
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
@@ -625,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
- if (vma->vm_flags & VM_PFNMAP)
+ if (!vma_migratable(vma))
return 1;
if (endvma > end)
@@ -644,16 +643,13 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
- if (vma_migratable(vma) &&
- vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+ if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
change_prot_numa(vma, start, endvma);
return 1;
}
- if ((flags & MPOL_MF_STRICT) ||
- ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma)))
- /* queue pages from current vma */
+ /* queue pages from current vma */
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
return 0;
return 1;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index b1034f9c77e7..3ad0fea5c438 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1582,7 +1582,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
(GFP_HIGHUSER_MOVABLE |
__GFP_THISNODE | __GFP_NOMEMALLOC |
__GFP_NORETRY | __GFP_NOWARN) &
- ~(__GFP_IO | __GFP_FS), 0);
+ ~__GFP_RECLAIM, 0);
return newpage;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 84b12624ceb0..76d1ec29149b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -42,6 +42,7 @@
#include <linux/memory.h>
#include <linux/printk.h>
#include <linux/userfaultfd_k.h>
+#include <linux/moduleparam.h>
#include <asm/uaccess.h>
#include <asm/cacheflush.h>
@@ -69,6 +70,8 @@ const int mmap_rnd_compat_bits_max = CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX;
int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
#endif
+static bool ignore_rlimit_data = true;
+core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
static void unmap_region(struct mm_struct *mm,
struct vm_area_struct *vma, struct vm_area_struct *prev,
@@ -387,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma)
}
#ifdef CONFIG_DEBUG_VM_RB
-static int browse_rb(struct rb_root *root)
+static int browse_rb(struct mm_struct *mm)
{
+ struct rb_root *root = &mm->mm_rb;
int i = 0, j, bug = 0;
struct rb_node *nd, *pn = NULL;
unsigned long prev = 0, pend = 0;
@@ -411,12 +415,14 @@ static int browse_rb(struct rb_root *root)
vma->vm_start, vma->vm_end);
bug = 1;
}
+ spin_lock(&mm->page_table_lock);
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
pr_emerg("free gap %lx, correct %lx\n",
vma->rb_subtree_gap,
vma_compute_subtree_gap(vma));
bug = 1;
}
+ spin_unlock(&mm->page_table_lock);
i++;
pn = nd;
prev = vma->vm_start;
@@ -453,12 +459,16 @@ static void validate_mm(struct mm_struct *mm)
struct vm_area_struct *vma = mm->mmap;
while (vma) {
+ struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
- vma_lock_anon_vma(vma);
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- anon_vma_interval_tree_verify(avc);
- vma_unlock_anon_vma(vma);
+ if (anon_vma) {
+ anon_vma_lock_read(anon_vma);
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_verify(avc);
+ anon_vma_unlock_read(anon_vma);
+ }
+
highest_address = vma->vm_end;
vma = vma->vm_next;
i++;
@@ -472,7 +482,7 @@ static void validate_mm(struct mm_struct *mm)
mm->highest_vm_end, highest_address);
bug = 1;
}
- i = browse_rb(&mm->mm_rb);
+ i = browse_rb(mm);
if (i != mm->map_count) {
if (i != -1)
pr_emerg("map_count %d rb %d\n", mm->map_count, i);
@@ -2139,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
- int error;
+ int error = 0;
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
- /*
- * We must make sure the anon_vma is allocated
- * so that the anon_vma locking is not a noop.
- */
+ /* Guard against wrapping around to address 0. */
+ if (address < PAGE_ALIGN(address+4))
+ address = PAGE_ALIGN(address+4);
+ else
+ return -ENOMEM;
+
+ /* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
- vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
- * Also guard against wrapping around to address 0.
*/
- if (address < PAGE_ALIGN(address+4))
- address = PAGE_ALIGN(address+4);
- else {
- vma_unlock_anon_vma(vma);
- return -ENOMEM;
- }
- error = 0;
+ anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address > vma->vm_end) {
@@ -2182,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
* updates, but we only hold a shared mmap_sem
* lock here, so we need to protect against
* concurrent vma expansions.
- * vma_lock_anon_vma() doesn't help here, as
+ * anon_vma_lock_write() doesn't help here, as
* we don't guarantee that all growable vmas
* in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard
@@ -2205,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
}
- vma_unlock_anon_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(mm);
return error;
@@ -2221,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
int error;
- /*
- * We must make sure the anon_vma is allocated
- * so that the anon_vma locking is not a noop.
- */
- if (unlikely(anon_vma_prepare(vma)))
- return -ENOMEM;
-
address &= PAGE_MASK;
error = security_mmap_addr(address);
if (error)
return error;
- vma_lock_anon_vma(vma);
+ /* We must make sure the anon_vma is allocated. */
+ if (unlikely(anon_vma_prepare(vma)))
+ return -ENOMEM;
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
+ anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address < vma->vm_start) {
@@ -2257,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma,
* updates, but we only hold a shared mmap_sem
* lock here, so we need to protect against
* concurrent vma expansions.
- * vma_lock_anon_vma() doesn't help here, as
+ * anon_vma_lock_write() doesn't help here, as
* we don't guarantee that all growable vmas
* in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard
@@ -2278,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma,
}
}
}
- vma_unlock_anon_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(mm);
return error;
@@ -2663,12 +2664,29 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (!vma || !(vma->vm_flags & VM_SHARED))
goto out;
- if (start < vma->vm_start || start + size > vma->vm_end)
+ if (start < vma->vm_start)
goto out;
- if (pgoff == linear_page_index(vma, start)) {
- ret = 0;
- goto out;
+ if (start + size > vma->vm_end) {
+ struct vm_area_struct *next;
+
+ for (next = vma->vm_next; next; next = next->vm_next) {
+ /* hole between vmas ? */
+ if (next->vm_start != next->vm_prev->vm_end)
+ goto out;
+
+ if (next->vm_file != vma->vm_file)
+ goto out;
+
+ if (next->vm_flags != vma->vm_flags)
+ goto out;
+
+ if (start + size <= next->vm_end)
+ break;
+ }
+
+ if (!next)
+ goto out;
}
prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
@@ -2678,9 +2696,16 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
flags &= MAP_NONBLOCK;
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
if (vma->vm_flags & VM_LOCKED) {
+ struct vm_area_struct *tmp;
flags |= MAP_LOCKED;
+
/* drop PG_Mlocked flag for over-mapped range */
- munlock_vma_pages_range(vma, start, start + size);
+ for (tmp = vma; tmp->vm_start >= start + size;
+ tmp = tmp->vm_next) {
+ munlock_vma_pages_range(tmp,
+ max(tmp->vm_start, start),
+ min(tmp->vm_end, start + size));
+ }
}
file = get_file(vma->vm_file);
@@ -2982,9 +3007,17 @@ bool may_expand_vm(struct mm_struct *mm, vm_flags_t flags, unsigned long npages)
if (mm->total_vm + npages > rlimit(RLIMIT_AS) >> PAGE_SHIFT)
return false;
- if ((flags & (VM_WRITE | VM_SHARED | (VM_STACK_FLAGS &
- (VM_GROWSUP | VM_GROWSDOWN)))) == VM_WRITE)
- return mm->data_vm + npages <= rlimit(RLIMIT_DATA);
+ if (is_data_mapping(flags) &&
+ mm->data_vm + npages > rlimit(RLIMIT_DATA) >> PAGE_SHIFT) {
+ if (ignore_rlimit_data)
+ pr_warn_once("%s (%d): VmData %lu exceed data ulimit "
+ "%lu. Will be forbidden soon.\n",
+ current->comm, current->pid,
+ (mm->data_vm + npages) << PAGE_SHIFT,
+ rlimit(RLIMIT_DATA));
+ else
+ return false;
+ }
return true;
}
@@ -2993,11 +3026,11 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
{
mm->total_vm += npages;
- if ((flags & (VM_EXEC | VM_WRITE)) == VM_EXEC)
+ if (is_exec_mapping(flags))
mm->exec_vm += npages;
- else if (flags & (VM_STACK_FLAGS & (VM_GROWSUP | VM_GROWSDOWN)))
+ else if (is_stack_mapping(flags))
mm->stack_vm += npages;
- else if ((flags & (VM_WRITE | VM_SHARED)) == VM_WRITE)
+ else if (is_data_mapping(flags))
mm->data_vm += npages;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8eb7bb40dc40..f7cb3d4d9c2e 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -160,9 +160,11 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
}
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
- if (next - addr != HPAGE_PMD_SIZE)
+ if (next - addr != HPAGE_PMD_SIZE) {
split_huge_pmd(vma, pmd, addr);
- else {
+ if (pmd_none(*pmd))
+ continue;
+ } else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
newprot, prot_numa);
diff --git a/mm/mremap.c b/mm/mremap.c
index d77946a997f7..8eeba02fc991 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -210,6 +210,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
}
}
split_huge_pmd(vma, old_pmd, old_addr);
+ if (pmd_none(*old_pmd))
+ continue;
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 63358d9f9aa9..838ca8bb64f7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -5210,6 +5210,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
pgdat->numabalancing_migrate_nr_pages = 0;
pgdat->numabalancing_migrate_next_window = jiffies;
#endif
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ spin_lock_init(&pgdat->split_queue_lock);
+ INIT_LIST_HEAD(&pgdat->split_queue);
+ pgdat->split_queue_len = 0;
+#endif
init_waitqueue_head(&pgdat->kswapd_wait);
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_ext_init(pgdat);
@@ -6615,7 +6620,7 @@ bool is_pageblock_removable_nolock(struct page *page)
return !has_unmovable_pages(zone, page, 0, true);
}
-#ifdef CONFIG_CMA
+#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
static unsigned long pfn_max_align_down(unsigned long pfn)
{
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 9d4767698a1c..06a005b979a7 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -90,9 +90,9 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
* ARCHes with special requirements for evicting THP backing TLB entries can
* implement this. Otherwise also, it can help optimize normal TLB flush in
* THP regime. stock flush_tlb_range() typically has optimization to nuke the
- * entire TLB TLB if flush span is greater than a threshhold, which will
+ * entire TLB if flush span is greater than a threshold, which will
* likely be true for a single huge page. Thus a single thp flush will
- * invalidate the entire TLB which is not desitable.
+ * invalidate the entire TLB which is not desirable.
* e.g. see arch/arc: flush_pmd_tlb_range
*/
#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
@@ -195,7 +195,9 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
VM_BUG_ON(pmd_trans_huge(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
- flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+ /* collapse entails shooting down ptes not pmd */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
}
#endif
diff --git a/mm/slab.c b/mm/slab.c
index 6ecc697a8bc4..621fbcb35a36 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2275,7 +2275,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
err = setup_cpu_cache(cachep, gfp);
if (err) {
- __kmem_cache_shutdown(cachep);
+ __kmem_cache_release(cachep);
return err;
}
@@ -2414,12 +2414,13 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
+ return __kmem_cache_shrink(cachep, false);
+}
+
+void __kmem_cache_release(struct kmem_cache *cachep)
+{
int i;
struct kmem_cache_node *n;
- int rc = __kmem_cache_shrink(cachep, false);
-
- if (rc)
- return rc;
free_percpu(cachep->cpu_cache);
@@ -2430,7 +2431,6 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
kfree(n);
cachep->node[i] = NULL;
}
- return 0;
}
/*
diff --git a/mm/slab.h b/mm/slab.h
index 834ad240c0bb..2eedacea439d 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -140,6 +140,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
int __kmem_cache_shutdown(struct kmem_cache *);
+void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *, bool);
void slab_kmem_cache_release(struct kmem_cache *);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index b50aef01ccf7..065b7bdabdc3 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -693,6 +693,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s,
void slab_kmem_cache_release(struct kmem_cache *s)
{
+ __kmem_cache_release(s);
destroy_memcg_params(s);
kfree_const(s->name);
kmem_cache_free(kmem_cache, s);
diff --git a/mm/slob.c b/mm/slob.c
index 17e8f8cc7c53..5ec158054ffe 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -630,6 +630,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
return 0;
}
+void __kmem_cache_release(struct kmem_cache *c)
+{
+}
+
int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
{
return 0;
diff --git a/mm/slub.c b/mm/slub.c
index 2e1355ac056b..d8fbd4a6ed59 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1592,18 +1592,12 @@ static inline void add_partial(struct kmem_cache_node *n,
__add_partial(n, page, tail);
}
-static inline void
-__remove_partial(struct kmem_cache_node *n, struct page *page)
-{
- list_del(&page->lru);
- n->nr_partial--;
-}
-
static inline void remove_partial(struct kmem_cache_node *n,
struct page *page)
{
lockdep_assert_held(&n->list_lock);
- __remove_partial(n, page);
+ list_del(&page->lru);
+ n->nr_partial--;
}
/*
@@ -3184,6 +3178,12 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
}
}
+void __kmem_cache_release(struct kmem_cache *s)
+{
+ free_percpu(s->cpu_slab);
+ free_kmem_cache_nodes(s);
+}
+
static int init_kmem_cache_nodes(struct kmem_cache *s)
{
int node;
@@ -3443,28 +3443,31 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
/*
* Attempt to free all partial slabs on a node.
- * This is called from kmem_cache_close(). We must be the last thread
- * using the cache and therefore we do not need to lock anymore.
+ * This is called from __kmem_cache_shutdown(). We must take list_lock
+ * because sysfs file might still access partial list after the shutdowning.
*/
static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
struct page *page, *h;
+ BUG_ON(irqs_disabled());
+ spin_lock_irq(&n->list_lock);
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
- __remove_partial(n, page);
+ remove_partial(n, page);
discard_slab(s, page);
} else {
list_slab_objects(s, page,
- "Objects remaining in %s on kmem_cache_close()");
+ "Objects remaining in %s on __kmem_cache_shutdown()");
}
}
+ spin_unlock_irq(&n->list_lock);
}
/*
* Release all resources used by a slab cache.
*/
-static inline int kmem_cache_close(struct kmem_cache *s)
+int __kmem_cache_shutdown(struct kmem_cache *s)
{
int node;
struct kmem_cache_node *n;
@@ -3476,16 +3479,9 @@ static inline int kmem_cache_close(struct kmem_cache *s)
if (n->nr_partial || slabs_node(s, node))
return 1;
}
- free_percpu(s->cpu_slab);
- free_kmem_cache_nodes(s);
return 0;
}
-int __kmem_cache_shutdown(struct kmem_cache *s)
-{
- return kmem_cache_close(s);
-}
-
/********************************************************************
* Kmalloc subsystem
*******************************************************************/
@@ -3980,7 +3976,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
memcg_propagate_slab_attrs(s);
err = sysfs_slab_add(s);
if (err)
- kmem_cache_close(s);
+ __kmem_cache_release(s);
return err;
}
diff --git a/mm/util.c b/mm/util.c
index c108a6542d05..4fb14ca5a419 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -230,36 +230,11 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
}
/* Check if the vma is being used as a stack by this task */
-static int vm_is_stack_for_task(struct task_struct *t,
- struct vm_area_struct *vma)
+int vma_is_stack_for_task(struct vm_area_struct *vma, struct task_struct *t)
{
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}
-/*
- * Check if the vma is being used as a stack.
- * If is_group is non-zero, check in the entire thread group or else
- * just check in the current task. Returns the task_struct of the task
- * that the vma is stack for. Must be called under rcu_read_lock().
- */
-struct task_struct *task_of_stack(struct task_struct *task,
- struct vm_area_struct *vma, bool in_group)
-{
- if (vm_is_stack_for_task(task, vma))
- return task;
-
- if (in_group) {
- struct task_struct *t;
-
- for_each_thread(task, t) {
- if (vm_is_stack_for_task(t, vma))
- return t;
- }
- }
-
- return NULL;
-}
-
#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT)
void arch_pick_mmap_layout(struct mm_struct *mm)
{
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index 9a6c0704211c..149fdf6c5c56 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -248,9 +248,8 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, bool tree,
if (tree) {
spin_lock(&vmpr->sr_lock);
- vmpr->tree_scanned += scanned;
+ scanned = vmpr->tree_scanned += scanned;
vmpr->tree_reclaimed += reclaimed;
- scanned = vmpr->scanned;
spin_unlock(&vmpr->sr_lock);
if (scanned < vmpressure_win)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb3dd37ccd7c..71b1c29948db 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1443,7 +1443,7 @@ int isolate_lru_page(struct page *page)
int ret = -EBUSY;
VM_BUG_ON_PAGE(!page_count(page), page);
- VM_BUG_ON_PAGE(PageTail(page), page);
+ WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 40b2c74ddf16..084c6725b373 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1396,10 +1396,15 @@ static void vmstat_update(struct work_struct *w)
* Counters were updated so we expect more updates
* to occur in the future. Keep on running the
* update worker thread.
+ * If we were marked on cpu_stat_off clear the flag
+ * so that vmstat_shepherd doesn't schedule us again.
*/
- queue_delayed_work_on(smp_processor_id(), vmstat_wq,
- this_cpu_ptr(&vmstat_work),
- round_jiffies_relative(sysctl_stat_interval));
+ if (!cpumask_test_and_clear_cpu(smp_processor_id(),
+ cpu_stat_off)) {
+ queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+ this_cpu_ptr(&vmstat_work),
+ round_jiffies_relative(sysctl_stat_interval));
+ }
} else {
/*
* We did not update any counters so the app may be in
@@ -1417,18 +1422,6 @@ static void vmstat_update(struct work_struct *w)
* until the diffs stay at zero. The function is used by NOHZ and can only be
* invoked when tick processing is not active.
*/
-void quiet_vmstat(void)
-{
- if (system_state != SYSTEM_RUNNING)
- return;
-
- do {
- if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
- cancel_delayed_work(this_cpu_ptr(&vmstat_work));
-
- } while (refresh_cpu_vm_stats(false));
-}
-
/*
* Check if the diffs for a certain cpu indicate that
* an update is needed.
@@ -1452,6 +1445,30 @@ static bool need_update(int cpu)
return false;
}
+void quiet_vmstat(void)
+{
+ if (system_state != SYSTEM_RUNNING)
+ return;
+
+ /*
+ * If we are already in hands of the shepherd then there
+ * is nothing for us to do here.
+ */
+ if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+ return;
+
+ if (!need_update(smp_processor_id()))
+ return;
+
+ /*
+ * Just refresh counters and do not care about the pending delayed
+ * vmstat_update. It doesn't fire that often to matter and canceling
+ * it would be too expensive from this path.
+ * vmstat_shepherd will take care about that for us.
+ */
+ refresh_cpu_vm_stats(false);
+}
+
/*
* Shepherd worker thread that checks the
@@ -1469,18 +1486,25 @@ static void vmstat_shepherd(struct work_struct *w)
get_online_cpus();
/* Check processors whose vmstat worker threads have been disabled */
- for_each_cpu(cpu, cpu_stat_off)
- if (need_update(cpu) &&
- cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
-
- queue_delayed_work_on(cpu, vmstat_wq,
- &per_cpu(vmstat_work, cpu), 0);
+ for_each_cpu(cpu, cpu_stat_off) {
+ struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
+ if (need_update(cpu)) {
+ if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
+ queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
+ } else {
+ /*
+ * Cancel the work if quiet_vmstat has put this
+ * cpu on cpu_stat_off because the work item might
+ * be still scheduled
+ */
+ cancel_delayed_work(dw);
+ }
+ }
put_online_cpus();
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
-
}
static void __init start_shepherd_timer(void)
@@ -1488,7 +1512,7 @@ static void __init start_shepherd_timer(void)
int cpu;
for_each_possible_cpu(cpu)
- INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu),
+ INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))