summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/gup.c24
-rw-r--r--mm/huge_memory.c28
-rw-r--r--mm/madvise.c2
-rw-r--r--mm/memcontrol.c4
-rw-r--r--mm/memory.c365
-rw-r--r--mm/memory_hotplug.c5
-rw-r--r--mm/migrate.c7
-rw-r--r--mm/page_alloc.c29
-rw-r--r--mm/slab.c8
-rw-r--r--mm/slub.c6
-rw-r--r--mm/swapfile.c2
12 files changed, 363 insertions, 123 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 8c8f2deee4e3..f6d36ccc2351 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2365,7 +2365,11 @@ readpage:
}
if (!PageUptodate(page)) {
- error = lock_page_killable(page);
+ if (iocb->ki_flags & IOCB_WAITQ)
+ error = lock_page_async(page, iocb->ki_waitq);
+ else
+ error = lock_page_killable(page);
+
if (unlikely(error))
goto readpage_error;
if (!PageUptodate(page)) {
diff --git a/mm/gup.c b/mm/gup.c
index e5739a1974d5..e869c634cc9a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1255,6 +1255,9 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
BUG_ON(*locked != 1);
}
+ if (flags & FOLL_PIN)
+ atomic_set(&mm->has_pinned, 1);
+
/*
* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
* is to set FOLL_GET if the caller wants pages[] filled in (but has
@@ -2485,13 +2488,13 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 1;
}
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pmd_t *pmdp;
- pmdp = pmd_offset(&pud, addr);
+ pmdp = pmd_offset_lockless(pudp, pud, addr);
do {
pmd_t pmd = READ_ONCE(*pmdp);
@@ -2528,13 +2531,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
return 1;
}
-static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
+static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
- pudp = pud_offset(&p4d, addr);
+ pudp = pud_offset_lockless(p4dp, p4d, addr);
do {
pud_t pud = READ_ONCE(*pudp);
@@ -2549,20 +2552,20 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
PUD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
+ } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
return 0;
} while (pudp++, addr = next, addr != end);
return 1;
}
-static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
+static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
p4d_t *p4dp;
- p4dp = p4d_offset(&pgd, addr);
+ p4dp = p4d_offset_lockless(pgdp, pgd, addr);
do {
p4d_t p4d = READ_ONCE(*p4dp);
@@ -2574,7 +2577,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
P4D_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr))
+ } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);
@@ -2602,7 +2605,7 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
PGDIR_SHIFT, next, flags, pages, nr))
return;
- } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr))
+ } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
@@ -2660,6 +2663,9 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
FOLL_FAST_ONLY)))
return -EINVAL;
+ if (gup_flags & FOLL_PIN)
+ atomic_set(&current->mm->has_pinned, 1);
+
if (!(gup_flags & FOLL_FAST_ONLY))
might_lock_read(&current->mm->mmap_lock);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index faadc449cca5..da397779a6d4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1074,6 +1074,24 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
src_page = pmd_page(pmd);
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+
+ /*
+ * If this page is a potentially pinned page, split and retry the fault
+ * with smaller page size. Normally this should not happen because the
+ * userspace should use MADV_DONTFORK upon pinned regions. This is a
+ * best effort that the pinned pages won't be replaced by another
+ * random page during the coming copy-on-write.
+ */
+ if (unlikely(is_cow_mapping(vma->vm_flags) &&
+ atomic_read(&src_mm->has_pinned) &&
+ page_maybe_dma_pinned(src_page))) {
+ pte_free(dst_mm, pgtable);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ __split_huge_pmd(vma, src_pmd, addr, false, NULL);
+ return -EAGAIN;
+ }
+
get_page(src_page);
page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -1177,6 +1195,16 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
/* No huge zero pud yet */
}
+ /* Please refer to comments in copy_huge_pmd() */
+ if (unlikely(is_cow_mapping(vma->vm_flags) &&
+ atomic_read(&src_mm->has_pinned) &&
+ page_maybe_dma_pinned(pud_page(pud)))) {
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ __split_huge_pud(vma, src_pud, addr);
+ return -EAGAIN;
+ }
+
pudp_set_wrprotect(src_mm, addr, src_pud);
pud = pud_mkold(pud_wrprotect(pud));
set_pud_at(dst_mm, addr, dst_pud, pud);
diff --git a/mm/madvise.c b/mm/madvise.c
index d4aa5f776543..0e0d61003fc6 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -381,9 +381,9 @@ huge_unlock:
return 0;
}
+regular_page:
if (pmd_trans_unstable(pmd))
return 0;
-regular_page:
#endif
tlb_change_page_size(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cfa6cbad21d5..6877c765b8d0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1538,9 +1538,9 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
seq_buf_printf(&s, "workingset_activate_file %lu\n",
memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
- seq_buf_printf(&s, "workingset_restore %lu\n",
+ seq_buf_printf(&s, "workingset_restore_anon %lu\n",
memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
- seq_buf_printf(&s, "workingset_restore %lu\n",
+ seq_buf_printf(&s, "workingset_restore_file %lu\n",
memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
diff --git a/mm/memory.c b/mm/memory.c
index 469af373ae76..fcfc4ca36eba 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -695,84 +695,218 @@ out:
* covered by this vma.
*/
-static inline unsigned long
-copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+static unsigned long
+copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr, int *rss)
{
unsigned long vm_flags = vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ if (likely(!non_swap_entry(entry))) {
+ if (swap_duplicate(entry) < 0)
+ return entry.val;
+
+ /* make sure dst_mm is on swapoff's mmlist. */
+ if (unlikely(list_empty(&dst_mm->mmlist))) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&dst_mm->mmlist))
+ list_add(&dst_mm->mmlist,
+ &src_mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ rss[MM_SWAPENTS]++;
+ } else if (is_migration_entry(entry)) {
+ page = migration_entry_to_page(entry);
- /* pte contains position in swap or file, so copy. */
- if (unlikely(!pte_present(pte))) {
- swp_entry_t entry = pte_to_swp_entry(pte);
-
- if (likely(!non_swap_entry(entry))) {
- if (swap_duplicate(entry) < 0)
- return entry.val;
-
- /* make sure dst_mm is on swapoff's mmlist. */
- if (unlikely(list_empty(&dst_mm->mmlist))) {
- spin_lock(&mmlist_lock);
- if (list_empty(&dst_mm->mmlist))
- list_add(&dst_mm->mmlist,
- &src_mm->mmlist);
- spin_unlock(&mmlist_lock);
- }
- rss[MM_SWAPENTS]++;
- } else if (is_migration_entry(entry)) {
- page = migration_entry_to_page(entry);
-
- rss[mm_counter(page)]++;
-
- if (is_write_migration_entry(entry) &&
- is_cow_mapping(vm_flags)) {
- /*
- * COW mappings require pages in both
- * parent and child to be set to read.
- */
- make_migration_entry_read(&entry);
- pte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(*src_pte))
- pte = pte_swp_mksoft_dirty(pte);
- if (pte_swp_uffd_wp(*src_pte))
- pte = pte_swp_mkuffd_wp(pte);
- set_pte_at(src_mm, addr, src_pte, pte);
- }
- } else if (is_device_private_entry(entry)) {
- page = device_private_entry_to_page(entry);
+ rss[mm_counter(page)]++;
+ if (is_write_migration_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
/*
- * Update rss count even for unaddressable pages, as
- * they should treated just like normal pages in this
- * respect.
- *
- * We will likely want to have some new rss counters
- * for unaddressable pages, at some point. But for now
- * keep things as they are.
+ * COW mappings require pages in both
+ * parent and child to be set to read.
*/
- get_page(page);
- rss[mm_counter(page)]++;
- page_dup_rmap(page, false);
+ make_migration_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(*src_pte))
+ pte = pte_swp_mksoft_dirty(pte);
+ if (pte_swp_uffd_wp(*src_pte))
+ pte = pte_swp_mkuffd_wp(pte);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
+ } else if (is_device_private_entry(entry)) {
+ page = device_private_entry_to_page(entry);
- /*
- * We do not preserve soft-dirty information, because so
- * far, checkpoint/restore is the only feature that
- * requires that. And checkpoint/restore does not work
- * when a device driver is involved (you cannot easily
- * save and restore device driver state).
- */
- if (is_write_device_private_entry(entry) &&
- is_cow_mapping(vm_flags)) {
- make_device_private_entry_read(&entry);
- pte = swp_entry_to_pte(entry);
- if (pte_swp_uffd_wp(*src_pte))
- pte = pte_swp_mkuffd_wp(pte);
- set_pte_at(src_mm, addr, src_pte, pte);
- }
+ /*
+ * Update rss count even for unaddressable pages, as
+ * they should treated just like normal pages in this
+ * respect.
+ *
+ * We will likely want to have some new rss counters
+ * for unaddressable pages, at some point. But for now
+ * keep things as they are.
+ */
+ get_page(page);
+ rss[mm_counter(page)]++;
+ page_dup_rmap(page, false);
+
+ /*
+ * We do not preserve soft-dirty information, because so
+ * far, checkpoint/restore is the only feature that
+ * requires that. And checkpoint/restore does not work
+ * when a device driver is involved (you cannot easily
+ * save and restore device driver state).
+ */
+ if (is_write_device_private_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
+ make_device_private_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ if (pte_swp_uffd_wp(*src_pte))
+ pte = pte_swp_mkuffd_wp(pte);
+ set_pte_at(src_mm, addr, src_pte, pte);
}
- goto out_set_pte;
+ }
+ set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
+}
+
+/*
+ * Copy a present and normal page if necessary.
+ *
+ * NOTE! The usual case is that this doesn't need to do
+ * anything, and can just return a positive value. That
+ * will let the caller know that it can just increase
+ * the page refcount and re-use the pte the traditional
+ * way.
+ *
+ * But _if_ we need to copy it because it needs to be
+ * pinned in the parent (and the child should get its own
+ * copy rather than just a reference to the same page),
+ * we'll do that here and return zero to let the caller
+ * know we're done.
+ *
+ * And if we need a pre-allocated page but don't yet have
+ * one, return a negative error to let the preallocation
+ * code know so that it can do so outside the page table
+ * lock.
+ */
+static inline int
+copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pte_t *dst_pte, pte_t *src_pte,
+ struct vm_area_struct *vma, struct vm_area_struct *new,
+ unsigned long addr, int *rss, struct page **prealloc,
+ pte_t pte, struct page *page)
+{
+ struct page *new_page;
+
+ if (!is_cow_mapping(vma->vm_flags))
+ return 1;
+
+ /*
+ * The trick starts.
+ *
+ * What we want to do is to check whether this page may
+ * have been pinned by the parent process. If so,
+ * instead of wrprotect the pte on both sides, we copy
+ * the page immediately so that we'll always guarantee
+ * the pinned page won't be randomly replaced in the
+ * future.
+ *
+ * To achieve this, we do the following:
+ *
+ * 1. Write-protect the pte if it's writable. This is
+ * to protect concurrent write fast-gup with
+ * FOLL_PIN, so that we'll fail the fast-gup with
+ * the write bit removed.
+ *
+ * 2. Check page_maybe_dma_pinned() to see whether this
+ * page may have been pinned.
+ *
+ * The order of these steps is important to serialize
+ * against the fast-gup code (gup_pte_range()) on the
+ * pte check and try_grab_compound_head(), so that
+ * we'll make sure either we'll capture that fast-gup
+ * so we'll copy the pinned page here, or we'll fail
+ * that fast-gup.
+ *
+ * NOTE! Even if we don't end up copying the page,
+ * we won't undo this wrprotect(), because the normal
+ * reference copy will need it anyway.
+ */
+ if (pte_write(pte))
+ ptep_set_wrprotect(src_mm, addr, src_pte);
+
+ /*
+ * These are the "normally we can just copy by reference"
+ * checks.
+ */
+ if (likely(!atomic_read(&src_mm->has_pinned)))
+ return 1;
+ if (likely(!page_maybe_dma_pinned(page)))
+ return 1;
+
+ /*
+ * Uhhuh. It looks like the page might be a pinned page,
+ * and we actually need to copy it. Now we can set the
+ * source pte back to being writable.
+ */
+ if (pte_write(pte))
+ set_pte_at(src_mm, addr, src_pte, pte);
+
+ new_page = *prealloc;
+ if (!new_page)
+ return -EAGAIN;
+
+ /*
+ * We have a prealloc page, all good! Take it
+ * over and copy the page & arm it.
+ */
+ *prealloc = NULL;
+ copy_user_highpage(new_page, page, addr, vma);
+ __SetPageUptodate(new_page);
+ page_add_new_anon_rmap(new_page, new, addr, false);
+ lru_cache_add_inactive_or_unevictable(new_page, new);
+ rss[mm_counter(new_page)]++;
+
+ /* All done, just insert the new page copy in the child */
+ pte = mk_pte(new_page, new->vm_page_prot);
+ pte = maybe_mkwrite(pte_mkdirty(pte), new);
+ set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
+}
+
+/*
+ * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
+ * is required to copy this pte.
+ */
+static inline int
+copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
+ struct vm_area_struct *new,
+ unsigned long addr, int *rss, struct page **prealloc)
+{
+ unsigned long vm_flags = vma->vm_flags;
+ pte_t pte = *src_pte;
+ struct page *page;
+
+ page = vm_normal_page(vma, addr, pte);
+ if (page) {
+ int retval;
+
+ retval = copy_present_page(dst_mm, src_mm,
+ dst_pte, src_pte,
+ vma, new,
+ addr, rss, prealloc,
+ pte, page);
+ if (retval <= 0)
+ return retval;
+
+ get_page(page);
+ page_dup_rmap(page, false);
+ rss[mm_counter(page)]++;
}
/*
@@ -800,35 +934,51 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (!(vm_flags & VM_UFFD_WP))
pte = pte_clear_uffd_wp(pte);
- page = vm_normal_page(vma, addr, pte);
- if (page) {
- get_page(page);
- page_dup_rmap(page, false);
- rss[mm_counter(page)]++;
- }
-
-out_set_pte:
set_pte_at(dst_mm, addr, dst_pte, pte);
return 0;
}
+static inline struct page *
+page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *new_page;
+
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
+ if (!new_page)
+ return NULL;
+
+ if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
+ put_page(new_page);
+ return NULL;
+ }
+ cgroup_throttle_swaprate(new_page, GFP_KERNEL);
+
+ return new_page;
+}
+
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+ struct vm_area_struct *new,
unsigned long addr, unsigned long end)
{
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
- int progress = 0;
+ int progress, ret = 0;
int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
+ struct page *prealloc = NULL;
again:
+ progress = 0;
init_rss_vec(rss);
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
- if (!dst_pte)
- return -ENOMEM;
+ if (!dst_pte) {
+ ret = -ENOMEM;
+ goto out;
+ }
src_pte = pte_offset_map(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -851,10 +1001,34 @@ again:
progress++;
continue;
}
- entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
+ if (unlikely(!pte_present(*src_pte))) {
+ entry.val = copy_nonpresent_pte(dst_mm, src_mm,
+ dst_pte, src_pte,
vma, addr, rss);
- if (entry.val)
+ if (entry.val)
+ break;
+ progress += 8;
+ continue;
+ }
+ /* copy_present_pte() will clear `*prealloc' if consumed */
+ ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte,
+ vma, new, addr, rss, &prealloc);
+ /*
+ * If we need a pre-allocated page for this pte, drop the
+ * locks, allocate, and try again.
+ */
+ if (unlikely(ret == -EAGAIN))
break;
+ if (unlikely(prealloc)) {
+ /*
+ * pre-alloc page cannot be reused by next time so as
+ * to strictly follow mempolicy (e.g., alloc_page_vma()
+ * will allocate page according to address). This
+ * could only happen if one pinned pte changed.
+ */
+ put_page(prealloc);
+ prealloc = NULL;
+ }
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -866,17 +1040,30 @@ again:
cond_resched();
if (entry.val) {
- if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
+ if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ entry.val = 0;
+ } else if (ret) {
+ WARN_ON_ONCE(ret != -EAGAIN);
+ prealloc = page_copy_prealloc(src_mm, vma, addr);
+ if (!prealloc)
return -ENOMEM;
- progress = 0;
+ /* We've captured and resolved the error. Reset, try again. */
+ ret = 0;
}
if (addr != end)
goto again;
- return 0;
+out:
+ if (unlikely(prealloc))
+ put_page(prealloc);
+ return ret;
}
static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
+ struct vm_area_struct *new,
unsigned long addr, unsigned long end)
{
pmd_t *src_pmd, *dst_pmd;
@@ -903,7 +1090,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
if (pmd_none_or_clear_bad(src_pmd))
continue;
if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
- vma, addr, next))
+ vma, new, addr, next))
return -ENOMEM;
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
return 0;
@@ -911,6 +1098,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
+ struct vm_area_struct *new,
unsigned long addr, unsigned long end)
{
pud_t *src_pud, *dst_pud;
@@ -937,7 +1125,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
if (pud_none_or_clear_bad(src_pud))
continue;
if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
- vma, addr, next))
+ vma, new, addr, next))
return -ENOMEM;
} while (dst_pud++, src_pud++, addr = next, addr != end);
return 0;
@@ -945,6 +1133,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
+ struct vm_area_struct *new,
unsigned long addr, unsigned long end)
{
p4d_t *src_p4d, *dst_p4d;
@@ -959,14 +1148,14 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src
if (p4d_none_or_clear_bad(src_p4d))
continue;
if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
- vma, addr, next))
+ vma, new, addr, next))
return -ENOMEM;
} while (dst_p4d++, src_p4d++, addr = next, addr != end);
return 0;
}
int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma, struct vm_area_struct *new)
{
pgd_t *src_pgd, *dst_pgd;
unsigned long next;
@@ -1021,7 +1210,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (pgd_none_or_clear_bad(src_pgd))
continue;
if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
- vma, addr, next))) {
+ vma, new, addr, next))) {
ret = -ENOMEM;
break;
}
@@ -2955,8 +3144,8 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
* page count reference, and the page is locked,
* it's dark out, and we're wearing sunglasses. Hit it.
*/
- wp_page_reuse(vmf);
unlock_page(page);
+ wp_page_reuse(vmf);
return VM_FAULT_WRITE;
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index b11a269e2356..ce3e73e3a5c1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -729,7 +729,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
* are reserved so nobody should be touching them so we should be safe
*/
memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
- MEMMAP_HOTPLUG, altmap);
+ MEMINIT_HOTPLUG, altmap);
set_zone_contiguous(zone);
}
@@ -1080,7 +1080,8 @@ int __ref add_memory_resource(int nid, struct resource *res)
}
/* link memory sections under this node.*/
- ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1));
+ ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
+ MEMINIT_HOTPLUG);
BUG_ON(ret);
/* create new memmap entry */
diff --git a/mm/migrate.c b/mm/migrate.c
index aecb1433cf3c..04a98bb2f568 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1446,7 +1446,7 @@ retry:
* Capture required information that might get lost
* during migration.
*/
- is_thp = PageTransHuge(page);
+ is_thp = PageTransHuge(page) && !PageHuge(page);
nr_subpages = thp_nr_pages(page);
cond_resched();
@@ -1472,7 +1472,7 @@ retry:
* we encounter them after the rest of the list
* is processed.
*/
- if (PageTransHuge(page) && !PageHuge(page)) {
+ if (is_thp) {
lock_page(page);
rc = split_huge_page_to_list(page, from);
unlock_page(page);
@@ -1481,8 +1481,7 @@ retry:
nr_thp_split++;
goto retry;
}
- }
- if (is_thp) {
+
nr_thp_failed++;
nr_failed += nr_subpages;
goto out;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0608f7f1236d..9fba8859ecd7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -3367,9 +3367,16 @@ struct page *rmqueue(struct zone *preferred_zone,
struct page *page;
if (likely(order == 0)) {
- page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
+ /*
+ * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
+ * we need to skip it when CMA area isn't allowed.
+ */
+ if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
+ migratetype != MIGRATE_MOVABLE) {
+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
migratetype, alloc_flags);
- goto out;
+ goto out;
+ }
}
/*
@@ -3381,7 +3388,13 @@ struct page *rmqueue(struct zone *preferred_zone,
do {
page = NULL;
- if (alloc_flags & ALLOC_HARDER) {
+ /*
+ * order-0 request can reach here when the pcplist is skipped
+ * due to non-CMA allocation context. HIGHATOMIC area is
+ * reserved for high-order atomic allocation, so order-0
+ * request should skip it.
+ */
+ if (order > 0 && alloc_flags & ALLOC_HARDER) {
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
@@ -5975,7 +5988,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn, enum memmap_context context,
+ unsigned long start_pfn, enum meminit_context context,
struct vmem_altmap *altmap)
{
unsigned long pfn, end_pfn = start_pfn + size;
@@ -6007,7 +6020,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
- if (context == MEMMAP_EARLY) {
+ if (context == MEMINIT_EARLY) {
if (overlap_memmap_init(zone, &pfn))
continue;
if (defer_init(nid, pfn, end_pfn))
@@ -6016,7 +6029,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
- if (context == MEMMAP_HOTPLUG)
+ if (context == MEMINIT_HOTPLUG)
__SetPageReserved(page);
/*
@@ -6099,7 +6112,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*
- * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+ * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
* because this is done early in section_activate()
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
@@ -6137,7 +6150,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid,
if (end_pfn > start_pfn) {
size = end_pfn - start_pfn;
memmap_init_zone(size, nid, zone, start_pfn,
- MEMMAP_EARLY, NULL);
+ MEMINIT_EARLY, NULL);
}
}
}
diff --git a/mm/slab.c b/mm/slab.c
index 3160dff6fd76..f658e86ec8ce 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1632,6 +1632,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page)
kmem_cache_free(cachep->freelist_cache, freelist);
}
+/*
+ * Update the size of the caches before calling slabs_destroy as it may
+ * recursively call kfree.
+ */
static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
{
struct page *page, *n;
@@ -2153,8 +2157,8 @@ static void do_drain(void *arg)
spin_lock(&n->list_lock);
free_block(cachep, ac->entry, ac->avail, node, &list);
spin_unlock(&n->list_lock);
- slabs_destroy(cachep, &list);
ac->avail = 0;
+ slabs_destroy(cachep, &list);
}
static void drain_cpu_caches(struct kmem_cache *cachep)
@@ -3402,9 +3406,9 @@ free_done:
}
#endif
spin_unlock(&n->list_lock);
- slabs_destroy(cachep, &list);
ac->avail -= batchcount;
memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
+ slabs_destroy(cachep, &list);
}
/*
diff --git a/mm/slub.c b/mm/slub.c
index d4177aecedf6..6d3574013b2f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1413,10 +1413,6 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
char *next_block;
slab_flags_t block_flags;
- /* If slub_debug = 0, it folds into the if conditional. */
- if (!slub_debug_string)
- return flags | slub_debug;
-
len = strlen(name);
next_block = slub_debug_string;
/* Go through all blocks of debug options, see if any matches our slab's name */
@@ -1450,7 +1446,7 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
}
}
- return slub_debug;
+ return flags | slub_debug;
}
#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 12f59e641b5e..debc94155f74 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1078,7 +1078,7 @@ start_over:
goto nextsi;
}
if (size == SWAPFILE_CLUSTER) {
- if (!(si->flags & SWP_FS))
+ if (si->flags & SWP_BLKDEV)
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,