diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 6 | ||||
-rw-r--r-- | mm/gup.c | 24 | ||||
-rw-r--r-- | mm/huge_memory.c | 28 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 4 | ||||
-rw-r--r-- | mm/memory.c | 365 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 5 | ||||
-rw-r--r-- | mm/migrate.c | 7 | ||||
-rw-r--r-- | mm/page_alloc.c | 29 | ||||
-rw-r--r-- | mm/slab.c | 8 | ||||
-rw-r--r-- | mm/slub.c | 6 | ||||
-rw-r--r-- | mm/swapfile.c | 2 |
12 files changed, 363 insertions, 123 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 8c8f2deee4e3..f6d36ccc2351 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2365,7 +2365,11 @@ readpage: } if (!PageUptodate(page)) { - error = lock_page_killable(page); + if (iocb->ki_flags & IOCB_WAITQ) + error = lock_page_async(page, iocb->ki_waitq); + else + error = lock_page_killable(page); + if (unlikely(error)) goto readpage_error; if (!PageUptodate(page)) { @@ -1255,6 +1255,9 @@ static __always_inline long __get_user_pages_locked(struct mm_struct *mm, BUG_ON(*locked != 1); } + if (flags & FOLL_PIN) + atomic_set(&mm->has_pinned, 1); + /* * FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior * is to set FOLL_GET if the caller wants pages[] filled in (but has @@ -2485,13 +2488,13 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr, return 1; } -static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, +static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pmd_t *pmdp; - pmdp = pmd_offset(&pud, addr); + pmdp = pmd_offset_lockless(pudp, pud, addr); do { pmd_t pmd = READ_ONCE(*pmdp); @@ -2528,13 +2531,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, return 1; } -static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, +static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; pud_t *pudp; - pudp = pud_offset(&p4d, addr); + pudp = pud_offset_lockless(p4dp, p4d, addr); do { pud_t pud = READ_ONCE(*pudp); @@ -2549,20 +2552,20 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end, if (!gup_huge_pd(__hugepd(pud_val(pud)), addr, PUD_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr)) + } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr)) return 0; } while (pudp++, addr = next, addr != end); return 1; } -static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, +static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end, unsigned int flags, struct page **pages, int *nr) { unsigned long next; p4d_t *p4dp; - p4dp = p4d_offset(&pgd, addr); + p4dp = p4d_offset_lockless(pgdp, pgd, addr); do { p4d_t p4d = READ_ONCE(*p4dp); @@ -2574,7 +2577,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end, if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr, P4D_SHIFT, next, flags, pages, nr)) return 0; - } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr)) + } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr)) return 0; } while (p4dp++, addr = next, addr != end); @@ -2602,7 +2605,7 @@ static void gup_pgd_range(unsigned long addr, unsigned long end, if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr, PGDIR_SHIFT, next, flags, pages, nr)) return; - } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr)) + } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr)) return; } while (pgdp++, addr = next, addr != end); } @@ -2660,6 +2663,9 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, FOLL_FAST_ONLY))) return -EINVAL; + if (gup_flags & FOLL_PIN) + atomic_set(¤t->mm->has_pinned, 1); + if (!(gup_flags & FOLL_FAST_ONLY)) might_lock_read(¤t->mm->mmap_lock); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index faadc449cca5..da397779a6d4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1074,6 +1074,24 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, src_page = pmd_page(pmd); VM_BUG_ON_PAGE(!PageHead(src_page), src_page); + + /* + * If this page is a potentially pinned page, split and retry the fault + * with smaller page size. Normally this should not happen because the + * userspace should use MADV_DONTFORK upon pinned regions. This is a + * best effort that the pinned pages won't be replaced by another + * random page during the coming copy-on-write. + */ + if (unlikely(is_cow_mapping(vma->vm_flags) && + atomic_read(&src_mm->has_pinned) && + page_maybe_dma_pinned(src_page))) { + pte_free(dst_mm, pgtable); + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + __split_huge_pmd(vma, src_pmd, addr, false, NULL); + return -EAGAIN; + } + get_page(src_page); page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -1177,6 +1195,16 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* No huge zero pud yet */ } + /* Please refer to comments in copy_huge_pmd() */ + if (unlikely(is_cow_mapping(vma->vm_flags) && + atomic_read(&src_mm->has_pinned) && + page_maybe_dma_pinned(pud_page(pud)))) { + spin_unlock(src_ptl); + spin_unlock(dst_ptl); + __split_huge_pud(vma, src_pud, addr); + return -EAGAIN; + } + pudp_set_wrprotect(src_mm, addr, src_pud); pud = pud_mkold(pud_wrprotect(pud)); set_pud_at(dst_mm, addr, dst_pud, pud); diff --git a/mm/madvise.c b/mm/madvise.c index d4aa5f776543..0e0d61003fc6 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -381,9 +381,9 @@ huge_unlock: return 0; } +regular_page: if (pmd_trans_unstable(pmd)) return 0; -regular_page: #endif tlb_change_page_size(tlb, PAGE_SIZE); orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cfa6cbad21d5..6877c765b8d0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1538,9 +1538,9 @@ static char *memory_stat_format(struct mem_cgroup *memcg) memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON)); seq_buf_printf(&s, "workingset_activate_file %lu\n", memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE)); - seq_buf_printf(&s, "workingset_restore %lu\n", + seq_buf_printf(&s, "workingset_restore_anon %lu\n", memcg_page_state(memcg, WORKINGSET_RESTORE_ANON)); - seq_buf_printf(&s, "workingset_restore %lu\n", + seq_buf_printf(&s, "workingset_restore_file %lu\n", memcg_page_state(memcg, WORKINGSET_RESTORE_FILE)); seq_buf_printf(&s, "workingset_nodereclaim %lu\n", memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); diff --git a/mm/memory.c b/mm/memory.c index 469af373ae76..fcfc4ca36eba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -695,84 +695,218 @@ out: * covered by this vma. */ -static inline unsigned long -copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, +static unsigned long +copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr, int *rss) { unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; + swp_entry_t entry = pte_to_swp_entry(pte); + + if (likely(!non_swap_entry(entry))) { + if (swap_duplicate(entry) < 0) + return entry.val; + + /* make sure dst_mm is on swapoff's mmlist. */ + if (unlikely(list_empty(&dst_mm->mmlist))) { + spin_lock(&mmlist_lock); + if (list_empty(&dst_mm->mmlist)) + list_add(&dst_mm->mmlist, + &src_mm->mmlist); + spin_unlock(&mmlist_lock); + } + rss[MM_SWAPENTS]++; + } else if (is_migration_entry(entry)) { + page = migration_entry_to_page(entry); - /* pte contains position in swap or file, so copy. */ - if (unlikely(!pte_present(pte))) { - swp_entry_t entry = pte_to_swp_entry(pte); - - if (likely(!non_swap_entry(entry))) { - if (swap_duplicate(entry) < 0) - return entry.val; - - /* make sure dst_mm is on swapoff's mmlist. */ - if (unlikely(list_empty(&dst_mm->mmlist))) { - spin_lock(&mmlist_lock); - if (list_empty(&dst_mm->mmlist)) - list_add(&dst_mm->mmlist, - &src_mm->mmlist); - spin_unlock(&mmlist_lock); - } - rss[MM_SWAPENTS]++; - } else if (is_migration_entry(entry)) { - page = migration_entry_to_page(entry); - - rss[mm_counter(page)]++; - - if (is_write_migration_entry(entry) && - is_cow_mapping(vm_flags)) { - /* - * COW mappings require pages in both - * parent and child to be set to read. - */ - make_migration_entry_read(&entry); - pte = swp_entry_to_pte(entry); - if (pte_swp_soft_dirty(*src_pte)) - pte = pte_swp_mksoft_dirty(pte); - if (pte_swp_uffd_wp(*src_pte)) - pte = pte_swp_mkuffd_wp(pte); - set_pte_at(src_mm, addr, src_pte, pte); - } - } else if (is_device_private_entry(entry)) { - page = device_private_entry_to_page(entry); + rss[mm_counter(page)]++; + if (is_write_migration_entry(entry) && + is_cow_mapping(vm_flags)) { /* - * Update rss count even for unaddressable pages, as - * they should treated just like normal pages in this - * respect. - * - * We will likely want to have some new rss counters - * for unaddressable pages, at some point. But for now - * keep things as they are. + * COW mappings require pages in both + * parent and child to be set to read. */ - get_page(page); - rss[mm_counter(page)]++; - page_dup_rmap(page, false); + make_migration_entry_read(&entry); + pte = swp_entry_to_pte(entry); + if (pte_swp_soft_dirty(*src_pte)) + pte = pte_swp_mksoft_dirty(pte); + if (pte_swp_uffd_wp(*src_pte)) + pte = pte_swp_mkuffd_wp(pte); + set_pte_at(src_mm, addr, src_pte, pte); + } + } else if (is_device_private_entry(entry)) { + page = device_private_entry_to_page(entry); - /* - * We do not preserve soft-dirty information, because so - * far, checkpoint/restore is the only feature that - * requires that. And checkpoint/restore does not work - * when a device driver is involved (you cannot easily - * save and restore device driver state). - */ - if (is_write_device_private_entry(entry) && - is_cow_mapping(vm_flags)) { - make_device_private_entry_read(&entry); - pte = swp_entry_to_pte(entry); - if (pte_swp_uffd_wp(*src_pte)) - pte = pte_swp_mkuffd_wp(pte); - set_pte_at(src_mm, addr, src_pte, pte); - } + /* + * Update rss count even for unaddressable pages, as + * they should treated just like normal pages in this + * respect. + * + * We will likely want to have some new rss counters + * for unaddressable pages, at some point. But for now + * keep things as they are. + */ + get_page(page); + rss[mm_counter(page)]++; + page_dup_rmap(page, false); + + /* + * We do not preserve soft-dirty information, because so + * far, checkpoint/restore is the only feature that + * requires that. And checkpoint/restore does not work + * when a device driver is involved (you cannot easily + * save and restore device driver state). + */ + if (is_write_device_private_entry(entry) && + is_cow_mapping(vm_flags)) { + make_device_private_entry_read(&entry); + pte = swp_entry_to_pte(entry); + if (pte_swp_uffd_wp(*src_pte)) + pte = pte_swp_mkuffd_wp(pte); + set_pte_at(src_mm, addr, src_pte, pte); } - goto out_set_pte; + } + set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; +} + +/* + * Copy a present and normal page if necessary. + * + * NOTE! The usual case is that this doesn't need to do + * anything, and can just return a positive value. That + * will let the caller know that it can just increase + * the page refcount and re-use the pte the traditional + * way. + * + * But _if_ we need to copy it because it needs to be + * pinned in the parent (and the child should get its own + * copy rather than just a reference to the same page), + * we'll do that here and return zero to let the caller + * know we're done. + * + * And if we need a pre-allocated page but don't yet have + * one, return a negative error to let the preallocation + * code know so that it can do so outside the page table + * lock. + */ +static inline int +copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, + struct vm_area_struct *vma, struct vm_area_struct *new, + unsigned long addr, int *rss, struct page **prealloc, + pte_t pte, struct page *page) +{ + struct page *new_page; + + if (!is_cow_mapping(vma->vm_flags)) + return 1; + + /* + * The trick starts. + * + * What we want to do is to check whether this page may + * have been pinned by the parent process. If so, + * instead of wrprotect the pte on both sides, we copy + * the page immediately so that we'll always guarantee + * the pinned page won't be randomly replaced in the + * future. + * + * To achieve this, we do the following: + * + * 1. Write-protect the pte if it's writable. This is + * to protect concurrent write fast-gup with + * FOLL_PIN, so that we'll fail the fast-gup with + * the write bit removed. + * + * 2. Check page_maybe_dma_pinned() to see whether this + * page may have been pinned. + * + * The order of these steps is important to serialize + * against the fast-gup code (gup_pte_range()) on the + * pte check and try_grab_compound_head(), so that + * we'll make sure either we'll capture that fast-gup + * so we'll copy the pinned page here, or we'll fail + * that fast-gup. + * + * NOTE! Even if we don't end up copying the page, + * we won't undo this wrprotect(), because the normal + * reference copy will need it anyway. + */ + if (pte_write(pte)) + ptep_set_wrprotect(src_mm, addr, src_pte); + + /* + * These are the "normally we can just copy by reference" + * checks. + */ + if (likely(!atomic_read(&src_mm->has_pinned))) + return 1; + if (likely(!page_maybe_dma_pinned(page))) + return 1; + + /* + * Uhhuh. It looks like the page might be a pinned page, + * and we actually need to copy it. Now we can set the + * source pte back to being writable. + */ + if (pte_write(pte)) + set_pte_at(src_mm, addr, src_pte, pte); + + new_page = *prealloc; + if (!new_page) + return -EAGAIN; + + /* + * We have a prealloc page, all good! Take it + * over and copy the page & arm it. + */ + *prealloc = NULL; + copy_user_highpage(new_page, page, addr, vma); + __SetPageUptodate(new_page); + page_add_new_anon_rmap(new_page, new, addr, false); + lru_cache_add_inactive_or_unevictable(new_page, new); + rss[mm_counter(new_page)]++; + + /* All done, just insert the new page copy in the child */ + pte = mk_pte(new_page, new->vm_page_prot); + pte = maybe_mkwrite(pte_mkdirty(pte), new); + set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; +} + +/* + * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page + * is required to copy this pte. + */ +static inline int +copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, + struct vm_area_struct *new, + unsigned long addr, int *rss, struct page **prealloc) +{ + unsigned long vm_flags = vma->vm_flags; + pte_t pte = *src_pte; + struct page *page; + + page = vm_normal_page(vma, addr, pte); + if (page) { + int retval; + + retval = copy_present_page(dst_mm, src_mm, + dst_pte, src_pte, + vma, new, + addr, rss, prealloc, + pte, page); + if (retval <= 0) + return retval; + + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; } /* @@ -800,35 +934,51 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!(vm_flags & VM_UFFD_WP)) pte = pte_clear_uffd_wp(pte); - page = vm_normal_page(vma, addr, pte); - if (page) { - get_page(page); - page_dup_rmap(page, false); - rss[mm_counter(page)]++; - } - -out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } +static inline struct page * +page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, + unsigned long addr) +{ + struct page *new_page; + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr); + if (!new_page) + return NULL; + + if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) { + put_page(new_page); + return NULL; + } + cgroup_throttle_swaprate(new_page, GFP_KERNEL); + + return new_page; +} + static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; - int progress = 0; + int progress, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; + struct page *prealloc = NULL; again: + progress = 0; init_rss_vec(rss); dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); - if (!dst_pte) - return -ENOMEM; + if (!dst_pte) { + ret = -ENOMEM; + goto out; + } src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); @@ -851,10 +1001,34 @@ again: progress++; continue; } - entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, + if (unlikely(!pte_present(*src_pte))) { + entry.val = copy_nonpresent_pte(dst_mm, src_mm, + dst_pte, src_pte, vma, addr, rss); - if (entry.val) + if (entry.val) + break; + progress += 8; + continue; + } + /* copy_present_pte() will clear `*prealloc' if consumed */ + ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte, + vma, new, addr, rss, &prealloc); + /* + * If we need a pre-allocated page for this pte, drop the + * locks, allocate, and try again. + */ + if (unlikely(ret == -EAGAIN)) break; + if (unlikely(prealloc)) { + /* + * pre-alloc page cannot be reused by next time so as + * to strictly follow mempolicy (e.g., alloc_page_vma() + * will allocate page according to address). This + * could only happen if one pinned pte changed. + */ + put_page(prealloc); + prealloc = NULL; + } progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); @@ -866,17 +1040,30 @@ again: cond_resched(); if (entry.val) { - if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { + ret = -ENOMEM; + goto out; + } + entry.val = 0; + } else if (ret) { + WARN_ON_ONCE(ret != -EAGAIN); + prealloc = page_copy_prealloc(src_mm, vma, addr); + if (!prealloc) return -ENOMEM; - progress = 0; + /* We've captured and resolved the error. Reset, try again. */ + ret = 0; } if (addr != end) goto again; - return 0; +out: + if (unlikely(prealloc)) + put_page(prealloc); + return ret; } static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pmd_t *src_pmd, *dst_pmd; @@ -903,7 +1090,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; @@ -911,6 +1098,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { pud_t *src_pud, *dst_pud; @@ -937,7 +1125,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; @@ -945,6 +1133,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma, + struct vm_area_struct *new, unsigned long addr, unsigned long end) { p4d_t *src_p4d, *dst_p4d; @@ -959,14 +1148,14 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src if (p4d_none_or_clear_bad(src_p4d)) continue; if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d, - vma, addr, next)) + vma, new, addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - struct vm_area_struct *vma) + struct vm_area_struct *vma, struct vm_area_struct *new) { pgd_t *src_pgd, *dst_pgd; unsigned long next; @@ -1021,7 +1210,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd, - vma, addr, next))) { + vma, new, addr, next))) { ret = -ENOMEM; break; } @@ -2955,8 +3144,8 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * page count reference, and the page is locked, * it's dark out, and we're wearing sunglasses. Hit it. */ - wp_page_reuse(vmf); unlock_page(page); + wp_page_reuse(vmf); return VM_FAULT_WRITE; } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b11a269e2356..ce3e73e3a5c1 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -729,7 +729,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn, * are reserved so nobody should be touching them so we should be safe */ memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn, - MEMMAP_HOTPLUG, altmap); + MEMINIT_HOTPLUG, altmap); set_zone_contiguous(zone); } @@ -1080,7 +1080,8 @@ int __ref add_memory_resource(int nid, struct resource *res) } /* link memory sections under this node.*/ - ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1)); + ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1), + MEMINIT_HOTPLUG); BUG_ON(ret); /* create new memmap entry */ diff --git a/mm/migrate.c b/mm/migrate.c index aecb1433cf3c..04a98bb2f568 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1446,7 +1446,7 @@ retry: * Capture required information that might get lost * during migration. */ - is_thp = PageTransHuge(page); + is_thp = PageTransHuge(page) && !PageHuge(page); nr_subpages = thp_nr_pages(page); cond_resched(); @@ -1472,7 +1472,7 @@ retry: * we encounter them after the rest of the list * is processed. */ - if (PageTransHuge(page) && !PageHuge(page)) { + if (is_thp) { lock_page(page); rc = split_huge_page_to_list(page, from); unlock_page(page); @@ -1481,8 +1481,7 @@ retry: nr_thp_split++; goto retry; } - } - if (is_thp) { + nr_thp_failed++; nr_failed += nr_subpages; goto out; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0608f7f1236d..9fba8859ecd7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3367,9 +3367,16 @@ struct page *rmqueue(struct zone *preferred_zone, struct page *page; if (likely(order == 0)) { - page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, + /* + * MIGRATE_MOVABLE pcplist could have the pages on CMA area and + * we need to skip it when CMA area isn't allowed. + */ + if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA || + migratetype != MIGRATE_MOVABLE) { + page = rmqueue_pcplist(preferred_zone, zone, gfp_flags, migratetype, alloc_flags); - goto out; + goto out; + } } /* @@ -3381,7 +3388,13 @@ struct page *rmqueue(struct zone *preferred_zone, do { page = NULL; - if (alloc_flags & ALLOC_HARDER) { + /* + * order-0 request can reach here when the pcplist is skipped + * due to non-CMA allocation context. HIGHATOMIC area is + * reserved for high-order atomic allocation, so order-0 + * request should skip it. + */ + if (order > 0 && alloc_flags & ALLOC_HARDER) { page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC); if (page) trace_mm_page_alloc_zone_locked(page, order, migratetype); @@ -5975,7 +5988,7 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn) * done. Non-atomic initialization, single-pass. */ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, - unsigned long start_pfn, enum memmap_context context, + unsigned long start_pfn, enum meminit_context context, struct vmem_altmap *altmap) { unsigned long pfn, end_pfn = start_pfn + size; @@ -6007,7 +6020,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, * There can be holes in boot-time mem_map[]s handed to this * function. They do not exist on hotplugged memory. */ - if (context == MEMMAP_EARLY) { + if (context == MEMINIT_EARLY) { if (overlap_memmap_init(zone, &pfn)) continue; if (defer_init(nid, pfn, end_pfn)) @@ -6016,7 +6029,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, page = pfn_to_page(pfn); __init_single_page(page, pfn, zone, nid); - if (context == MEMMAP_HOTPLUG) + if (context == MEMINIT_HOTPLUG) __SetPageReserved(page); /* @@ -6099,7 +6112,7 @@ void __ref memmap_init_zone_device(struct zone *zone, * check here not to call set_pageblock_migratetype() against * pfn out of zone. * - * Please note that MEMMAP_HOTPLUG path doesn't clear memmap + * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in section_activate() */ if (!(pfn & (pageblock_nr_pages - 1))) { @@ -6137,7 +6150,7 @@ void __meminit __weak memmap_init(unsigned long size, int nid, if (end_pfn > start_pfn) { size = end_pfn - start_pfn; memmap_init_zone(size, nid, zone, start_pfn, - MEMMAP_EARLY, NULL); + MEMINIT_EARLY, NULL); } } } diff --git a/mm/slab.c b/mm/slab.c index 3160dff6fd76..f658e86ec8ce 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1632,6 +1632,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page) kmem_cache_free(cachep->freelist_cache, freelist); } +/* + * Update the size of the caches before calling slabs_destroy as it may + * recursively call kfree. + */ static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list) { struct page *page, *n; @@ -2153,8 +2157,8 @@ static void do_drain(void *arg) spin_lock(&n->list_lock); free_block(cachep, ac->entry, ac->avail, node, &list); spin_unlock(&n->list_lock); - slabs_destroy(cachep, &list); ac->avail = 0; + slabs_destroy(cachep, &list); } static void drain_cpu_caches(struct kmem_cache *cachep) @@ -3402,9 +3406,9 @@ free_done: } #endif spin_unlock(&n->list_lock); - slabs_destroy(cachep, &list); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); + slabs_destroy(cachep, &list); } /* diff --git a/mm/slub.c b/mm/slub.c index d4177aecedf6..6d3574013b2f 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1413,10 +1413,6 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, char *next_block; slab_flags_t block_flags; - /* If slub_debug = 0, it folds into the if conditional. */ - if (!slub_debug_string) - return flags | slub_debug; - len = strlen(name); next_block = slub_debug_string; /* Go through all blocks of debug options, see if any matches our slab's name */ @@ -1450,7 +1446,7 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, } } - return slub_debug; + return flags | slub_debug; } #else /* !CONFIG_SLUB_DEBUG */ static inline void setup_object_debug(struct kmem_cache *s, diff --git a/mm/swapfile.c b/mm/swapfile.c index 12f59e641b5e..debc94155f74 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1078,7 +1078,7 @@ start_over: goto nextsi; } if (size == SWAPFILE_CLUSTER) { - if (!(si->flags & SWP_FS)) + if (si->flags & SWP_BLKDEV) n_ret = swap_alloc_cluster(si, swp_entries); } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, |