diff options
author | Andrew Morton <akpm@linux-foundation.org> | 2023-08-21 14:26:20 -0700 |
---|---|---|
committer | Andrew Morton <akpm@linux-foundation.org> | 2023-08-21 14:26:20 -0700 |
commit | 5994eabf3bbbea550166ae90de0c854fc984c95d (patch) | |
tree | 6b242a70d3254c408c68157a5f5b2fa7eb9f6a4b /mm | |
parent | e45a2e947dfa6da2d73e2cf91ed6399c12522d4f (diff) | |
parent | 6867c7a3320669cbe44b905a3eb35db725c6d470 (diff) |
merge mm-hotfixes-stable into mm-stable to pick up depended-upon changes
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 8 | ||||
-rw-r--r-- | mm/damon/core.c | 1 | ||||
-rw-r--r-- | mm/damon/vaddr.c | 2 | ||||
-rw-r--r-- | mm/gup.c | 30 | ||||
-rw-r--r-- | mm/hmm.c | 1 | ||||
-rw-r--r-- | mm/huge_memory.c | 3 | ||||
-rw-r--r-- | mm/hugetlb.c | 75 | ||||
-rw-r--r-- | mm/internal.h | 17 | ||||
-rw-r--r-- | mm/ksm.c | 27 | ||||
-rw-r--r-- | mm/madvise.c | 3 | ||||
-rw-r--r-- | mm/memcontrol.c | 2 | ||||
-rw-r--r-- | mm/memory-failure.c | 41 | ||||
-rw-r--r-- | mm/mempolicy.c | 22 | ||||
-rw-r--r-- | mm/migrate_device.c | 1 | ||||
-rw-r--r-- | mm/mincore.c | 1 | ||||
-rw-r--r-- | mm/mlock.c | 1 | ||||
-rw-r--r-- | mm/mprotect.c | 1 | ||||
-rw-r--r-- | mm/pagewalk.c | 36 | ||||
-rw-r--r-- | mm/swapfile.c | 8 | ||||
-rw-r--r-- | mm/vmalloc.c | 4 | ||||
-rw-r--r-- | mm/vmscan.c | 14 | ||||
-rw-r--r-- | mm/zsmalloc.c | 14 |
22 files changed, 226 insertions, 86 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index 216081ab325a..38c8d216c6a3 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -933,11 +933,12 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, /* * Check if the pageblock has already been marked skipped. - * Only the aligned PFN is checked as the caller isolates + * Only the first PFN is checked as the caller isolates * COMPACT_CLUSTER_MAX at a time so the second call must * not falsely conclude that the block should be skipped. */ - if (!valid_page && pageblock_aligned(low_pfn)) { + if (!valid_page && (pageblock_aligned(low_pfn) || + low_pfn == cc->zone->zone_start_pfn)) { if (!isolation_suitable(cc, page)) { low_pfn = end_pfn; folio = NULL; @@ -2030,7 +2031,8 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) * before making it "skip" so other compaction instances do * not scan the same block. */ - if (pageblock_aligned(low_pfn) && + if ((pageblock_aligned(low_pfn) || + low_pfn == cc->zone->zone_start_pfn) && !fast_find_block && !isolation_suitable(cc, page)) continue; diff --git a/mm/damon/core.c b/mm/damon/core.c index c1f1483c5082..bcd2bd9d6c10 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -273,6 +273,7 @@ struct damos_filter *damos_new_filter(enum damos_filter_type type, return NULL; filter->type = type; filter->matching = matching; + INIT_LIST_HEAD(&filter->list); return filter; } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index d01cc46f4bf4..4c81a9dbd044 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -389,6 +389,7 @@ out: static const struct mm_walk_ops damon_mkold_ops = { .pmd_entry = damon_mkold_pmd_entry, .hugetlb_entry = damon_mkold_hugetlb_entry, + .walk_lock = PGWALK_RDLOCK, }; static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) @@ -532,6 +533,7 @@ out: static const struct mm_walk_ops damon_young_ops = { .pmd_entry = damon_young_pmd_entry, .hugetlb_entry = damon_young_hugetlb_entry, + .walk_lock = PGWALK_RDLOCK, }; static bool damon_va_young(struct mm_struct *mm, unsigned long addr, @@ -597,7 +597,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, pte = ptep_get(ptep); if (!pte_present(pte)) goto no_page; - if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) + if (pte_protnone(pte) && !gup_can_follow_protnone(vma, flags)) goto no_page; page = vm_normal_page(vma, address, pte); @@ -714,7 +714,7 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); - if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags)) + if (pmd_protnone(pmdval) && !gup_can_follow_protnone(vma, flags)) return no_page_table(vma, flags); ptl = pmd_lock(mm, pmd); @@ -844,6 +844,10 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, if (WARN_ON_ONCE(foll_flags & FOLL_PIN)) return NULL; + /* + * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect + * to fail on PROT_NONE-mapped pages. + */ page = follow_page_mask(vma, address, foll_flags, &ctx); if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); @@ -2240,6 +2244,13 @@ static bool is_valid_gup_args(struct page **pages, int *locked, gup_flags |= FOLL_UNLOCKABLE; } + /* + * For now, always trigger NUMA hinting faults. Some GUP users like + * KVM require the hint to be as the calling context of GUP is + * functionally similar to a memory reference from task context. + */ + gup_flags |= FOLL_HONOR_NUMA_FAULT; + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((gup_flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) @@ -2564,7 +2575,14 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, struct page *page; struct folio *folio; - if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) + /* + * Always fallback to ordinary GUP on PROT_NONE-mapped pages: + * pte_access_permitted() better should reject these pages + * either way: otherwise, GUP-fast might succeed in + * cases where ordinary GUP would fail due to VMA access + * permissions. + */ + if (pte_protnone(pte)) goto pte_unmap; if (!pte_access_permitted(pte, flags & FOLL_WRITE)) @@ -2983,8 +3001,8 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))) { - if (pmd_protnone(pmd) && - !gup_can_follow_protnone(flags)) + /* See gup_pte_range() */ + if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, @@ -3164,7 +3182,7 @@ static int internal_get_user_pages_fast(unsigned long start, if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM | FOLL_FORCE | FOLL_PIN | FOLL_GET | FOLL_FAST_ONLY | FOLL_NOFAULT | - FOLL_PCI_P2PDMA))) + FOLL_PCI_P2PDMA | FOLL_HONOR_NUMA_FAULT))) return -EINVAL; if (gup_flags & FOLL_PIN) @@ -562,6 +562,7 @@ static const struct mm_walk_ops hmm_walk_ops = { .pte_hole = hmm_vma_walk_hole, .hugetlb_entry = hmm_vma_walk_hugetlb_entry, .test_walk = hmm_vma_walk_test, + .walk_lock = PGWALK_RDLOCK, }; /** diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e371503f7746..154c210892a1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1467,8 +1467,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) return ERR_PTR(-EFAULT); - /* Full NUMA hinting faults to serialise migration in fault paths */ - if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags)) + if (pmd_protnone(*pmd) && !gup_can_follow_protnone(vma, flags)) return NULL; if (!pmd_write(*pmd) && gup_must_unshare(vma, flags, page)) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index abfdcaf114f1..5f498e8025cc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1580,9 +1580,37 @@ static inline void destroy_compound_gigantic_folio(struct folio *folio, unsigned int order) { } #endif +static inline void __clear_hugetlb_destructor(struct hstate *h, + struct folio *folio) +{ + lockdep_assert_held(&hugetlb_lock); + + /* + * Very subtle + * + * For non-gigantic pages set the destructor to the normal compound + * page dtor. This is needed in case someone takes an additional + * temporary ref to the page, and freeing is delayed until they drop + * their reference. + * + * For gigantic pages set the destructor to the null dtor. This + * destructor will never be called. Before freeing the gigantic + * page destroy_compound_gigantic_folio will turn the folio into a + * simple group of pages. After this the destructor does not + * apply. + * + */ + if (hstate_is_gigantic(h)) + folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR); + else + folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); +} + /* - * Remove hugetlb folio from lists, and update dtor so that the folio appears - * as just a compound page. + * Remove hugetlb folio from lists. + * If vmemmap exists for the folio, update dtor so that the folio appears + * as just a compound page. Otherwise, wait until after allocating vmemmap + * to update dtor. * * A reference is held on the folio, except in the case of demote. * @@ -1613,31 +1641,19 @@ static void __remove_hugetlb_folio(struct hstate *h, struct folio *folio, } /* - * Very subtle - * - * For non-gigantic pages set the destructor to the normal compound - * page dtor. This is needed in case someone takes an additional - * temporary ref to the page, and freeing is delayed until they drop - * their reference. - * - * For gigantic pages set the destructor to the null dtor. This - * destructor will never be called. Before freeing the gigantic - * page destroy_compound_gigantic_folio will turn the folio into a - * simple group of pages. After this the destructor does not - * apply. - * - * This handles the case where more than one ref is held when and - * after update_and_free_hugetlb_folio is called. - * - * In the case of demote we do not ref count the page as it will soon - * be turned into a page of smaller size. + * We can only clear the hugetlb destructor after allocating vmemmap + * pages. Otherwise, someone (memory error handling) may try to write + * to tail struct pages. + */ + if (!folio_test_hugetlb_vmemmap_optimized(folio)) + __clear_hugetlb_destructor(h, folio); + + /* + * In the case of demote we do not ref count the page as it will soon + * be turned into a page of smaller size. */ if (!demote) folio_ref_unfreeze(folio, 1); - if (hstate_is_gigantic(h)) - folio_set_compound_dtor(folio, NULL_COMPOUND_DTOR); - else - folio_set_compound_dtor(folio, COMPOUND_PAGE_DTOR); h->nr_huge_pages--; h->nr_huge_pages_node[nid]--; @@ -1706,6 +1722,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, { int i; struct page *subpage; + bool clear_dtor = folio_test_hugetlb_vmemmap_optimized(folio); if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; @@ -1736,6 +1753,16 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, if (unlikely(folio_test_hwpoison(folio))) folio_clear_hugetlb_hwpoison(folio); + /* + * If vmemmap pages were allocated above, then we need to clear the + * hugetlb destructor under the hugetlb lock. + */ + if (clear_dtor) { + spin_lock_irq(&hugetlb_lock); + __clear_hugetlb_destructor(h, folio); + spin_unlock_irq(&hugetlb_lock); + } + for (i = 0; i < pages_per_huge_page(h); i++) { subpage = folio_page(folio, i); subpage->flags &= ~(1 << PG_locked | 1 << PG_error | diff --git a/mm/internal.h b/mm/internal.h index c6ed10f0a5ad..d99ffb473f90 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -941,6 +941,13 @@ int migrate_device_coherent_page(struct page *page); struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); int __must_check try_grab_page(struct page *page, unsigned int flags); +/* + * mm/huge_memory.c + */ +struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmd, + unsigned int flags); + enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, @@ -1015,6 +1022,16 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, smp_rmb(); /* + * During GUP-fast we might not get called on the head page for a + * hugetlb page that is mapped using cont-PTE, because GUP-fast does + * not work with the abstracted hugetlb PTEs that always point at the + * head page. For hugetlb, PageAnonExclusive only applies on the head + * page (as it cannot be partially COW-shared), so lookup the head page. + */ + if (unlikely(!PageHead(page) && PageHuge(page))) + page = compound_head(page); + + /* * Note that PageKsm() pages cannot be exclusive, and consequently, * cannot get pinned. */ @@ -462,6 +462,12 @@ static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long nex static const struct mm_walk_ops break_ksm_ops = { .pmd_entry = break_ksm_pmd_entry, + .walk_lock = PGWALK_RDLOCK, +}; + +static const struct mm_walk_ops break_ksm_lock_vma_ops = { + .pmd_entry = break_ksm_pmd_entry, + .walk_lock = PGWALK_WRLOCK, }; /* @@ -477,16 +483,17 @@ static const struct mm_walk_ops break_ksm_ops = { * of the process that owns 'vma'. We also do not want to enforce * protection keys here anyway. */ -static int break_ksm(struct vm_area_struct *vma, unsigned long addr) +static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma) { vm_fault_t ret = 0; + const struct mm_walk_ops *ops = lock_vma ? + &break_ksm_lock_vma_ops : &break_ksm_ops; do { int ksm_page; cond_resched(); - ksm_page = walk_page_range_vma(vma, addr, addr + 1, - &break_ksm_ops, NULL); + ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL); if (WARN_ON_ONCE(ksm_page < 0)) return ksm_page; if (!ksm_page) @@ -572,7 +579,7 @@ static void break_cow(struct ksm_rmap_item *rmap_item) mmap_read_lock(mm); vma = find_mergeable_vma(mm, addr); if (vma) - break_ksm(vma, addr); + break_ksm(vma, addr, false); mmap_read_unlock(mm); } @@ -878,7 +885,7 @@ static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) * in cmp_and_merge_page on one of the rmap_items we would be removing. */ static int unmerge_ksm_pages(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, bool lock_vma) { unsigned long addr; int err = 0; @@ -889,7 +896,7 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, if (signal_pending(current)) err = -ERESTARTSYS; else - err = break_ksm(vma, addr); + err = break_ksm(vma, addr, lock_vma); } return err; } @@ -1036,7 +1043,7 @@ static int unmerge_and_remove_all_rmap_items(void) if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) continue; err = unmerge_ksm_pages(vma, - vma->vm_start, vma->vm_end); + vma->vm_start, vma->vm_end, false); if (err) goto error; } @@ -2546,7 +2553,7 @@ static int __ksm_del_vma(struct vm_area_struct *vma) return 0; if (vma->anon_vma) { - err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end); + err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end, true); if (err) return err; } @@ -2684,7 +2691,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start, return 0; /* just ignore the advice */ if (vma->anon_vma) { - err = unmerge_ksm_pages(vma, start, end); + err = unmerge_ksm_pages(vma, start, end, true); if (err) return err; } @@ -2800,6 +2807,8 @@ struct page *ksm_might_need_to_copy(struct page *page, anon_vma->root == vma->anon_vma->root) { return page; /* still no need to copy it */ } + if (PageHWPoison(page)) + return ERR_PTR(-EHWPOISON); if (!PageUptodate(page)) return page; /* let do_swap_page report the error */ diff --git a/mm/madvise.c b/mm/madvise.c index 8498f700c284..1fb2a11d77d9 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -232,6 +232,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, static const struct mm_walk_ops swapin_walk_ops = { .pmd_entry = swapin_walk_pmd_entry, + .walk_lock = PGWALK_RDLOCK, }; static void shmem_swapin_range(struct vm_area_struct *vma, @@ -537,6 +538,7 @@ regular_folio: static const struct mm_walk_ops cold_walk_ops = { .pmd_entry = madvise_cold_or_pageout_pte_range, + .walk_lock = PGWALK_RDLOCK, }; static void madvise_cold_page_range(struct mmu_gather *tlb, @@ -760,6 +762,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, static const struct mm_walk_ops madvise_free_walk_ops = { .pmd_entry = madvise_free_pte_range, + .walk_lock = PGWALK_RDLOCK, }; static int madvise_free_single_vma(struct vm_area_struct *vma, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d83f99580900..8e125aa5a18d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6013,6 +6013,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, static const struct mm_walk_ops precharge_walk_ops = { .pmd_entry = mem_cgroup_count_precharge_pte_range, + .walk_lock = PGWALK_RDLOCK, }; static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) @@ -6292,6 +6293,7 @@ put: /* get_mctgt_type() gets & locks the page */ static const struct mm_walk_ops charge_walk_ops = { .pmd_entry = mem_cgroup_move_charge_pte_range, + .walk_lock = PGWALK_RDLOCK, }; static void mem_cgroup_move_charge(void) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 976747d28ce7..55dfe8a7bf4b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -827,6 +827,7 @@ static int hwpoison_hugetlb_range(pte_t *ptep, unsigned long hmask, static const struct mm_walk_ops hwp_walk_ops = { .pmd_entry = hwpoison_pte_range, .hugetlb_entry = hwpoison_hugetlb_range, + .walk_lock = PGWALK_RDLOCK, }; /* @@ -2500,7 +2501,7 @@ int unpoison_memory(unsigned long pfn) { struct folio *folio; struct page *p; - int ret = -EBUSY; + int ret = -EBUSY, ghp; unsigned long count = 1; bool huge = false; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, @@ -2533,6 +2534,13 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } + if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio)) + goto unlock_mutex; + + /* + * Note that folio->_mapcount is overloaded in SLAB, so the simple test + * in folio_mapped() has to be done after folio_test_slab() is checked. + */ if (folio_mapped(folio)) { unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); @@ -2545,32 +2553,28 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (folio_test_slab(folio) || PageTable(&folio->page) || folio_test_reserved(folio)) - goto unlock_mutex; - - ret = get_hwpoison_page(p, MF_UNPOISON); - if (!ret) { + ghp = get_hwpoison_page(p, MF_UNPOISON); + if (!ghp) { if (PageHuge(p)) { huge = true; count = folio_free_raw_hwp(folio, false); - if (count == 0) { - ret = -EBUSY; + if (count == 0) goto unlock_mutex; - } } ret = folio_test_clear_hwpoison(folio) ? 0 : -EBUSY; - } else if (ret < 0) { - if (ret == -EHWPOISON) { + } else if (ghp < 0) { + if (ghp == -EHWPOISON) { ret = put_page_back_buddy(p) ? 0 : -EBUSY; - } else + } else { + ret = ghp; unpoison_pr_info("Unpoison: failed to grab page %#lx\n", pfn, &unpoison_rs); + } } else { if (PageHuge(p)) { huge = true; count = folio_free_raw_hwp(folio, false); if (count == 0) { - ret = -EBUSY; folio_put(folio); goto unlock_mutex; } @@ -2771,10 +2775,13 @@ retry: if (ret > 0) { ret = soft_offline_in_use_page(page); } else if (ret == 0) { - if (!page_handle_poison(page, true, false) && try_again) { - try_again = false; - flags &= ~MF_COUNT_INCREASED; - goto retry; + if (!page_handle_poison(page, true, false)) { + if (try_again) { + try_again = false; + flags &= ~MF_COUNT_INCREASED; + goto retry; + } + ret = -EBUSY; } } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index c53f8beeb507..ec2eaceffd74 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -718,6 +718,14 @@ static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_folios_hugetlb, .pmd_entry = queue_folios_pte_range, .test_walk = queue_pages_test_walk, + .walk_lock = PGWALK_RDLOCK, +}; + +static const struct mm_walk_ops queue_pages_lock_vma_walk_ops = { + .hugetlb_entry = queue_folios_hugetlb, + .pmd_entry = queue_folios_pte_range, + .test_walk = queue_pages_test_walk, + .walk_lock = PGWALK_WRLOCK, }; /* @@ -738,7 +746,7 @@ static const struct mm_walk_ops queue_pages_walk_ops = { static int queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist) + struct list_head *pagelist, bool lock_vma) { int err; struct queue_pages qp = { @@ -749,8 +757,10 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, .end = end, .first = NULL, }; + const struct mm_walk_ops *ops = lock_vma ? + &queue_pages_lock_vma_walk_ops : &queue_pages_walk_ops; - err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); + err = walk_page_range(mm, start, end, ops, &qp); if (!qp.first) /* whole range in hole */ @@ -1078,7 +1088,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, vma = find_vma(mm, 0); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, - flags | MPOL_MF_DISCONTIG_OK, &pagelist); + flags | MPOL_MF_DISCONTIG_OK, &pagelist, false); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, @@ -1321,12 +1331,8 @@ static long do_mbind(unsigned long start, unsigned long len, * Lock the VMAs before scanning for pages to migrate, to ensure we don't * miss a concurrently inserted page. */ - vma_iter_init(&vmi, mm, start); - for_each_vma_range(vmi, vma, end) - vma_start_write(vma); - ret = queue_pages_range(mm, start, end, nmask, - flags | MPOL_MF_INVERT, &pagelist); + flags | MPOL_MF_INVERT, &pagelist, true); if (ret < 0) { err = ret; diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 6c556b5876c6..d69131adc51c 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -279,6 +279,7 @@ next: static const struct mm_walk_ops migrate_vma_walk_ops = { .pmd_entry = migrate_vma_collect_pmd, .pte_hole = migrate_vma_collect_hole, + .walk_lock = PGWALK_RDLOCK, }; /* diff --git a/mm/mincore.c b/mm/mincore.c index b7f7a516b26c..dad3622cc963 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -176,6 +176,7 @@ static const struct mm_walk_ops mincore_walk_ops = { .pmd_entry = mincore_pte_range, .pte_hole = mincore_unmapped_range, .hugetlb_entry = mincore_hugetlb, + .walk_lock = PGWALK_RDLOCK, }; /* diff --git a/mm/mlock.c b/mm/mlock.c index 1746600a2e14..06bdfab83b58 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -371,6 +371,7 @@ static void mlock_vma_pages_range(struct vm_area_struct *vma, { static const struct mm_walk_ops mlock_walk_ops = { .pmd_entry = mlock_pte_range, + .walk_lock = PGWALK_WRLOCK_VERIFY, }; /* diff --git a/mm/mprotect.c b/mm/mprotect.c index 7cd7f644da80..130db91d3a8c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -568,6 +568,7 @@ static const struct mm_walk_ops prot_none_walk_ops = { .pte_entry = prot_none_pte_entry, .hugetlb_entry = prot_none_hugetlb_entry, .test_walk = prot_none_test, + .walk_lock = PGWALK_WRLOCK, }; int diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 2022333805d3..9b2d23fbf4d3 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -400,6 +400,33 @@ static int __walk_page_range(unsigned long start, unsigned long end, return err; } +static inline void process_mm_walk_lock(struct mm_struct *mm, + enum page_walk_lock walk_lock) +{ + if (walk_lock == PGWALK_RDLOCK) + mmap_assert_locked(mm); + else + mmap_assert_write_locked(mm); +} + +static inline void process_vma_walk_lock(struct vm_area_struct *vma, + enum page_walk_lock walk_lock) +{ +#ifdef CONFIG_PER_VMA_LOCK + switch (walk_lock) { + case PGWALK_WRLOCK: + vma_start_write(vma); + break; + case PGWALK_WRLOCK_VERIFY: + vma_assert_write_locked(vma); + break; + case PGWALK_RDLOCK: + /* PGWALK_RDLOCK is handled by process_mm_walk_lock */ + break; + } +#endif +} + /** * walk_page_range - walk page table with caller specific callbacks * @mm: mm_struct representing the target process of page table walk @@ -459,7 +486,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, if (!walk.mm) return -EINVAL; - mmap_assert_locked(walk.mm); + process_mm_walk_lock(walk.mm, ops->walk_lock); vma = find_vma(walk.mm, start); do { @@ -474,6 +501,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, if (ops->pte_hole) err = ops->pte_hole(start, next, -1, &walk); } else { /* inside vma */ + process_vma_walk_lock(vma, ops->walk_lock); walk.vma = vma; next = min(end, vma->vm_end); vma = find_vma(mm, vma->vm_end); @@ -549,7 +577,8 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; - mmap_assert_locked(walk.mm); + process_mm_walk_lock(walk.mm, ops->walk_lock); + process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(start, end, &walk); } @@ -566,7 +595,8 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, if (!walk.mm) return -EINVAL; - mmap_assert_locked(walk.mm); + process_mm_walk_lock(walk.mm, ops->walk_lock); + process_vma_walk_lock(vma, ops->walk_lock); return __walk_page_range(vma->vm_start, vma->vm_end, &walk); } diff --git a/mm/swapfile.c b/mm/swapfile.c index b52145c6bac2..d46933adf789 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1745,7 +1745,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, struct page *swapcache; spinlock_t *ptl; pte_t *pte, new_pte, old_pte; - bool hwposioned = false; + bool hwpoisoned = PageHWPoison(page); int ret = 1; swapcache = page; @@ -1753,7 +1753,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, if (unlikely(!page)) return -ENOMEM; else if (unlikely(PTR_ERR(page) == -EHWPOISON)) - hwposioned = true; + hwpoisoned = true; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte || !pte_same_as_swp(ptep_get(pte), @@ -1764,11 +1764,11 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, old_pte = ptep_get(pte); - if (unlikely(hwposioned || !PageUptodate(page))) { + if (unlikely(hwpoisoned || !PageUptodate(page))) { swp_entry_t swp_entry; dec_mm_counter(vma->vm_mm, MM_SWAPENTS); - if (hwposioned) { + if (hwpoisoned) { swp_entry = make_hwpoison_entry(swapcache); page = swapcache; } else { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 93cf99aba335..228a4a5312f2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2979,6 +2979,10 @@ void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot) free_vm_area(area); return NULL; } + + flush_cache_vmap((unsigned long)area->addr, + (unsigned long)area->addr + count * PAGE_SIZE); + return area->addr; } EXPORT_SYMBOL_GPL(vmap_pfn); diff --git a/mm/vmscan.c b/mm/vmscan.c index 6cbe921ef662..c7c149cb8d66 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4284,6 +4284,7 @@ static void walk_mm(struct lruvec *lruvec, struct mm_struct *mm, struct lru_gen_ static const struct mm_walk_ops mm_walk_ops = { .test_walk = should_skip_vma, .p4d_entry = walk_pud_range, + .walk_lock = PGWALK_RDLOCK, }; int err; @@ -4855,16 +4856,17 @@ void lru_gen_release_memcg(struct mem_cgroup *memcg) spin_lock_irq(&pgdat->memcg_lru.lock); - VM_WARN_ON_ONCE(hlist_nulls_unhashed(&lruvec->lrugen.list)); + if (hlist_nulls_unhashed(&lruvec->lrugen.list)) + goto unlock; gen = lruvec->lrugen.gen; - hlist_nulls_del_rcu(&lruvec->lrugen.list); + hlist_nulls_del_init_rcu(&lruvec->lrugen.list); pgdat->memcg_lru.nr_memcgs[gen]--; if (!pgdat->memcg_lru.nr_memcgs[gen] && gen == get_memcg_gen(pgdat->memcg_lru.seq)) WRITE_ONCE(pgdat->memcg_lru.seq, pgdat->memcg_lru.seq + 1); - +unlock: spin_unlock_irq(&pgdat->memcg_lru.lock); } } @@ -5446,8 +5448,10 @@ restart: rcu_read_lock(); hlist_nulls_for_each_entry_rcu(lrugen, pos, &pgdat->memcg_lru.fifo[gen][bin], list) { - if (op) + if (op) { lru_gen_rotate_memcg(lruvec, op); + op = 0; + } mem_cgroup_put(memcg); @@ -5455,7 +5459,7 @@ restart: memcg = lruvec_memcg(lruvec); if (!mem_cgroup_tryget(memcg)) { - op = 0; + lru_gen_release_memcg(memcg); memcg = NULL; continue; } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 32f5bc4074df..b58f957429f0 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1777,6 +1777,7 @@ static void replace_sub_page(struct size_class *class, struct zspage *zspage, static bool zs_page_isolate(struct page *page, isolate_mode_t mode) { + struct zs_pool *pool; struct zspage *zspage; /* @@ -1786,9 +1787,10 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) VM_BUG_ON_PAGE(PageIsolated(page), page); zspage = get_zspage(page); - migrate_write_lock(zspage); + pool = zspage->pool; + spin_lock(&pool->lock); inc_zspage_isolation(zspage); - migrate_write_unlock(zspage); + spin_unlock(&pool->lock); return true; } @@ -1854,12 +1856,12 @@ static int zs_page_migrate(struct page *newpage, struct page *page, kunmap_atomic(s_addr); replace_sub_page(class, zspage, newpage, page); + dec_zspage_isolation(zspage); /* * Since we complete the data copy and set up new zspage structure, * it's okay to release the pool's lock. */ spin_unlock(&pool->lock); - dec_zspage_isolation(zspage); migrate_write_unlock(zspage); get_page(newpage); @@ -1876,14 +1878,16 @@ static int zs_page_migrate(struct page *newpage, struct page *page, static void zs_page_putback(struct page *page) { + struct zs_pool *pool; struct zspage *zspage; VM_BUG_ON_PAGE(!PageIsolated(page), page); zspage = get_zspage(page); - migrate_write_lock(zspage); + pool = zspage->pool; + spin_lock(&pool->lock); dec_zspage_isolation(zspage); - migrate_write_unlock(zspage); + spin_unlock(&pool->lock); } static const struct movable_operations zsmalloc_mops = { |