diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-26 12:32:41 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-05-26 12:32:41 -0700 |
commit | 98931dd95fd489fcbfa97da563505a6f071d7c77 (patch) | |
tree | 44683fc4a92efa614acdca2742a7ff19d26da1e3 /mm/huge_memory.c | |
parent | df202b452fe6c6d6f1351bad485e2367ef1e644e (diff) | |
parent | f403f22f8ccb12860b2b62fec3173c6ccd45938b (diff) |
Merge tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
"Almost all of MM here. A few things are still getting finished off,
reviewed, etc.
- Yang Shi has improved the behaviour of khugepaged collapsing of
readonly file-backed transparent hugepages.
- Johannes Weiner has arranged for zswap memory use to be tracked and
managed on a per-cgroup basis.
- Munchun Song adds a /proc knob ("hugetlb_optimize_vmemmap") for
runtime enablement of the recent huge page vmemmap optimization
feature.
- Baolin Wang contributes a series to fix some issues around hugetlb
pagetable invalidation.
- Zhenwei Pi has fixed some interactions between hwpoisoned pages and
virtualization.
- Tong Tiangen has enabled the use of the presently x86-only
page_table_check debugging feature on arm64 and riscv.
- David Vernet has done some fixup work on the memcg selftests.
- Peter Xu has taught userfaultfd to handle write protection faults
against shmem- and hugetlbfs-backed files.
- More DAMON development from SeongJae Park - adding online tuning of
the feature and support for monitoring of fixed virtual address
ranges. Also easier discovery of which monitoring operations are
available.
- Nadav Amit has done some optimization of TLB flushing during
mprotect().
- Neil Brown continues to labor away at improving our swap-over-NFS
support.
- David Hildenbrand has some fixes to anon page COWing versus
get_user_pages().
- Peng Liu fixed some errors in the core hugetlb code.
- Joao Martins has reduced the amount of memory consumed by
device-dax's compound devmaps.
- Some cleanups of the arch-specific pagemap code from Anshuman
Khandual.
- Muchun Song has found and fixed some errors in the TLB flushing of
transparent hugepages.
- Roman Gushchin has done more work on the memcg selftests.
... and, of course, many smaller fixes and cleanups. Notably, the
customary million cleanup serieses from Miaohe Lin"
* tag 'mm-stable-2022-05-25' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (381 commits)
mm: kfence: use PAGE_ALIGNED helper
selftests: vm: add the "settings" file with timeout variable
selftests: vm: add "test_hmm.sh" to TEST_FILES
selftests: vm: check numa_available() before operating "merge_across_nodes" in ksm_tests
selftests: vm: add migration to the .gitignore
selftests/vm/pkeys: fix typo in comment
ksm: fix typo in comment
selftests: vm: add process_mrelease tests
Revert "mm/vmscan: never demote for memcg reclaim"
mm/kfence: print disabling or re-enabling message
include/trace/events/percpu.h: cleanup for "percpu: improve percpu_alloc_percpu event trace"
include/trace/events/mmflags.h: cleanup for "tracing: incorrect gfp_t conversion"
mm: fix a potential infinite loop in start_isolate_page_range()
MAINTAINERS: add Muchun as co-maintainer for HugeTLB
zram: fix Kconfig dependency warning
mm/shmem: fix shmem folio swapoff hang
cgroup: fix an error handling path in alloc_pagecache_max_30M()
mm: damon: use HPAGE_PMD_SIZE
tracing: incorrect isolate_mote_t cast in mm_vmscan_lru_isolate
nodemask.h: fix compilation error with GCC12
...
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 177 |
1 files changed, 123 insertions, 54 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 910a138e9859..a77c78a2b6b5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -39,6 +39,7 @@ #include <asm/tlb.h> #include <asm/pgalloc.h> #include "internal.h" +#include "swap.h" #define CREATE_TRACE_POINTS #include <trace/events/thp.h> @@ -68,13 +69,6 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -static inline bool file_thp_enabled(struct vm_area_struct *vma) -{ - return transhuge_vma_enabled(vma, vma->vm_flags) && vma->vm_file && - !inode_is_open_for_write(vma->vm_file->f_inode) && - (vma->vm_flags & VM_EXEC); -} - bool transparent_hugepage_active(struct vm_area_struct *vma) { /* The addr is used to check if the vma size fits */ @@ -86,8 +80,8 @@ bool transparent_hugepage_active(struct vm_area_struct *vma) return __transparent_hugepage_enabled(vma); if (vma_is_shmem(vma)) return shmem_huge_enabled(vma); - if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) - return file_thp_enabled(vma); + if (transhuge_vma_enabled(vma, vma->vm_flags) && file_thp_enabled(vma)) + return true; return false; } @@ -647,7 +641,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - page_add_new_anon_rmap(page, vma, haddr, true); + page_add_new_anon_rmap(page, vma, haddr); lru_cache_add_inactive_or_unevictable(page, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); @@ -725,15 +719,15 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; gfp_t gfp; - struct page *page; + struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; if (!transhuge_vma_suitable(vma, haddr)) return VM_FAULT_FALLBACK; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - if (unlikely(khugepaged_enter(vma, vma->vm_flags))) - return VM_FAULT_OOM; + khugepaged_enter(vma, vma->vm_flags); + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { @@ -774,13 +768,12 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) return ret; } gfp = vma_thp_gfp_mask(vma); - page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER); - if (unlikely(!page)) { + folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); + if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - prep_transhuge_page(page); - return __do_huge_pmd_anonymous_page(vmf, page, gfp); + return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -1054,7 +1047,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, swp_entry_t entry = pmd_to_swp_entry(pmd); VM_BUG_ON(!is_pmd_migration_entry(pmd)); - if (is_writable_migration_entry(entry)) { + if (!is_readable_migration_entry(entry)) { entry = make_readable_migration_entry( swp_offset(entry)); pmd = swp_entry_to_pmd(entry); @@ -1097,23 +1090,16 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, src_page = pmd_page(pmd); VM_BUG_ON_PAGE(!PageHead(src_page), src_page); - /* - * If this page is a potentially pinned page, split and retry the fault - * with smaller page size. Normally this should not happen because the - * userspace should use MADV_DONTFORK upon pinned regions. This is a - * best effort that the pinned pages won't be replaced by another - * random page during the coming copy-on-write. - */ - if (unlikely(page_needs_cow_for_dma(src_vma, src_page))) { + get_page(src_page); + if (unlikely(page_try_dup_anon_rmap(src_page, true, src_vma))) { + /* Page maybe pinned: split and retry the fault on PTEs. */ + put_page(src_page); pte_free(dst_mm, pgtable); spin_unlock(src_ptl); spin_unlock(dst_ptl); __split_huge_pmd(src_vma, src_pmd, addr, false, NULL); return -EAGAIN; } - - get_page(src_page); - page_dup_rmap(src_page, true); add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR); out_zero_page: mm_inc_nr_ptes(dst_mm); @@ -1217,14 +1203,10 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, /* No huge zero pud yet */ } - /* Please refer to comments in copy_huge_pmd() */ - if (unlikely(page_needs_cow_for_dma(vma, pud_page(pud)))) { - spin_unlock(src_ptl); - spin_unlock(dst_ptl); - __split_huge_pud(vma, src_pud, addr); - return -EAGAIN; - } - + /* + * TODO: once we support anonymous pages, use page_try_dup_anon_rmap() + * and split if duplicating fails. + */ pudp_set_wrprotect(src_mm, addr, src_pud); pud = pud_mkold(pud_wrprotect(pud)); set_pud_at(dst_mm, addr, dst_pud, pud); @@ -1282,6 +1264,7 @@ unlock: vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) { + const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; @@ -1290,6 +1273,9 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); + VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); + VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); + if (is_huge_zero_pmd(orig_pmd)) goto fallback; @@ -1303,6 +1289,10 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) page = pmd_page(orig_pmd); VM_BUG_ON_PAGE(!PageHead(page), page); + /* Early check when only holding the PT lock. */ + if (PageAnonExclusive(page)) + goto reuse; + if (!trylock_page(page)) { get_page(page); spin_unlock(vmf->ptl); @@ -1317,8 +1307,14 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) put_page(page); } + /* Recheck after temporarily dropping the PT lock. */ + if (PageAnonExclusive(page)) { + unlock_page(page); + goto reuse; + } + /* - * See do_wp_page(): we can only map the page writable if there are + * See do_wp_page(): we can only reuse the page exclusively if there are * no additional references. Note that we always drain the LRU * pagevecs immediately after adding a THP. */ @@ -1328,11 +1324,18 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) try_to_free_swap(page); if (page_count(page) == 1) { pmd_t entry; + + page_move_anon_rmap(page, vma); + unlock_page(page); +reuse: + if (unlikely(unshare)) { + spin_unlock(vmf->ptl); + return 0; + } entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - unlock_page(page); spin_unlock(vmf->ptl); return VM_FAULT_WRITE; } @@ -1379,6 +1382,12 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page = pmd_page(*pmd); VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) + return ERR_PTR(-EMLINK); + + VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) && + !PageAnonExclusive(page), page); + if (!try_grab_page(page, flags)) return ERR_PTR(-ENOMEM); @@ -1692,18 +1701,21 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, * or if prot_numa but THP migration is not supported * - HPAGE_PMD_NR if protections changed and TLB flush necessary */ -int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, pgprot_t newprot, unsigned long cp_flags) +int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long addr, pgprot_t newprot, + unsigned long cp_flags) { struct mm_struct *mm = vma->vm_mm; spinlock_t *ptl; - pmd_t entry; + pmd_t oldpmd, entry; bool preserve_write; int ret; bool prot_numa = cp_flags & MM_CP_PROT_NUMA; bool uffd_wp = cp_flags & MM_CP_UFFD_WP; bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE; + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); + if (prot_numa && !thp_migration_supported()) return 1; @@ -1717,6 +1729,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION if (is_swap_pmd(*pmd)) { swp_entry_t entry = pmd_to_swp_entry(*pmd); + struct page *page = pfn_swap_entry_to_page(entry); VM_BUG_ON(!is_pmd_migration_entry(*pmd)); if (is_writable_migration_entry(entry)) { @@ -1725,8 +1738,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * A protection check is difficult so * just be safe and disable write */ - entry = make_readable_migration_entry( - swp_offset(entry)); + if (PageAnon(page)) + entry = make_readable_exclusive_migration_entry(swp_offset(entry)); + else + entry = make_readable_migration_entry(swp_offset(entry)); newpmd = swp_entry_to_pmd(entry); if (pmd_swp_soft_dirty(*pmd)) newpmd = pmd_swp_mksoft_dirty(newpmd); @@ -1778,12 +1793,12 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * The race makes MADV_DONTNEED miss the huge pmd and don't clear it * which may break userspace. * - * pmdp_invalidate() is required to make sure we don't miss + * pmdp_invalidate_ad() is required to make sure we don't miss * dirty/young flags set by hardware. */ - entry = pmdp_invalidate(vma, addr, pmd); + oldpmd = pmdp_invalidate_ad(vma, addr, pmd); - entry = pmd_modify(entry, newprot); + entry = pmd_modify(oldpmd, newprot); if (preserve_write) entry = pmd_mk_savedwrite(entry); if (uffd_wp) { @@ -1799,6 +1814,10 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, } ret = HPAGE_PMD_NR; set_pmd_at(mm, addr, pmd, entry); + + if (huge_pmd_needs_flush(oldpmd, entry)) + tlb_flush_pmd_range(tlb, addr, HPAGE_PMD_SIZE); + BUG_ON(vma_is_anonymous(vma) && !preserve_write && pmd_write(entry)); unlock: spin_unlock(ptl); @@ -1946,6 +1965,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, pgtable_t pgtable; pmd_t old_pmd, _pmd; bool young, write, soft_dirty, pmd_migration = false, uffd_wp = false; + bool anon_exclusive = false; unsigned long addr; int i; @@ -2027,6 +2047,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, entry = pmd_to_swp_entry(old_pmd); page = pfn_swap_entry_to_page(entry); write = is_writable_migration_entry(entry); + if (PageAnon(page)) + anon_exclusive = is_readable_exclusive_migration_entry(entry); young = false; soft_dirty = pmd_swp_soft_dirty(old_pmd); uffd_wp = pmd_swp_uffd_wp(old_pmd); @@ -2038,8 +2060,26 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, young = pmd_young(old_pmd); soft_dirty = pmd_soft_dirty(old_pmd); uffd_wp = pmd_uffd_wp(old_pmd); + VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); + + /* + * Without "freeze", we'll simply split the PMD, propagating the + * PageAnonExclusive() flag for each PTE by setting it for + * each subpage -- no need to (temporarily) clear. + * + * With "freeze" we want to replace mapped pages by + * migration entries right away. This is only possible if we + * managed to clear PageAnonExclusive() -- see + * set_pmd_migration_entry(). + * + * In case we cannot clear PageAnonExclusive(), split the PMD + * only and let try_to_migrate_one() fail later. + */ + anon_exclusive = PageAnon(page) && PageAnonExclusive(page); + if (freeze && anon_exclusive && page_try_share_anon_rmap(page)) + freeze = false; } /* @@ -2061,6 +2101,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (write) swp_entry = make_writable_migration_entry( page_to_pfn(page + i)); + else if (anon_exclusive) + swp_entry = make_readable_exclusive_migration_entry( + page_to_pfn(page + i)); else swp_entry = make_readable_migration_entry( page_to_pfn(page + i)); @@ -2072,6 +2115,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } else { entry = mk_pte(page + i, READ_ONCE(vma->vm_page_prot)); entry = maybe_mkwrite(entry, vma); + if (anon_exclusive) + SetPageAnonExclusive(page + i); if (!write) entry = pte_wrprotect(entry); if (!young) @@ -2249,8 +2294,6 @@ static void unmap_page(struct page *page) try_to_migrate(folio, ttu_flags); else try_to_unmap(folio, ttu_flags | TTU_IGNORE_MLOCK); - - VM_WARN_ON_ONCE_PAGE(page_mapped(page), page); } static void remap_page(struct folio *folio, unsigned long nr) @@ -2305,6 +2348,13 @@ static void __split_huge_page_tail(struct page *head, int tail, * * After successful get_page_unless_zero() might follow flags change, * for example lock_page() which set PG_waiters. + * + * Note that for mapped sub-pages of an anonymous THP, + * PG_anon_exclusive has been cleared in unmap_page() and is stored in + * the migration entry instead from where remap_page() will restore it. + * We can still have PG_anon_exclusive set on effectively unmapped and + * unreferenced sub-pages of an anonymous THP: we can simply drop + * PG_anon_exclusive (-> PG_mappedtodisk) for these here. */ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; page_tail->flags |= (head->flags & @@ -3035,25 +3085,35 @@ late_initcall(split_huge_pages_debugfs); #endif #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION -void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, +int set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, struct page *page) { struct vm_area_struct *vma = pvmw->vma; struct mm_struct *mm = vma->vm_mm; unsigned long address = pvmw->address; + bool anon_exclusive; pmd_t pmdval; swp_entry_t entry; pmd_t pmdswp; if (!(pvmw->pmd && !pvmw->pte)) - return; + return 0; flush_cache_range(vma, address, address + HPAGE_PMD_SIZE); pmdval = pmdp_invalidate(vma, address, pvmw->pmd); + + anon_exclusive = PageAnon(page) && PageAnonExclusive(page); + if (anon_exclusive && page_try_share_anon_rmap(page)) { + set_pmd_at(mm, address, pvmw->pmd, pmdval); + return -EBUSY; + } + if (pmd_dirty(pmdval)) set_page_dirty(page); if (pmd_write(pmdval)) entry = make_writable_migration_entry(page_to_pfn(page)); + else if (anon_exclusive) + entry = make_readable_exclusive_migration_entry(page_to_pfn(page)); else entry = make_readable_migration_entry(page_to_pfn(page)); pmdswp = swp_entry_to_pmd(entry); @@ -3063,6 +3123,8 @@ void set_pmd_migration_entry(struct page_vma_mapped_walk *pvmw, page_remove_rmap(page, vma, true); put_page(page); trace_set_migration_pmd(address, pmd_val(pmdswp)); + + return 0; } void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) @@ -3087,10 +3149,17 @@ void remove_migration_pmd(struct page_vma_mapped_walk *pvmw, struct page *new) if (pmd_swp_uffd_wp(*pvmw->pmd)) pmde = pmd_wrprotect(pmd_mkuffd_wp(pmde)); - if (PageAnon(new)) - page_add_anon_rmap(new, vma, mmun_start, true); - else + if (PageAnon(new)) { + rmap_t rmap_flags = RMAP_COMPOUND; + + if (!is_readable_migration_entry(entry)) + rmap_flags |= RMAP_EXCLUSIVE; + + page_add_anon_rmap(new, vma, mmun_start, rmap_flags); + } else { page_add_file_rmap(new, vma, true); + } + VM_BUG_ON(pmd_write(pmde) && PageAnon(new) && !PageAnonExclusive(new)); set_pmd_at(mm, mmun_start, pvmw->pmd, pmde); /* No need to invalidate - it was non-present before */ |