diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-10 17:53:04 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-10 17:53:04 -0700 |
commit | 27bc50fc90647bbf7b734c3fc306a5e61350da53 (patch) | |
tree | 75fc525fbfec8c07a97a7875a89592317bcad4ca /mm/khugepaged.c | |
parent | 70442fc54e6889a2a77f0e9554e8188a1557f00e (diff) | |
parent | bbff39cc6cbcb86ccfacb2dcafc79912a9f9df69 (diff) |
Merge tag 'mm-stable-2022-10-08' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
- Yu Zhao's Multi-Gen LRU patches are here. They've been under test in
linux-next for a couple of months without, to my knowledge, any
negative reports (or any positive ones, come to that).
- Also the Maple Tree from Liam Howlett. An overlapping range-based
tree for vmas. It it apparently slightly more efficient in its own
right, but is mainly targeted at enabling work to reduce mmap_lock
contention.
Liam has identified a number of other tree users in the kernel which
could be beneficially onverted to mapletrees.
Yu Zhao has identified a hard-to-hit but "easy to fix" lockdep splat
at [1]. This has yet to be addressed due to Liam's unfortunately
timed vacation. He is now back and we'll get this fixed up.
- Dmitry Vyukov introduces KMSAN: the Kernel Memory Sanitizer. It uses
clang-generated instrumentation to detect used-unintialized bugs down
to the single bit level.
KMSAN keeps finding bugs. New ones, as well as the legacy ones.
- Yang Shi adds a userspace mechanism (madvise) to induce a collapse of
memory into THPs.
- Zach O'Keefe has expanded Yang Shi's madvise(MADV_COLLAPSE) to
support file/shmem-backed pages.
- userfaultfd updates from Axel Rasmussen
- zsmalloc cleanups from Alexey Romanov
- cleanups from Miaohe Lin: vmscan, hugetlb_cgroup, hugetlb and
memory-failure
- Huang Ying adds enhancements to NUMA balancing memory tiering mode's
page promotion, with a new way of detecting hot pages.
- memcg updates from Shakeel Butt: charging optimizations and reduced
memory consumption.
- memcg cleanups from Kairui Song.
- memcg fixes and cleanups from Johannes Weiner.
- Vishal Moola provides more folio conversions
- Zhang Yi removed ll_rw_block() :(
- migration enhancements from Peter Xu
- migration error-path bugfixes from Huang Ying
- Aneesh Kumar added ability for a device driver to alter the memory
tiering promotion paths. For optimizations by PMEM drivers, DRM
drivers, etc.
- vma merging improvements from Jakub Matěn.
- NUMA hinting cleanups from David Hildenbrand.
- xu xin added aditional userspace visibility into KSM merging
activity.
- THP & KSM code consolidation from Qi Zheng.
- more folio work from Matthew Wilcox.
- KASAN updates from Andrey Konovalov.
- DAMON cleanups from Kaixu Xia.
- DAMON work from SeongJae Park: fixes, cleanups.
- hugetlb sysfs cleanups from Muchun Song.
- Mike Kravetz fixes locking issues in hugetlbfs and in hugetlb core.
Link: https://lkml.kernel.org/r/CAOUHufZabH85CeUN-MEMgL8gJGzJEWUrkiM58JkTbBhh-jew0Q@mail.gmail.com [1]
* tag 'mm-stable-2022-10-08' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (555 commits)
hugetlb: allocate vma lock for all sharable vmas
hugetlb: take hugetlb vma_lock when clearing vma_lock->vma pointer
hugetlb: fix vma lock handling during split vma and range unmapping
mglru: mm/vmscan.c: fix imprecise comments
mm/mglru: don't sync disk for each aging cycle
mm: memcontrol: drop dead CONFIG_MEMCG_SWAP config symbol
mm: memcontrol: use do_memsw_account() in a few more places
mm: memcontrol: deprecate swapaccounting=0 mode
mm: memcontrol: don't allocate cgroup swap arrays when memcg is disabled
mm/secretmem: remove reduntant return value
mm/hugetlb: add available_huge_pages() func
mm: remove unused inline functions from include/linux/mm_inline.h
selftests/vm: add selftest for MADV_COLLAPSE of uffd-minor memory
selftests/vm: add file/shmem MADV_COLLAPSE selftest for cleared pmd
selftests/vm: add thp collapse shmem testing
selftests/vm: add thp collapse file and tmpfs testing
selftests/vm: modularize thp collapse memory operations
selftests/vm: dedup THP helpers
mm/khugepaged: add tracepoint to hpage_collapse_scan_file()
mm/madvise: add file and shmem support to MADV_COLLAPSE
...
Diffstat (limited to 'mm/khugepaged.c')
-rw-r--r-- | mm/khugepaged.c | 1168 |
1 files changed, 739 insertions, 429 deletions
diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 01dbc6dbd599..4734315f7940 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -23,16 +23,20 @@ #include <asm/tlb.h> #include <asm/pgalloc.h> #include "internal.h" +#include "mm_slot.h" enum scan_result { SCAN_FAIL, SCAN_SUCCEED, SCAN_PMD_NULL, + SCAN_PMD_NONE, + SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, + SCAN_PTE_MAPPED_HUGEPAGE, SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, @@ -73,6 +77,8 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * default collapse hugepages if there is at least one pte mapped like * it would have happened if the vma was large enough during page * fault. + * + * Note that these are only respected if collapse was initiated by khugepaged. */ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; @@ -85,18 +91,24 @@ static struct kmem_cache *mm_slot_cache __read_mostly; #define MAX_PTE_MAPPED_THP 8 +struct collapse_control { + bool is_khugepaged; + + /* Num pages scanned per node */ + u32 node_load[MAX_NUMNODES]; + + /* Last target selected in hpage_collapse_find_target_node() */ + int last_target_node; +}; + /** - * struct mm_slot - hash lookup from mm to mm_slot - * @hash: hash collision list - * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head - * @mm: the mm that this information is valid for + * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned + * @slot: hash lookup from mm to mm_slot * @nr_pte_mapped_thp: number of pte mapped THP * @pte_mapped_thp: address array corresponding pte mapped THP */ -struct mm_slot { - struct hlist_node hash; - struct list_head mm_node; - struct mm_struct *mm; +struct khugepaged_mm_slot { + struct mm_slot slot; /* pte-mapped THP in this mm */ int nr_pte_mapped_thp; @@ -113,7 +125,7 @@ struct mm_slot { */ struct khugepaged_scan { struct list_head mm_head; - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; unsigned long address; }; @@ -377,8 +389,9 @@ int hugepage_madvise(struct vm_area_struct *vma, int __init khugepaged_init(void) { mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", - sizeof(struct mm_slot), - __alignof__(struct mm_slot), 0, NULL); + sizeof(struct khugepaged_mm_slot), + __alignof__(struct khugepaged_mm_slot), + 0, NULL); if (!mm_slot_cache) return -ENOMEM; @@ -395,65 +408,38 @@ void __init khugepaged_destroy(void) kmem_cache_destroy(mm_slot_cache); } -static inline struct mm_slot *alloc_mm_slot(void) -{ - if (!mm_slot_cache) /* initialization failed */ - return NULL; - return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); -} - -static inline void free_mm_slot(struct mm_slot *mm_slot) -{ - kmem_cache_free(mm_slot_cache, mm_slot); -} - -static struct mm_slot *get_mm_slot(struct mm_struct *mm) -{ - struct mm_slot *mm_slot; - - hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) - if (mm == mm_slot->mm) - return mm_slot; - - return NULL; -} - -static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct mm_slot *mm_slot) -{ - mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); -} - -static inline int khugepaged_test_exit(struct mm_struct *mm) +static inline int hpage_collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } void __khugepaged_enter(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; int wakeup; - mm_slot = alloc_mm_slot(); + mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return; + slot = &mm_slot->slot; + /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); + VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); return; } spin_lock(&khugepaged_mm_lock); - insert_to_mm_slots_hash(mm, mm_slot); + mm_slot_insert(mm_slots_hash, mm, slot); /* * Insert just behind the scanning cursor, to let the area settle * down a little. */ wakeup = list_empty(&khugepaged_scan.mm_head); - list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); spin_unlock(&khugepaged_mm_lock); mmgrab(mm); @@ -466,37 +452,38 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (hugepage_vma_check(vma, vm_flags, false, false)) + if (hugepage_vma_check(vma, vm_flags, false, false, true)) __khugepaged_enter(vma->vm_mm); } } void __khugepaged_exit(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); - mm_slot = get_mm_slot(mm); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&slot->hash); + list_del(&slot->mm_node); free = 1; } spin_unlock(&khugepaged_mm_lock); if (free) { clear_bit(MMF_VM_HUGEPAGE, &mm->flags); - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } else if (mm_slot) { /* * This is required to serialize against - * khugepaged_test_exit() (which is guaranteed to run - * under mmap sem read mode). Stop here (after we - * return all pagetables will be destroyed) until - * khugepaged has finished working on the pagetables - * under the mmap_lock. + * hpage_collapse_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we return all + * pagetables will be destroyed) until khugepaged has finished + * working on the pagetables under the mmap_lock. */ mmap_write_lock(mm); mmap_write_unlock(mm); @@ -546,11 +533,12 @@ static bool is_refcount_suitable(struct page *page) static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, + struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; pte_t *_pte; - int none_or_zero = 0, shared = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; bool writable = false; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; @@ -558,8 +546,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, pte_t pteval = *_pte; if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { + ++none_or_zero; if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -579,11 +569,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageAnon(page), page); - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { - result = SCAN_EXCEED_SHARED_PTE; - count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); - goto out; + if (page_mapcount(page) > 1) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out; + } } if (PageCompound(page)) { @@ -646,10 +639,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (PageCompound(page)) list_add_tail(&page->lru, compound_pagelist); next: - /* There should be enough young pte to collapse the page */ - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) referenced++; if (pte_write(pteval)) @@ -658,19 +655,19 @@ next: if (unlikely(!writable)) { result = SCAN_PAGE_RO; - } else if (unlikely(!referenced)) { + } else if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 1; + return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 0; + return result; } static void __collapse_huge_page_copy(pte_t *pte, struct page *page, @@ -735,9 +732,12 @@ static void khugepaged_alloc_sleep(void) remove_wait_queue(&khugepaged_wait, &wait); } -static int khugepaged_node_load[MAX_NUMNODES]; +struct collapse_control khugepaged_collapse_control = { + .is_khugepaged = true, + .last_target_node = NUMA_NO_NODE, +}; -static bool khugepaged_scan_abort(int nid) +static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) { int i; @@ -749,11 +749,11 @@ static bool khugepaged_scan_abort(int nid) return false; /* If there is a count for this node already, it must be acceptable */ - if (khugepaged_node_load[nid]) + if (cc->node_load[nid]) return false; for (i = 0; i < MAX_NUMNODES; i++) { - if (!khugepaged_node_load[i]) + if (!cc->node_load[i]) continue; if (node_distance(nid, i) > node_reclaim_distance) return true; @@ -772,146 +772,63 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) } #ifdef CONFIG_NUMA -static int khugepaged_find_target_node(void) +static int hpage_collapse_find_target_node(struct collapse_control *cc) { - static int last_khugepaged_target_node = NUMA_NO_NODE; int nid, target_node = 0, max_value = 0; /* find first node with max normal pages hit */ for (nid = 0; nid < MAX_NUMNODES; nid++) - if (khugepaged_node_load[nid] > max_value) { - max_value = khugepaged_node_load[nid]; + if (cc->node_load[nid] > max_value) { + max_value = cc->node_load[nid]; target_node = nid; } /* do some balance if several nodes have the same hit record */ - if (target_node <= last_khugepaged_target_node) - for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; - nid++) - if (max_value == khugepaged_node_load[nid]) { + if (target_node <= cc->last_target_node) + for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == cc->node_load[nid]) { target_node = nid; break; } - last_khugepaged_target_node = target_node; + cc->last_target_node = target_node; return target_node; } - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +#else +static int hpage_collapse_find_target_node(struct collapse_control *cc) { - if (IS_ERR(*hpage)) { - if (!*wait) - return false; - - *wait = false; - *hpage = NULL; - khugepaged_alloc_sleep(); - } else if (*hpage) { - put_page(*hpage); - *hpage = NULL; - } - - return true; + return 0; } +#endif -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) +static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node) { - VM_BUG_ON_PAGE(*hpage, *hpage); - *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - *hpage = ERR_PTR(-ENOMEM); - return NULL; + return false; } prep_transhuge_page(*hpage); count_vm_event(THP_COLLAPSE_ALLOC); - return *hpage; -} -#else -static int khugepaged_find_target_node(void) -{ - return 0; -} - -static inline struct page *alloc_khugepaged_hugepage(void) -{ - struct page *page; - - page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), - HPAGE_PMD_ORDER); - if (page) - prep_transhuge_page(page); - return page; -} - -static struct page *khugepaged_alloc_hugepage(bool *wait) -{ - struct page *hpage; - - do { - hpage = alloc_khugepaged_hugepage(); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - if (!*wait) - return NULL; - - *wait = false; - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && likely(hugepage_flags_enabled())); - - return hpage; -} - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) -{ - /* - * If the hpage allocated earlier was briefly exposed in page cache - * before collapse_file() failed, it is possible that racing lookups - * have not yet completed, and would then be unpleasantly surprised by - * finding the hpage reused for the same mapping at a different offset. - * Just release the previous allocation if there is any danger of that. - */ - if (*hpage && page_count(*hpage) > 1) { - put_page(*hpage); - *hpage = NULL; - } - - if (!*hpage) - *hpage = khugepaged_alloc_hugepage(wait); - - if (unlikely(!*hpage)) - return false; - return true; } -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) -{ - VM_BUG_ON(!*hpage); - - return *hpage; -} -#endif - /* * If mmap_lock temporarily dropped, revalidate vma * before taking mmap_lock. - * Return 0 if succeeds, otherwise return none-zero - * value (scan code). + * Returns enum scan_result value. */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, - struct vm_area_struct **vmap) + bool expect_anon, + struct vm_area_struct **vmap, + struct collapse_control *cc) { struct vm_area_struct *vma; - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); @@ -920,7 +837,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!transhuge_vma_suitable(vma, address)) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, + cc->is_khugepaged)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -929,23 +847,62 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, * hugepage_vma_check may return true for qualified file * vmas. */ - if (!vma->anon_vma || !vma_is_anonymous(vma)) - return SCAN_VMA_CHECK; - return 0; + if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) + return SCAN_PAGE_ANON; + return SCAN_SUCCEED; +} + +static int find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, + pmd_t **pmd) +{ + pmd_t pmde; + + *pmd = mm_find_pmd(mm, address); + if (!*pmd) + return SCAN_PMD_NULL; + + pmde = pmd_read_atomic(*pmd); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ + barrier(); +#endif + if (pmd_none(pmde)) + return SCAN_PMD_NONE; + if (pmd_trans_huge(pmde)) + return SCAN_PMD_MAPPED; + if (pmd_bad(pmde)) + return SCAN_PMD_NULL; + return SCAN_SUCCEED; +} + +static int check_pmd_still_valid(struct mm_struct *mm, + unsigned long address, + pmd_t *pmd) +{ + pmd_t *new_pmd; + int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); + + if (result != SCAN_SUCCEED) + return result; + if (new_pmd != pmd) + return SCAN_FAIL; + return SCAN_SUCCEED; } /* * Bring missing pages in from swap, to complete THP collapse. - * Only done if khugepaged_scan_pmd believes it is worthwhile. + * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Note that if false is returned, mmap_lock will be released. */ -static bool __collapse_huge_page_swapin(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, - int referenced) +static int __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd, + int referenced) { int swapped_in = 0; vm_fault_t ret = 0; @@ -976,12 +933,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, */ if (ret & VM_FAULT_RETRY) { trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; + /* Likely, but not guaranteed, that page lock failed */ + return SCAN_PAGE_LOCK; } if (ret & VM_FAULT_ERROR) { mmap_read_unlock(mm); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; + return SCAN_FAIL; } swapped_in++; } @@ -991,30 +949,41 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, lru_add_drain(); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); - return true; + return SCAN_SUCCEED; } -static void collapse_huge_page(struct mm_struct *mm, - unsigned long address, - struct page **hpage, - int node, int referenced, int unmapped) +static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, + struct collapse_control *cc) +{ + /* Only allocate from the target node */ + gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : + GFP_TRANSHUGE) | __GFP_THISNODE; + int node = hpage_collapse_find_target_node(cc); + + if (!hpage_collapse_alloc_page(hpage, gfp, node)) + return SCAN_ALLOC_HUGE_PAGE_FAIL; + if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp))) + return SCAN_CGROUP_CHARGE_FAIL; + count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC); + return SCAN_SUCCEED; +} + +static int collapse_huge_page(struct mm_struct *mm, unsigned long address, + int referenced, int unmapped, + struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; - struct page *new_page; + struct page *hpage; spinlock_t *pmd_ptl, *pte_ptl; - int isolated = 0, result = 0; + int result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; - gfp_t gfp; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; - /* * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves @@ -1022,40 +991,34 @@ static void collapse_huge_page(struct mm_struct *mm, * that. We will recheck the vma after taking it again in write mode. */ mmap_read_unlock(mm); - new_page = khugepaged_alloc_page(hpage, gfp, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out_nolock; - } - if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { - result = SCAN_CGROUP_CHARGE_FAIL; + result = alloc_charge_hpage(&hpage, mm, cc); + if (result != SCAN_SUCCEED) goto out_nolock; - } - count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); mmap_read_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); - if (result) { + result = hugepage_vma_revalidate(mm, address, true, &vma, cc); + if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } - /* - * __collapse_huge_page_swapin will return with mmap_lock released - * when it fails. So we jump out_nolock directly in that case. - * Continuing to collapse causes inconsistency. - */ - if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, - pmd, referenced)) { - goto out_nolock; + if (unmapped) { + /* + * __collapse_huge_page_swapin will return with mmap_lock + * released when it fails. So we jump out_nolock directly in + * that case. Continuing to collapse causes inconsistency. + */ + result = __collapse_huge_page_swapin(mm, vma, address, pmd, + referenced); + if (result != SCAN_SUCCEED) + goto out_nolock; } mmap_read_unlock(mm); @@ -1065,11 +1028,12 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); - if (result) + result = hugepage_vma_revalidate(mm, address, true, &vma, cc); + if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ - if (mm_find_pmd(mm, address) != pmd) + result = check_pmd_still_valid(mm, address, pmd); + if (result != SCAN_SUCCEED) goto out_up_write; anon_vma_lock_write(vma->anon_vma); @@ -1095,11 +1059,11 @@ static void collapse_huge_page(struct mm_struct *mm, mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte, - &compound_pagelist); + result = __collapse_huge_page_isolate(vma, address, pte, cc, + &compound_pagelist); spin_unlock(pte_ptl); - if (unlikely(!isolated)) { + if (unlikely(result != SCAN_SUCCEED)) { pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); @@ -1111,7 +1075,6 @@ static void collapse_huge_page(struct mm_struct *mm, pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); - result = SCAN_FAIL; goto out_up_write; } @@ -1121,8 +1084,8 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, - &compound_pagelist); + __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); /* * spin_lock() below is not the equivalent of smp_wmb(), but @@ -1130,42 +1093,43 @@ static void collapse_huge_page(struct mm_struct *mm, * avoid the copy_huge_page writes to become visible after * the set_pmd_at() write. */ - __SetPageUptodate(new_page); + __SetPageUptodate(hpage); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); + _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address); - lru_cache_add_inactive_or_unevictable(new_page, vma); + page_add_new_anon_rmap(hpage, vma, address); + lru_cache_add_inactive_or_unevictable(hpage, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); spin_unlock(pmd_ptl); - *hpage = NULL; + hpage = NULL; - khugepaged_pages_collapsed++; result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: - if (!IS_ERR_OR_NULL(*hpage)) - mem_cgroup_uncharge(page_folio(*hpage)); - trace_mm_collapse_huge_page(mm, isolated, result); - return; + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); + } + trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); + return result; } -static int khugepaged_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - struct page **hpage) +static int hpage_collapse_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, bool *mmap_locked, + struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, result = 0, referenced = 0; + int result = SCAN_FAIL, referenced = 0; int none_or_zero = 0, shared = 0; struct page *page = NULL; unsigned long _address; @@ -1175,19 +1139,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) goto out; - } - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (is_swap_pte(pteval)) { - if (++unmapped <= khugepaged_max_ptes_swap) { + ++unmapped; + if (!cc->is_khugepaged || + unmapped <= khugepaged_max_ptes_swap) { /* * Always be strict with uffd-wp * enabled swap entries. Please see @@ -1205,8 +1169,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + ++none_or_zero; if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -1236,27 +1202,30 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { - result = SCAN_EXCEED_SHARED_PTE; - count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); - goto out_unmap; + if (page_mapcount(page) > 1) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out_unmap; + } } page = compound_head(page); /* * Record which node the original page is from and save this - * information to khugepaged_node_load[]. + * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } - khugepaged_node_load[node]++; + cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; goto out_unmap; @@ -1291,43 +1260,51 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, result = SCAN_PAGE_COUNT; goto out_unmap; } - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) referenced++; } if (!writable) { result = SCAN_PAGE_RO; - } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) { + } else if (cc->is_khugepaged && + (!referenced || + (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; - ret = 1; } out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { - node = khugepaged_find_target_node(); + if (result == SCAN_SUCCEED) { + result = collapse_huge_page(mm, address, referenced, + unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ - collapse_huge_page(mm, address, hpage, node, - referenced, unmapped); + *mmap_locked = false; } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, none_or_zero, result, unmapped); - return ret; + return result; } -static void collect_mm_slot(struct mm_slot *mm_slot) +static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) { - struct mm_struct *mm = mm_slot->mm; + struct mm_slot *slot = &mm_slot->slot; + struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); - if (khugepaged_test_exit(mm)) { + if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&slot->hash); + list_del(&slot->mm_node); /* * Not strictly needed because the mm exited already. @@ -1336,7 +1313,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) */ /* khugepaged_mm_lock actually not necessary for the below */ - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } } @@ -1345,19 +1322,66 @@ static void collect_mm_slot(struct mm_slot *mm_slot) /* * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. + * + * Note that following race exists: + * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A, + * emptying the A's ->pte_mapped_thp[] array. + * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and + * retract_page_tables() finds a VMA in mm_struct A mapping the same extent + * (at virtual address X) and adds an entry (for X) into mm_struct A's + * ->pte-mapped_thp[] array. + * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X, + * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry + * (for X) into mm_struct A's ->pte-mapped_thp[] array. + * Thus, it's possible the same address is added multiple times for the same + * mm_struct. Should this happen, we'll simply attempt + * collapse_pte_mapped_thp() multiple times for the same address, under the same + * exclusive mmap_lock, and assuming the first call is successful, subsequent + * attempts will return quickly (without grabbing any additional locks) when + * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap + * check, and since this is a rare occurrence, the cost of preventing this + * "multiple-add" is thought to be more expensive than just handling it, should + * it occur. */ -static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm, +static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; + bool ret = false; VM_BUG_ON(addr & ~HPAGE_PMD_MASK); spin_lock(&khugepaged_mm_lock); - mm_slot = get_mm_slot(mm); - if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); + if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) { mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; + ret = true; + } spin_unlock(&khugepaged_mm_lock); + return ret; +} + +/* hpage must be locked, and mmap_lock must be held in write */ +static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmdp, struct page *hpage) +{ + struct vm_fault vmf = { + .vma = vma, + .address = addr, + .flags = 0, + .pmd = pmdp, + }; + + VM_BUG_ON(!PageTransHuge(hpage)); + mmap_assert_write_locked(vma->vm_mm); + + if (do_set_pmd(&vmf, hpage)) + return SCAN_FAIL; + + get_page(hpage); + return SCAN_SUCCEED; } static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, @@ -1381,52 +1405,80 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v * * @mm: process address space where collapse happens * @addr: THP collapse address + * @install_pmd: If a huge PMD should be installed * * This function checks whether all the PTEs in the PMD are pointing to the * right THP. If so, retract the page table so the THP can refault in with - * as pmd-mapped. + * as pmd-mapped. Possibly install a huge PMD mapping the THP. */ -void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) +int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd) { unsigned long haddr = addr & HPAGE_PMD_MASK; - struct vm_area_struct *vma = find_vma(mm, haddr); + struct vm_area_struct *vma = vma_lookup(mm, haddr); struct page *hpage; pte_t *start_pte, *pte; pmd_t *pmd; spinlock_t *ptl; - int count = 0; + int count = 0, result = SCAN_FAIL; int i; + mmap_assert_write_locked(mm); + + /* Fast check before locking page if already PMD-mapped */ + result = find_pmd_or_thp_or_none(mm, haddr, &pmd); + if (result == SCAN_PMD_MAPPED) + return result; + if (!vma || !vma->vm_file || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) - return; + return SCAN_VMA_CHECK; /* - * This vm_flags may not have VM_HUGEPAGE if the page was not - * collapsed by this mm. But we can still collapse if the page is - * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() - * will not fail the vma for missing VM_HUGEPAGE + * If we are here, we've succeeded in replacing all the native pages + * in the page cache with a single hugepage. If a mm were to fault-in + * this memory (mapped by a suitably aligned VMA), we'd get the hugepage + * and map it by a PMD, regardless of sysfs THP settings. As such, let's + * analogously elide sysfs THP settings here. */ - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) - return; + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) - return; + return SCAN_PTE_UFFD_WP; hpage = find_lock_page(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); if (!hpage) - return; + return SCAN_PAGE_NULL; + + if (!PageHead(hpage)) { + result = SCAN_FAIL; + goto drop_hpage; + } - if (!PageHead(hpage)) + if (compound_order(hpage) != HPAGE_PMD_ORDER) { + result = SCAN_PAGE_COMPOUND; goto drop_hpage; + } - pmd = mm_find_pmd(mm, haddr); - if (!pmd) + switch (result) { + case SCAN_SUCCEED: + break; + case SCAN_PMD_NONE: + /* + * In MADV_COLLAPSE path, possible race with khugepaged where + * all pte entries have been removed and pmd cleared. If so, + * skip all the pte checks and just update the pmd mapping. + */ + goto maybe_install_pmd; + default: goto drop_hpage; + } start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + result = SCAN_FAIL; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; @@ -1438,8 +1490,10 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) continue; /* page swapped out, abort */ - if (!pte_present(*pte)) + if (!pte_present(*pte)) { + result = SCAN_PTE_NON_PRESENT; goto abort; + } page = vm_normal_page(vma, addr, *pte); if (WARN_ON_ONCE(page && is_zone_device_page(page))) @@ -1474,21 +1528,29 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); } - /* step 4: collapse pmd */ + /* step 4: remove pte entries */ collapse_and_free_pmd(mm, vma, haddr, pmd); + +maybe_install_pmd: + /* step 5: install pmd entry */ + result = install_pmd + ? set_huge_pmd(vma, haddr, pmd, hpage) + : SCAN_SUCCEED; + drop_hpage: unlock_page(hpage); put_page(hpage); - return; + return result; abort: pte_unmap_unlock(start_pte, ptl); goto drop_hpage; } -static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) { - struct mm_struct *mm = mm_slot->mm; + struct mm_slot *slot = &mm_slot->slot; + struct mm_struct *mm = slot->mm; int i; if (likely(mm_slot->nr_pte_mapped_thp == 0)) @@ -1497,26 +1559,33 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) if (!mmap_write_trylock(mm)) return; - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) goto out; for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) - collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]); + collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false); out: mm_slot->nr_pte_mapped_thp = 0; mmap_write_unlock(mm); } -static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) +static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, + struct mm_struct *target_mm, + unsigned long target_addr, struct page *hpage, + struct collapse_control *cc) { struct vm_area_struct *vma; - struct mm_struct *mm; - unsigned long addr; - pmd_t *pmd; + int target_result = SCAN_FAIL; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + int result = SCAN_FAIL; + struct mm_struct *mm = NULL; + unsigned long addr = 0; + pmd_t *pmd; + bool is_target = false; + /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that * got written to. These VMAs are likely not worth investing @@ -1533,25 +1602,34 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * ptl. It has higher chance to recover THP for the VMA, but * has higher cost too. */ - if (vma->anon_vma) - continue; + if (vma->anon_vma) { + result = SCAN_PAGE_ANON; + goto next; + } addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (addr & ~HPAGE_PMD_MASK) - continue; - if (vma->vm_end < addr + HPAGE_PMD_SIZE) - continue; + if (addr & ~HPAGE_PMD_MASK || + vma->vm_end < addr + HPAGE_PMD_SIZE) { + result = SCAN_VMA_CHECK; + goto next; + } mm = vma->vm_mm; - pmd = mm_find_pmd(mm, addr); - if (!pmd) - continue; + is_target = mm == target_mm && addr == target_addr; + result = find_pmd_or_thp_or_none(mm, addr, &pmd); + if (result != SCAN_SUCCEED) + goto next; /* * We need exclusive mmap_lock to retract page table. * * We use trylock due to lock inversion: we need to acquire * mmap_lock while holding page lock. Fault path does it in * reverse order. Trylock is a way to avoid deadlock. + * + * Also, it's not MADV_COLLAPSE's job to collapse other + * mappings - let khugepaged take care of them later. */ - if (mmap_write_trylock(mm)) { + result = SCAN_PTE_MAPPED_HUGEPAGE; + if ((cc->is_khugepaged || is_target) && + mmap_write_trylock(mm)) { /* * When a vma is registered with uffd-wp, we can't * recycle the pmd pgtable because there can be pte @@ -1560,25 +1638,48 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * it'll always mapped in small page size for uffd-wp * registered ranges. */ - if (!khugepaged_test_exit(mm) && !userfaultfd_wp(vma)) - collapse_and_free_pmd(mm, vma, addr, pmd); + if (hpage_collapse_test_exit(mm)) { + result = SCAN_ANY_PROCESS; + goto unlock_next; + } + if (userfaultfd_wp(vma)) { + result = SCAN_PTE_UFFD_WP; + goto unlock_next; + } + collapse_and_free_pmd(mm, vma, addr, pmd); + if (!cc->is_khugepaged && is_target) + result = set_huge_pmd(vma, addr, pmd, hpage); + else + result = SCAN_SUCCEED; + +unlock_next: mmap_write_unlock(mm); - } else { - /* Try again later */ + goto next; + } + /* + * Calling context will handle target mm/addr. Otherwise, let + * khugepaged try again later. + */ + if (!is_target) { khugepaged_add_pte_mapped_thp(mm, addr); + continue; } +next: + if (is_target) + target_result = result; } i_mmap_unlock_write(mapping); + return target_result; } /** * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * * @mm: process address space where collapse happens + * @addr: virtual collapse start address * @file: file that collapse on * @start: collapse start address - * @hpage: new allocated huge page for collapse - * @node: appointed node the new huge page allocate from + * @cc: collapse context and scratchpad * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; @@ -1595,13 +1696,12 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ -static void collapse_file(struct mm_struct *mm, - struct file *file, pgoff_t start, - struct page **hpage, int node) +static int collapse_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; - gfp_t gfp; - struct page *new_page; + struct page *hpage; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1612,20 +1712,9 @@ static void collapse_file(struct mm_struct *mm, VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; - - new_page = khugepaged_alloc_page(hpage, gfp, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out; - } - - if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { - result = SCAN_CGROUP_CHARGE_FAIL; + result = alloc_charge_hpage(&hpage, mm, cc); + if (result != SCAN_SUCCEED) goto out; - } - count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); /* * Ensure we have slots for all the pages in the range. This is @@ -1643,14 +1732,14 @@ static void collapse_file(struct mm_struct *mm, } } while (1); - __SetPageLocked(new_page); + __SetPageLocked(hpage); if (is_shmem) - __SetPageSwapBacked(new_page); - new_page->index = start; - new_page->mapping = mapping; + __SetPageSwapBacked(hpage); + hpage->index = start; + hpage->mapping = mapping; /* - * At this point the new_page is locked and not up-to-date. + * At this point the hpage is locked and not up-to-date. * It's safe to insert it into the page cache, because nobody would * be able to map it or use it in another way until we unlock it. */ @@ -1678,19 +1767,22 @@ static void collapse_file(struct mm_struct *mm, result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, new_page); + xas_store(&xas, hpage); nr_none++; continue; } if (xa_is_value(page) || !PageUptodate(page)) { + struct folio *folio; + xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ - if (shmem_getpage(mapping->host, index, &page, - SGP_NOALLOC)) { + if (shmem_get_folio(mapping->host, index, + &folio, SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } + page = folio_file_page(folio, index); } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); @@ -1757,9 +1849,16 @@ static void collapse_file(struct mm_struct *mm, /* * If file was truncated then extended, or hole-punched, before * we locked the first page, then a THP might be there already. + * This will be discovered on the first iteration. */ if (PageTransCompound(page)) { - result = SCAN_PAGE_COMPOUND; + struct page *head = compound_head(page); + + result = compound_order(head) == HPAGE_PMD_ORDER && + head->index == start + /* Maybe PMD-mapped */ + ? SCAN_PTE_MAPPED_HUGEPAGE + : SCAN_PAGE_COMPOUND; goto out_unlock; } @@ -1820,19 +1919,19 @@ static void collapse_file(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page); + xas_store(&xas, hpage); continue; out_unlock: unlock_page(page); put_page(page); goto xa_unlocked; } - nr = thp_nr_pages(new_page); + nr = thp_nr_pages(hpage); if (is_shmem) - __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr); + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); else { - __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); filemap_nr_thps_inc(mapping); /* * Paired with smp_mb() in do_dentry_open() to ensure @@ -1843,21 +1942,21 @@ out_unlock: smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; - __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr); + __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); goto xa_locked; } } if (nr_none) { - __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ - __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); } /* Join all the small entries into a single multi-index entry */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); - xas_store(&xas, new_page); + xas_store(&xas, hpage); xa_locked: xas_unlock_irq(&xas); xa_unlocked: @@ -1879,11 +1978,11 @@ xa_unlocked: index = start; list_for_each_entry_safe(page, tmp, &pagelist, lru) { while (index < page->index) { - clear_highpage(new_page + (index % HPAGE_PMD_NR)); + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - copy_highpage(new_page + (page->index % HPAGE_PMD_NR), - page); + copy_highpage(hpage + (page->index % HPAGE_PMD_NR), + page); list_del(&page->lru); page->mapping = NULL; page_ref_unfreeze(page, 1); @@ -1894,23 +1993,23 @@ xa_unlocked: index++; } while (index < end) { - clear_highpage(new_page + (index % HPAGE_PMD_NR)); + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - SetPageUptodate(new_page); - page_ref_add(new_page, HPAGE_PMD_NR - 1); + SetPageUptodate(hpage); + page_ref_add(hpage, HPAGE_PMD_NR - 1); if (is_shmem) - set_page_dirty(new_page); - lru_cache_add(new_page); + set_page_dirty(hpage); + lru_cache_add(hpage); /* * Remove pte page tables, so we can re-fault the page as huge. */ - retract_page_tables(mapping, start); - *hpage = NULL; - - khugepaged_pages_collapsed++; + result = retract_page_tables(mapping, start, mm, addr, hpage, + cc); + unlock_page(hpage); + hpage = NULL; } else { struct page *page; @@ -1949,19 +2048,24 @@ xa_unlocked: VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - new_page->mapping = NULL; + hpage->mapping = NULL; } - unlock_page(new_page); + if (hpage) + unlock_page(hpage); out: VM_BUG_ON(!list_empty(&pagelist)); - if (!IS_ERR_OR_NULL(*hpage)) - mem_cgroup_uncharge(page_folio(*hpage)); + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); + } /* TODO: tracepoints */ + return result; } -static void khugepaged_scan_file(struct mm_struct *mm, - struct file *file, pgoff_t start, struct page **hpage) +static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { struct page *page = NULL; struct address_space *mapping = file->f_mapping; @@ -1972,14 +2076,16 @@ static void khugepaged_scan_file(struct mm_struct *mm, present = 0; swap = 0; - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); rcu_read_lock(); xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) { - if (++swap > khugepaged_max_ptes_swap) { + ++swap; + if (cc->is_khugepaged && + swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; @@ -1988,20 +2094,32 @@ static void khugepaged_scan_file(struct mm_struct *mm, } /* - * XXX: khugepaged should compact smaller compound pages + * TODO: khugepaged should compact smaller compound pages * into a PMD sized page */ if (PageTransCompound(page)) { - result = SCAN_PAGE_COMPOUND; + struct page *head = compound_head(page); + + result = compound_order(head) == HPAGE_PMD_ORDER && + head->index == start + /* Maybe PMD-mapped */ + ? SCAN_PTE_MAPPED_HUGEPAGE + : SCAN_PAGE_COMPOUND; + /* + * For SCAN_PTE_MAPPED_HUGEPAGE, further processing + * by the caller won't touch the page cache, and so + * it's safe to skip LRU and refcount checks before + * returning. + */ break; } node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; break; } - khugepaged_node_load[node]++; + cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; @@ -2030,54 +2148,68 @@ static void khugepaged_scan_file(struct mm_struct *mm, rcu_read_unlock(); if (result == SCAN_SUCCEED) { - if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { + if (cc->is_khugepaged && + present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - node = khugepaged_find_target_node(); - collapse_file(mm, file, start, hpage, node); + result = collapse_file(mm, addr, file, start, cc); } } - /* TODO: tracepoints */ + trace_mm_khugepaged_scan_file(mm, page, file->f_path.dentry->d_iname, + present, swap, result); + return result; } #else -static void khugepaged_scan_file(struct mm_struct *mm, - struct file *file, pgoff_t start, struct page **hpage) +static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { BUILD_BUG(); } -static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) { } + +static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ + return false; +} #endif -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, - struct page **hpage) +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, + struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { - struct mm_slot *mm_slot; + struct vma_iterator vmi; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int progress = 0; VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); + *result = SCAN_FAIL; - if (khugepaged_scan.mm_slot) + if (khugepaged_scan.mm_slot) { mm_slot = khugepaged_scan.mm_slot; - else { - mm_slot = list_entry(khugepaged_scan.mm_head.next, + slot = &mm_slot->slot; + } else { + slot = list_entry(khugepaged_scan.mm_head.next, struct mm_slot, mm_node); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; khugepaged_scan.mm_slot = mm_slot; } spin_unlock(&khugepaged_mm_lock); khugepaged_collapse_pte_mapped_thps(mm_slot); - mm = mm_slot->mm; + mm = slot->mm; /* * Don't wait for semaphore (to avoid long wait times). Just move to * the next mm on the list. @@ -2085,19 +2217,21 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; - if (likely(!khugepaged_test_exit(mm))) - vma = find_vma(mm, khugepaged_scan.address); progress++; - for (; vma; vma = vma->vm_next) { + if (unlikely(hpage_collapse_test_exit(mm))) + goto breakouterloop; + + vma_iter_init(&vmi, mm, khugepaged_scan.address); + for_each_vma(vmi, vma) { unsigned long hstart, hend; cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) { + if (unlikely(hpage_collapse_test_exit(mm))) { progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) { + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { skip: progress++; continue; @@ -2111,9 +2245,10 @@ skip: VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { - int ret; + bool mmap_locked = true; + cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || @@ -2125,19 +2260,48 @@ skip: khugepaged_scan.address); mmap_read_unlock(mm); - ret = 1; - khugepaged_scan_file(mm, file, pgoff, hpage); + *result = hpage_collapse_scan_file(mm, + khugepaged_scan.address, + file, pgoff, cc); + mmap_locked = false; fput(file); } else { - ret = khugepaged_scan_pmd(mm, vma, - khugepaged_scan.address, - hpage); + *result = hpage_collapse_scan_pmd(mm, vma, + khugepaged_scan.address, + &mmap_locked, + cc); + } + switch (*result) { + case SCAN_PTE_MAPPED_HUGEPAGE: { + pmd_t *pmd; + + *result = find_pmd_or_thp_or_none(mm, + khugepaged_scan.address, + &pmd); + if (*result != SCAN_SUCCEED) + break; + if (!khugepaged_add_pte_mapped_thp(mm, + khugepaged_scan.address)) + break; + } fallthrough; + case SCAN_SUCCEED: + ++khugepaged_pages_collapsed; + break; + default: + break; } + /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; - if (ret) - /* we released mmap_lock so break loop */ + if (!mmap_locked) + /* + * We released mmap_lock so break loop. Note + * that we drop mmap_lock before all hugepage + * allocations, so if allocation fails, we are + * guaranteed to break here and report the + * correct result back to caller. + */ goto breakouterloop_mmap_lock; if (progress >= pages) goto breakouterloop; @@ -2153,16 +2317,17 @@ breakouterloop_mmap_lock: * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ - if (khugepaged_test_exit(mm) || !vma) { + if (hpage_collapse_test_exit(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ - if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { - khugepaged_scan.mm_slot = list_entry( - mm_slot->mm_node.next, - struct mm_slot, mm_node); + if (slot->mm_node.next != &khugepaged_scan.mm_head) { + slot = list_entry(slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.mm_slot = + mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; @@ -2187,19 +2352,16 @@ static int khugepaged_wait_event(void) kthread_should_stop(); } -static void khugepaged_do_scan(void) +static void khugepaged_do_scan(struct collapse_control *cc) { - struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; + int result = SCAN_SUCCEED; lru_add_drain_all(); - while (progress < pages) { - if (!khugepaged_prealloc_page(&hpage, &wait)) - break; - + while (true) { cond_resched(); if (unlikely(kthread_should_stop() || try_to_freeze())) @@ -2211,14 +2373,25 @@ static void khugepaged_do_scan(void) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - &hpage); + &result, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock); - } - if (!IS_ERR_OR_NULL(hpage)) - put_page(hpage); + if (progress >= pages) + break; + + if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { + /* + * If fail to allocate the first time, try to sleep for + * a while. When hit again, cancel the scan. + */ + if (!wait) + break; + wait = false; + khugepaged_alloc_sleep(); + } + } } static bool khugepaged_should_wakeup(void) @@ -2249,13 +2422,13 @@ static void khugepaged_wait_work(void) static int khugepaged(void *none) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; set_freezable(); set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { - khugepaged_do_scan(); + khugepaged_do_scan(&khugepaged_collapse_control); khugepaged_wait_work(); } @@ -2354,3 +2527,140 @@ void khugepaged_min_free_kbytes_update(void) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } + +static int madvise_collapse_errno(enum scan_result r) +{ + /* + * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide + * actionable feedback to caller, so they may take an appropriate + * fallback measure depending on the nature of the failure. + */ + switch (r) { + case SCAN_ALLOC_HUGE_PAGE_FAIL: + return -ENOMEM; + case SCAN_CGROUP_CHARGE_FAIL: + return -EBUSY; + /* Resource temporary unavailable - trying again might succeed */ + case SCAN_PAGE_LOCK: + case SCAN_PAGE_LRU: + case SCAN_DEL_PAGE_LRU: + return -EAGAIN; + /* + * Other: Trying again likely not to succeed / error intrinsic to + * specified memory range. khugepaged likely won't be able to collapse + * either. + */ + default: + return -EINVAL; + } +} + +int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct collapse_control *cc; + struct mm_struct *mm = vma->vm_mm; + unsigned long hstart, hend, addr; + int thps = 0, last_fail = SCAN_FAIL; + bool mmap_locked = true; + + BUG_ON(vma->vm_start > start); + BUG_ON(vma->vm_end < end); + + *prev = vma; + + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + return -EINVAL; + + cc = kmalloc(sizeof(*cc), GFP_KERNEL); + if (!cc) + return -ENOMEM; + cc->is_khugepaged = false; + cc->last_target_node = NUMA_NO_NODE; + + mmgrab(mm); + lru_add_drain_all(); + + hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = end & HPAGE_PMD_MASK; + + for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { + int result = SCAN_FAIL; + + if (!mmap_locked) { + cond_resched(); + mmap_read_lock(mm); + mmap_locked = true; + result = hugepage_vma_revalidate(mm, addr, false, &vma, + cc); + if (result != SCAN_SUCCEED) { + last_fail = result; + goto out_nolock; + } + + hend = vma->vm_end & HPAGE_PMD_MASK; + } + mmap_assert_locked(mm); + memset(cc->node_load, 0, sizeof(cc->node_load)); + if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + struct file *file = get_file(vma->vm_file); + pgoff_t pgoff = linear_page_index(vma, addr); + + mmap_read_unlock(mm); + mmap_locked = false; + result = hpage_collapse_scan_file(mm, addr, file, pgoff, + cc); + fput(file); + } else { + result = hpage_collapse_scan_pmd(mm, vma, addr, + &mmap_locked, cc); + } + if (!mmap_locked) + *prev = NULL; /* Tell caller we dropped mmap_lock */ + +handle_result: + switch (result) { + case SCAN_SUCCEED: + case SCAN_PMD_MAPPED: + ++thps; + break; + case SCAN_PTE_MAPPED_HUGEPAGE: + BUG_ON(mmap_locked); + BUG_ON(*prev); + mmap_write_lock(mm); + result = collapse_pte_mapped_thp(mm, addr, true); + mmap_write_unlock(mm); + goto handle_result; + /* Whitelisted set of results where continuing OK */ + case SCAN_PMD_NULL: + case SCAN_PTE_NON_PRESENT: + case SCAN_PTE_UFFD_WP: + case SCAN_PAGE_RO: + case SCAN_LACK_REFERENCED_PAGE: + case SCAN_PAGE_NULL: + case SCAN_PAGE_COUNT: + case SCAN_PAGE_LOCK: + case SCAN_PAGE_COMPOUND: + case SCAN_PAGE_LRU: + case SCAN_DEL_PAGE_LRU: + last_fail = result; + break; + default: + last_fail = result; + /* Other error, exit */ + goto out_maybelock; + } + } + +out_maybelock: + /* Caller expects us to hold mmap_lock on return */ + if (!mmap_locked) + mmap_read_lock(mm); +out_nolock: + mmap_assert_locked(mm); + mmdrop(mm); + kfree(cc); + + return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 + : madvise_collapse_errno(last_fail); +} |