diff options
Diffstat (limited to 'mm/migrate.c')
-rw-r--r-- | mm/migrate.c | 270 |
1 files changed, 168 insertions, 102 deletions
diff --git a/mm/migrate.c b/mm/migrate.c index 923ea80ba744..dfdb3a136bf8 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -20,7 +20,6 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/mm_inline.h> -#include <linux/nsproxy.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/topology.h> @@ -35,21 +34,16 @@ #include <linux/syscalls.h> #include <linux/compat.h> #include <linux/hugetlb.h> -#include <linux/hugetlb_cgroup.h> #include <linux/gfp.h> #include <linux/pfn_t.h> -#include <linux/memremap.h> -#include <linux/userfaultfd_k.h> -#include <linux/balloon_compaction.h> #include <linux/page_idle.h> #include <linux/page_owner.h> #include <linux/sched/mm.h> #include <linux/ptrace.h> -#include <linux/oom.h> #include <linux/memory.h> -#include <linux/random.h> #include <linux/sched/sysctl.h> #include <linux/memory-tiers.h> +#include <linux/pagewalk.h> #include <asm/tlbflush.h> @@ -177,13 +171,83 @@ void putback_movable_pages(struct list_head *l) } } +/* Must be called with an elevated refcount on the non-hugetlb folio */ +bool isolate_folio_to_list(struct folio *folio, struct list_head *list) +{ + bool isolated, lru; + + if (folio_test_hugetlb(folio)) + return isolate_hugetlb(folio, list); + + lru = !__folio_test_movable(folio); + if (lru) + isolated = folio_isolate_lru(folio); + else + isolated = isolate_movable_page(&folio->page, + ISOLATE_UNEVICTABLE); + + if (!isolated) + return false; + + list_add(&folio->lru, list); + if (lru) + node_stat_add_folio(folio, NR_ISOLATED_ANON + + folio_is_file_lru(folio)); + + return true; +} + +static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw, + struct folio *folio, + unsigned long idx) +{ + struct page *page = folio_page(folio, idx); + bool contains_data; + pte_t newpte; + void *addr; + + VM_BUG_ON_PAGE(PageCompound(page), page); + VM_BUG_ON_PAGE(!PageAnon(page), page); + VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page); + + if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) || + mm_forbids_zeropage(pvmw->vma->vm_mm)) + return false; + + /* + * The pmd entry mapping the old thp was flushed and the pte mapping + * this subpage has been non present. If the subpage is only zero-filled + * then map it to the shared zeropage. + */ + addr = kmap_local_page(page); + contains_data = memchr_inv(addr, 0, PAGE_SIZE); + kunmap_local(addr); + + if (contains_data) + return false; + + newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address), + pvmw->vma->vm_page_prot)); + set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte); + + dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio)); + return true; +} + +struct rmap_walk_arg { + struct folio *folio; + bool map_unused_to_zeropage; +}; + /* * Restore a potential migration pte to a working pte entry */ static bool remove_migration_pte(struct folio *folio, - struct vm_area_struct *vma, unsigned long addr, void *old) + struct vm_area_struct *vma, unsigned long addr, void *arg) { - DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION); + struct rmap_walk_arg *rmap_walk_arg = arg; + DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION); while (page_vma_mapped_walk(&pvmw)) { rmap_t rmap_flags = RMAP_NONE; @@ -207,6 +271,9 @@ static bool remove_migration_pte(struct folio *folio, continue; } #endif + if (rmap_walk_arg->map_unused_to_zeropage && + try_to_map_unused_to_zeropage(&pvmw, folio, idx)) + continue; folio_get(folio); pte = mk_pte(new, READ_ONCE(vma->vm_page_prot)); @@ -285,14 +352,21 @@ static bool remove_migration_pte(struct folio *folio, * Get rid of all migration entries and replace them by * references to the indicated page. */ -void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked) +void remove_migration_ptes(struct folio *src, struct folio *dst, int flags) { + struct rmap_walk_arg rmap_walk_arg = { + .folio = src, + .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE, + }; + struct rmap_walk_control rwc = { .rmap_one = remove_migration_pte, - .arg = src, + .arg = &rmap_walk_arg, }; - if (locked) + VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src); + + if (flags & RMP_LOCKED) rmap_walk_locked(dst, &rwc); else rmap_walk(dst, &rwc); @@ -422,6 +496,8 @@ static int __folio_migrate_mapping(struct address_space *mapping, /* No turning back from here */ newfolio->index = folio->index; newfolio->mapping = folio->mapping; + if (folio_test_anon(folio) && folio_test_large(folio)) + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); if (folio_test_swapbacked(folio)) __folio_set_swapbacked(newfolio); @@ -446,6 +522,8 @@ static int __folio_migrate_mapping(struct address_space *mapping, */ newfolio->index = folio->index; newfolio->mapping = folio->mapping; + if (folio_test_anon(folio) && folio_test_large(folio)) + mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1); folio_ref_add(newfolio, nr); /* add cache reference */ if (folio_test_swapbacked(folio)) { __folio_set_swapbacked(newfolio); @@ -585,8 +663,6 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) { int cpupid; - if (folio_test_error(folio)) - folio_set_error(newfolio); if (folio_test_referenced(folio)) folio_set_referenced(newfolio); if (folio_test_uptodate(folio)) @@ -640,7 +716,8 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_migrate_ksm(newfolio, folio); /* * Please do not reorder this without considering how mm/ksm.c's - * ksm_get_folio() depends upon ksm_migrate_page() and PageSwapCache(). + * ksm_get_folio() depends upon ksm_migrate_page() and the + * swapcache flag. */ if (folio_test_swapcache(folio)) folio_clear_swapcache(folio); @@ -666,6 +743,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) folio_set_readahead(newfolio); folio_copy_owner(newfolio, folio); + pgalloc_tag_copy(newfolio, folio); mem_cgroup_migrate(folio, newfolio); } @@ -904,7 +982,7 @@ static int writeout(struct address_space *mapping, struct folio *folio) * At this point we know that the migration attempt cannot * be successful. */ - remove_migration_ptes(folio, folio, false); + remove_migration_ptes(folio, folio, 0); rc = mapping->a_ops->writepage(&folio->page, &wbc); @@ -1068,7 +1146,7 @@ static void migrate_folio_undo_src(struct folio *src, struct list_head *ret) { if (page_was_mapped) - remove_migration_ptes(src, src, false); + remove_migration_ptes(src, src, 0); /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); @@ -1306,7 +1384,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private, lru_add_drain(); if (old_page_state & PAGE_WAS_MAPPED) - remove_migration_ptes(src, dst, false); + remove_migration_ptes(src, dst, 0); out_unlock_both: folio_unlock(dst); @@ -1444,7 +1522,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio, if (page_was_mapped) remove_migration_ptes(src, - rc == MIGRATEPAGE_SUCCESS ? dst : src, false); + rc == MIGRATEPAGE_SUCCESS ? dst : src, 0); unlock_put_anon: folio_unlock(dst); @@ -1682,7 +1760,8 @@ static int migrate_pages_batch(struct list_head *from, * use _deferred_list. */ if (nr_pages > 2 && - !list_empty(&folio->_deferred_list)) { + !list_empty(&folio->_deferred_list) && + folio_test_partially_mapped(folio)) { if (!try_split_folio(folio, split_folios, mode)) { nr_failed++; stats->nr_thp_failed += is_thp; @@ -2111,76 +2190,66 @@ static int do_move_pages_to_node(struct list_head *pagelist, int node) return err; } +static int __add_folio_for_migration(struct folio *folio, int node, + struct list_head *pagelist, bool migrate_all) +{ + if (is_zero_folio(folio) || is_huge_zero_folio(folio)) + return -EFAULT; + + if (folio_is_zone_device(folio)) + return -ENOENT; + + if (folio_nid(folio) == node) + return 0; + + if (folio_likely_mapped_shared(folio) && !migrate_all) + return -EACCES; + + if (folio_test_hugetlb(folio)) { + if (isolate_hugetlb(folio, pagelist)) + return 1; + } else if (folio_isolate_lru(folio)) { + list_add_tail(&folio->lru, pagelist); + node_stat_mod_folio(folio, + NR_ISOLATED_ANON + folio_is_file_lru(folio), + folio_nr_pages(folio)); + return 1; + } + return -EBUSY; +} + /* - * Resolves the given address to a struct page, isolates it from the LRU and + * Resolves the given address to a struct folio, isolates it from the LRU and * puts it to the given pagelist. * Returns: - * errno - if the page cannot be found/isolated + * errno - if the folio cannot be found/isolated * 0 - when it doesn't have to be migrated because it is already on the * target node * 1 - when it has been queued */ -static int add_page_for_migration(struct mm_struct *mm, const void __user *p, +static int add_folio_for_migration(struct mm_struct *mm, const void __user *p, int node, struct list_head *pagelist, bool migrate_all) { struct vm_area_struct *vma; - unsigned long addr; - struct page *page; + struct folio_walk fw; struct folio *folio; - int err; + unsigned long addr; + int err = -EFAULT; mmap_read_lock(mm); addr = (unsigned long)untagged_addr_remote(mm, p); - err = -EFAULT; vma = vma_lookup(mm, addr); - if (!vma || !vma_migratable(vma)) - goto out; - - /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - - err = PTR_ERR(page); - if (IS_ERR(page)) - goto out; - - err = -ENOENT; - if (!page) - goto out; - - folio = page_folio(page); - if (folio_is_zone_device(folio)) - goto out_putfolio; - - err = 0; - if (folio_nid(folio) == node) - goto out_putfolio; - - err = -EACCES; - if (folio_likely_mapped_shared(folio) && !migrate_all) - goto out_putfolio; - - err = -EBUSY; - if (folio_test_hugetlb(folio)) { - if (isolate_hugetlb(folio, pagelist)) - err = 1; - } else { - if (!folio_isolate_lru(folio)) - goto out_putfolio; - - err = 1; - list_add_tail(&folio->lru, pagelist); - node_stat_mod_folio(folio, - NR_ISOLATED_ANON + folio_is_file_lru(folio), - folio_nr_pages(folio)); + if (vma && vma_migratable(vma)) { + folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE); + if (folio) { + err = __add_folio_for_migration(folio, node, pagelist, + migrate_all); + folio_walk_end(&fw, vma); + } else { + err = -ENOENT; + } } -out_putfolio: - /* - * Either remove the duplicate refcount from folio_isolate_lru() - * or drop the folio ref if it was not isolated. - */ - folio_put(folio); -out: mmap_read_unlock(mm); return err; } @@ -2274,8 +2343,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes, * Errors in the page lookup or isolation are not fatal and we simply * report them via status */ - err = add_page_for_migration(mm, p, current_node, &pagelist, - flags & MPOL_MF_MOVE_ALL); + err = add_folio_for_migration(mm, p, current_node, &pagelist, + flags & MPOL_MF_MOVE_ALL); if (err > 0) { /* The page is successfully queued for migration */ @@ -2331,28 +2400,26 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, for (i = 0; i < nr_pages; i++) { unsigned long addr = (unsigned long)(*pages); struct vm_area_struct *vma; - struct page *page; + struct folio_walk fw; + struct folio *folio; int err = -EFAULT; vma = vma_lookup(mm, addr); if (!vma) goto set_status; - /* FOLL_DUMP to ignore special (like zero) pages */ - page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - - err = PTR_ERR(page); - if (IS_ERR(page)) - goto set_status; - - err = -ENOENT; - if (!page) - goto set_status; - - if (!is_zone_device_page(page)) - err = page_to_nid(page); - - put_page(page); + folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE); + if (folio) { + if (is_zero_folio(folio) || is_huge_zero_folio(folio)) + err = -EFAULT; + else if (folio_is_zone_device(folio)) + err = -ENOENT; + else + err = folio_nid(folio); + folio_walk_end(&fw, vma); + } else { + err = -ENOENT; + } set_status: *status = err; @@ -2432,25 +2499,19 @@ static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes) return current->mm; } - /* Find the mm_struct */ - rcu_read_lock(); - task = find_task_by_vpid(pid); + task = find_get_task_by_vpid(pid); if (!task) { - rcu_read_unlock(); return ERR_PTR(-ESRCH); } - get_task_struct(task); /* * Check if this process has the right to modify the specified * process. Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { - rcu_read_unlock(); mm = ERR_PTR(-EPERM); goto out; } - rcu_read_unlock(); mm = ERR_PTR(security_task_movememory(task)); if (IS_ERR(mm)) @@ -2526,7 +2587,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat, if (!zone_watermark_ok(zone, 0, high_wmark_pages(zone) + nr_migrate_pages, - ZONE_MOVABLE, 0)) + ZONE_MOVABLE, ALLOC_CMA)) continue; return true; } @@ -2627,6 +2688,8 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, int nr_remaining; unsigned int nr_succeeded; LIST_HEAD(migratepages); + struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio); + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); list_add(&folio->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio, @@ -2636,10 +2699,13 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma, putback_movable_pages(&migratepages); if (nr_succeeded) { count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded); - if (!node_is_toptier(folio_nid(folio)) && node_is_toptier(node)) - mod_node_page_state(pgdat, PGPROMOTE_SUCCESS, - nr_succeeded); + count_memcg_events(memcg, NUMA_PAGE_MIGRATE, nr_succeeded); + if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) + && !node_is_toptier(folio_nid(folio)) + && node_is_toptier(node)) + mod_lruvec_state(lruvec, PGPROMOTE_SUCCESS, nr_succeeded); } + mem_cgroup_put(memcg); BUG_ON(!list_empty(&migratepages)); return nr_remaining ? -EAGAIN : 0; } |