From acfac37851e01b40c30a7afd0d93ad8db8914f25 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 7 Oct 2022 12:59:20 -0700 Subject: mm/hugetlb.c: make __hugetlb_vma_unlock_write_put() static Reported-by: kernel test robot Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0ad53ad98e74..41d3aa077837 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6804,7 +6804,7 @@ void hugetlb_vma_lock_release(struct kref *kref) kfree(vma_lock); } -void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) +static void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) { struct vm_area_struct *vma = vma_lock->vma; -- cgit v1.2.3 From 7efc3b7261030da79001c00d92bc3392fd6c664c Mon Sep 17 00:00:00 2001 From: Chuyi Zhou Date: Wed, 13 Jul 2022 14:20:09 +0800 Subject: mm/compaction: fix set skip in fast_find_migrateblock When we successfully find a pageblock in fast_find_migrateblock(), the block will be set skip-flag through set_pageblock_skip(). However, when entering isolate_migratepages_block(), the whole pageblock will be skipped due to the branch 'if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages))'. Eventually we will goto isolate_abort and isolate nothing. That makes fast_find_migrateblock useless. In this patch, when we find a suitable pageblock in fast_find_migrateblock, we do noting but let isolate_migratepages_block to set skip flag to the pageblock after scan it. Normally, we would isolate some pages from the fast-find block. I use mmtest/thpscale-madvhugepage test it. Here is the result: baseline patch Amean fault-both-1 1331.66 ( 0.00%) 1261.04 * 5.30%* Amean fault-both-3 1383.95 ( 0.00%) 1191.69 * 13.89%* Amean fault-both-5 1568.13 ( 0.00%) 1445.20 * 7.84%* Amean fault-both-7 1819.62 ( 0.00%) 1555.13 * 14.54%* Amean fault-both-12 1106.96 ( 0.00%) 1149.43 * -3.84%* Amean fault-both-18 2196.93 ( 0.00%) 1875.77 * 14.62%* Amean fault-both-24 2642.69 ( 0.00%) 2671.21 * -1.08%* Amean fault-both-30 2901.89 ( 0.00%) 2857.32 * 1.54%* Amean fault-both-32 3747.00 ( 0.00%) 3479.23 * 7.15%* Link: https://lkml.kernel.org/r/20220713062009.597255-1-zhouchuyi@bytedance.com Fixes: 70b44595eafe9 ("mm, compaction: use free lists to quickly locate a migration source") Signed-off-by: zhouchuyi Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/compaction.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index e2a9615f5fde..c4e4453187a2 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1851,7 +1851,6 @@ static unsigned long fast_find_migrateblock(struct compact_control *cc) pfn = cc->zone->zone_start_pfn; cc->fast_search_fail = 0; found_block = true; - set_pageblock_skip(freepage); break; } } -- cgit v1.2.3 From 92b7399695a5cc961c44fc6e4624d3bc3c699ee7 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Tue, 11 Oct 2022 20:36:51 +0000 Subject: mmap: fix copy_vma() failure path The anon vma was not unlinked and the file was not closed in the failure path when the machine runs out of memory during the maple tree modification. This caused a memory leak of the anon vma chain and vma since neither would be freed. Link: https://lkml.kernel.org/r/20221011203621.1446507-1-Liam.Howlett@oracle.com Fixes: 524e00b36e8c ("mm: remove rb tree") Signed-off-by: Liam R. Howlett Reported-by: Lukas Bulwahn Tested-by: Lukas Bulwahn Signed-off-by: Andrew Morton --- mm/mmap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 6e447544f07d..fc8581cefef7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3240,6 +3240,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, out_vma_link: if (new_vma->vm_ops && new_vma->vm_ops->close) new_vma->vm_ops->close(new_vma); + + if (new_vma->vm_file) + fput(new_vma->vm_file); + + unlink_anon_vmas(new_vma); out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: -- cgit v1.2.3 From 28c5609fb236807910ca347ad3e26c4567998526 Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Tue, 11 Oct 2022 16:08:37 +0000 Subject: mm/mmap: preallocate maple nodes for brk vma expansion If the brk VMA is the last vma in a maple node and meets the rare criteria that it can be expanded, then preallocation is necessary to avoid a potential fs_reclaim circular lock issue on low resources. At the same time use the actual vma start address (unaligned) when calling vma_adjust_trans_huge(). Link: https://lkml.kernel.org/r/20221011160624.1253454-1-Liam.Howlett@oracle.com Fixes: 2e7ce7d354f2 (mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()) Signed-off-by: Liam R. Howlett Reported-by: Yu Zhao Signed-off-by: Andrew Morton --- mm/mmap.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index fc8581cefef7..5855f26639f9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2942,17 +2942,18 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, if (vma && (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) && ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) { - mas->index = vma->vm_start; - mas->last = addr + len - 1; - vma_adjust_trans_huge(vma, addr, addr + len, 0); + mas_set_range(mas, vma->vm_start, addr + len - 1); + if (mas_preallocate(mas, vma, GFP_KERNEL)) + return -ENOMEM; + + vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0); if (vma->anon_vma) { anon_vma_lock_write(vma->anon_vma); anon_vma_interval_tree_pre_update_vma(vma); } vma->vm_end = addr + len; vma->vm_flags |= VM_SOFTDIRTY; - if (mas_store_gfp(mas, vma, GFP_KERNEL)) - goto mas_expand_failed; + mas_store_prealloc(mas, vma); if (vma->anon_vma) { anon_vma_interval_tree_post_update_vma(vma); @@ -2993,13 +2994,6 @@ mas_store_fail: vma_alloc_fail: vm_unacct_memory(len >> PAGE_SHIFT); return -ENOMEM; - -mas_expand_failed: - if (vma->anon_vma) { - anon_vma_interval_tree_post_update_vma(vma); - anon_vma_unlock_write(vma->anon_vma); - } - return -ENOMEM; } int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) -- cgit v1.2.3 From 515778e2d790652a38a24554fdb7f21420d91efc Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 30 Sep 2022 20:25:55 -0400 Subject: mm/uffd: fix warning without PTE_MARKER_UFFD_WP compiled in When PTE_MARKER_UFFD_WP not configured, it's still possible to reach pte marker code and trigger an warning. Add a few CONFIG_PTE_MARKER_UFFD_WP ifdefs to make sure the code won't be reached when not compiled in. Link: https://lkml.kernel.org/r/YzeR+R6b4bwBlBHh@x1n Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: Peter Xu Reported-by: Cc: Axel Rasmussen Cc: Brian Geffon Cc: Edward Liaw Cc: Liu Shixin Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++++ mm/memory.c | 2 ++ mm/mprotect.c | 2 ++ 3 files changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 41d3aa077837..9a910612336d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5096,6 +5096,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct * unmapped and its refcount is dropped, so just clear pte here. */ if (unlikely(!pte_present(pte))) { +#ifdef CONFIG_PTE_MARKER_UFFD_WP /* * If the pte was wr-protected by uffd-wp in any of the * swap forms, meanwhile the caller does not want to @@ -5107,6 +5108,7 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP)); else +#endif huge_pte_clear(mm, address, ptep, sz); spin_unlock(ptl); continue; @@ -5135,11 +5137,13 @@ static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct tlb_remove_huge_tlb_entry(h, tlb, ptep, address); if (huge_pte_dirty(pte)) set_page_dirty(page); +#ifdef CONFIG_PTE_MARKER_UFFD_WP /* Leave a uffd-wp pte marker if needed */ if (huge_pte_uffd_wp(pte) && !(zap_flags & ZAP_FLAG_DROP_MARKER)) set_huge_pte_at(mm, address, ptep, make_pte_marker(PTE_MARKER_UFFD_WP)); +#endif hugetlb_count_sub(pages_per_huge_page(h), mm); page_remove_rmap(page, vma, true); diff --git a/mm/memory.c b/mm/memory.c index df678fa30cdb..2c7723ea4371 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1393,10 +1393,12 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, unsigned long addr, pte_t *pte, struct zap_details *details, pte_t pteval) { +#ifdef CONFIG_PTE_MARKER_UFFD_WP if (zap_drop_file_uffd_wp(details)) return; pte_install_uffd_wp_if_needed(vma, addr, pte, pteval); +#endif } static unsigned long zap_pte_range(struct mmu_gather *tlb, diff --git a/mm/mprotect.c b/mm/mprotect.c index 461dcbd4f21a..668bfaa6ed2a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -267,6 +267,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, } else { /* It must be an none page, or what else?.. */ WARN_ON_ONCE(!pte_none(oldpte)); +#ifdef CONFIG_PTE_MARKER_UFFD_WP if (unlikely(uffd_wp && !vma_is_anonymous(vma))) { /* * For file-backed mem, we need to be able to @@ -278,6 +279,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, make_pte_marker(PTE_MARKER_UFFD_WP)); pages++; } +#endif } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); -- cgit v1.2.3 From deb0f6562884b5b4beb883d73e66a7d3a1b96d99 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Fri, 30 Sep 2022 00:38:43 +0000 Subject: mm/mmap: undo ->mmap() when arch_validate_flags() fails Commit c462ac288f2c ("mm: Introduce arch_validate_flags()") added a late check in mmap_region() to let architectures validate vm_flags. The check needs to happen after calling ->mmap() as the flags can potentially be modified during this callback. If arch_validate_flags() check fails we unmap and free the vma. However, the error path fails to undo the ->mmap() call that previously succeeded and depending on the specific ->mmap() implementation this translates to reference increments, memory allocations and other operations what will not be cleaned up. There are several places (mainly device drivers) where this is an issue. However, one specific example is bpf_map_mmap() which keeps count of the mappings in map->writecnt. The count is incremented on ->mmap() and then decremented on vm_ops->close(). When arch_validate_flags() fails this count is off since bpf_map_mmap_close() is never called. One can reproduce this issue in arm64 devices with MTE support. Here the vm_flags are checked to only allow VM_MTE if VM_MTE_ALLOWED has been set previously. From userspace then is enough to pass the PROT_MTE flag to mmap() syscall to trigger the arch_validate_flags() failure. The following program reproduces this issue: #include #include #include #include #include int main(void) { union bpf_attr attr = { .map_type = BPF_MAP_TYPE_ARRAY, .key_size = sizeof(int), .value_size = sizeof(long long), .max_entries = 256, .map_flags = BPF_F_MMAPABLE, }; int fd; fd = syscall(__NR_bpf, BPF_MAP_CREATE, &attr, sizeof(attr)); mmap(NULL, 4096, PROT_WRITE | PROT_MTE, MAP_SHARED, fd, 0); return 0; } By manually adding some log statements to the vm_ops callbacks we can confirm that when passing PROT_MTE to mmap() the map->writecnt is off upon ->release(): With PROT_MTE flag: root@debian:~# ./bpf-test [ 111.263874] bpf_map_write_active_inc: map=9 writecnt=1 [ 111.288763] bpf_map_release: map=9 writecnt=1 Without PROT_MTE flag: root@debian:~# ./bpf-test [ 157.816912] bpf_map_write_active_inc: map=10 writecnt=1 [ 157.830442] bpf_map_write_active_dec: map=10 writecnt=0 [ 157.832396] bpf_map_release: map=10 writecnt=0 This patch fixes the above issue by calling vm_ops->close() when the arch_validate_flags() check fails, after this we can proceed to unmap and free the vma on the error path. Link: https://lkml.kernel.org/r/20220930003844.1210987-1-cmllamas@google.com Fixes: c462ac288f2c ("mm: Introduce arch_validate_flags()") Signed-off-by: Carlos Llamas Reviewed-by: Catalin Marinas Acked-by: Andrii Nakryiko Reviewed-by: Liam Howlett Cc: Christian Brauner (Microsoft) Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: [5.10+] Signed-off-by: Andrew Morton --- mm/mmap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 5855f26639f9..bf2122af94e7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2673,7 +2673,7 @@ cannot_expand: if (!arch_validate_flags(vma->vm_flags)) { error = -EINVAL; if (file) - goto unmap_and_free_vma; + goto close_and_free_vma; else goto free_vma; } @@ -2742,6 +2742,9 @@ expanded: validate_mm(mm); return addr; +close_and_free_vma: + if (vma->vm_ops && vma->vm_ops->close) + vma->vm_ops->close(vma); unmap_and_free_vma: fput(vma->vm_file); vma->vm_file = NULL; -- cgit v1.2.3 From ac801e7e252c5588325e3c983c7d4167fc68c024 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Mon, 5 Sep 2022 14:24:27 +0200 Subject: kmsan: unpoison @tlb in arch_tlb_gather_mmu() This is an optimization to reduce stackdepot pressure. struct mmu_gather contains 7 1-bit fields packed into a 32-bit unsigned int value. The remaining 25 bits remain uninitialized and are never used, but KMSAN updates the origin for them in zap_pXX_range() in mm/memory.c, thus creating very long origin chains. This is technically correct, but consumes too much memory. Unpoisoning the whole structure will prevent creating such chains. Link: https://lkml.kernel.org/r/20220905122452.2258262-20-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Liu Shixin Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/mmu_gather.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'mm') diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index a71924bd38c0..add4244e5790 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -1,6 +1,7 @@ #include #include #include +#include #include #include #include @@ -265,6 +266,15 @@ void tlb_flush_mmu(struct mmu_gather *tlb) static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) { + /* + * struct mmu_gather contains 7 1-bit fields packed into a 32-bit + * unsigned int value. The remaining 25 bits remain uninitialized + * and are never used, but KMSAN updates the origin for them in + * zap_pXX_range() in mm/memory.c, thus creating very long origin + * chains. This is technically correct, but consumes too much memory. + * Unpoisoning the whole structure will prevent creating such chains. + */ + kmsan_unpoison_memory(tlb, sizeof(*tlb)); tlb->mm = mm; tlb->fullmm = fullmm; -- cgit v1.2.3 From 652e04464d3944226052c827bdaaf5113b072870 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Tue, 27 Sep 2022 08:19:45 +0800 Subject: mm/damon: move sz_damon_region to damon_sz_region Rename sz_damon_region() to damon_sz_region(), and move it to "include/linux/damon.h", because in many places, we can to use this func. Link: https://lkml.kernel.org/r/20220927001946.85375-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 6 ++++++ mm/damon/core.c | 9 ++------- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/damon.h b/include/linux/damon.h index ed5470f50bab..620ada094c3b 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -484,6 +484,12 @@ static inline struct damon_region *damon_first_region(struct damon_target *t) return list_first_entry(&t->regions_list, struct damon_region, list); } +static inline unsigned long damon_sz_region(struct damon_region *r) +{ + return r->ar.end - r->ar.start; +} + + #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) diff --git a/mm/damon/core.c b/mm/damon/core.c index 4de8c7c52979..5b9e0d585aef 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -864,18 +864,13 @@ static void kdamond_apply_schemes(struct damon_ctx *c) } } -static inline unsigned long sz_damon_region(struct damon_region *r) -{ - return r->ar.end - r->ar.start; -} - /* * Merge two adjacent regions into one region */ static void damon_merge_two_regions(struct damon_target *t, struct damon_region *l, struct damon_region *r) { - unsigned long sz_l = sz_damon_region(l), sz_r = sz_damon_region(r); + unsigned long sz_l = damon_sz_region(l), sz_r = damon_sz_region(r); l->nr_accesses = (l->nr_accesses * sz_l + r->nr_accesses * sz_r) / (sz_l + sz_r); @@ -904,7 +899,7 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, if (prev && prev->ar.end == r->ar.start && abs(prev->nr_accesses - r->nr_accesses) <= thres && - sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) + damon_sz_region(prev) + damon_sz_region(r) <= sz_limit) damon_merge_two_regions(t, prev, r); else prev = r; -- cgit v1.2.3 From ab63f63f3885d492e62da55304b0483a2a9e6a7d Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Tue, 27 Sep 2022 08:19:46 +0800 Subject: mm/damon: use damon_sz_region() in appropriate place In many places we can use damon_sz_region() to instead of "r->ar.end - r->ar.start". Link: https://lkml.kernel.org/r/20220927001946.85375-2-xhao@linux.alibaba.com Signed-off-by: Xin Hao Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 17 ++++++++--------- mm/damon/vaddr.c | 4 ++-- 2 files changed, 10 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/damon/core.c b/mm/damon/core.c index 5b9e0d585aef..515ac4e52a11 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -490,7 +490,7 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) - sz += r->ar.end - r->ar.start; + sz += damon_sz_region(r); } if (ctx->attrs.min_nr_regions) @@ -673,7 +673,7 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s) { unsigned long sz; - sz = r->ar.end - r->ar.start; + sz = damon_sz_region(r); return s->pattern.min_sz_region <= sz && sz <= s->pattern.max_sz_region && s->pattern.min_nr_accesses <= r->nr_accesses && @@ -701,7 +701,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_for_each_scheme(s, c) { struct damos_quota *quota = &s->quota; - unsigned long sz = r->ar.end - r->ar.start; + unsigned long sz = damon_sz_region(r); struct timespec64 begin, end; unsigned long sz_applied = 0; @@ -730,14 +730,14 @@ static void damon_do_apply_schemes(struct damon_ctx *c, sz = ALIGN_DOWN(quota->charge_addr_from - r->ar.start, DAMON_MIN_REGION); if (!sz) { - if (r->ar.end - r->ar.start <= - DAMON_MIN_REGION) + if (damon_sz_region(r) <= + DAMON_MIN_REGION) continue; sz = DAMON_MIN_REGION; } damon_split_region_at(t, r, sz); r = damon_next_region(r); - sz = r->ar.end - r->ar.start; + sz = damon_sz_region(r); } quota->charge_target_from = NULL; quota->charge_addr_from = 0; @@ -842,8 +842,7 @@ static void kdamond_apply_schemes(struct damon_ctx *c) continue; score = c->ops.get_scheme_score( c, t, r, s); - quota->histogram[score] += - r->ar.end - r->ar.start; + quota->histogram[score] += damon_sz_region(r); if (score > max_score) max_score = score; } @@ -957,7 +956,7 @@ static void damon_split_regions_of(struct damon_target *t, int nr_subs) int i; damon_for_each_region_safe(r, next, t) { - sz_region = r->ar.end - r->ar.start; + sz_region = damon_sz_region(r); for (i = 0; i < nr_subs - 1 && sz_region > 2 * DAMON_MIN_REGION; i++) { diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index ea94e0b2c311..15f03df66db6 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -72,7 +72,7 @@ static int damon_va_evenly_split_region(struct damon_target *t, return -EINVAL; orig_end = r->ar.end; - sz_orig = r->ar.end - r->ar.start; + sz_orig = damon_sz_region(r); sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); if (!sz_piece) @@ -618,7 +618,7 @@ static unsigned long damos_madvise(struct damon_target *target, { struct mm_struct *mm; unsigned long start = PAGE_ALIGN(r->ar.start); - unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start); + unsigned long len = PAGE_ALIGN(damon_sz_region(r)); unsigned long applied; mm = damon_get_mm(target); -- cgit v1.2.3 From 16ce101db85db694a91380aa4c89b25530871d33 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:15 +1000 Subject: mm/memory.c: fix race when faulting a device private page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Fix several device private page reference counting issues", v2 This series aims to fix a number of page reference counting issues in drivers dealing with device private ZONE_DEVICE pages. These result in use-after-free type bugs, either from accessing a struct page which no longer exists because it has been removed or accessing fields within the struct page which are no longer valid because the page has been freed. During normal usage it is unlikely these will cause any problems. However without these fixes it is possible to crash the kernel from userspace. These crashes can be triggered either by unloading the kernel module or unbinding the device from the driver prior to a userspace task exiting. In modules such as Nouveau it is also possible to trigger some of these issues by explicitly closing the device file-descriptor prior to the task exiting and then accessing device private memory. This involves some minor changes to both PowerPC and AMD GPU code. Unfortunately I lack hardware to test either of those so any help there would be appreciated. The changes mimic what is done in for both Nouveau and hmm-tests though so I doubt they will cause problems. This patch (of 8): When the CPU tries to access a device private page the migrate_to_ram() callback associated with the pgmap for the page is called. However no reference is taken on the faulting page. Therefore a concurrent migration of the device private page can free the page and possibly the underlying pgmap. This results in a race which can crash the kernel due to the migrate_to_ram() function pointer becoming invalid. It also means drivers can't reliably read the zone_device_data field because the page may have been freed with memunmap_pages(). Close the race by getting a reference on the page while holding the ptl to ensure it has not been freed. Unfortunately the elevated reference count will cause the migration required to handle the fault to fail. To avoid this failure pass the faulting page into the migrate_vma functions so that if an elevated reference count is found it can be checked to see if it's expected or not. [mpe@ellerman.id.au: fix build] Link: https://lkml.kernel.org/r/87fsgbf3gh.fsf@mpe.ellerman.id.au Link: https://lkml.kernel.org/r/cover.60659b549d8509ddecafad4f498ee7f03bb23c69.1664366292.git-series.apopple@nvidia.com Link: https://lkml.kernel.org/r/d3e813178a59e565e8d78d9b9a4e2562f6494f90.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Felix Kuehling Cc: Jason Gunthorpe Cc: John Hubbard Cc: Ralph Campbell Cc: Michael Ellerman Cc: Lyude Paul Cc: Alex Deucher Cc: Alex Sierra Cc: Ben Skeggs Cc: Christian König Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 19 ++++++++++-------- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 17 +++++++++------- drivers/gpu/drm/amd/amdkfd/kfd_migrate.h | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_svm.c | 11 +++++++---- include/linux/migrate.h | 8 ++++++++ lib/test_hmm.c | 7 ++++--- mm/memory.c | 16 ++++++++++++++- mm/migrate.c | 34 +++++++++++++++++++------------- mm/migrate_device.c | 18 ++++++++++++----- 9 files changed, 89 insertions(+), 43 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 598006301620..965c9e9e500b 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -508,10 +508,10 @@ unsigned long kvmppc_h_svm_init_start(struct kvm *kvm) static int __kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) + struct kvm *kvm, unsigned long gpa, struct page *fault_page) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; + struct migrate_vma mig = { 0 }; struct page *dpage, *spage; struct kvmppc_uvmem_page_pvt *pvt; unsigned long pfn; @@ -525,6 +525,7 @@ static int __kvmppc_svm_page_out(struct vm_area_struct *vma, mig.dst = &dst_pfn; mig.pgmap_owner = &kvmppc_uvmem_pgmap; mig.flags = MIGRATE_VMA_SELECT_DEVICE_PRIVATE; + mig.fault_page = fault_page; /* The requested page is already paged-out, nothing to do */ if (!kvmppc_gfn_is_uvmem_pfn(gpa >> page_shift, kvm, NULL)) @@ -580,12 +581,14 @@ out_finalize: static inline int kvmppc_svm_page_out(struct vm_area_struct *vma, unsigned long start, unsigned long end, unsigned long page_shift, - struct kvm *kvm, unsigned long gpa) + struct kvm *kvm, unsigned long gpa, + struct page *fault_page) { int ret; mutex_lock(&kvm->arch.uvmem_lock); - ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa); + ret = __kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, + fault_page); mutex_unlock(&kvm->arch.uvmem_lock); return ret; @@ -634,7 +637,7 @@ void kvmppc_uvmem_drop_pages(const struct kvm_memory_slot *slot, pvt->remove_gfn = true; if (__kvmppc_svm_page_out(vma, addr, addr + PAGE_SIZE, - PAGE_SHIFT, kvm, pvt->gpa)) + PAGE_SHIFT, kvm, pvt->gpa, NULL)) pr_err("Can't page out gpa:0x%lx addr:0x%lx\n", pvt->gpa, addr); } else { @@ -736,7 +739,7 @@ static int kvmppc_svm_page_in(struct vm_area_struct *vma, bool pagein) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma mig; + struct migrate_vma mig = { 0 }; struct page *spage; unsigned long pfn; struct page *dpage; @@ -994,7 +997,7 @@ static vm_fault_t kvmppc_uvmem_migrate_to_ram(struct vm_fault *vmf) if (kvmppc_svm_page_out(vmf->vma, vmf->address, vmf->address + PAGE_SIZE, PAGE_SHIFT, - pvt->kvm, pvt->gpa)) + pvt->kvm, pvt->gpa, vmf->page)) return VM_FAULT_SIGBUS; else return 0; @@ -1065,7 +1068,7 @@ kvmppc_h_svm_page_out(struct kvm *kvm, unsigned long gpa, if (!vma || vma->vm_start > start || vma->vm_end < end) goto out; - if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa)) + if (!kvmppc_svm_page_out(vma, start, end, page_shift, kvm, gpa, NULL)) ret = H_SUCCESS; out: mmap_read_unlock(kvm->mm); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index b059a77b6081..776448bd9fe4 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -409,7 +409,7 @@ svm_migrate_vma_to_vram(struct amdgpu_device *adev, struct svm_range *prange, uint64_t npages = (end - start) >> PAGE_SHIFT; struct kfd_process_device *pdd; struct dma_fence *mfence = NULL; - struct migrate_vma migrate; + struct migrate_vma migrate = { 0 }; unsigned long cpages = 0; dma_addr_t *scratch; void *buf; @@ -668,7 +668,7 @@ out_oom: static long svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, struct vm_area_struct *vma, uint64_t start, uint64_t end, - uint32_t trigger) + uint32_t trigger, struct page *fault_page) { struct kfd_process *p = container_of(prange->svms, struct kfd_process, svms); uint64_t npages = (end - start) >> PAGE_SHIFT; @@ -676,7 +676,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, unsigned long cpages = 0; struct kfd_process_device *pdd; struct dma_fence *mfence = NULL; - struct migrate_vma migrate; + struct migrate_vma migrate = { 0 }; dma_addr_t *scratch; void *buf; int r = -ENOMEM; @@ -699,6 +699,7 @@ svm_migrate_vma_to_ram(struct amdgpu_device *adev, struct svm_range *prange, migrate.src = buf; migrate.dst = migrate.src + npages; + migrate.fault_page = fault_page; scratch = (dma_addr_t *)(migrate.dst + npages); kfd_smi_event_migration_start(adev->kfd.dev, p->lead_thread->pid, @@ -766,7 +767,7 @@ out: * 0 - OK, otherwise error code */ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, - uint32_t trigger) + uint32_t trigger, struct page *fault_page) { struct amdgpu_device *adev; struct vm_area_struct *vma; @@ -807,7 +808,8 @@ int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, } next = min(vma->vm_end, end); - r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger); + r = svm_migrate_vma_to_ram(adev, prange, vma, addr, next, trigger, + fault_page); if (r < 0) { pr_debug("failed %ld to migrate prange %p\n", r, prange); break; @@ -851,7 +853,7 @@ svm_migrate_vram_to_vram(struct svm_range *prange, uint32_t best_loc, pr_debug("from gpu 0x%x to gpu 0x%x\n", prange->actual_loc, best_loc); do { - r = svm_migrate_vram_to_ram(prange, mm, trigger); + r = svm_migrate_vram_to_ram(prange, mm, trigger, NULL); if (r) return r; } while (prange->actual_loc && --retries); @@ -938,7 +940,8 @@ static vm_fault_t svm_migrate_to_ram(struct vm_fault *vmf) goto out_unlock_prange; } - r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU); + r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, + vmf->page); if (r) pr_debug("failed %d migrate 0x%p [0x%lx 0x%lx] to ram\n", r, prange, prange->start, prange->last); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h index b3f0754b32fa..a5d7e6d22264 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.h @@ -43,7 +43,7 @@ enum MIGRATION_COPY_DIR { int svm_migrate_to_vram(struct svm_range *prange, uint32_t best_loc, struct mm_struct *mm, uint32_t trigger); int svm_migrate_vram_to_ram(struct svm_range *prange, struct mm_struct *mm, - uint32_t trigger); + uint32_t trigger, struct page *fault_page); unsigned long svm_migrate_addr_to_pfn(struct amdgpu_device *adev, unsigned long addr); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c index 11074cc8c333..9139e5a0b2a0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c @@ -2913,13 +2913,15 @@ retry_write_locked: */ if (prange->actual_loc) r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, + NULL); else r = 0; } } else { r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU); + KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, + NULL); } if (r) { pr_debug("failed %d to migrate svms %p [0x%lx 0x%lx]\n", @@ -3242,7 +3244,8 @@ svm_range_trigger_migration(struct mm_struct *mm, struct svm_range *prange, return 0; if (!best_loc) { - r = svm_migrate_vram_to_ram(prange, mm, KFD_MIGRATE_TRIGGER_PREFETCH); + r = svm_migrate_vram_to_ram(prange, mm, + KFD_MIGRATE_TRIGGER_PREFETCH, NULL); *migrated = !r; return r; } @@ -3303,7 +3306,7 @@ static void svm_range_evict_svm_bo_worker(struct work_struct *work) mutex_lock(&prange->migrate_mutex); do { r = svm_migrate_vram_to_ram(prange, mm, - KFD_MIGRATE_TRIGGER_TTM_EVICTION); + KFD_MIGRATE_TRIGGER_TTM_EVICTION, NULL); } while (!r && prange->actual_loc && --retries); if (!r && prange->actual_loc) diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 704a04f5a074..52090d1f9230 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -62,6 +62,8 @@ extern const char *migrate_reason_names[MR_TYPES]; #ifdef CONFIG_MIGRATION extern void putback_movable_pages(struct list_head *l); +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode, int extra_count); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode); extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, @@ -197,6 +199,12 @@ struct migrate_vma { */ void *pgmap_owner; unsigned long flags; + + /* + * Set to vmf->page if this is being called to migrate a page as part of + * a migrate_to_ram() callback. + */ + struct page *fault_page; }; int migrate_vma_setup(struct migrate_vma *args); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index 6a33f6b1b465..e566166b5571 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -907,7 +907,7 @@ static int dmirror_migrate_to_system(struct dmirror *dmirror, struct vm_area_struct *vma; unsigned long src_pfns[64] = { 0 }; unsigned long dst_pfns[64] = { 0 }; - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long next; int ret; @@ -968,7 +968,7 @@ static int dmirror_migrate_to_device(struct dmirror *dmirror, unsigned long src_pfns[64] = { 0 }; unsigned long dst_pfns[64] = { 0 }; struct dmirror_bounce bounce; - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long next; int ret; @@ -1334,7 +1334,7 @@ static void dmirror_devmem_free(struct page *page) static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) { - struct migrate_vma args; + struct migrate_vma args = { 0 }; unsigned long src_pfns = 0; unsigned long dst_pfns = 0; struct page *rpage; @@ -1357,6 +1357,7 @@ static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) args.dst = &dst_pfns; args.pgmap_owner = dmirror->mdevice; args.flags = dmirror_select_device(dmirror); + args.fault_page = vmf->page; if (migrate_vma_setup(&args)) return VM_FAULT_SIGBUS; diff --git a/mm/memory.c b/mm/memory.c index 2c7723ea4371..4ad6077164cd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3750,7 +3750,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ret = remove_device_exclusive_entry(vmf); } else if (is_device_private_entry(entry)) { vmf->page = pfn_swap_entry_to_page(entry); - ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) { + spin_unlock(vmf->ptl); + goto out; + } + + /* + * Get a page reference while we know the page can't be + * freed. + */ + get_page(vmf->page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + vmf->page->pgmap->ops->migrate_to_ram(vmf); + put_page(vmf->page); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else if (is_swapin_error_entry(entry)) { diff --git a/mm/migrate.c b/mm/migrate.c index c228afba0963..1379e1912772 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -625,6 +625,25 @@ EXPORT_SYMBOL(folio_migrate_copy); * Migration functions ***********************************************************/ +int migrate_folio_extra(struct address_space *mapping, struct folio *dst, + struct folio *src, enum migrate_mode mode, int extra_count) +{ + int rc; + + BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ + + rc = folio_migrate_mapping(mapping, dst, src, extra_count); + + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + if (mode != MIGRATE_SYNC_NO_COPY) + folio_migrate_copy(dst, src); + else + folio_migrate_flags(dst, src); + return MIGRATEPAGE_SUCCESS; +} + /** * migrate_folio() - Simple folio migration. * @mapping: The address_space containing the folio. @@ -640,20 +659,7 @@ EXPORT_SYMBOL(folio_migrate_copy); int migrate_folio(struct address_space *mapping, struct folio *dst, struct folio *src, enum migrate_mode mode) { - int rc; - - BUG_ON(folio_test_writeback(src)); /* Writeback must be complete */ - - rc = folio_migrate_mapping(mapping, dst, src, 0); - - if (rc != MIGRATEPAGE_SUCCESS) - return rc; - - if (mode != MIGRATE_SYNC_NO_COPY) - folio_migrate_copy(dst, src); - else - folio_migrate_flags(dst, src); - return MIGRATEPAGE_SUCCESS; + return migrate_folio_extra(mapping, dst, src, mode, 0); } EXPORT_SYMBOL(migrate_folio); diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 5ab6ab9d2ed8..8dee38ffcda2 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -325,14 +325,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate) * folio_migrate_mapping(), except that here we allow migration of a * ZONE_DEVICE page. */ -static bool migrate_vma_check_page(struct page *page) +static bool migrate_vma_check_page(struct page *page, struct page *fault_page) { /* * One extra ref because caller holds an extra reference, either from * isolate_lru_page() for a regular page, or migrate_vma_collect() for * a device page. */ - int extra = 1; + int extra = 1 + (page == fault_page); /* * FIXME support THP (transparent huge page), it is bit more complex to @@ -405,7 +405,8 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) if (folio_mapped(folio)) try_to_migrate(folio, 0); - if (page_mapped(page) || !migrate_vma_check_page(page)) { + if (page_mapped(page) || + !migrate_vma_check_page(page, migrate->fault_page)) { if (!is_zone_device_page(page)) { get_page(page); putback_lru_page(page); @@ -517,6 +518,8 @@ int migrate_vma_setup(struct migrate_vma *args) return -EINVAL; if (!args->src || !args->dst) return -EINVAL; + if (args->fault_page && !is_device_private_page(args->fault_page)) + return -EINVAL; memset(args->src, 0, sizeof(*args->src) * nr_pages); args->cpages = 0; @@ -747,8 +750,13 @@ void migrate_vma_pages(struct migrate_vma *migrate) continue; } - r = migrate_folio(mapping, page_folio(newpage), - page_folio(page), MIGRATE_SYNC_NO_COPY); + if (migrate->fault_page == page) + r = migrate_folio_extra(mapping, page_folio(newpage), + page_folio(page), + MIGRATE_SYNC_NO_COPY, 1); + else + r = migrate_folio(mapping, page_folio(newpage), + page_folio(page), MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; } -- cgit v1.2.3 From ef233450898f8893dafa193a9f3211fa077a3d05 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:16 +1000 Subject: mm: free device private pages have zero refcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount") device private pages have no longer had an extra reference count when the page is in use. However before handing them back to the owning device driver we add an extra reference count such that free pages have a reference count of one. This makes it difficult to tell if a page is free or not because both free and in use pages will have a non-zero refcount. Instead we should return pages to the drivers page allocator with a zero reference count. Kernel code can then safely use kernel functions such as get_page_unless_zero(). Link: https://lkml.kernel.org/r/cf70cf6f8c0bdb8aaebdbfb0d790aea4c683c3c6.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Acked-by: Felix Kuehling Cc: Jason Gunthorpe Cc: Michael Ellerman Cc: Alex Deucher Cc: Christian König Cc: Ben Skeggs Cc: Lyude Paul Cc: Ralph Campbell Cc: Alex Sierra Cc: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- arch/powerpc/kvm/book3s_hv_uvmem.c | 2 +- drivers/gpu/drm/amd/amdkfd/kfd_migrate.c | 2 +- drivers/gpu/drm/nouveau/nouveau_dmem.c | 2 +- include/linux/memremap.h | 1 + lib/test_hmm.c | 2 +- mm/memremap.c | 9 +++++++++ mm/page_alloc.c | 8 ++++++++ 7 files changed, 22 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/kvm/book3s_hv_uvmem.c b/arch/powerpc/kvm/book3s_hv_uvmem.c index 965c9e9e500b..e2f11f9c3f2a 100644 --- a/arch/powerpc/kvm/book3s_hv_uvmem.c +++ b/arch/powerpc/kvm/book3s_hv_uvmem.c @@ -718,7 +718,7 @@ static struct page *kvmppc_uvmem_get_page(unsigned long gpa, struct kvm *kvm) dpage = pfn_to_page(uvmem_pfn); dpage->zone_device_data = pvt; - lock_page(dpage); + zone_device_page_init(dpage); return dpage; out_clear: spin_lock(&kvmppc_uvmem_bitmap_lock); diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c index 776448bd9fe4..97a684568ae0 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_migrate.c @@ -223,7 +223,7 @@ svm_migrate_get_vram_page(struct svm_range *prange, unsigned long pfn) page = pfn_to_page(pfn); svm_range_bo_ref(prange->svm_bo); page->zone_device_data = prange->svm_bo; - lock_page(page); + zone_device_page_init(page); } static void diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c index 16356611b5b9..b092988266a6 100644 --- a/drivers/gpu/drm/nouveau/nouveau_dmem.c +++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c @@ -326,7 +326,7 @@ nouveau_dmem_page_alloc_locked(struct nouveau_drm *drm) return NULL; } - lock_page(page); + zone_device_page_init(page); return page; } diff --git a/include/linux/memremap.h b/include/linux/memremap.h index c3b4cc84877b..7fcaf3180a5b 100644 --- a/include/linux/memremap.h +++ b/include/linux/memremap.h @@ -187,6 +187,7 @@ static inline bool folio_is_device_coherent(const struct folio *folio) } #ifdef CONFIG_ZONE_DEVICE +void zone_device_page_init(struct page *page); void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e566166b5571..bc2b94991165 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -627,8 +627,8 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) goto error; } + zone_device_page_init(dpage); dpage->zone_device_data = rpage; - lock_page(dpage); return dpage; error: diff --git a/mm/memremap.c b/mm/memremap.c index 25029a474d30..1c2c038f3410 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -505,8 +505,17 @@ void free_zone_device_page(struct page *page) /* * Reset the page count to 1 to prepare for handing out the page again. */ + if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && + page->pgmap->type != MEMORY_DEVICE_COHERENT) + set_page_count(page, 1); +} + +void zone_device_page_init(struct page *page) +{ set_page_count(page, 1); + lock_page(page); } +EXPORT_SYMBOL_GPL(zone_device_page_init); #ifdef CONFIG_FS_DAX bool __put_devmap_managed_page_refs(struct page *page, int refs) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 12b6184cbbed..059f6946832f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6819,6 +6819,14 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } + + /* + * ZONE_DEVICE pages are released directly to the driver page allocator + * which will set the page count to 1 when allocating the page. + */ + if (pgmap->type == MEMORY_DEVICE_PRIVATE || + pgmap->type == MEMORY_DEVICE_COHERENT) + set_page_count(page, 0); } /* -- cgit v1.2.3 From 0dc45ca1ce18900572282c4f054bbe78351cb6a7 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:17 +1000 Subject: mm/memremap.c: take a pgmap reference on page allocation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ZONE_DEVICE pages have a struct dev_pagemap which is allocated by a driver. When the struct page is first allocated by the kernel in memremap_pages() a reference is taken on the associated pagemap to ensure it is not freed prior to the pages being freed. Prior to 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount") pages were considered free and returned to the driver when the reference count dropped to one. However the pagemap reference was not dropped until the page reference count hit zero. This would occur as part of the final put_page() in memunmap_pages() which would wait for all pages to be freed prior to returning. When the extra refcount was removed the pagemap reference was no longer being dropped in put_page(). Instead memunmap_pages() was changed to explicitly drop the pagemap references. This means that memunmap_pages() can complete even though pages are still mapped by the kernel which can lead to kernel crashes, particularly if a driver frees the pagemap. To fix this drivers should take a pagemap reference when allocating the page. This reference can then be returned when the page is freed. Link: https://lkml.kernel.org/r/12d155ec727935ebfbb4d639a03ab374917ea51b.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Fixes: 27674ef6c73f ("mm: remove the extra ZONE_DEVICE struct page refcount") Cc: Jason Gunthorpe Cc: Felix Kuehling Cc: Alex Deucher Cc: Christian König Cc: Ben Skeggs Cc: Lyude Paul Cc: Ralph Campbell Cc: Alex Sierra Cc: John Hubbard Cc: Dan Williams Cc: David Hildenbrand Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Michael Ellerman Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/memremap.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memremap.c b/mm/memremap.c index 1c2c038f3410..421bec3a29ee 100644 --- a/mm/memremap.c +++ b/mm/memremap.c @@ -138,8 +138,11 @@ void memunmap_pages(struct dev_pagemap *pgmap) int i; percpu_ref_kill(&pgmap->ref); - for (i = 0; i < pgmap->nr_range; i++) - percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); + if (pgmap->type != MEMORY_DEVICE_PRIVATE && + pgmap->type != MEMORY_DEVICE_COHERENT) + for (i = 0; i < pgmap->nr_range; i++) + percpu_ref_put_many(&pgmap->ref, pfn_len(pgmap, i)); + wait_for_completion(&pgmap->done); for (i = 0; i < pgmap->nr_range; i++) @@ -264,7 +267,9 @@ static int pagemap_range(struct dev_pagemap *pgmap, struct mhp_params *params, memmap_init_zone_device(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], PHYS_PFN(range->start), PHYS_PFN(range_len(range)), pgmap); - percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id)); + if (pgmap->type != MEMORY_DEVICE_PRIVATE && + pgmap->type != MEMORY_DEVICE_COHERENT) + percpu_ref_get_many(&pgmap->ref, pfn_len(pgmap, range_id)); return 0; err_add_memory: @@ -502,16 +507,24 @@ void free_zone_device_page(struct page *page) page->mapping = NULL; page->pgmap->ops->page_free(page); - /* - * Reset the page count to 1 to prepare for handing out the page again. - */ if (page->pgmap->type != MEMORY_DEVICE_PRIVATE && page->pgmap->type != MEMORY_DEVICE_COHERENT) + /* + * Reset the page count to 1 to prepare for handing out the page + * again. + */ set_page_count(page, 1); + else + put_dev_pagemap(page->pgmap); } void zone_device_page_init(struct page *page) { + /* + * Drivers shouldn't be allocating pages after calling + * memunmap_pages(). + */ + WARN_ON_ONCE(!percpu_ref_tryget_live(&page->pgmap->ref)); set_page_count(page, 1); lock_page(page); } -- cgit v1.2.3 From 241f68859656836ae3e85179cc224cc4c5e4e6a7 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:18 +1000 Subject: mm/migrate_device.c: refactor migrate_vma and migrate_deivce_coherent_page() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit migrate_device_coherent_page() reuses the existing migrate_vma family of functions to migrate a specific page without providing a valid mapping or vma. This looks a bit odd because it means we are calling migrate_vma_*() without setting a valid vma, however it was considered acceptable at the time because the details were internal to migrate_device.c and there was only a single user. One of the reasons the details could be kept internal was that this was strictly for migrating device coherent memory. Such memory can be copied directly by the CPU without intervention from a driver. However this isn't true for device private memory, and a future change requires similar functionality for device private memory. So refactor the code into something more sensible for migrating device memory without a vma. Link: https://lkml.kernel.org/r/c7b2ff84e9b33d022cf4a40f87d051f281a16d8f.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Cc: "Huang, Ying" Cc: Zi Yan Cc: Matthew Wilcox Cc: Yang Shi Cc: David Hildenbrand Cc: Ralph Campbell Cc: John Hubbard Cc: Alex Deucher Cc: Alex Sierra Cc: Ben Skeggs Cc: Christian König Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Lyude Paul Cc: Michael Ellerman Signed-off-by: Andrew Morton --- mm/migrate_device.c | 150 +++++++++++++++++++++++++++++----------------------- 1 file changed, 85 insertions(+), 65 deletions(-) (limited to 'mm') diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 8dee38ffcda2..7707c1d898f5 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -357,26 +357,20 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page) } /* - * migrate_vma_unmap() - replace page mapping with special migration pte entry - * @migrate: migrate struct containing all migration information - * - * Isolate pages from the LRU and replace mappings (CPU page table pte) with a - * special migration pte entry and check if it has been pinned. Pinned pages are - * restored because we cannot migrate them. - * - * This is the last step before we call the device driver callback to allocate - * destination memory and copy contents of original page over to new page. + * Unmaps pages for migration. Returns number of unmapped pages. */ -static void migrate_vma_unmap(struct migrate_vma *migrate) +static unsigned long migrate_device_unmap(unsigned long *src_pfns, + unsigned long npages, + struct page *fault_page) { - const unsigned long npages = migrate->npages; unsigned long i, restore = 0; bool allow_drain = true; + unsigned long unmapped = 0; lru_add_drain(); for (i = 0; i < npages; i++) { - struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); struct folio *folio; if (!page) @@ -391,8 +385,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) } if (isolate_lru_page(page)) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; continue; } @@ -406,34 +399,54 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) try_to_migrate(folio, 0); if (page_mapped(page) || - !migrate_vma_check_page(page, migrate->fault_page)) { + !migrate_vma_check_page(page, fault_page)) { if (!is_zone_device_page(page)) { get_page(page); putback_lru_page(page); } - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; - migrate->cpages--; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; restore++; continue; } + + unmapped++; } for (i = 0; i < npages && restore; i++) { - struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); struct folio *folio; - if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) + if (!page || (src_pfns[i] & MIGRATE_PFN_MIGRATE)) continue; folio = page_folio(page); remove_migration_ptes(folio, folio, false); - migrate->src[i] = 0; + src_pfns[i] = 0; folio_unlock(folio); folio_put(folio); restore--; } + + return unmapped; +} + +/* + * migrate_vma_unmap() - replace page mapping with special migration pte entry + * @migrate: migrate struct containing all migration information + * + * Isolate pages from the LRU and replace mappings (CPU page table pte) with a + * special migration pte entry and check if it has been pinned. Pinned pages are + * restored because we cannot migrate them. + * + * This is the last step before we call the device driver callback to allocate + * destination memory and copy contents of original page over to new page. + */ +static void migrate_vma_unmap(struct migrate_vma *migrate) +{ + migrate->cpages = migrate_device_unmap(migrate->src, migrate->npages, + migrate->fault_page); } /** @@ -680,41 +693,36 @@ abort: *src &= ~MIGRATE_PFN_MIGRATE; } -/** - * migrate_vma_pages() - migrate meta-data from src page to dst page - * @migrate: migrate struct containing all migration information - * - * This migrates struct page meta-data from source struct page to destination - * struct page. This effectively finishes the migration from source page to the - * destination page. - */ -void migrate_vma_pages(struct migrate_vma *migrate) +static void migrate_device_pages(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages, + struct migrate_vma *migrate) { - const unsigned long npages = migrate->npages; - const unsigned long start = migrate->start; struct mmu_notifier_range range; - unsigned long addr, i; + unsigned long i; bool notified = false; - for (i = 0, addr = start; i < npages; addr += PAGE_SIZE, i++) { - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); - struct page *page = migrate_pfn_to_page(migrate->src[i]); + for (i = 0; i < npages; i++) { + struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); struct address_space *mapping; int r; if (!newpage) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; continue; } if (!page) { + unsigned long addr; + /* * The only time there is no vma is when called from * migrate_device_coherent_page(). However this isn't * called if the page could not be unmapped. */ - VM_BUG_ON(!migrate->vma); - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE)) + VM_BUG_ON(!migrate); + addr = migrate->start + i*PAGE_SIZE; + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) continue; if (!notified) { notified = true; @@ -726,7 +734,7 @@ void migrate_vma_pages(struct migrate_vma *migrate) mmu_notifier_invalidate_range_start(&range); } migrate_vma_insert_page(migrate, addr, newpage, - &migrate->src[i]); + &src_pfns[i]); continue; } @@ -739,18 +747,18 @@ void migrate_vma_pages(struct migrate_vma *migrate) * device private or coherent memory. */ if (mapping) { - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; continue; } } else if (is_zone_device_page(newpage)) { /* * Other types of ZONE_DEVICE page are not supported. */ - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; continue; } - if (migrate->fault_page == page) + if (migrate && migrate->fault_page == page) r = migrate_folio_extra(mapping, page_folio(newpage), page_folio(page), MIGRATE_SYNC_NO_COPY, 1); @@ -758,7 +766,7 @@ void migrate_vma_pages(struct migrate_vma *migrate) r = migrate_folio(mapping, page_folio(newpage), page_folio(page), MIGRATE_SYNC_NO_COPY); if (r != MIGRATEPAGE_SUCCESS) - migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; + src_pfns[i] &= ~MIGRATE_PFN_MIGRATE; } /* @@ -769,28 +777,30 @@ void migrate_vma_pages(struct migrate_vma *migrate) if (notified) mmu_notifier_invalidate_range_only_end(&range); } -EXPORT_SYMBOL(migrate_vma_pages); /** - * migrate_vma_finalize() - restore CPU page table entry + * migrate_vma_pages() - migrate meta-data from src page to dst page * @migrate: migrate struct containing all migration information * - * This replaces the special migration pte entry with either a mapping to the - * new page if migration was successful for that page, or to the original page - * otherwise. - * - * This also unlocks the pages and puts them back on the lru, or drops the extra - * refcount, for device pages. + * This migrates struct page meta-data from source struct page to destination + * struct page. This effectively finishes the migration from source page to the + * destination page. */ -void migrate_vma_finalize(struct migrate_vma *migrate) +void migrate_vma_pages(struct migrate_vma *migrate) +{ + migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); +} +EXPORT_SYMBOL(migrate_vma_pages); + +static void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages) { - const unsigned long npages = migrate->npages; unsigned long i; for (i = 0; i < npages; i++) { struct folio *dst, *src; - struct page *newpage = migrate_pfn_to_page(migrate->dst[i]); - struct page *page = migrate_pfn_to_page(migrate->src[i]); + struct page *newpage = migrate_pfn_to_page(dst_pfns[i]); + struct page *page = migrate_pfn_to_page(src_pfns[i]); if (!page) { if (newpage) { @@ -800,7 +810,7 @@ void migrate_vma_finalize(struct migrate_vma *migrate) continue; } - if (!(migrate->src[i] & MIGRATE_PFN_MIGRATE) || !newpage) { + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !newpage) { if (newpage) { unlock_page(newpage); put_page(newpage); @@ -827,6 +837,22 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } } } + +/** + * migrate_vma_finalize() - restore CPU page table entry + * @migrate: migrate struct containing all migration information + * + * This replaces the special migration pte entry with either a mapping to the + * new page if migration was successful for that page, or to the original page + * otherwise. + * + * This also unlocks the pages and puts them back on the lru, or drops the extra + * refcount, for device pages. + */ +void migrate_vma_finalize(struct migrate_vma *migrate) +{ + migrate_device_finalize(migrate->src, migrate->dst, migrate->npages); +} EXPORT_SYMBOL(migrate_vma_finalize); /* @@ -837,25 +863,19 @@ EXPORT_SYMBOL(migrate_vma_finalize); int migrate_device_coherent_page(struct page *page) { unsigned long src_pfn, dst_pfn = 0; - struct migrate_vma args; struct page *dpage; WARN_ON_ONCE(PageCompound(page)); lock_page(page); src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE; - args.src = &src_pfn; - args.dst = &dst_pfn; - args.cpages = 1; - args.npages = 1; - args.vma = NULL; /* * We don't have a VMA and don't need to walk the page tables to find * the source page. So call migrate_vma_unmap() directly to unmap the * page as migrate_vma_setup() will fail if args.vma == NULL. */ - migrate_vma_unmap(&args); + migrate_device_unmap(&src_pfn, 1, NULL); if (!(src_pfn & MIGRATE_PFN_MIGRATE)) return -EBUSY; @@ -865,10 +885,10 @@ int migrate_device_coherent_page(struct page *page) dst_pfn = migrate_pfn(page_to_pfn(dpage)); } - migrate_vma_pages(&args); + migrate_device_pages(&src_pfn, &dst_pfn, 1, NULL); if (src_pfn & MIGRATE_PFN_MIGRATE) copy_highpage(dpage, page); - migrate_vma_finalize(&args); + migrate_device_finalize(&src_pfn, &dst_pfn, 1); if (src_pfn & MIGRATE_PFN_MIGRATE) return 0; -- cgit v1.2.3 From e778406b40dbb1342a1888cd751ca9d2982a12e2 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 28 Sep 2022 22:01:19 +1000 Subject: mm/migrate_device.c: add migrate_device_range() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Device drivers can use the migrate_vma family of functions to migrate existing private anonymous mappings to device private pages. These pages are backed by memory on the device with drivers being responsible for copying data to and from device memory. Device private pages are freed via the pgmap->page_free() callback when they are unmapped and their refcount drops to zero. Alternatively they may be freed indirectly via migration back to CPU memory in response to a pgmap->migrate_to_ram() callback called whenever the CPU accesses an address mapped to a device private page. In other words drivers cannot control the lifetime of data allocated on the devices and must wait until these pages are freed from userspace. This causes issues when memory needs to reclaimed on the device, either because the device is going away due to a ->release() callback or because another user needs to use the memory. Drivers could use the existing migrate_vma functions to migrate data off the device. However this would require them to track the mappings of each page which is both complicated and not always possible. Instead drivers need to be able to migrate device pages directly so they can free up device memory. To allow that this patch introduces the migrate_device family of functions which are functionally similar to migrate_vma but which skips the initial lookup based on mapping. Link: https://lkml.kernel.org/r/868116aab70b0c8ee467d62498bb2cf0ef907295.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Cc: "Huang, Ying" Cc: Zi Yan Cc: Matthew Wilcox Cc: Yang Shi Cc: David Hildenbrand Cc: Ralph Campbell Cc: John Hubbard Cc: Alex Deucher Cc: Alex Sierra Cc: Ben Skeggs Cc: Christian König Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Lyude Paul Cc: Michael Ellerman Signed-off-by: Andrew Morton --- include/linux/migrate.h | 7 ++++ mm/migrate_device.c | 89 +++++++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 89 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 52090d1f9230..3ef77f52a4f0 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -210,6 +210,13 @@ struct migrate_vma { int migrate_vma_setup(struct migrate_vma *args); void migrate_vma_pages(struct migrate_vma *migrate); void migrate_vma_finalize(struct migrate_vma *migrate); +int migrate_device_range(unsigned long *src_pfns, unsigned long start, + unsigned long npages); +void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, + unsigned long npages); +void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages); + #endif /* CONFIG_MIGRATION */ #endif /* _LINUX_MIGRATE_H */ diff --git a/mm/migrate_device.c b/mm/migrate_device.c index 7707c1d898f5..6fa682eef7a0 100644 --- a/mm/migrate_device.c +++ b/mm/migrate_device.c @@ -693,7 +693,7 @@ abort: *src &= ~MIGRATE_PFN_MIGRATE; } -static void migrate_device_pages(unsigned long *src_pfns, +static void __migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, unsigned long npages, struct migrate_vma *migrate) { @@ -715,6 +715,9 @@ static void migrate_device_pages(unsigned long *src_pfns, if (!page) { unsigned long addr; + if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) + continue; + /* * The only time there is no vma is when called from * migrate_device_coherent_page(). However this isn't @@ -722,8 +725,6 @@ static void migrate_device_pages(unsigned long *src_pfns, */ VM_BUG_ON(!migrate); addr = migrate->start + i*PAGE_SIZE; - if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE)) - continue; if (!notified) { notified = true; @@ -778,6 +779,22 @@ static void migrate_device_pages(unsigned long *src_pfns, mmu_notifier_invalidate_range_only_end(&range); } +/** + * migrate_device_pages() - migrate meta-data from src page to dst page + * @src_pfns: src_pfns returned from migrate_device_range() + * @dst_pfns: array of pfns allocated by the driver to migrate memory to + * @npages: number of pages in the range + * + * Equivalent to migrate_vma_pages(). This is called to migrate struct page + * meta-data from source struct page to destination. + */ +void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns, + unsigned long npages) +{ + __migrate_device_pages(src_pfns, dst_pfns, npages, NULL); +} +EXPORT_SYMBOL(migrate_device_pages); + /** * migrate_vma_pages() - migrate meta-data from src page to dst page * @migrate: migrate struct containing all migration information @@ -788,12 +805,22 @@ static void migrate_device_pages(unsigned long *src_pfns, */ void migrate_vma_pages(struct migrate_vma *migrate) { - migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); + __migrate_device_pages(migrate->src, migrate->dst, migrate->npages, migrate); } EXPORT_SYMBOL(migrate_vma_pages); -static void migrate_device_finalize(unsigned long *src_pfns, - unsigned long *dst_pfns, unsigned long npages) +/* + * migrate_device_finalize() - complete page migration + * @src_pfns: src_pfns returned from migrate_device_range() + * @dst_pfns: array of pfns allocated by the driver to migrate memory to + * @npages: number of pages in the range + * + * Completes migration of the page by removing special migration entries. + * Drivers must ensure copying of page data is complete and visible to the CPU + * before calling this. + */ +void migrate_device_finalize(unsigned long *src_pfns, + unsigned long *dst_pfns, unsigned long npages) { unsigned long i; @@ -837,6 +864,7 @@ static void migrate_device_finalize(unsigned long *src_pfns, } } } +EXPORT_SYMBOL(migrate_device_finalize); /** * migrate_vma_finalize() - restore CPU page table entry @@ -855,6 +883,53 @@ void migrate_vma_finalize(struct migrate_vma *migrate) } EXPORT_SYMBOL(migrate_vma_finalize); +/** + * migrate_device_range() - migrate device private pfns to normal memory. + * @src_pfns: array large enough to hold migrating source device private pfns. + * @start: starting pfn in the range to migrate. + * @npages: number of pages to migrate. + * + * migrate_vma_setup() is similar in concept to migrate_vma_setup() except that + * instead of looking up pages based on virtual address mappings a range of + * device pfns that should be migrated to system memory is used instead. + * + * This is useful when a driver needs to free device memory but doesn't know the + * virtual mappings of every page that may be in device memory. For example this + * is often the case when a driver is being unloaded or unbound from a device. + * + * Like migrate_vma_setup() this function will take a reference and lock any + * migrating pages that aren't free before unmapping them. Drivers may then + * allocate destination pages and start copying data from the device to CPU + * memory before calling migrate_device_pages(). + */ +int migrate_device_range(unsigned long *src_pfns, unsigned long start, + unsigned long npages) +{ + unsigned long i, pfn; + + for (pfn = start, i = 0; i < npages; pfn++, i++) { + struct page *page = pfn_to_page(pfn); + + if (!get_page_unless_zero(page)) { + src_pfns[i] = 0; + continue; + } + + if (!trylock_page(page)) { + src_pfns[i] = 0; + put_page(page); + continue; + } + + src_pfns[i] = migrate_pfn(pfn) | MIGRATE_PFN_MIGRATE; + } + + migrate_device_unmap(src_pfns, npages, NULL); + + return 0; +} +EXPORT_SYMBOL(migrate_device_range); + /* * Migrate a device coherent page back to normal memory. The caller should have * a reference on page which will be copied to the new page if migration is @@ -885,7 +960,7 @@ int migrate_device_coherent_page(struct page *page) dst_pfn = migrate_pfn(page_to_pfn(dpage)); } - migrate_device_pages(&src_pfn, &dst_pfn, 1, NULL); + migrate_device_pages(&src_pfn, &dst_pfn, 1); if (src_pfn & MIGRATE_PFN_MIGRATE) copy_highpage(dpage, page); migrate_device_finalize(&src_pfn, &dst_pfn, 1); -- cgit v1.2.3 From d6e5040bd8e53371fafd7e0c7c63b090b3a675db Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 26 Sep 2022 20:08:47 +0200 Subject: kasan: fix array-bounds warnings in tests GCC's -Warray-bounds option detects out-of-bounds accesses to statically-sized allocations in krealloc out-of-bounds tests. Use OPTIMIZER_HIDE_VAR to suppress the warning. Also change kmalloc_memmove_invalid_size to use OPTIMIZER_HIDE_VAR instead of a volatile variable. Link: https://lkml.kernel.org/r/e94399242d32e00bba6fd0d9ec4c897f188128e8.1664215688.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reported-by: kernel test robot Reviewed-by: Kees Cook Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- mm/kasan/kasan_test.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/kasan/kasan_test.c b/mm/kasan/kasan_test.c index f25692def781..57e4c72aa8bd 100644 --- a/mm/kasan/kasan_test.c +++ b/mm/kasan/kasan_test.c @@ -295,6 +295,9 @@ static void krealloc_more_oob_helper(struct kunit *test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + /* Suppress -Warray-bounds warnings. */ + OPTIMIZER_HIDE_VAR(ptr2); + /* All offsets up to size2 must be accessible. */ ptr2[size1 - 1] = 'x'; ptr2[size1] = 'x'; @@ -327,6 +330,9 @@ static void krealloc_less_oob_helper(struct kunit *test, ptr2 = krealloc(ptr1, size2, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + /* Suppress -Warray-bounds warnings. */ + OPTIMIZER_HIDE_VAR(ptr2); + /* Must be accessible for all modes. */ ptr2[size2 - 1] = 'x'; @@ -540,13 +546,14 @@ static void kmalloc_memmove_invalid_size(struct kunit *test) { char *ptr; size_t size = 64; - volatile size_t invalid_size = size; + size_t invalid_size = size; ptr = kmalloc(size, GFP_KERNEL); KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); memset((char *)ptr, 0, 64); OPTIMIZER_HIDE_VAR(ptr); + OPTIMIZER_HIDE_VAR(invalid_size); KUNIT_EXPECT_KASAN_FAIL(test, memmove((char *)ptr, (char *)ptr + 4, invalid_size)); kfree(ptr); -- cgit v1.2.3 From bce8cb3c04dc01d21b6b17baf1cb6c277e7e6848 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 29 Sep 2022 19:23:17 +0800 Subject: mm: use update_mmu_tlb() on the second thread As message in commit 7df676974359 ("mm/memory.c: Update local TLB if PTE entry exists") said, we should update local TLB only on the second thread. So in the do_anonymous_page() here, we should use update_mmu_tlb() instead of update_mmu_cache() on the second thread. As David pointed out, this is a performance improvement, not a correctness fix. Link: https://lkml.kernel.org/r/20220929112318.32393-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Bibo Mao Cc: Chris Zankel Cc: Huacai Chen Cc: Max Filippov Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 4ad6077164cd..f88c351aecd4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4134,7 +4134,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) { - update_mmu_cache(vma, vmf->address, vmf->pte); + update_mmu_tlb(vma, vmf->address, vmf->pte); goto release; } -- cgit v1.2.3 From 2ea7ff1e39cbe3753d3c649beb70f2cf861dca75 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 Oct 2022 15:33:58 -0400 Subject: mm/hugetlb: fix race condition of uffd missing/minor handling Patch series "mm/hugetlb: Fix selftest failures with write check", v3. Currently akpm mm-unstable fails with uffd hugetlb private mapping test randomly on a write check. The initial bisection of that points to the recent pmd unshare series, but it turns out there's no direction relationship with the series but only some timing change caused the race to start trigger. The race should be fixed in patch 1. Patch 2 is a trivial cleanup on the similar race with hugetlb migrations, patch 3 comment on the write check so when anyone read it again it'll be clear why it's there. This patch (of 3): After the recent rework patchset of hugetlb locking on pmd sharing, kselftest for userfaultfd sometimes fails on hugetlb private tests with unexpected write fault checks. It turns out there's nothing wrong within the locking series regarding this matter, but it could have changed the timing of threads so it can trigger an old bug. The real bug is when we call hugetlb_no_page() we're not with the pgtable lock. It means we're reading the pte values lockless. It's perfectly fine in most cases because before we do normal page allocations we'll take the lock and check pte_same() again. However before that, there are actually two paths on userfaultfd missing/minor handling that may directly move on with the fault process without checking the pte values. It means for these two paths we may be generating an uffd message based on an unstable pte, while an unstable pte can legally be anything as long as the modifier holds the pgtable lock. One example, which is also what happened in the failing kselftest and caused the test failure, is that for private mappings wr-protection changes can happen on one page. While hugetlb_change_protection() generally requires pte being cleared before being changed, then there can be a race condition like: thread 1 thread 2 -------- -------- UFFDIO_WRITEPROTECT hugetlb_fault hugetlb_change_protection pgtable_lock() huge_ptep_modify_prot_start pte==NULL hugetlb_no_page generate uffd missing event even if page existed!! huge_ptep_modify_prot_commit pgtable_unlock() Fix this by rechecking the pte after pgtable lock for both userfaultfd missing & minor fault paths. This bug should have been around starting from uffd hugetlb introduced, so attaching a Fixes to the commit. Also attach another Fixes to the minor support commit for easier tracking. Note that userfaultfd is actually fine with false positives (e.g. caused by pte changed), but not wrong logical events (e.g. caused by reading a pte during changing). The latter can confuse the userspace, so the strictness is very much preferred. E.g., MISSING event should never happen on the page after UFFDIO_COPY has correctly installed the page and returned. Link: https://lkml.kernel.org/r/20221004193400.110155-1-peterx@redhat.com Link: https://lkml.kernel.org/r/20221004193400.110155-2-peterx@redhat.com Fixes: 1a1aad8a9b7b ("userfaultfd: hugetlbfs: add userfaultfd hugetlb hook") Fixes: 7677f7fd8be7 ("userfaultfd: add minor fault registration mode") Signed-off-by: Peter Xu Co-developed-by: Mike Kravetz Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Nadav Amit Cc: David Hildenbrand Cc: Mike Rapoport Signed-off-by: Andrew Morton --- mm/hugetlb.c | 59 ++++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9a910612336d..bf9d8d04bf4f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5535,6 +5535,23 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, return handle_userfault(&vmf, reason); } +/* + * Recheck pte with pgtable lock. Returns true if pte didn't change, or + * false if pte changed or is changing. + */ +static bool hugetlb_pte_stable(struct hstate *h, struct mm_struct *mm, + pte_t *ptep, pte_t old_pte) +{ + spinlock_t *ptl; + bool same; + + ptl = huge_pte_lock(h, mm, ptep); + same = pte_same(huge_ptep_get(ptep), old_pte); + spin_unlock(ptl); + + return same; +} + static vm_fault_t hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct address_space *mapping, pgoff_t idx, @@ -5575,10 +5592,33 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (idx >= size) goto out; /* Check for page in userfault range */ - if (userfaultfd_missing(vma)) - return hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, address, - VM_UFFD_MISSING); + if (userfaultfd_missing(vma)) { + /* + * Since hugetlb_no_page() was examining pte + * without pgtable lock, we need to re-test under + * lock because the pte may not be stable and could + * have changed from under us. Try to detect + * either changed or during-changing ptes and retry + * properly when needed. + * + * Note that userfaultfd is actually fine with + * false positives (e.g. caused by pte changed), + * but not wrong logical events (e.g. caused by + * reading a pte during changing). The latter can + * confuse the userspace, so the strictness is very + * much preferred. E.g., MISSING event should + * never happen on the page after UFFDIO_COPY has + * correctly installed the page and returned. + */ + if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + ret = 0; + goto out; + } + + return hugetlb_handle_userfault(vma, mapping, idx, flags, + haddr, address, + VM_UFFD_MISSING); + } page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { @@ -5644,9 +5684,14 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (userfaultfd_minor(vma)) { unlock_page(page); put_page(page); - return hugetlb_handle_userfault(vma, mapping, idx, - flags, haddr, address, - VM_UFFD_MINOR); + /* See comment in userfaultfd_missing() block above */ + if (!hugetlb_pte_stable(h, mm, ptep, old_pte)) { + ret = 0; + goto out; + } + return hugetlb_handle_userfault(vma, mapping, idx, flags, + haddr, address, + VM_UFFD_MINOR); } } -- cgit v1.2.3 From f9bf6c03eca1077cae8de0e6d86427656fa42a9b Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 4 Oct 2022 15:33:59 -0400 Subject: mm/hugetlb: use hugetlb_pte_stable in migration race check After hugetlb_pte_stable() introduced, we can also rewrite the migration race condition against page allocation to use the new helper too. Link: https://lkml.kernel.org/r/20221004193400.110155-3-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Mike Kravetz Reviewed-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Axel Rasmussen Cc: Mike Rapoport Cc: Nadav Amit Signed-off-by: Andrew Morton --- mm/hugetlb.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bf9d8d04bf4f..9b26055f3119 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5634,11 +5634,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * here. Before returning error, get ptl and make * sure there really is no pte entry. */ - ptl = huge_pte_lock(h, mm, ptep); - ret = 0; - if (huge_pte_none(huge_ptep_get(ptep))) + if (hugetlb_pte_stable(h, mm, ptep, old_pte)) ret = vmf_error(PTR_ERR(page)); - spin_unlock(ptl); + else + ret = 0; goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); -- cgit v1.2.3 From 15cd90049d595e592d8860ee15a3f23491d54d17 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Thu, 6 Oct 2022 10:15:40 +0000 Subject: mm/page_alloc: fix incorrect PGFREE and PGALLOC for high-order page PGFREE and PGALLOC represent the number of freed and allocated pages. So the page order must be considered. Link: https://lkml.kernel.org/r/20221006101540.40686-1-laoar.shao@gmail.com Fixes: 44042b449872 ("mm/page_alloc: allow high-order pages to be stored on the per-cpu lists") Signed-off-by: Yafang Shao Acked-by: Mel Gorman Reviewed-by: Miaohe Lin Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 059f6946832f..8e9b7f08a32c 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3446,7 +3446,7 @@ static void free_unref_page_commit(struct zone *zone, struct per_cpu_pages *pcp, int pindex; bool free_high; - __count_vm_event(PGFREE); + __count_vm_events(PGFREE, 1 << order); pindex = order_to_pindex(migratetype, order); list_add(&page->pcp_list, &pcp->lists[pindex]); pcp->count += 1 << order; @@ -3803,7 +3803,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, pcp_spin_unlock_irqrestore(pcp, flags); pcp_trylock_finish(UP_flags); if (page) { - __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); + __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone, 1); } return page; -- cgit v1.2.3 From ef6e06b2ef87077104d1145a0fd452ff8dbbc4b7 Mon Sep 17 00:00:00 2001 From: Ira Weiny Date: Wed, 5 Oct 2022 21:05:55 -0700 Subject: highmem: fix kmap_to_page() for kmap_local_page() addresses kmap_to_page() is used to get the page for a virtual address which may be kmap'ed. Unfortunately, kmap_local_page() stores mappings in a thread local array separate from kmap(). These mappings were not checked by the call. Check the kmap_local_page() mappings and return the page if found. Because it is intended to remove kmap_to_page() add a warn on once to the kmap checks to flag potential issues early. NOTE Due to 32bit x86 use of kmap local in iomap atmoic, KMAP_LOCAL does not require HIGHMEM to be set. Therefore the support calls required a new KMAP_LOCAL section to fix 0day build errors. [akpm@linux-foundation.org: fix warning] Link: https://lkml.kernel.org/r/20221006040555.1502679-1-ira.weiny@intel.com Signed-off-by: Ira Weiny Reported-by: Al Viro Reported-by: kernel test robot Cc: "Fabio M. De Francesco" Cc: Thomas Gleixner Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- mm/highmem.c | 43 +++++++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index c707d7202d5f..db251e77f98f 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -30,6 +30,17 @@ #include #include +#ifdef CONFIG_KMAP_LOCAL +static inline int kmap_local_calc_idx(int idx) +{ + return idx + KM_MAX_IDX * smp_processor_id(); +} + +#ifndef arch_kmap_local_map_idx +#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) +#endif +#endif /* CONFIG_KMAP_LOCAL */ + /* * Virtual_count is not a pure "count". * 0 means that it is not mapped, and has not been mapped @@ -142,12 +153,29 @@ pte_t *pkmap_page_table; struct page *__kmap_to_page(void *vaddr) { + unsigned long base = (unsigned long) vaddr & PAGE_MASK; + struct kmap_ctrl *kctrl = ¤t->kmap_ctrl; unsigned long addr = (unsigned long)vaddr; + int i; + + /* kmap() mappings */ + if (WARN_ON_ONCE(addr >= PKMAP_ADDR(0) && + addr < PKMAP_ADDR(LAST_PKMAP))) + return pte_page(pkmap_page_table[PKMAP_NR(addr)]); - if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { - int i = PKMAP_NR(addr); + /* kmap_local_page() mappings */ + if (WARN_ON_ONCE(base >= __fix_to_virt(FIX_KMAP_END) && + base < __fix_to_virt(FIX_KMAP_BEGIN))) { + for (i = 0; i < kctrl->idx; i++) { + unsigned long base_addr; + int idx; - return pte_page(pkmap_page_table[i]); + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + base_addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + + if (base_addr == base) + return pte_page(kctrl->pteval[i]); + } } return virt_to_page(vaddr); @@ -462,10 +490,6 @@ static inline void kmap_local_idx_pop(void) # define arch_kmap_local_post_unmap(vaddr) do { } while (0) #endif -#ifndef arch_kmap_local_map_idx -#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) -#endif - #ifndef arch_kmap_local_unmap_idx #define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx) #endif @@ -494,11 +518,6 @@ static inline bool kmap_high_unmap_local(unsigned long vaddr) return false; } -static inline int kmap_local_calc_idx(int idx) -{ - return idx + KM_MAX_IDX * smp_processor_id(); -} - static pte_t *__kmap_pte; static pte_t *kmap_get_pte(unsigned long vaddr, int idx) -- cgit v1.2.3