From bc0c3357601e3ff1b006600530079bd246ef0d82 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Wed, 23 Aug 2023 19:05:56 +0200 Subject: mm: remove remnants of SPLIT_RSS_COUNTING The feature got retired in f1a7941243c1 ("mm: convert mm's rss stats into percpu_counter"), but the patch failed to fully clean it up. Link: https://lkml.kernel.org/r/20230823170556.2281747-1-mjguzik@gmail.com Signed-off-by: Mateusz Guzik Acked-by: Shakeel Butt Signed-off-by: Andrew Morton --- fs/exec.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 6518e33ea813..eb039041ad6a 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -986,8 +986,6 @@ static int exec_mmap(struct mm_struct *mm) tsk = current; old_mm = current->mm; exec_mm_release(tsk, old_mm); - if (old_mm) - sync_mm_rss(old_mm); ret = down_write_killable(&tsk->signal->exec_update_lock); if (ret) -- cgit v1.2.3 From 91e79d22be75fec88ae58d274a7c9e49d6215099 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 23 Aug 2023 00:13:14 +0100 Subject: mm: convert DAX lock/unlock page to lock/unlock folio The one caller of DAX lock/unlock page already calls compound_head(), so use page_folio() instead, then use a folio throughout the DAX code to remove uses of page->mapping and page->index. [jane.chu@oracle.com: add comment to mf_generic_kill_procss(), simplify mf_generic_kill_procs:folio initialization] Link: https://lkml.kernel.org/r/20230908222336.186313-1-jane.chu@oracle.com Link: https://lkml.kernel.org/r/20230822231314.349200-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Jane Chu Acked-by: Naoya Horiguchi Cc: Dan Williams Cc: Jane Chu Signed-off-by: Andrew Morton --- fs/dax.c | 24 ++++++++++++------------ include/linux/dax.h | 10 +++++----- mm/memory-failure.c | 29 ++++++++++++++++------------- 3 files changed, 33 insertions(+), 30 deletions(-) (limited to 'fs') diff --git a/fs/dax.c b/fs/dax.c index 8fafecbe42b1..3380b43cb6bb 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -412,23 +412,23 @@ static struct page *dax_busy_page(void *entry) return NULL; } -/* - * dax_lock_page - Lock the DAX entry corresponding to a page - * @page: The page whose entry we want to lock +/** + * dax_lock_folio - Lock the DAX entry corresponding to a folio + * @folio: The folio whose entry we want to lock * * Context: Process context. - * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could + * Return: A cookie to pass to dax_unlock_folio() or 0 if the entry could * not be locked. */ -dax_entry_t dax_lock_page(struct page *page) +dax_entry_t dax_lock_folio(struct folio *folio) { XA_STATE(xas, NULL, 0); void *entry; - /* Ensure page->mapping isn't freed while we look at it */ + /* Ensure folio->mapping isn't freed while we look at it */ rcu_read_lock(); for (;;) { - struct address_space *mapping = READ_ONCE(page->mapping); + struct address_space *mapping = READ_ONCE(folio->mapping); entry = NULL; if (!mapping || !dax_mapping(mapping)) @@ -447,11 +447,11 @@ dax_entry_t dax_lock_page(struct page *page) xas.xa = &mapping->i_pages; xas_lock_irq(&xas); - if (mapping != page->mapping) { + if (mapping != folio->mapping) { xas_unlock_irq(&xas); continue; } - xas_set(&xas, page->index); + xas_set(&xas, folio->index); entry = xas_load(&xas); if (dax_is_locked(entry)) { rcu_read_unlock(); @@ -467,10 +467,10 @@ dax_entry_t dax_lock_page(struct page *page) return (dax_entry_t)entry; } -void dax_unlock_page(struct page *page, dax_entry_t cookie) +void dax_unlock_folio(struct folio *folio, dax_entry_t cookie) { - struct address_space *mapping = page->mapping; - XA_STATE(xas, &mapping->i_pages, page->index); + struct address_space *mapping = folio->mapping; + XA_STATE(xas, &mapping->i_pages, folio->index); if (S_ISCHR(mapping->host->i_mode)) return; diff --git a/include/linux/dax.h b/include/linux/dax.h index 22cd9902345d..b463502b16e1 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -159,8 +159,8 @@ int dax_writeback_mapping_range(struct address_space *mapping, struct page *dax_layout_busy_page(struct address_space *mapping); struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end); -dax_entry_t dax_lock_page(struct page *page); -void dax_unlock_page(struct page *page, dax_entry_t cookie); +dax_entry_t dax_lock_folio(struct folio *folio); +void dax_unlock_folio(struct folio *folio, dax_entry_t cookie); dax_entry_t dax_lock_mapping_entry(struct address_space *mapping, unsigned long index, struct page **page); void dax_unlock_mapping_entry(struct address_space *mapping, @@ -182,14 +182,14 @@ static inline int dax_writeback_mapping_range(struct address_space *mapping, return -EOPNOTSUPP; } -static inline dax_entry_t dax_lock_page(struct page *page) +static inline dax_entry_t dax_lock_folio(struct folio *folio) { - if (IS_DAX(page->mapping->host)) + if (IS_DAX(folio->mapping->host)) return ~0UL; return 0; } -static inline void dax_unlock_page(struct page *page, dax_entry_t cookie) +static inline void dax_unlock_folio(struct folio *folio, dax_entry_t cookie) { } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 4d6e43c88489..660c21859118 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1713,20 +1713,23 @@ static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, kill_procs(to_kill, flags & MF_MUST_KILL, false, pfn, flags); } +/* + * Only dev_pagemap pages get here, such as fsdax when the filesystem + * either do not claim or fails to claim a hwpoison event, or devdax. + * The fsdax pages are initialized per base page, and the devdax pages + * could be initialized either as base pages, or as compound pages with + * vmemmap optimization enabled. Devdax is simplistic in its dealing with + * hwpoison, such that, if a subpage of a compound page is poisoned, + * simply mark the compound head page is by far sufficient. + */ static int mf_generic_kill_procs(unsigned long long pfn, int flags, struct dev_pagemap *pgmap) { - struct page *page = pfn_to_page(pfn); + struct folio *folio = pfn_folio(pfn); LIST_HEAD(to_kill); dax_entry_t cookie; int rc = 0; - /* - * Pages instantiated by device-dax (not filesystem-dax) - * may be compound pages. - */ - page = compound_head(page); - /* * Prevent the inode from being freed while we are interrogating * the address_space, typically this would be handled by @@ -1734,11 +1737,11 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags, * also prevents changes to the mapping of this pfn until * poison signaling is complete. */ - cookie = dax_lock_page(page); + cookie = dax_lock_folio(folio); if (!cookie) return -EBUSY; - if (hwpoison_filter(page)) { + if (hwpoison_filter(&folio->page)) { rc = -EOPNOTSUPP; goto unlock; } @@ -1760,7 +1763,7 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags, * Use this flag as an indication that the dax page has been * remapped UC to prevent speculative consumption of poison. */ - SetPageHWPoison(page); + SetPageHWPoison(&folio->page); /* * Unlike System-RAM there is no possibility to swap in a @@ -1769,11 +1772,11 @@ static int mf_generic_kill_procs(unsigned long long pfn, int flags, * SIGBUS (i.e. MF_MUST_KILL) */ flags |= MF_ACTION_REQUIRED | MF_MUST_KILL; - collect_procs(page, &to_kill, true); + collect_procs(&folio->page, &to_kill, true); - unmap_and_kill(&to_kill, pfn, page->mapping, page->index, flags); + unmap_and_kill(&to_kill, pfn, folio->mapping, folio->index, flags); unlock: - dax_unlock_page(page, cookie); + dax_unlock_folio(folio, cookie); return rc; } -- cgit v1.2.3 From b1e5a3dee255a11cbdd5a0e814829276bd33a793 Mon Sep 17 00:00:00 2001 From: "Joel Fernandes (Google)" Date: Sun, 3 Sep 2023 15:13:23 +0000 Subject: mm/mremap: allow moves within the same VMA for stack moves For the stack move happening in shift_arg_pages(), the move is happening within the same VMA which spans the old and new ranges. In case the aligned address happens to fall within that VMA, allow such moves and don't abort the mremap alignment optimization. In the regular non-stack mremap case, we cannot allow any such moves as will end up destroying some part of the mapping (either the source of the move, or part of the existing mapping). So just avoid it for stack moves. Link: https://lkml.kernel.org/r/20230903151328.2981432-3-joel@joelfernandes.org Signed-off-by: Joel Fernandes (Google) Reviewed-by: Lorenzo Stoakes Cc: Kalesh Singh Cc: "Kirill A. Shutemov" Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Lokesh Gidra Cc: Michal Hocko Cc: Paul E. McKenney Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/exec.c | 2 +- include/linux/mm.h | 2 +- mm/mremap.c | 33 +++++++++++++++++++-------------- 3 files changed, 21 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index eb039041ad6a..4aa19b24f281 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -713,7 +713,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * process cleanup to remove whatever mess we made. */ if (length != move_page_tables(vma, old_start, - vma, new_start, length, false)) + vma, new_start, length, false, true)) return -ENOMEM; lru_add_drain(); diff --git a/include/linux/mm.h b/include/linux/mm.h index 7613150acab9..21c3d4e8a282 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2480,7 +2480,7 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen); extern unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, - bool need_rmap_locks); + bool need_rmap_locks, bool for_stack); /* * Flags used by change_protection(). For now we make it a bitmap so diff --git a/mm/mremap.c b/mm/mremap.c index e2b65a17148e..ce8a23ef325a 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -490,12 +490,13 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, } /* - * A helper to check if a previous mapping exists. Required for - * move_page_tables() and realign_addr() to determine if a previous mapping - * exists before we can do realignment optimizations. + * A helper to check if aligning down is OK. The aligned address should fall + * on *no mapping*. For the stack moving down, that's a special move within + * the VMA that is created to span the source and destination of the move, + * so we make an exception for it. */ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_align, - unsigned long mask) + unsigned long mask, bool for_stack) { unsigned long addr_masked = addr_to_align & mask; @@ -504,9 +505,13 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali * of the corresponding VMA, we can't align down or we will destroy part * of the current mapping. */ - if (vma->vm_start != addr_to_align) + if (!for_stack && vma->vm_start != addr_to_align) return false; + /* In the stack case we explicitly permit in-VMA alignment. */ + if (for_stack && addr_masked >= vma->vm_start) + return true; + /* * Make sure the realignment doesn't cause the address to fall on an * existing mapping. @@ -517,7 +522,7 @@ static bool can_align_down(struct vm_area_struct *vma, unsigned long addr_to_ali /* Opportunistically realign to specified boundary for faster copy. */ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old_vma, unsigned long *new_addr, struct vm_area_struct *new_vma, - unsigned long mask) + unsigned long mask, bool for_stack) { /* Skip if the addresses are already aligned. */ if ((*old_addr & ~mask) == 0) @@ -528,8 +533,8 @@ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old return; /* Ensure realignment doesn't cause overlap with existing mappings. */ - if (!can_align_down(old_vma, *old_addr, mask) || - !can_align_down(new_vma, *new_addr, mask)) + if (!can_align_down(old_vma, *old_addr, mask, for_stack) || + !can_align_down(new_vma, *new_addr, mask, for_stack)) return; *old_addr = *old_addr & mask; @@ -539,7 +544,7 @@ static void try_realign_addr(unsigned long *old_addr, struct vm_area_struct *old unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long old_addr, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long len, - bool need_rmap_locks) + bool need_rmap_locks, bool for_stack) { unsigned long extent, old_end; struct mmu_notifier_range range; @@ -559,9 +564,9 @@ unsigned long move_page_tables(struct vm_area_struct *vma, * If possible, realign addresses to PMD boundary for faster copy. * Only realign if the mremap copying hits a PMD boundary. */ - if ((vma != new_vma) - && (len >= PMD_SIZE - (old_addr & ~PMD_MASK))) - try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK); + if (len >= PMD_SIZE - (old_addr & ~PMD_MASK)) + try_realign_addr(&old_addr, vma, &new_addr, new_vma, PMD_MASK, + for_stack); flush_cache_range(vma, old_addr, old_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma->vm_mm, @@ -708,7 +713,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, } moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len, - need_rmap_locks); + need_rmap_locks, false); if (moved_len < old_len) { err = -ENOMEM; } else if (vma->vm_ops && vma->vm_ops->mremap) { @@ -722,7 +727,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, * and then proceed to unmap new area instead of old. */ move_page_tables(new_vma, new_addr, vma, old_addr, moved_len, - true); + true, false); vma = new_vma; old_len = new_len; old_addr = new_addr; -- cgit v1.2.3 From e538a582097878c536c68002c79722e4a037c080 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Mon, 11 Sep 2023 14:21:14 +0300 Subject: proc/kcore: do not try to access unaccepted memory Support for unaccepted memory was added recently, refer commit dcdfdd40fa82 ("mm: Add support for unaccepted memory"), whereby a virtual machine may need to accept memory before it can be used. Do not try to access unaccepted memory because it can cause the guest to fail. For /proc/kcore, which is read-only and does not support mmap, this means a read of unaccepted memory will return zeros. Link: https://lkml.kernel.org/r/20230911112114.91323-3-adrian.hunter@intel.com Signed-off-by: Adrian Hunter Reviewed-by: David Hildenbrand Cc: Ard Biesheuvel Cc: Baoquan He Cc: Borislav Petkov (AMD) Cc: Dave Hansen Cc: Dave Young Cc: Kirill A. Shutemov Cc: Lorenzo Stoakes Cc: Mike Rapoport Cc: Tom Lendacky Cc: Vivek Goyal Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/proc/kcore.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 23fc24d16b31..6422e569b080 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -546,7 +546,8 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) * and explicitly excluded physical ranges. */ if (!page || PageOffline(page) || - is_page_hwpoison(page) || !pfn_is_ram(pfn)) { + is_page_hwpoison(page) || !pfn_is_ram(pfn) || + pfn_is_unaccepted_memory(pfn)) { if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; -- cgit v1.2.3 From 557936ee8dc6bd0d5e078f415441d9f2a3fcca23 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:05 +0800 Subject: erofs: dynamically allocate the erofs-shrinker Use new APIs to dynamically allocate the erofs-shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-7-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Reviewed-by: Chao Yu Reviewed-by: Gao Xiang Cc: Yue Hu Cc: Jeffle Xu Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Signed-off-by: Andrew Morton --- fs/erofs/utils.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c index cc6fb9e98899..e9c25cd7b601 100644 --- a/fs/erofs/utils.c +++ b/fs/erofs/utils.c @@ -270,19 +270,24 @@ static unsigned long erofs_shrink_scan(struct shrinker *shrink, return freed; } -static struct shrinker erofs_shrinker_info = { - .scan_objects = erofs_shrink_scan, - .count_objects = erofs_shrink_count, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *erofs_shrinker_info; int __init erofs_init_shrinker(void) { - return register_shrinker(&erofs_shrinker_info, "erofs-shrinker"); + erofs_shrinker_info = shrinker_alloc(0, "erofs-shrinker"); + if (!erofs_shrinker_info) + return -ENOMEM; + + erofs_shrinker_info->count_objects = erofs_shrink_count; + erofs_shrinker_info->scan_objects = erofs_shrink_scan; + + shrinker_register(erofs_shrinker_info); + + return 0; } void erofs_exit_shrinker(void) { - unregister_shrinker(&erofs_shrinker_info); + shrinker_free(erofs_shrinker_info); } #endif /* !CONFIG_EROFS_FS_ZIP */ -- cgit v1.2.3 From bfcba5ba39cbce8957b4908c9dd570dd6097fff3 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:06 +0800 Subject: f2fs: dynamically allocate the f2fs-shrinker Use new APIs to dynamically allocate the f2fs-shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-8-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Reviewed-by: Chao Yu Cc: Jaegeuk Kim Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/f2fs/super.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a8c8232852bb..fe25ff9cebbe 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -83,11 +83,26 @@ void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, #endif /* f2fs-wide shrinker description */ -static struct shrinker f2fs_shrinker_info = { - .scan_objects = f2fs_shrink_scan, - .count_objects = f2fs_shrink_count, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *f2fs_shrinker_info; + +static int __init f2fs_init_shrinker(void) +{ + f2fs_shrinker_info = shrinker_alloc(0, "f2fs-shrinker"); + if (!f2fs_shrinker_info) + return -ENOMEM; + + f2fs_shrinker_info->count_objects = f2fs_shrink_count; + f2fs_shrinker_info->scan_objects = f2fs_shrink_scan; + + shrinker_register(f2fs_shrinker_info); + + return 0; +} + +static void f2fs_exit_shrinker(void) +{ + shrinker_free(f2fs_shrinker_info); +} enum { Opt_gc_background, @@ -4944,7 +4959,7 @@ static int __init init_f2fs_fs(void) err = f2fs_init_sysfs(); if (err) goto free_garbage_collection_cache; - err = register_shrinker(&f2fs_shrinker_info, "f2fs-shrinker"); + err = f2fs_init_shrinker(); if (err) goto free_sysfs; err = register_filesystem(&f2fs_fs_type); @@ -4989,7 +5004,7 @@ free_root_stats: f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); free_shrinker: - unregister_shrinker(&f2fs_shrinker_info); + f2fs_exit_shrinker(); free_sysfs: f2fs_exit_sysfs(); free_garbage_collection_cache: @@ -5021,7 +5036,7 @@ static void __exit exit_f2fs_fs(void) f2fs_destroy_post_read_processing(); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); - unregister_shrinker(&f2fs_shrinker_info); + f2fs_exit_shrinker(); f2fs_exit_sysfs(); f2fs_destroy_garbage_collection_cache(); f2fs_destroy_extent_cache(); -- cgit v1.2.3 From a304c23cd6c96ecdf5f96df66e809993d5e96bf0 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:07 +0800 Subject: gfs2: dynamically allocate the gfs2-glock shrinker Use new APIs to dynamically allocate the gfs2-glock shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-9-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Bob Peterson Cc: Andreas Gruenbacher Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/gfs2/glock.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 4a280be229a6..482291bb08d8 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -2041,11 +2041,7 @@ static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink, return vfs_pressure_ratio(atomic_read(&lru_count)); } -static struct shrinker glock_shrinker = { - .seeks = DEFAULT_SEEKS, - .count_objects = gfs2_glock_shrink_count, - .scan_objects = gfs2_glock_shrink_scan, -}; +static struct shrinker *glock_shrinker; /** * glock_hash_walk - Call a function for glock in a hash bucket @@ -2465,13 +2461,18 @@ int __init gfs2_glock_init(void) return -ENOMEM; } - ret = register_shrinker(&glock_shrinker, "gfs2-glock"); - if (ret) { + glock_shrinker = shrinker_alloc(0, "gfs2-glock"); + if (!glock_shrinker) { destroy_workqueue(glock_workqueue); rhashtable_destroy(&gl_hash_table); - return ret; + return -ENOMEM; } + glock_shrinker->count_objects = gfs2_glock_shrink_count; + glock_shrinker->scan_objects = gfs2_glock_shrink_scan; + + shrinker_register(glock_shrinker); + for (i = 0; i < GLOCK_WAIT_TABLE_SIZE; i++) init_waitqueue_head(glock_wait_table + i); @@ -2480,7 +2481,7 @@ int __init gfs2_glock_init(void) void gfs2_glock_exit(void) { - unregister_shrinker(&glock_shrinker); + shrinker_free(glock_shrinker); rhashtable_destroy(&gl_hash_table); destroy_workqueue(glock_workqueue); } -- cgit v1.2.3 From 8ee0fd9c1085d952ea62c1879620eaa01e4c6332 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:08 +0800 Subject: gfs2: dynamically allocate the gfs2-qd shrinker Use new APIs to dynamically allocate the gfs2-qd shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-10-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Bob Peterson Cc: Andreas Gruenbacher Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/gfs2/main.c | 6 +++--- fs/gfs2/quota.c | 25 +++++++++++++++++++------ fs/gfs2/quota.h | 3 ++- 3 files changed, 24 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index 66eb98b690a2..79be0cdc730c 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -147,7 +147,7 @@ static int __init init_gfs2_fs(void) if (!gfs2_trans_cachep) goto fail_cachep8; - error = register_shrinker(&gfs2_qd_shrinker, "gfs2-qd"); + error = gfs2_qd_shrinker_init(); if (error) goto fail_shrinker; @@ -196,7 +196,7 @@ fail_wq3: fail_wq2: destroy_workqueue(gfs2_recovery_wq); fail_wq1: - unregister_shrinker(&gfs2_qd_shrinker); + gfs2_qd_shrinker_exit(); fail_shrinker: kmem_cache_destroy(gfs2_trans_cachep); fail_cachep8: @@ -229,7 +229,7 @@ fail_lru: static void __exit exit_gfs2_fs(void) { - unregister_shrinker(&gfs2_qd_shrinker); + gfs2_qd_shrinker_exit(); gfs2_glock_exit(); gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 171b2713d2e5..d3d013d1d5ac 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -196,13 +196,26 @@ static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc)); } -struct shrinker gfs2_qd_shrinker = { - .count_objects = gfs2_qd_shrink_count, - .scan_objects = gfs2_qd_shrink_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_NUMA_AWARE, -}; +static struct shrinker *gfs2_qd_shrinker; + +int __init gfs2_qd_shrinker_init(void) +{ + gfs2_qd_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "gfs2-qd"); + if (!gfs2_qd_shrinker) + return -ENOMEM; + + gfs2_qd_shrinker->count_objects = gfs2_qd_shrink_count; + gfs2_qd_shrinker->scan_objects = gfs2_qd_shrink_scan; + + shrinker_register(gfs2_qd_shrinker); + return 0; +} + +void gfs2_qd_shrinker_exit(void) +{ + shrinker_free(gfs2_qd_shrinker); +} static u64 qd2index(struct gfs2_quota_data *qd) { diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 1429945215a0..36f54b426b0c 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -60,7 +60,8 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip, } extern const struct quotactl_ops gfs2_quotactl_ops; -extern struct shrinker gfs2_qd_shrinker; +int __init gfs2_qd_shrinker_init(void); +void gfs2_qd_shrinker_exit(void); extern struct list_lru gfs2_qd_lru; extern void __init gfs2_quota_hash_init(void); -- cgit v1.2.3 From d5dad4929fb42e7f610cda5e1ea51cd76be998d4 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:09 +0800 Subject: NFSv4.2: dynamically allocate the nfs-xattr shrinkers Use new APIs to dynamically allocate the nfs-xattr shrinkers. Link: https://lkml.kernel.org/r/20230911094444.68966-11-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Trond Myklebust Cc: Anna Schumaker Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/nfs/nfs42xattr.c | 87 +++++++++++++++++++++++++++-------------------------- 1 file changed, 44 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index 911f634ba3da..2ad66a8922f4 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -796,28 +796,9 @@ static unsigned long nfs4_xattr_cache_scan(struct shrinker *shrink, static unsigned long nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc); -static struct shrinker nfs4_xattr_cache_shrinker = { - .count_objects = nfs4_xattr_cache_count, - .scan_objects = nfs4_xattr_cache_scan, - .seeks = DEFAULT_SEEKS, - .flags = SHRINKER_MEMCG_AWARE, -}; - -static struct shrinker nfs4_xattr_entry_shrinker = { - .count_objects = nfs4_xattr_entry_count, - .scan_objects = nfs4_xattr_entry_scan, - .seeks = DEFAULT_SEEKS, - .batch = 512, - .flags = SHRINKER_MEMCG_AWARE, -}; - -static struct shrinker nfs4_xattr_large_entry_shrinker = { - .count_objects = nfs4_xattr_entry_count, - .scan_objects = nfs4_xattr_entry_scan, - .seeks = 1, - .batch = 512, - .flags = SHRINKER_MEMCG_AWARE, -}; +static struct shrinker *nfs4_xattr_cache_shrinker; +static struct shrinker *nfs4_xattr_entry_shrinker; +static struct shrinker *nfs4_xattr_large_entry_shrinker; static enum lru_status cache_lru_isolate(struct list_head *item, @@ -943,7 +924,7 @@ nfs4_xattr_entry_scan(struct shrinker *shrink, struct shrink_control *sc) struct nfs4_xattr_entry *entry; struct list_lru *lru; - lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + lru = (shrink == nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; freed = list_lru_shrink_walk(lru, sc, entry_lru_isolate, &dispose); @@ -971,7 +952,7 @@ nfs4_xattr_entry_count(struct shrinker *shrink, struct shrink_control *sc) unsigned long count; struct list_lru *lru; - lru = (shrink == &nfs4_xattr_large_entry_shrinker) ? + lru = (shrink == nfs4_xattr_large_entry_shrinker) ? &nfs4_xattr_large_entry_lru : &nfs4_xattr_entry_lru; count = list_lru_shrink_count(lru, sc); @@ -991,18 +972,34 @@ static void nfs4_xattr_cache_init_once(void *p) INIT_LIST_HEAD(&cache->dispose); } -static int nfs4_xattr_shrinker_init(struct shrinker *shrinker, - struct list_lru *lru, const char *name) +typedef unsigned long (*count_objects_cb)(struct shrinker *s, + struct shrink_control *sc); +typedef unsigned long (*scan_objects_cb)(struct shrinker *s, + struct shrink_control *sc); + +static int __init nfs4_xattr_shrinker_init(struct shrinker **shrinker, + struct list_lru *lru, const char *name, + count_objects_cb count, + scan_objects_cb scan, long batch, int seeks) { - int ret = 0; + int ret; - ret = register_shrinker(shrinker, name); - if (ret) + *shrinker = shrinker_alloc(SHRINKER_MEMCG_AWARE, name); + if (!*shrinker) + return -ENOMEM; + + ret = list_lru_init_memcg(lru, *shrinker); + if (ret) { + shrinker_free(*shrinker); return ret; + } - ret = list_lru_init_memcg(lru, shrinker); - if (ret) - unregister_shrinker(shrinker); + (*shrinker)->count_objects = count; + (*shrinker)->scan_objects = scan; + (*shrinker)->batch = batch; + (*shrinker)->seeks = seeks; + + shrinker_register(*shrinker); return ret; } @@ -1010,7 +1007,7 @@ static int nfs4_xattr_shrinker_init(struct shrinker *shrinker, static void nfs4_xattr_shrinker_destroy(struct shrinker *shrinker, struct list_lru *lru) { - unregister_shrinker(shrinker); + shrinker_free(shrinker); list_lru_destroy(lru); } @@ -1026,27 +1023,31 @@ int __init nfs4_xattr_cache_init(void) return -ENOMEM; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_cache_shrinker, - &nfs4_xattr_cache_lru, - "nfs-xattr_cache"); + &nfs4_xattr_cache_lru, "nfs-xattr_cache", + nfs4_xattr_cache_count, + nfs4_xattr_cache_scan, 0, DEFAULT_SEEKS); if (ret) goto out1; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_entry_shrinker, - &nfs4_xattr_entry_lru, - "nfs-xattr_entry"); + &nfs4_xattr_entry_lru, "nfs-xattr_entry", + nfs4_xattr_entry_count, + nfs4_xattr_entry_scan, 512, DEFAULT_SEEKS); if (ret) goto out2; ret = nfs4_xattr_shrinker_init(&nfs4_xattr_large_entry_shrinker, &nfs4_xattr_large_entry_lru, - "nfs-xattr_large_entry"); + "nfs-xattr_large_entry", + nfs4_xattr_entry_count, + nfs4_xattr_entry_scan, 512, 1); if (!ret) return 0; - nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru); out2: - nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru); out1: kmem_cache_destroy(nfs4_xattr_cache_cachep); @@ -1056,11 +1057,11 @@ out1: void nfs4_xattr_cache_exit(void) { - nfs4_xattr_shrinker_destroy(&nfs4_xattr_large_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_large_entry_shrinker, &nfs4_xattr_large_entry_lru); - nfs4_xattr_shrinker_destroy(&nfs4_xattr_entry_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_entry_shrinker, &nfs4_xattr_entry_lru); - nfs4_xattr_shrinker_destroy(&nfs4_xattr_cache_shrinker, + nfs4_xattr_shrinker_destroy(nfs4_xattr_cache_shrinker, &nfs4_xattr_cache_lru); kmem_cache_destroy(nfs4_xattr_cache_cachep); } -- cgit v1.2.3 From 777fc8f1b4b9a0c5a558f1ab49ddfa5137bfaeb3 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:10 +0800 Subject: nfs: dynamically allocate the nfs-acl shrinker Use new APIs to dynamically allocate the nfs-acl shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-12-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Trond Myklebust Cc: Anna Schumaker Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/nfs/super.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 0d6473cb00cb..09ded7f63acf 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -129,11 +129,7 @@ static void nfs_ssc_unregister_ops(void) } #endif /* CONFIG_NFS_V4_2 */ -static struct shrinker acl_shrinker = { - .count_objects = nfs_access_cache_count, - .scan_objects = nfs_access_cache_scan, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *acl_shrinker; /* * Register the NFS filesystems @@ -153,9 +149,18 @@ int __init register_nfs_fs(void) ret = nfs_register_sysctl(); if (ret < 0) goto error_2; - ret = register_shrinker(&acl_shrinker, "nfs-acl"); - if (ret < 0) + + acl_shrinker = shrinker_alloc(0, "nfs-acl"); + if (!acl_shrinker) { + ret = -ENOMEM; goto error_3; + } + + acl_shrinker->count_objects = nfs_access_cache_count; + acl_shrinker->scan_objects = nfs_access_cache_scan; + + shrinker_register(acl_shrinker); + #ifdef CONFIG_NFS_V4_2 nfs_ssc_register_ops(); #endif @@ -175,7 +180,7 @@ error_0: */ void __exit unregister_nfs_fs(void) { - unregister_shrinker(&acl_shrinker); + shrinker_free(acl_shrinker); nfs_unregister_sysctl(); unregister_nfs4_fs(); #ifdef CONFIG_NFS_V4_2 -- cgit v1.2.3 From 856e594965414a1cc63d0f3e709ecb6869bc685e Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:11 +0800 Subject: nfsd: dynamically allocate the nfsd-filecache shrinker Use new APIs to dynamically allocate the nfsd-filecache shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-13-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Jeff Layton Cc: Neil Brown Cc: Olga Kornievskaia Cc: Dai Ngo Cc: Tom Talpey Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Oleksandr Tyshchenko Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/nfsd/filecache.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ee9c923192e0..9c62b4502539 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -521,11 +521,7 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) return ret; } -static struct shrinker nfsd_file_shrinker = { - .scan_objects = nfsd_file_lru_scan, - .count_objects = nfsd_file_lru_count, - .seeks = 1, -}; +static struct shrinker *nfsd_file_shrinker; /** * nfsd_file_cond_queue - conditionally unhash and queue a nfsd_file @@ -746,12 +742,19 @@ nfsd_file_cache_init(void) goto out_err; } - ret = register_shrinker(&nfsd_file_shrinker, "nfsd-filecache"); - if (ret) { - pr_err("nfsd: failed to register nfsd_file_shrinker: %d\n", ret); + nfsd_file_shrinker = shrinker_alloc(0, "nfsd-filecache"); + if (!nfsd_file_shrinker) { + ret = -ENOMEM; + pr_err("nfsd: failed to allocate nfsd_file_shrinker\n"); goto out_lru; } + nfsd_file_shrinker->count_objects = nfsd_file_lru_count; + nfsd_file_shrinker->scan_objects = nfsd_file_lru_scan; + nfsd_file_shrinker->seeks = 1; + + shrinker_register(nfsd_file_shrinker); + ret = lease_register_notifier(&nfsd_file_lease_notifier); if (ret) { pr_err("nfsd: unable to register lease notifier: %d\n", ret); @@ -774,7 +777,7 @@ out: out_notifier: lease_unregister_notifier(&nfsd_file_lease_notifier); out_shrinker: - unregister_shrinker(&nfsd_file_shrinker); + shrinker_free(nfsd_file_shrinker); out_lru: list_lru_destroy(&nfsd_file_lru); out_err: @@ -891,7 +894,7 @@ nfsd_file_cache_shutdown(void) return; lease_unregister_notifier(&nfsd_file_lease_notifier); - unregister_shrinker(&nfsd_file_shrinker); + shrinker_free(nfsd_file_shrinker); /* * make sure all callers of nfsd_file_lru_cb are done before * calling nfsd_file_cache_purge -- cgit v1.2.3 From eab477e883b52a82bd7b4437111bbc6264694349 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:12 +0800 Subject: quota: dynamically allocate the dquota-cache shrinker Use new APIs to dynamically allocate the dquota-cache shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-14-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Jan Kara Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/quota/dquot.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 9e72bfe8bbad..15030b0cd1c8 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -791,12 +791,6 @@ dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc) percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])); } -static struct shrinker dqcache_shrinker = { - .count_objects = dqcache_shrink_count, - .scan_objects = dqcache_shrink_scan, - .seeks = DEFAULT_SEEKS, -}; - /* * Safely release dquot and put reference to dquot. */ @@ -2956,6 +2950,7 @@ static int __init dquot_init(void) { int i, ret; unsigned long nr_hash, order; + struct shrinker *dqcache_shrinker; printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__); @@ -2990,8 +2985,14 @@ static int __init dquot_init(void) pr_info("VFS: Dquot-cache hash table entries: %ld (order %ld," " %ld bytes)\n", nr_hash, order, (PAGE_SIZE << order)); - if (register_shrinker(&dqcache_shrinker, "dquota-cache")) - panic("Cannot register dquot shrinker"); + dqcache_shrinker = shrinker_alloc(0, "dquota-cache"); + if (!dqcache_shrinker) + panic("Cannot allocate dquot shrinker"); + + dqcache_shrinker->count_objects = dqcache_shrink_count; + dqcache_shrinker->scan_objects = dqcache_shrink_scan; + + shrinker_register(dqcache_shrinker); return 0; } -- cgit v1.2.3 From 827a34f90e5a27b83f188b914512d81e36688dec Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:13 +0800 Subject: ubifs: dynamically allocate the ubifs-slab shrinker Use new APIs to dynamically allocate the ubifs-slab shrinker. Link: https://lkml.kernel.org/r/20230911094444.68966-15-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Richard Weinberger Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/ubifs/super.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b08fb28d16b5..96f6a9118207 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -54,11 +54,7 @@ module_param_cb(default_version, &ubifs_default_version_ops, &ubifs_default_vers static struct kmem_cache *ubifs_inode_slab; /* UBIFS TNC shrinker description */ -static struct shrinker ubifs_shrinker_info = { - .scan_objects = ubifs_shrink_scan, - .count_objects = ubifs_shrink_count, - .seeks = DEFAULT_SEEKS, -}; +static struct shrinker *ubifs_shrinker_info; /** * validate_inode - validate inode. @@ -2373,7 +2369,7 @@ static void inode_slab_ctor(void *obj) static int __init ubifs_init(void) { - int err; + int err = -ENOMEM; BUILD_BUG_ON(sizeof(struct ubifs_ch) != 24); @@ -2439,10 +2435,15 @@ static int __init ubifs_init(void) if (!ubifs_inode_slab) return -ENOMEM; - err = register_shrinker(&ubifs_shrinker_info, "ubifs-slab"); - if (err) + ubifs_shrinker_info = shrinker_alloc(0, "ubifs-slab"); + if (!ubifs_shrinker_info) goto out_slab; + ubifs_shrinker_info->count_objects = ubifs_shrink_count; + ubifs_shrinker_info->scan_objects = ubifs_shrink_scan; + + shrinker_register(ubifs_shrinker_info); + err = ubifs_compressors_init(); if (err) goto out_shrinker; @@ -2467,7 +2468,7 @@ out_dbg: dbg_debugfs_exit(); ubifs_compressors_exit(); out_shrinker: - unregister_shrinker(&ubifs_shrinker_info); + shrinker_free(ubifs_shrinker_info); out_slab: kmem_cache_destroy(ubifs_inode_slab); return err; @@ -2483,7 +2484,7 @@ static void __exit ubifs_exit(void) dbg_debugfs_exit(); ubifs_sysfs_exit(); ubifs_compressors_exit(); - unregister_shrinker(&ubifs_shrinker_info); + shrinker_free(ubifs_shrinker_info); /* * Make sure all delayed rcu free inodes are flushed before we -- cgit v1.2.3 From 714e5bde00a58ba13b03b68547931e011f93de58 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:28 +0800 Subject: mbcache: dynamically allocate the mbcache shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the mbcache shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct mb_cache. Link: https://lkml.kernel.org/r/20230911094444.68966-30-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Alexander Viro Cc: Christian Brauner Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/mbcache.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/mbcache.c b/fs/mbcache.c index 2a4b8b549e93..82aa7a35db26 100644 --- a/fs/mbcache.c +++ b/fs/mbcache.c @@ -37,7 +37,7 @@ struct mb_cache { struct list_head c_list; /* Number of entries in cache */ unsigned long c_entry_count; - struct shrinker c_shrink; + struct shrinker *c_shrink; /* Work for shrinking when the cache has too many entries */ struct work_struct c_shrink_work; }; @@ -293,8 +293,7 @@ EXPORT_SYMBOL(mb_cache_entry_touch); static unsigned long mb_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct mb_cache *cache = container_of(shrink, struct mb_cache, - c_shrink); + struct mb_cache *cache = shrink->private_data; return cache->c_entry_count; } @@ -333,8 +332,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache, static unsigned long mb_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct mb_cache *cache = container_of(shrink, struct mb_cache, - c_shrink); + struct mb_cache *cache = shrink->private_data; return mb_cache_shrink(cache, sc->nr_to_scan); } @@ -377,15 +375,19 @@ struct mb_cache *mb_cache_create(int bucket_bits) for (i = 0; i < bucket_count; i++) INIT_HLIST_BL_HEAD(&cache->c_hash[i]); - cache->c_shrink.count_objects = mb_cache_count; - cache->c_shrink.scan_objects = mb_cache_scan; - cache->c_shrink.seeks = DEFAULT_SEEKS; - if (register_shrinker(&cache->c_shrink, "mbcache-shrinker")) { + cache->c_shrink = shrinker_alloc(0, "mbcache-shrinker"); + if (!cache->c_shrink) { kfree(cache->c_hash); kfree(cache); goto err_out; } + cache->c_shrink->count_objects = mb_cache_count; + cache->c_shrink->scan_objects = mb_cache_scan; + cache->c_shrink->private_data = cache; + + shrinker_register(cache->c_shrink); + INIT_WORK(&cache->c_shrink_work, mb_cache_shrink_worker); return cache; @@ -406,7 +408,7 @@ void mb_cache_destroy(struct mb_cache *cache) { struct mb_cache_entry *entry, *next; - unregister_shrinker(&cache->c_shrink); + shrinker_free(cache->c_shrink); /* * We don't bother with any locking. Cache must not be used at this -- cgit v1.2.3 From 4d09d75d8b8cb3c8087b46585924777bc8737663 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:29 +0800 Subject: ext4: dynamically allocate the ext4-es shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the ext4-es shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct ext4_sb_info. Link: https://lkml.kernel.org/r/20230911094444.68966-31-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/ext4/ext4.h | 2 +- fs/ext4/extents_status.c | 23 +++++++++++++---------- 2 files changed, 14 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9418359b1d9d..8eeff770992c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1653,7 +1653,7 @@ struct ext4_sb_info { __u32 s_csum_seed; /* Reclaim extents from extent status tree */ - struct shrinker s_es_shrinker; + struct shrinker *s_es_shrinker; struct list_head s_es_list; /* List of inodes with reclaimable extents */ long s_es_nr_inode; struct ext4_es_stats s_es_stats; diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 6f7de14c0fa8..deec7d1f4e50 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1606,7 +1606,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, unsigned long nr; struct ext4_sb_info *sbi; - sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); + sbi = shrink->private_data; nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; @@ -1615,8 +1615,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, static unsigned long ext4_es_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct ext4_sb_info *sbi = container_of(shrink, - struct ext4_sb_info, s_es_shrinker); + struct ext4_sb_info *sbi = shrink->private_data; int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; @@ -1700,13 +1699,17 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) if (err) goto err3; - sbi->s_es_shrinker.scan_objects = ext4_es_scan; - sbi->s_es_shrinker.count_objects = ext4_es_count; - sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; - err = register_shrinker(&sbi->s_es_shrinker, "ext4-es:%s", - sbi->s_sb->s_id); - if (err) + sbi->s_es_shrinker = shrinker_alloc(0, "ext4-es:%s", sbi->s_sb->s_id); + if (!sbi->s_es_shrinker) { + err = -ENOMEM; goto err4; + } + + sbi->s_es_shrinker->scan_objects = ext4_es_scan; + sbi->s_es_shrinker->count_objects = ext4_es_count; + sbi->s_es_shrinker->private_data = sbi; + + shrinker_register(sbi->s_es_shrinker); return 0; err4: @@ -1726,7 +1729,7 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) percpu_counter_destroy(&sbi->s_es_stats.es_stats_cache_misses); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); - unregister_shrinker(&sbi->s_es_shrinker); + shrinker_free(sbi->s_es_shrinker); } /* -- cgit v1.2.3 From 4b403dfa8ea8dfbc9d317b25a0433c1ac6765f7f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:30 +0800 Subject: jbd2,ext4: dynamically allocate the jbd2-journal shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the jbd2-journal shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct journal_s. Link: https://lkml.kernel.org/r/20230911094444.68966-32-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: Jan Kara Cc: "Theodore Ts'o" Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/jbd2/journal.c | 29 ++++++++++++++++++----------- include/linux/jbd2.h | 2 +- 2 files changed, 19 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 30dec2bd2ecc..ed53188472f9 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1290,7 +1290,7 @@ static int jbd2_min_tag_size(void) static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) { - journal_t *journal = container_of(shrink, journal_t, j_shrinker); + journal_t *journal = shrink->private_data; unsigned long nr_to_scan = sc->nr_to_scan; unsigned long nr_shrunk; unsigned long count; @@ -1316,7 +1316,7 @@ static unsigned long jbd2_journal_shrink_scan(struct shrinker *shrink, static unsigned long jbd2_journal_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { - journal_t *journal = container_of(shrink, journal_t, j_shrinker); + journal_t *journal = shrink->private_data; unsigned long count; count = percpu_counter_read_positive(&journal->j_checkpoint_jh_count); @@ -1588,14 +1588,21 @@ static journal_t *journal_init_common(struct block_device *bdev, goto err_cleanup; journal->j_shrink_transaction = NULL; - journal->j_shrinker.scan_objects = jbd2_journal_shrink_scan; - journal->j_shrinker.count_objects = jbd2_journal_shrink_count; - journal->j_shrinker.seeks = DEFAULT_SEEKS; - journal->j_shrinker.batch = journal->j_max_transaction_buffers; - err = register_shrinker(&journal->j_shrinker, "jbd2-journal:(%u:%u)", - MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev)); - if (err) + + journal->j_shrinker = shrinker_alloc(0, "jbd2-journal:(%u:%u)", + MAJOR(bdev->bd_dev), + MINOR(bdev->bd_dev)); + if (!journal->j_shrinker) { + err = -ENOMEM; goto err_cleanup; + } + + journal->j_shrinker->scan_objects = jbd2_journal_shrink_scan; + journal->j_shrinker->count_objects = jbd2_journal_shrink_count; + journal->j_shrinker->batch = journal->j_max_transaction_buffers; + journal->j_shrinker->private_data = journal; + + shrinker_register(journal->j_shrinker); return journal; @@ -2172,9 +2179,9 @@ int jbd2_journal_destroy(journal_t *journal) brelse(journal->j_sb_buffer); } - if (journal->j_shrinker.flags & SHRINKER_REGISTERED) { + if (journal->j_shrinker) { percpu_counter_destroy(&journal->j_checkpoint_jh_count); - unregister_shrinker(&journal->j_shrinker); + shrinker_free(journal->j_shrinker); } if (journal->j_proc_entry) jbd2_stats_proc_exit(journal); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 52772c826c86..6dcbb4eb80fb 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -886,7 +886,7 @@ struct journal_s * Journal head shrinker, reclaim buffer's journal head which * has been written back. */ - struct shrinker j_shrinker; + struct shrinker *j_shrinker; /** * @j_checkpoint_jh_count: -- cgit v1.2.3 From d17452aa33a6fd0e9cf2bd3d43237f2d2c294a7e Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:31 +0800 Subject: nfsd: dynamically allocate the nfsd-client shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the nfsd-client shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct nfsd_net. Link: https://lkml.kernel.org/r/20230911094444.68966-33-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Chuck Lever Acked-by: Jeff Layton Cc: Jeff Layton Cc: Olga Kornievskaia Cc: Dai Ngo Cc: Tom Talpey Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/nfsd/netns.h | 2 +- fs/nfsd/nfs4state.c | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index ec49b200b797..f669444d5336 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -195,7 +195,7 @@ struct nfsd_net { int nfs4_max_clients; atomic_t nfsd_courtesy_clients; - struct shrinker nfsd_client_shrinker; + struct shrinker *nfsd_client_shrinker; struct work_struct nfsd_shrinker_work; }; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 8534693eb6a4..23b3b38c8cda 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -4400,8 +4400,7 @@ static unsigned long nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { int count; - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_client_shrinker); + struct nfsd_net *nn = shrink->private_data; count = atomic_read(&nn->nfsd_courtesy_clients); if (!count) @@ -8149,12 +8148,16 @@ static int nfs4_state_create_net(struct net *net) INIT_WORK(&nn->nfsd_shrinker_work, nfsd4_state_shrinker_worker); get_net(net); - nn->nfsd_client_shrinker.scan_objects = nfsd4_state_shrinker_scan; - nn->nfsd_client_shrinker.count_objects = nfsd4_state_shrinker_count; - nn->nfsd_client_shrinker.seeks = DEFAULT_SEEKS; - - if (register_shrinker(&nn->nfsd_client_shrinker, "nfsd-client")) + nn->nfsd_client_shrinker = shrinker_alloc(0, "nfsd-client"); + if (!nn->nfsd_client_shrinker) goto err_shrinker; + + nn->nfsd_client_shrinker->scan_objects = nfsd4_state_shrinker_scan; + nn->nfsd_client_shrinker->count_objects = nfsd4_state_shrinker_count; + nn->nfsd_client_shrinker->private_data = nn; + + shrinker_register(nn->nfsd_client_shrinker); + return 0; err_shrinker: @@ -8252,7 +8255,7 @@ nfs4_state_shutdown_net(struct net *net) struct list_head *pos, *next, reaplist; struct nfsd_net *nn = net_generic(net, nfsd_net_id); - unregister_shrinker(&nn->nfsd_client_shrinker); + shrinker_free(nn->nfsd_client_shrinker); cancel_work(&nn->nfsd_shrinker_work); cancel_delayed_work_sync(&nn->laundromat_work); locks_end_grace(&nn->nfsd4_manager); -- cgit v1.2.3 From 8eea99a81c6f67c08fedd7db2ccd09aa93db69a7 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:32 +0800 Subject: nfsd: dynamically allocate the nfsd-reply shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the nfsd-reply shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct nfsd_net. Link: https://lkml.kernel.org/r/20230911094444.68966-34-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: Chuck Lever Acked-by: Jeff Layton Acked-by: Muchun Song Cc: Neil Brown Cc: Olga Kornievskaia Cc: Dai Ngo Cc: Tom Talpey Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Oleksandr Tyshchenko Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/nfsd/netns.h | 2 +- fs/nfsd/nfscache.c | 31 ++++++++++++++++--------------- 2 files changed, 17 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index f669444d5336..ab303a8b77d5 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -177,7 +177,7 @@ struct nfsd_net { /* size of cache when we saw the longest hash chain */ unsigned int longest_chain_cachesize; - struct shrinker nfsd_reply_cache_shrinker; + struct shrinker *nfsd_reply_cache_shrinker; /* tracking server-to-server copy mounts */ spinlock_t nfsd_ssc_lock; diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index 80621a709510..fd56a52aa5fb 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -201,26 +201,29 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) { unsigned int hashsize; unsigned int i; - int status = 0; nn->max_drc_entries = nfsd_cache_size_limit(); atomic_set(&nn->num_drc_entries, 0); hashsize = nfsd_hashsize(nn->max_drc_entries); nn->maskbits = ilog2(hashsize); - nn->nfsd_reply_cache_shrinker.scan_objects = nfsd_reply_cache_scan; - nn->nfsd_reply_cache_shrinker.count_objects = nfsd_reply_cache_count; - nn->nfsd_reply_cache_shrinker.seeks = 1; - status = register_shrinker(&nn->nfsd_reply_cache_shrinker, - "nfsd-reply:%s", nn->nfsd_name); - if (status) - return status; - nn->drc_hashtbl = kvzalloc(array_size(hashsize, sizeof(*nn->drc_hashtbl)), GFP_KERNEL); if (!nn->drc_hashtbl) + return -ENOMEM; + + nn->nfsd_reply_cache_shrinker = shrinker_alloc(0, "nfsd-reply:%s", + nn->nfsd_name); + if (!nn->nfsd_reply_cache_shrinker) goto out_shrinker; + nn->nfsd_reply_cache_shrinker->scan_objects = nfsd_reply_cache_scan; + nn->nfsd_reply_cache_shrinker->count_objects = nfsd_reply_cache_count; + nn->nfsd_reply_cache_shrinker->seeks = 1; + nn->nfsd_reply_cache_shrinker->private_data = nn; + + shrinker_register(nn->nfsd_reply_cache_shrinker); + for (i = 0; i < hashsize; i++) { INIT_LIST_HEAD(&nn->drc_hashtbl[i].lru_head); spin_lock_init(&nn->drc_hashtbl[i].cache_lock); @@ -229,7 +232,7 @@ int nfsd_reply_cache_init(struct nfsd_net *nn) return 0; out_shrinker: - unregister_shrinker(&nn->nfsd_reply_cache_shrinker); + kvfree(nn->drc_hashtbl); printk(KERN_ERR "nfsd: failed to allocate reply cache\n"); return -ENOMEM; } @@ -239,7 +242,7 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) struct nfsd_cacherep *rp; unsigned int i; - unregister_shrinker(&nn->nfsd_reply_cache_shrinker); + shrinker_free(nn->nfsd_reply_cache_shrinker); for (i = 0; i < nn->drc_hashsize; i++) { struct list_head *head = &nn->drc_hashtbl[i].lru_head; @@ -323,8 +326,7 @@ nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_reply_cache_shrinker); + struct nfsd_net *nn = shrink->private_data; return atomic_read(&nn->num_drc_entries); } @@ -343,8 +345,7 @@ nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - struct nfsd_net *nn = container_of(shrink, - struct nfsd_net, nfsd_reply_cache_shrinker); + struct nfsd_net *nn = shrink->private_data; unsigned long freed = 0; LIST_HEAD(dispose); unsigned int i; -- cgit v1.2.3 From 17e7a00e60c8fff31082d5216b5c7d06d156f972 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:33 +0800 Subject: xfs: dynamically allocate the xfs-buf shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the xfs-buf shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct xfs_buftarg. Link: https://lkml.kernel.org/r/20230911094444.68966-35-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Chandan Babu R Cc: "Darrick J. Wong" Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/xfs/xfs_buf.c | 24 +++++++++++++----------- fs/xfs/xfs_buf.h | 2 +- 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index c1ece4a08ff4..9e7ba04572db 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1913,8 +1913,7 @@ xfs_buftarg_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_buftarg *btp = container_of(shrink, - struct xfs_buftarg, bt_shrinker); + struct xfs_buftarg *btp = shrink->private_data; LIST_HEAD(dispose); unsigned long freed; @@ -1936,8 +1935,7 @@ xfs_buftarg_shrink_count( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_buftarg *btp = container_of(shrink, - struct xfs_buftarg, bt_shrinker); + struct xfs_buftarg *btp = shrink->private_data; return list_lru_shrink_count(&btp->bt_lru, sc); } @@ -1947,7 +1945,7 @@ xfs_free_buftarg( { struct block_device *bdev = btp->bt_bdev; - unregister_shrinker(&btp->bt_shrinker); + shrinker_free(btp->bt_shrinker); ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); percpu_counter_destroy(&btp->bt_io_count); list_lru_destroy(&btp->bt_lru); @@ -2031,13 +2029,17 @@ xfs_alloc_buftarg( if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) goto error_lru; - btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count; - btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan; - btp->bt_shrinker.seeks = DEFAULT_SEEKS; - btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE; - if (register_shrinker(&btp->bt_shrinker, "xfs-buf:%s", - mp->m_super->s_id)) + btp->bt_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-buf:%s", + mp->m_super->s_id); + if (!btp->bt_shrinker) goto error_pcpu; + + btp->bt_shrinker->count_objects = xfs_buftarg_shrink_count; + btp->bt_shrinker->scan_objects = xfs_buftarg_shrink_scan; + btp->bt_shrinker->private_data = btp; + + shrinker_register(btp->bt_shrinker); + return btp; error_pcpu: diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index df8f47953bb4..702e7d9ea2ac 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -108,7 +108,7 @@ typedef struct xfs_buftarg { size_t bt_logical_sectormask; /* LRU control structures */ - struct shrinker bt_shrinker; + struct shrinker *bt_shrinker; struct list_lru bt_lru; struct percpu_counter bt_io_count; -- cgit v1.2.3 From 1a86a53da46778a583cd866465e18d662d866f30 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:34 +0800 Subject: xfs: dynamically allocate the xfs-inodegc shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the xfs-inodegc shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct xfs_mount. Link: https://lkml.kernel.org/r/20230911094444.68966-36-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Chandan Babu R Cc: "Darrick J. Wong" Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/xfs/xfs_icache.c | 26 +++++++++++++++----------- fs/xfs/xfs_mount.c | 4 ++-- fs/xfs/xfs_mount.h | 2 +- 3 files changed, 18 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 3c210ac83713..dba514a2c84d 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -2165,8 +2165,7 @@ xfs_inodegc_shrinker_count( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_mount *mp = container_of(shrink, struct xfs_mount, - m_inodegc_shrinker); + struct xfs_mount *mp = shrink->private_data; struct xfs_inodegc *gc; int cpu; @@ -2187,8 +2186,7 @@ xfs_inodegc_shrinker_scan( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_mount *mp = container_of(shrink, struct xfs_mount, - m_inodegc_shrinker); + struct xfs_mount *mp = shrink->private_data; struct xfs_inodegc *gc; int cpu; bool no_items = true; @@ -2224,13 +2222,19 @@ int xfs_inodegc_register_shrinker( struct xfs_mount *mp) { - struct shrinker *shrink = &mp->m_inodegc_shrinker; + mp->m_inodegc_shrinker = shrinker_alloc(SHRINKER_NONSLAB, + "xfs-inodegc:%s", + mp->m_super->s_id); + if (!mp->m_inodegc_shrinker) + return -ENOMEM; + + mp->m_inodegc_shrinker->count_objects = xfs_inodegc_shrinker_count; + mp->m_inodegc_shrinker->scan_objects = xfs_inodegc_shrinker_scan; + mp->m_inodegc_shrinker->seeks = 0; + mp->m_inodegc_shrinker->batch = XFS_INODEGC_SHRINKER_BATCH; + mp->m_inodegc_shrinker->private_data = mp; - shrink->count_objects = xfs_inodegc_shrinker_count; - shrink->scan_objects = xfs_inodegc_shrinker_scan; - shrink->seeks = 0; - shrink->flags = SHRINKER_NONSLAB; - shrink->batch = XFS_INODEGC_SHRINKER_BATCH; + shrinker_register(mp->m_inodegc_shrinker); - return register_shrinker(shrink, "xfs-inodegc:%s", mp->m_super->s_id); + return 0; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 0a0fd19573d8..aed5be5508fe 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1021,7 +1021,7 @@ xfs_mountfs( out_log_dealloc: xfs_log_mount_cancel(mp); out_inodegc_shrinker: - unregister_shrinker(&mp->m_inodegc_shrinker); + shrinker_free(mp->m_inodegc_shrinker); out_fail_wait: if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_buftarg_drain(mp->m_logdev_targp); @@ -1104,7 +1104,7 @@ xfs_unmountfs( #if defined(DEBUG) xfs_errortag_clearall(mp); #endif - unregister_shrinker(&mp->m_inodegc_shrinker); + shrinker_free(mp->m_inodegc_shrinker); xfs_free_perag(mp); xfs_errortag_del(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index d19cca099bc3..219681d29fbc 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -219,7 +219,7 @@ typedef struct xfs_mount { atomic_t m_agirotor; /* last ag dir inode alloced */ /* Memory shrinker to throttle and reprioritize inodegc */ - struct shrinker m_inodegc_shrinker; + struct shrinker *m_inodegc_shrinker; /* * Workqueue item so that we can coalesce multiple inode flush attempts * into a single flush. -- cgit v1.2.3 From fe88527852be8a6f1391c961db38d342cc46461d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:35 +0800 Subject: xfs: dynamically allocate the xfs-qm shrinker In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the xfs-qm shrinker, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct xfs_quotainfo. Link: https://lkml.kernel.org/r/20230911094444.68966-37-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Chandan Babu R Cc: "Darrick J. Wong" Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/xfs/xfs_qm.c | 27 ++++++++++++++------------- fs/xfs/xfs_qm.h | 2 +- 2 files changed, 15 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 086e78a6143a..94a7932ac570 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -504,8 +504,7 @@ xfs_qm_shrink_scan( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_quotainfo *qi = container_of(shrink, - struct xfs_quotainfo, qi_shrinker); + struct xfs_quotainfo *qi = shrink->private_data; struct xfs_qm_isolate isol; unsigned long freed; int error; @@ -539,8 +538,7 @@ xfs_qm_shrink_count( struct shrinker *shrink, struct shrink_control *sc) { - struct xfs_quotainfo *qi = container_of(shrink, - struct xfs_quotainfo, qi_shrinker); + struct xfs_quotainfo *qi = shrink->private_data; return list_lru_shrink_count(&qi->qi_lru, sc); } @@ -680,15 +678,18 @@ xfs_qm_init_quotainfo( if (XFS_IS_PQUOTA_ON(mp)) xfs_qm_set_defquota(mp, XFS_DQTYPE_PROJ, qinf); - qinf->qi_shrinker.count_objects = xfs_qm_shrink_count; - qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan; - qinf->qi_shrinker.seeks = DEFAULT_SEEKS; - qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE; - - error = register_shrinker(&qinf->qi_shrinker, "xfs-qm:%s", - mp->m_super->s_id); - if (error) + qinf->qi_shrinker = shrinker_alloc(SHRINKER_NUMA_AWARE, "xfs-qm:%s", + mp->m_super->s_id); + if (!qinf->qi_shrinker) { + error = -ENOMEM; goto out_free_inos; + } + + qinf->qi_shrinker->count_objects = xfs_qm_shrink_count; + qinf->qi_shrinker->scan_objects = xfs_qm_shrink_scan; + qinf->qi_shrinker->private_data = qinf; + + shrinker_register(qinf->qi_shrinker); return 0; @@ -718,7 +719,7 @@ xfs_qm_destroy_quotainfo( qi = mp->m_quotainfo; ASSERT(qi != NULL); - unregister_shrinker(&qi->qi_shrinker); + shrinker_free(qi->qi_shrinker); list_lru_destroy(&qi->qi_lru); xfs_qm_destroy_quotainos(qi); mutex_destroy(&qi->qi_tree_lock); diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h index 9683f0457d19..d5c9fc4ba591 100644 --- a/fs/xfs/xfs_qm.h +++ b/fs/xfs/xfs_qm.h @@ -63,7 +63,7 @@ struct xfs_quotainfo { struct xfs_def_quota qi_usr_default; struct xfs_def_quota qi_grp_default; struct xfs_def_quota qi_prj_default; - struct shrinker qi_shrinker; + struct shrinker *qi_shrinker; /* Minimum and maximum quota expiration timestamp values. */ time64_t qi_expiry_min; -- cgit v1.2.3 From 1720f5dd8d3af34d6023fb9e8c35e5e60e8b6643 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:37 +0800 Subject: fs: super: dynamically allocate the s_shrink In preparation for implementing lockless slab shrink, use new APIs to dynamically allocate the s_shrink, so that it can be freed asynchronously via RCU. Then it doesn't need to wait for RCU read-side critical section when releasing the struct super_block. Link: https://lkml.kernel.org/r/20230911094444.68966-39-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Sterba Cc: Chris Mason Cc: Josef Bacik Cc: Alexander Viro Cc: Christian Brauner Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Muchun Song Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- fs/btrfs/super.c | 2 +- fs/kernfs/mount.c | 2 +- fs/proc/root.c | 2 +- fs/super.c | 33 ++++++++++++++++++--------------- include/linux/fs.h | 2 +- 5 files changed, 22 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 1a093ec0f7e3..b1798bed68f2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1519,7 +1519,7 @@ static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, error = -EBUSY; } else { snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(&s->s_shrink, "sb-%s:%s", fs_type->name, + shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name, s->s_id); btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index c4bf26142eec..79b96e74a8a0 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -265,7 +265,7 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k sb->s_time_gran = 1; /* sysfs dentries and inodes don't require IO to create */ - sb->s_shrink.seeks = 0; + sb->s_shrink->seeks = 0; /* get root inode, initialize and unlock it */ down_read(&kf_root->kernfs_rwsem); diff --git a/fs/proc/root.c b/fs/proc/root.c index 9191248f2dac..b55dbc70287b 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -188,7 +188,7 @@ static int proc_fill_super(struct super_block *s, struct fs_context *fc) s->s_stack_depth = FILESYSTEM_MAX_STACK_DEPTH; /* procfs dentries and inodes don't require IO to create */ - s->s_shrink.seeks = 0; + s->s_shrink->seeks = 0; pde_get(&proc_root); root_inode = proc_get_inode(s, &proc_root); diff --git a/fs/super.c b/fs/super.c index 2d762ce67f6e..adadf6689611 100644 --- a/fs/super.c +++ b/fs/super.c @@ -191,7 +191,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, long dentries; long inodes; - sb = container_of(shrink, struct super_block, s_shrink); + sb = shrink->private_data; /* * Deadlock avoidance. We may hold various FS locks, and we don't want @@ -244,7 +244,7 @@ static unsigned long super_cache_count(struct shrinker *shrink, struct super_block *sb; long total_objects = 0; - sb = container_of(shrink, struct super_block, s_shrink); + sb = shrink->private_data; /* * We don't call super_trylock_shared() here as it is a scalability @@ -306,7 +306,7 @@ static void destroy_unused_super(struct super_block *s) security_sb_free(s); put_user_ns(s->s_user_ns); kfree(s->s_subtype); - free_prealloced_shrinker(&s->s_shrink); + shrinker_free(s->s_shrink); /* no delays needed */ destroy_super_work(&s->destroy_work); } @@ -383,16 +383,19 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; - s->s_shrink.seeks = DEFAULT_SEEKS; - s->s_shrink.scan_objects = super_cache_scan; - s->s_shrink.count_objects = super_cache_count; - s->s_shrink.batch = 1024; - s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE; - if (prealloc_shrinker(&s->s_shrink, "sb-%s", type->name)) + s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, + "sb-%s", type->name); + if (!s->s_shrink) goto fail; - if (list_lru_init_memcg(&s->s_dentry_lru, &s->s_shrink)) + + s->s_shrink->scan_objects = super_cache_scan; + s->s_shrink->count_objects = super_cache_count; + s->s_shrink->batch = 1024; + s->s_shrink->private_data = s; + + if (list_lru_init_memcg(&s->s_dentry_lru, s->s_shrink)) goto fail; - if (list_lru_init_memcg(&s->s_inode_lru, &s->s_shrink)) + if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink)) goto fail; return s; @@ -477,7 +480,7 @@ void deactivate_locked_super(struct super_block *s) { struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { - unregister_shrinker(&s->s_shrink); + shrinker_free(s->s_shrink); fs->kill_sb(s); kill_super_notify(s); @@ -818,7 +821,7 @@ retry: hlist_add_head(&s->s_instances, &s->s_type->fs_supers); spin_unlock(&sb_lock); get_filesystem(s->s_type); - register_shrinker_prepared(&s->s_shrink); + shrinker_register(s->s_shrink); return s; share_extant_sb: @@ -901,7 +904,7 @@ retry: hlist_add_head(&s->s_instances, &type->fs_supers); spin_unlock(&sb_lock); get_filesystem(type); - register_shrinker_prepared(&s->s_shrink); + shrinker_register(s->s_shrink); return s; } EXPORT_SYMBOL(sget); @@ -1522,7 +1525,7 @@ int setup_bdev_super(struct super_block *sb, int sb_flags, mutex_unlock(&bdev->bd_fsfreeze_mutex); snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); - shrinker_debugfs_rename(&sb->s_shrink, "sb-%s:%s", sb->s_type->name, + shrinker_debugfs_rename(sb->s_shrink, "sb-%s:%s", sb->s_type->name, sb->s_id); sb_set_blocksize(sb, block_size(bdev)); return 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index b528f063e8ff..fd539c9fef8e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1265,7 +1265,7 @@ struct super_block { const struct dentry_operations *s_d_op; /* default d_op for dentries */ - struct shrinker s_shrink; /* per-sb shrinker handle */ + struct shrinker *s_shrink; /* per-sb shrinker handle */ /* Number of inodes with nlink == 0 but still referenced */ atomic_long_t s_remove_count; -- cgit v1.2.3 From 8a0e8bb112af28362514624fc46e20426b4bdf1f Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Mon, 11 Sep 2023 17:44:44 +0800 Subject: mm: shrinker: convert shrinker_rwsem to mutex Now there are no readers of shrinker_rwsem, so we can simply replace it with mutex lock. [akpm@linux-foundation.org: update the fix to alloc_shrinker_info()] Link: https://lkml.kernel.org/r/20230911094444.68966-46-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: Abhinav Kumar Cc: Alasdair Kergon Cc: Alexander Viro Cc: Alyssa Rosenzweig Cc: Andreas Dilger Cc: Andreas Gruenbacher Cc: Anna Schumaker Cc: Arnd Bergmann Cc: Bob Peterson Cc: Borislav Petkov Cc: Carlos Llamas Cc: Chandan Babu R Cc: Chao Yu Cc: Chris Mason Cc: Christian Brauner Cc: Christian Koenig Cc: Chuck Lever Cc: Coly Li Cc: Dai Ngo Cc: Daniel Vetter Cc: Daniel Vetter Cc: "Darrick J. Wong" Cc: Dave Chinner Cc: Dave Hansen Cc: David Airlie Cc: David Hildenbrand Cc: David Sterba Cc: Dmitry Baryshkov Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Huang Rui Cc: Ingo Molnar Cc: Jaegeuk Kim Cc: Jani Nikula Cc: Jan Kara Cc: Jason Wang Cc: Jeff Layton Cc: Jeffle Xu Cc: Joel Fernandes (Google) Cc: Joonas Lahtinen Cc: Josef Bacik Cc: Juergen Gross Cc: Kent Overstreet Cc: Kirill Tkhai Cc: Marijn Suijten Cc: "Michael S. Tsirkin" Cc: Mike Snitzer Cc: Minchan Kim Cc: Nadav Amit Cc: Neil Brown Cc: Oleksandr Tyshchenko Cc: Olga Kornievskaia Cc: Paul E. McKenney Cc: Richard Weinberger Cc: Rob Clark Cc: Rob Herring Cc: Rodrigo Vivi Cc: Roman Gushchin Cc: Sean Paul Cc: Sergey Senozhatsky Cc: Song Liu Cc: Stefano Stabellini Cc: Steven Price Cc: "Theodore Ts'o" Cc: Thomas Gleixner Cc: Tomeu Vizoso Cc: Tom Talpey Cc: Trond Myklebust Cc: Tvrtko Ursulin Cc: Vlastimil Babka Cc: Xuan Zhuo Cc: Yue Hu Signed-off-by: Andrew Morton --- drivers/md/dm-cache-metadata.c | 2 +- fs/super.c | 2 +- mm/shrinker.c | 30 +++++++++++++++--------------- mm/shrinker_debug.c | 14 +++++++------- 4 files changed, 24 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index acffed750e3e..9e0c69958587 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1828,7 +1828,7 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs cmd root_lock). - * - must take shrinker_rwsem without holding cmd->root_lock + * - must take shrinker_mutex without holding cmd->root_lock */ new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_MAX_CONCURRENT_LOCKS); diff --git a/fs/super.c b/fs/super.c index adadf6689611..816a22a5cad1 100644 --- a/fs/super.c +++ b/fs/super.c @@ -178,7 +178,7 @@ static void super_wake(struct super_block *sb, unsigned int flag) * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the - * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we + * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, diff --git a/mm/shrinker.c b/mm/shrinker.c index ef22f20a97cb..dd91eab43ed3 100644 --- a/mm/shrinker.c +++ b/mm/shrinker.c @@ -8,7 +8,7 @@ #include "internal.h" LIST_HEAD(shrinker_list); -DECLARE_RWSEM(shrinker_rwsem); +DEFINE_MUTEX(shrinker_mutex); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -80,7 +80,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) int nid, ret = 0; int array_size = 0; - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); array_size = shrinker_unit_size(shrinker_nr_max); for_each_node(nid) { info = kvzalloc_node(sizeof(*info) + array_size, GFP_KERNEL, nid); @@ -91,12 +91,12 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) goto err; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); return ret; err: - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); free_shrinker_info(memcg); return -ENOMEM; } @@ -105,7 +105,7 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, - lockdep_is_held(&shrinker_rwsem)); + lockdep_is_held(&shrinker_mutex)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, int new_size, @@ -155,7 +155,7 @@ static int expand_shrinker_info(int new_id) if (!root_mem_cgroup) goto out; - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); new_size = shrinker_unit_size(new_nr_max); old_size = shrinker_unit_size(shrinker_nr_max); @@ -218,7 +218,7 @@ static int shrinker_memcg_alloc(struct shrinker *shrinker) if (mem_cgroup_disabled()) return -ENOSYS; - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -232,7 +232,7 @@ static int shrinker_memcg_alloc(struct shrinker *shrinker) shrinker->id = id; ret = 0; unlock: - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); return ret; } @@ -242,7 +242,7 @@ static void shrinker_memcg_remove(struct shrinker *shrinker) BUG_ON(id < 0); - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); idr_remove(&shrinker_idr, id); } @@ -293,7 +293,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -306,7 +306,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) } } } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); } #else static int shrinker_memcg_alloc(struct shrinker *shrinker) @@ -740,11 +740,11 @@ void shrinker_register(struct shrinker *shrinker) return; } - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); list_add_tail_rcu(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); init_completion(&shrinker->done); /* @@ -784,7 +784,7 @@ void shrinker_free(struct shrinker *shrinker) wait_for_completion(&shrinker->done); } - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); if (shrinker->flags & SHRINKER_REGISTERED) { /* * Now we can safely remove it from the shrinker_list and then @@ -799,7 +799,7 @@ void shrinker_free(struct shrinker *shrinker) if (shrinker->flags & SHRINKER_MEMCG_AWARE) shrinker_memcg_remove(shrinker); - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); if (debugfs_entry) shrinker_debugfs_remove(debugfs_entry, debugfs_id); diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 24aebe7c24cc..12ea5486a3e9 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -9,7 +9,7 @@ #include "internal.h" /* defined in vmscan.c */ -extern struct rw_semaphore shrinker_rwsem; +extern struct mutex shrinker_mutex; extern struct list_head shrinker_list; static DEFINE_IDA(shrinker_debugfs_ida); @@ -165,7 +165,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker) char buf[128]; int id; - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); /* debugfs isn't initialized yet, add debugfs entries later. */ if (!shrinker_debugfs_root) @@ -208,7 +208,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) if (!new) return -ENOMEM; - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); old = shrinker->name; shrinker->name = new; @@ -226,7 +226,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) shrinker->debugfs_entry = entry; } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); kfree_const(old); @@ -239,7 +239,7 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, { struct dentry *entry = shrinker->debugfs_entry; - lockdep_assert_held(&shrinker_rwsem); + lockdep_assert_held(&shrinker_mutex); *debugfs_id = entry ? shrinker->debugfs_id : -1; shrinker->debugfs_entry = NULL; @@ -265,14 +265,14 @@ static int __init shrinker_debugfs_init(void) shrinker_debugfs_root = dentry; /* Create debugfs entries for shrinkers registered at boot */ - down_write(&shrinker_rwsem); + mutex_lock(&shrinker_mutex); list_for_each_entry(shrinker, &shrinker_list, list) if (!shrinker->debugfs_entry) { ret = shrinker_debugfs_add(shrinker); if (ret) break; } - up_write(&shrinker_rwsem); + mutex_unlock(&shrinker_mutex); return ret; } -- cgit v1.2.3 From 2a41815784e029cbef571511384f00fa40f2a82e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:04 +0100 Subject: buffer: pass GFP flags to folio_alloc_buffers() Patch series "Add and use bdev_getblk()", v2. This patch series fixes a bug reported by Hui Zhu; see proposed patches v1 and v2: https://lore.kernel.org/linux-fsdevel/20230811035705.3296-1-teawaterz@linux.alibaba.com/ https://lore.kernel.org/linux-fsdevel/20230811071519.1094-1-teawaterz@linux.alibaba.com/ I decided to go in a rather different direction for this fix, and fix a related problem at the same time. I don't think there's any urgency to rush this into Linus' tree, nor have I marked it for stable. Reasonable people may disagree. This patch (of 8): Instead of creating entirely new flags, inherit them from grow_dev_page(). The other callers create the same flags that this function used to create. Link: https://lkml.kernel.org/r/20230914150011.843330-1-willy@infradead.org Link: https://lkml.kernel.org/r/20230914150011.843330-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Signed-off-by: Andrew Morton --- fs/buffer.c | 17 +++++++++-------- include/linux/buffer_head.h | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 12e9a71c693d..32338b7cfeb9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -915,16 +915,12 @@ int remove_inode_buffers(struct inode *inode) * which may not fail from ordinary buffer allocations. */ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, - bool retry) + gfp_t gfp) { struct buffer_head *bh, *head; - gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; long offset; struct mem_cgroup *memcg, *old_memcg; - if (retry) - gfp |= __GFP_NOFAIL; - /* The folio lock pins the memcg */ memcg = folio_memcg(folio); old_memcg = set_active_memcg(memcg); @@ -967,7 +963,11 @@ EXPORT_SYMBOL_GPL(folio_alloc_buffers); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry) { - return folio_alloc_buffers(page_folio(page), size, retry); + gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT; + if (retry) + gfp |= __GFP_NOFAIL; + + return folio_alloc_buffers(page_folio(page), size, gfp); } EXPORT_SYMBOL_GPL(alloc_page_buffers); @@ -1069,7 +1069,7 @@ grow_dev_page(struct block_device *bdev, sector_t block, goto failed; } - bh = folio_alloc_buffers(folio, size, true); + bh = folio_alloc_buffers(folio, size, gfp_mask | __GFP_ACCOUNT); /* * Link the folio to the buffers and initialise them. Take the @@ -1644,8 +1644,9 @@ void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state) { struct buffer_head *bh, *head, *tail; + gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL; - head = folio_alloc_buffers(folio, blocksize, true); + head = folio_alloc_buffers(folio, blocksize, gfp); bh = head; do { bh->b_state |= b_state; diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 44e9de51eedf..67d94d2be475 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -198,7 +198,7 @@ void touch_buffer(struct buffer_head *bh); void folio_set_bh(struct buffer_head *bh, struct folio *folio, unsigned long offset); struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, - bool retry); + gfp_t gfp); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry); void create_empty_buffers(struct page *, unsigned long, -- cgit v1.2.3 From 3ed65f04aac4d1cd025f30ee3fac174bcbf2b018 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:05 +0100 Subject: buffer: hoist GFP flags from grow_dev_page() to __getblk_gfp() grow_dev_page() is only called by grow_buffers(). grow_buffers() is only called by __getblk_slow() and __getblk_slow() is only called from __getblk_gfp(), so it is safe to move the GFP flags setting all the way up. With that done, add a new bdev_getblk() entry point that leaves the GFP flags the way the caller specified them. [willy@infradead.org: fix grow_dev_page() error handling] Link: https://lkml.kernel.org/r/ZRREEIwqiy5DijKB@casper.infradead.org Link: https://lkml.kernel.org/r/20230914150011.843330-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Cc: Dan Carpenter Signed-off-by: Andrew Morton --- fs/buffer.c | 61 +++++++++++++++++++++++++++++---------------- include/linux/buffer_head.h | 2 ++ 2 files changed, 42 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 32338b7cfeb9..80e96c1fcd33 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1043,20 +1043,11 @@ grow_dev_page(struct block_device *bdev, sector_t block, struct buffer_head *bh; sector_t end_block; int ret = 0; - gfp_t gfp_mask; - - gfp_mask = mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS) | gfp; - - /* - * XXX: __getblk_slow() can not really deal with failure and - * will endlessly loop on improvised global reclaim. Prefer - * looping in the allocator rather than here, at least that - * code knows what it's doing. - */ - gfp_mask |= __GFP_NOFAIL; folio = __filemap_get_folio(inode->i_mapping, index, - FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp_mask); + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, gfp); + if (IS_ERR(folio)) + return PTR_ERR(folio); bh = folio_buffers(folio); if (bh) { @@ -1069,7 +1060,10 @@ grow_dev_page(struct block_device *bdev, sector_t block, goto failed; } - bh = folio_alloc_buffers(folio, size, gfp_mask | __GFP_ACCOUNT); + ret = -ENOMEM; + bh = folio_alloc_buffers(folio, size, gfp | __GFP_ACCOUNT); + if (!bh) + goto failed; /* * Link the folio to the buffers and initialise them. Take the @@ -1420,24 +1414,49 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) } EXPORT_SYMBOL(__find_get_block); +/** + * bdev_getblk - Get a buffer_head in a block device's buffer cache. + * @bdev: The block device. + * @block: The block number. + * @size: The size of buffer_heads for this @bdev. + * @gfp: The memory allocation flags to use. + * + * In contrast to __getblk_gfp(), the @gfp flags must be all of the flags; + * they are not augmented with the mapping's GFP flags. + * + * Return: The buffer head, or NULL if memory could not be allocated. + */ +struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) +{ + struct buffer_head *bh = __find_get_block(bdev, block, size); + + might_alloc(gfp); + if (bh) + return bh; + + return __getblk_slow(bdev, block, size, gfp); +} +EXPORT_SYMBOL(bdev_getblk); + /* * __getblk_gfp() will locate (and, if necessary, create) the buffer_head * which corresponds to the passed block_device, block and size. The * returned buffer has its reference count incremented. - * - * __getblk_gfp() will lock up the machine if grow_dev_page's - * try_to_free_buffers() attempt is failing. FIXME, perhaps? */ struct buffer_head * __getblk_gfp(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { - struct buffer_head *bh = __find_get_block(bdev, block, size); + gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); - might_sleep(); - if (bh == NULL) - bh = __getblk_slow(bdev, block, size, gfp); - return bh; + /* + * Prefer looping in the allocator rather than here, at least that + * code knows what it's doing. + */ + gfp |= __GFP_NOFAIL; + + return bdev_getblk(bdev, block, size, gfp); } EXPORT_SYMBOL(__getblk_gfp); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 67d94d2be475..7825bb3d63a7 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -227,6 +227,8 @@ void __wait_on_buffer(struct buffer_head *); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, unsigned size); +struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp); struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp); void __brelse(struct buffer_head *); -- cgit v1.2.3 From e509ad4d77e6c8852f082104e388110d16fdfb97 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:06 +0100 Subject: ext4: use bdev_getblk() to avoid memory reclaim in readahead path sb_getblk_gfp adds __GFP_NOFAIL, which is unnecessary for readahead; we're quite comfortable with the possibility that we may not get a bh back. Switch to bdev_getblk() which does not include __GFP_NOFAIL. Link: https://lkml.kernel.org/r/20230914150011.843330-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reported-by: Hui Zhu Closes: https://lore.kernel.org/linux-fsdevel/20230811035705.3296-1-teawaterz@linux.alibaba.com/ Signed-off-by: Andrew Morton --- fs/ext4/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dbebd8b3127e..268d812b0add 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -255,7 +255,8 @@ struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { - struct buffer_head *bh = sb_getblk_gfp(sb, block, 0); + struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, + sb->s_blocksize, GFP_NOWAIT); if (likely(bh)) { if (trylock_buffer(bh)) -- cgit v1.2.3 From 775d9b10530a0a303dc08a9cdb2ed1f8667d9c5f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:07 +0100 Subject: buffer: use bdev_getblk() to avoid memory reclaim in readahead path __getblk() adds __GFP_NOFAIL, which is unnecessary for readahead; we're quite comfortable with the possibility that we may not get a bh back. Switch to bdev_getblk() which does not include __GFP_NOFAIL. Link: https://lkml.kernel.org/r/20230914150011.843330-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Signed-off-by: Andrew Morton --- fs/buffer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 80e96c1fcd33..edd118594565 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1465,7 +1465,9 @@ EXPORT_SYMBOL(__getblk_gfp); */ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) { - struct buffer_head *bh = __getblk(bdev, block, size); + struct buffer_head *bh = bdev_getblk(bdev, block, size, + GFP_NOWAIT | __GFP_MOVABLE); + if (likely(bh)) { bh_readahead(bh, REQ_RAHEAD); brelse(bh); -- cgit v1.2.3 From 8a83ac54940d27b8f56d766e1cb270d150fedd50 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:10 +0100 Subject: ext4: call bdev_getblk() from sb_getblk_gfp() Most of the callers of sb_getblk_gfp() already assumed that they were passing the entire GFP flags to use. Fix up the two callers that didn't, and remove the __GFP_NOFAIL from them since they both appear to correctly handle failure. Link: https://lkml.kernel.org/r/20230914150011.843330-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Signed-off-by: Andrew Morton --- fs/ext4/super.c | 10 ++++++++-- include/linux/buffer_head.h | 6 +++--- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 268d812b0add..c00ec159dea5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -244,13 +244,19 @@ static struct buffer_head *__ext4_sb_bread_gfp(struct super_block *sb, struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags) { - return __ext4_sb_bread_gfp(sb, block, op_flags, __GFP_MOVABLE); + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, + ~__GFP_FS) | __GFP_MOVABLE; + + return __ext4_sb_bread_gfp(sb, block, op_flags, gfp); } struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block) { - return __ext4_sb_bread_gfp(sb, block, 0, 0); + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_inode->i_mapping, + ~__GFP_FS); + + return __ext4_sb_bread_gfp(sb, block, 0, gfp); } void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index b294e2cccbae..22f13eece719 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -368,10 +368,10 @@ static inline struct buffer_head *sb_getblk(struct super_block *sb, return __getblk(sb->s_bdev, block, sb->s_blocksize); } -static inline struct buffer_head * -sb_getblk_gfp(struct super_block *sb, sector_t block, gfp_t gfp) +static inline struct buffer_head *sb_getblk_gfp(struct super_block *sb, + sector_t block, gfp_t gfp) { - return __getblk_gfp(sb->s_bdev, block, sb->s_blocksize, gfp); + return bdev_getblk(sb->s_bdev, block, sb->s_blocksize, gfp); } static inline struct buffer_head * -- cgit v1.2.3 From 93b13ecaa713e1fcbf23b7483eec065d300c5ad8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 14 Sep 2023 16:00:11 +0100 Subject: buffer: remove __getblk_gfp() Inline it into __bread_gfp(). Link: https://lkml.kernel.org/r/20230914150011.843330-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hui Zhu Signed-off-by: Andrew Morton --- fs/buffer.c | 36 +++++++++++------------------------- include/linux/buffer_head.h | 2 -- 2 files changed, 11 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index edd118594565..edec8652788c 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1421,9 +1421,6 @@ EXPORT_SYMBOL(__find_get_block); * @size: The size of buffer_heads for this @bdev. * @gfp: The memory allocation flags to use. * - * In contrast to __getblk_gfp(), the @gfp flags must be all of the flags; - * they are not augmented with the mapping's GFP flags. - * * Return: The buffer head, or NULL if memory could not be allocated. */ struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, @@ -1439,27 +1436,6 @@ struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, } EXPORT_SYMBOL(bdev_getblk); -/* - * __getblk_gfp() will locate (and, if necessary, create) the buffer_head - * which corresponds to the passed block_device, block and size. The - * returned buffer has its reference count incremented. - */ -struct buffer_head * -__getblk_gfp(struct block_device *bdev, sector_t block, - unsigned size, gfp_t gfp) -{ - gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); - - /* - * Prefer looping in the allocator rather than here, at least that - * code knows what it's doing. - */ - gfp |= __GFP_NOFAIL; - - return bdev_getblk(bdev, block, size, gfp); -} -EXPORT_SYMBOL(__getblk_gfp); - /* * Do async read-ahead on a buffer.. */ @@ -1491,7 +1467,17 @@ struct buffer_head * __bread_gfp(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { - struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); + struct buffer_head *bh; + + gfp |= mapping_gfp_constraint(bdev->bd_inode->i_mapping, ~__GFP_FS); + + /* + * Prefer looping in the allocator rather than here, at least that + * code knows what it's doing. + */ + gfp |= __GFP_NOFAIL; + + bh = bdev_getblk(bdev, block, size, gfp); if (likely(bh) && !buffer_uptodate(bh)) bh = __bread_slow(bh); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 22f13eece719..3dc4720e4773 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -229,8 +229,6 @@ struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, unsigned size); struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp); -struct buffer_head *__getblk_gfp(struct block_device *bdev, sector_t block, - unsigned size, gfp_t gfp); void __brelse(struct buffer_head *); void __bforget(struct buffer_head *); void __breadahead(struct block_device *, sector_t block, unsigned int size); -- cgit v1.2.3 From 8db0ec791f7788cd21e7f91ee5ff42c1c458d0e7 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Wed, 13 Sep 2023 16:12:47 -0400 Subject: fs: use nth_page() in place of direct struct page manipulation When dealing with hugetlb pages, struct page is not guaranteed to be contiguous on SPARSEMEM without VMEMMAP. Use nth_page() to handle it properly. Without the fix, a wrong subpage might be checked for HWPoison, causing wrong number of bytes of a page copied to user space. No bug is reported. The fix comes from code inspection. Link: https://lkml.kernel.org/r/20230913201248.452081-5-zi.yan@sent.com Fixes: 38c1ddbde6c6 ("hugetlbfs: improve read HWPOISON hugepage") Signed-off-by: Zi Yan Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Mike Rapoport (IBM) Cc: Thomas Bogendoerfer Cc: Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 316c4cebd3f3..60fce26ff937 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -295,7 +295,7 @@ static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t byt size_t res = 0; /* First subpage to start the loop. */ - page += offset / PAGE_SIZE; + page = nth_page(page, offset / PAGE_SIZE); offset %= PAGE_SIZE; while (1) { if (is_raw_hwpoison_page_in_hugepage(page)) @@ -309,7 +309,7 @@ static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t byt break; offset += n; if (offset == PAGE_SIZE) { - page++; + page = nth_page(page, 1); offset = 0; } } -- cgit v1.2.3 From a08c7193e4f18dc8508f2d07d0de2c5b94cb39a3 Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Tue, 26 Sep 2023 12:20:17 -0700 Subject: mm/filemap: remove hugetlb special casing in filemap.c Remove special cased hugetlb handling code within the page cache by changing the granularity of ->index to the base page size rather than the huge page size. The motivation of this patch is to reduce complexity within the filemap code while also increasing performance by removing branches that are evaluated on every page cache lookup. To support the change in index, new wrappers for hugetlb page cache interactions are added. These wrappers perform the conversion to a linear index which is now expected by the page cache for huge pages. ========================= PERFORMANCE ====================================== Perf was used to check the performance differences after the patch. Overall the performance is similar to mainline with a very small larger overhead that occurs in __filemap_add_folio() and hugetlb_add_to_page_cache(). This is because of the larger overhead that occurs in xa_load() and xa_store() as the xarray is now using more entries to store hugetlb folios in the page cache. Timing aarch64 2MB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-1 hugepages]# time fallocate -l 700GB test.txt real 1m49.568s user 0m0.000s sys 1m49.461s 6.5-rc3: [root]# time fallocate -l 700GB test.txt real 1m47.495s user 0m0.000s sys 1m47.370s 1GB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt real 1m47.024s user 0m0.000s sys 1m46.921s 6.5-rc3: [root@sidhakum-ol9-1 hugepages1G]# time fallocate -l 700GB test.txt real 1m44.551s user 0m0.000s sys 1m44.438s x86 2MB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-2 hugepages]# time fallocate -l 100GB test.txt real 0m22.383s user 0m0.000s sys 0m22.255s 6.5-rc3: [opc@sidhakum-ol9-2 hugepages]$ time sudo fallocate -l 100GB /dev/hugepages/test.txt real 0m22.735s user 0m0.038s sys 0m22.567s 1GB Page Size 6.5-rc3 + this patch: [root@sidhakum-ol9-2 hugepages1GB]# time fallocate -l 100GB test.txt real 0m25.786s user 0m0.001s sys 0m25.589s 6.5-rc3: [root@sidhakum-ol9-2 hugepages1G]# time fallocate -l 100GB test.txt real 0m33.454s user 0m0.001s sys 0m33.193s aarch64: workload - fallocate a 700GB file backed by huge pages 6.5-rc3 + this patch: 2MB Page Size: --100.00%--__arm64_sys_fallocate ksys_fallocate vfs_fallocate hugetlbfs_fallocate | |--95.04%--__pi_clear_page | |--3.57%--clear_huge_page | | | |--2.63%--rcu_all_qs | | | --0.91%--__cond_resched | --0.67%--__cond_resched 0.17% 0.00% 0 fallocate [kernel.vmlinux] [k] hugetlb_add_to_page_cache 0.14% 0.10% 11 fallocate [kernel.vmlinux] [k] __filemap_add_folio 6.5-rc3 2MB Page Size: --100.00%--__arm64_sys_fallocate ksys_fallocate vfs_fallocate hugetlbfs_fallocate | |--94.91%--__pi_clear_page | |--4.11%--clear_huge_page | | | |--3.00%--rcu_all_qs | | | --1.10%--__cond_resched | --0.59%--__cond_resched 0.08% 0.01% 1 fallocate [kernel.kallsyms] [k] hugetlb_add_to_page_cache 0.05% 0.03% 3 fallocate [kernel.kallsyms] [k] __filemap_add_folio x86 workload - fallocate a 100GB file backed by huge pages 6.5-rc3 + this patch: 2MB Page Size: hugetlbfs_fallocate | --99.57%--clear_huge_page | --98.47%--clear_page_erms | --0.53%--asm_sysvec_apic_timer_interrupt 0.04% 0.04% 1 fallocate [kernel.kallsyms] [k] xa_load 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] hugetlb_add_to_page_cache 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] __filemap_add_folio 0.04% 0.00% 0 fallocate [kernel.kallsyms] [k] xas_store 6.5-rc3 2MB Page Size: --99.93%--__x64_sys_fallocate vfs_fallocate hugetlbfs_fallocate | --99.38%--clear_huge_page | |--98.40%--clear_page_erms | --0.59%--__cond_resched 0.03% 0.03% 1 fallocate [kernel.kallsyms] [k] __filemap_add_folio ========================= TESTING ====================================== This patch passes libhugetlbfs tests and LTP hugetlb tests ********** TEST SUMMARY * 2M * 32-bit 64-bit * Total testcases: 110 113 * Skipped: 0 0 * PASS: 107 113 * FAIL: 0 0 * Killed by signal: 3 0 * Bad configuration: 0 0 * Expected FAIL: 0 0 * Unexpected PASS: 0 0 * Test not present: 0 0 * Strange test result: 0 0 ********** Done executing testcases. LTP Version: 20220527-178-g2761a81c4 page migration was also tested using Mike Kravetz's test program.[8] [dan.carpenter@linaro.org: fix an NULL vs IS_ERR() bug] Link: https://lkml.kernel.org/r/1772c296-1417-486f-8eef-171af2192681@moroto.mountain Link: https://lkml.kernel.org/r/20230926192017.98183-1-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Kumar Signed-off-by: Dan Carpenter Reported-and-tested-by: syzbot+c225dea486da4d5592bd@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c225dea486da4d5592bd Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 37 +++++++++++++++++++------------------ include/linux/hugetlb.h | 12 ++++++++++++ include/linux/pagemap.h | 32 ++------------------------------ mm/filemap.c | 34 ++++++++++------------------------ mm/hugetlb.c | 32 ++++++-------------------------- mm/migrate.c | 6 +++--- 6 files changed, 52 insertions(+), 101 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 60fce26ff937..926d01c493fb 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -334,7 +334,7 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) ssize_t retval = 0; while (iov_iter_count(to)) { - struct page *page; + struct folio *folio; size_t nr, copied, want; /* nr is the maximum number of bytes to copy from this page */ @@ -352,18 +352,18 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) } nr = nr - offset; - /* Find the page */ - page = find_lock_page(mapping, index); - if (unlikely(page == NULL)) { + /* Find the folio */ + folio = filemap_lock_hugetlb_folio(h, mapping, index); + if (IS_ERR(folio)) { /* * We have a HOLE, zero out the user-buffer for the * length of the hole or request. */ copied = iov_iter_zero(nr, to); } else { - unlock_page(page); + folio_unlock(folio); - if (!PageHWPoison(page)) + if (!folio_test_has_hwpoisoned(folio)) want = nr; else { /* @@ -371,19 +371,19 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) * touching the 1st raw HWPOISON subpage after * offset. */ - want = adjust_range_hwpoison(page, offset, nr); + want = adjust_range_hwpoison(&folio->page, offset, nr); if (want == 0) { - put_page(page); + folio_put(folio); retval = -EIO; break; } } /* - * We have the page, copy it to user space buffer. + * We have the folio, copy it to user space buffer. */ - copied = copy_page_to_iter(page, offset, want, to); - put_page(page); + copied = copy_folio_to_iter(folio, offset, want, to); + folio_put(folio); } offset += copied; retval += copied; @@ -661,21 +661,20 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, { struct hstate *h = hstate_inode(inode); struct address_space *mapping = &inode->i_data; - const pgoff_t start = lstart >> huge_page_shift(h); - const pgoff_t end = lend >> huge_page_shift(h); + const pgoff_t end = lend >> PAGE_SHIFT; struct folio_batch fbatch; pgoff_t next, index; int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); folio_batch_init(&fbatch); - next = start; + next = lstart >> PAGE_SHIFT; while (filemap_get_folios(mapping, &next, end - 1, &fbatch)) { for (i = 0; i < folio_batch_count(&fbatch); ++i) { struct folio *folio = fbatch.folios[i]; u32 hash = 0; - index = folio->index; + index = folio->index >> huge_page_order(h); hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -693,7 +692,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, } if (truncate_op) - (void)hugetlb_unreserve_pages(inode, start, LONG_MAX, freed); + (void)hugetlb_unreserve_pages(inode, + lstart >> huge_page_shift(h), + LONG_MAX, freed); } static void hugetlbfs_evict_inode(struct inode *inode) @@ -741,7 +742,7 @@ static void hugetlbfs_zero_partial_page(struct hstate *h, pgoff_t idx = start >> huge_page_shift(h); struct folio *folio; - folio = filemap_lock_folio(mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) return; @@ -886,7 +887,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* See if already present in mapping to avoid alloc/free */ - folio = filemap_get_folio(mapping, index); + folio = filemap_get_folio(mapping, index << huge_page_order(h)); if (!IS_ERR(folio)) { folio_put(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index a30686e649f7..d935b0584f1d 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -812,6 +812,12 @@ static inline unsigned int blocks_per_huge_page(struct hstate *h) return huge_page_size(h) / 512; } +static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, + struct address_space *mapping, pgoff_t idx) +{ + return filemap_lock_folio(mapping, idx << huge_page_order(h)); +} + #include #ifndef is_hugepage_only_range @@ -1008,6 +1014,12 @@ static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio return NULL; } +static inline struct folio *filemap_lock_hugetlb_folio(struct hstate *h, + struct address_space *mapping, pgoff_t idx) +{ + return NULL; +} + static inline int isolate_or_dissolve_huge_page(struct page *page, struct list_head *list) { diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 351c3b7f93a1..759b29d9a69a 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -789,9 +789,6 @@ static inline pgoff_t folio_next_index(struct folio *folio) */ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) { - /* HugeTLBfs indexes the page cache in units of hpage_size */ - if (folio_test_hugetlb(folio)) - return &folio->page; return folio_page(folio, index & (folio_nr_pages(folio) - 1)); } @@ -807,9 +804,6 @@ static inline struct page *folio_file_page(struct folio *folio, pgoff_t index) */ static inline bool folio_contains(struct folio *folio, pgoff_t index) { - /* HugeTLBfs indexes the page cache in units of hpage_size */ - if (folio_test_hugetlb(folio)) - return folio->index == index; return index - folio_index(folio) < folio_nr_pages(folio); } @@ -867,10 +861,9 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping, } /* - * Get index of the page within radix-tree (but not for hugetlb pages). - * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE) + * Get the offset in PAGE_SIZE (even for hugetlb pages). */ -static inline pgoff_t page_to_index(struct page *page) +static inline pgoff_t page_to_pgoff(struct page *page) { struct page *head; @@ -885,19 +878,6 @@ static inline pgoff_t page_to_index(struct page *page) return head->index + page - head; } -extern pgoff_t hugetlb_basepage_index(struct page *page); - -/* - * Get the offset in PAGE_SIZE (even for hugetlb pages). - * (TODO: hugetlb pages should have ->index in PAGE_SIZE) - */ -static inline pgoff_t page_to_pgoff(struct page *page) -{ - if (unlikely(PageHuge(page))) - return hugetlb_basepage_index(page); - return page_to_index(page); -} - /* * Return byte-offset into filesystem object for page. */ @@ -934,24 +914,16 @@ static inline loff_t folio_file_pos(struct folio *folio) /* * Get the offset in PAGE_SIZE (even for hugetlb folios). - * (TODO: hugetlb folios should have ->index in PAGE_SIZE) */ static inline pgoff_t folio_pgoff(struct folio *folio) { - if (unlikely(folio_test_hugetlb(folio))) - return hugetlb_basepage_index(&folio->page); return folio->index; } -extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, - unsigned long address); - static inline pgoff_t linear_page_index(struct vm_area_struct *vma, unsigned long address) { pgoff_t pgoff; - if (unlikely(is_vm_hugetlb_page(vma))) - return linear_hugepage_index(vma, address); pgoff = (address - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; return pgoff; diff --git a/mm/filemap.c b/mm/filemap.c index 9481ffaf24e6..340c693112a9 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -131,11 +131,8 @@ static void page_cache_delete(struct address_space *mapping, mapping_set_update(&xas, mapping); - /* hugetlb pages are represented by a single entry in the xarray */ - if (!folio_test_hugetlb(folio)) { - xas_set_order(&xas, folio->index, folio_order(folio)); - nr = folio_nr_pages(folio); - } + xas_set_order(&xas, folio->index, folio_order(folio)); + nr = folio_nr_pages(folio); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); @@ -234,7 +231,7 @@ void filemap_free_folio(struct address_space *mapping, struct folio *folio) if (free_folio) free_folio(folio); - if (folio_test_large(folio) && !folio_test_hugetlb(folio)) + if (folio_test_large(folio)) refs = folio_nr_pages(folio); folio_put_refs(folio, refs); } @@ -855,14 +852,15 @@ noinline int __filemap_add_folio(struct address_space *mapping, if (!huge) { int error = mem_cgroup_charge(folio, NULL, gfp); - VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); if (error) return error; charged = true; - xas_set_order(&xas, index, folio_order(folio)); - nr = folio_nr_pages(folio); } + VM_BUG_ON_FOLIO(index & (folio_nr_pages(folio) - 1), folio); + xas_set_order(&xas, index, folio_order(folio)); + nr = folio_nr_pages(folio); + gfp &= GFP_RECLAIM_MASK; folio_ref_add(folio, nr); folio->mapping = mapping; @@ -2040,7 +2038,7 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start, int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; - if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) + if (!xa_is_value(folio)) nr = folio_nr_pages(folio); *start = indices[idx] + nr; } @@ -2104,7 +2102,7 @@ put: int idx = folio_batch_count(fbatch) - 1; folio = fbatch->folios[idx]; - if (!xa_is_value(folio) && !folio_test_hugetlb(folio)) + if (!xa_is_value(folio)) nr = folio_nr_pages(folio); *start = indices[idx] + nr; } @@ -2145,9 +2143,6 @@ unsigned filemap_get_folios(struct address_space *mapping, pgoff_t *start, continue; if (!folio_batch_add(fbatch, folio)) { unsigned long nr = folio_nr_pages(folio); - - if (folio_test_hugetlb(folio)) - nr = 1; *start = folio->index + nr; goto out; } @@ -2213,9 +2208,6 @@ unsigned filemap_get_folios_contig(struct address_space *mapping, if (!folio_batch_add(fbatch, folio)) { nr = folio_nr_pages(folio); - - if (folio_test_hugetlb(folio)) - nr = 1; *start = folio->index + nr; goto out; } @@ -2232,10 +2224,7 @@ update_start: if (nr) { folio = fbatch->folios[nr - 1]; - if (folio_test_hugetlb(folio)) - *start = folio->index + 1; - else - *start = folio_next_index(folio); + *start = folio->index + folio_nr_pages(folio); } out: rcu_read_unlock(); @@ -2273,9 +2262,6 @@ unsigned filemap_get_folios_tag(struct address_space *mapping, pgoff_t *start, continue; if (!folio_batch_add(fbatch, folio)) { unsigned long nr = folio_nr_pages(folio); - - if (folio_test_hugetlb(folio)) - nr = 1; *start = folio->index + nr; goto out; } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index de220e3ff8be..0aaf28ce32e5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -952,7 +952,7 @@ static long region_count(struct resv_map *resv, long f, long t) /* * Convert the address within this vma to the page offset within - * the mapping, in pagecache page units; huge pages here. + * the mapping, huge page units here. */ static pgoff_t vma_hugecache_offset(struct hstate *h, struct vm_area_struct *vma, unsigned long address) @@ -961,13 +961,6 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } -pgoff_t linear_hugepage_index(struct vm_area_struct *vma, - unsigned long address) -{ - return vma_hugecache_offset(hstate_vma(vma), vma, address); -} -EXPORT_SYMBOL_GPL(linear_hugepage_index); - /** * vma_kernel_pagesize - Page size granularity for this VMA. * @vma: The user mapping. @@ -2074,20 +2067,6 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) return NULL; } -pgoff_t hugetlb_basepage_index(struct page *page) -{ - struct page *page_head = compound_head(page); - pgoff_t index = page_index(page_head); - unsigned long compound_idx; - - if (compound_order(page_head) > MAX_ORDER) - compound_idx = page_to_pfn(page) - page_to_pfn(page_head); - else - compound_idx = page - page_head; - - return (index << compound_order(page_head)) + compound_idx; -} - static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, nodemask_t *node_alloc_noretry) @@ -5772,7 +5751,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { struct address_space *mapping = vma->vm_file->f_mapping; - pgoff_t idx = vma_hugecache_offset(h, vma, address); + pgoff_t idx = linear_page_index(vma, address); struct folio *folio; folio = filemap_get_folio(mapping, idx); @@ -5789,6 +5768,7 @@ int hugetlb_add_to_page_cache(struct folio *folio, struct address_space *mapping struct hstate *h = hstate_inode(inode); int err; + idx <<= huge_page_order(h); __folio_set_locked(folio); err = __filemap_add_folio(mapping, folio, idx, GFP_KERNEL, NULL); @@ -5896,7 +5876,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, * before we get page_table_lock. */ new_folio = false; - folio = filemap_lock_folio(mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) { size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) @@ -6205,7 +6185,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - pagecache_folio = filemap_lock_folio(mapping, idx); + pagecache_folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(pagecache_folio)) pagecache_folio = NULL; } @@ -6338,7 +6318,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, if (is_continue) { ret = -EFAULT; - folio = filemap_lock_folio(mapping, idx); + folio = filemap_lock_hugetlb_folio(h, mapping, idx); if (IS_ERR(folio)) goto out; folio_in_pagecache = true; diff --git a/mm/migrate.c b/mm/migrate.c index 7d1804c4a5d9..942f8c9cd068 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -524,7 +524,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, int expected_count; xas_lock_irq(&xas); - expected_count = 2 + folio_has_private(src); + expected_count = folio_expected_refs(mapping, src); if (!folio_ref_freeze(src, expected_count)) { xas_unlock_irq(&xas); return -EAGAIN; @@ -533,11 +533,11 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, dst->index = src->index; dst->mapping = src->mapping; - folio_get(dst); + folio_ref_add(dst, folio_nr_pages(dst)); xas_store(&xas, dst); - folio_ref_unfreeze(src, expected_count - 1); + folio_ref_unfreeze(src, expected_count - folio_nr_pages(src)); xas_unlock_irq(&xas); -- cgit v1.2.3 From d61ea1cb009532dcbd77a9d44b812704cec60146 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Mon, 21 Aug 2023 19:15:13 +0500 Subject: userfaultfd: UFFD_FEATURE_WP_ASYNC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Implement IOCTL to get and optionally clear info about PTEs", v33. *Motivation* The real motivation for adding PAGEMAP_SCAN IOCTL is to emulate Windows GetWriteWatch() and ResetWriteWatch() syscalls [1]. The GetWriteWatch() retrieves the addresses of the pages that are written to in a region of virtual memory. This syscall is used in Windows applications and games etc. This syscall is being emulated in pretty slow manner in userspace. Our purpose is to enhance the kernel such that we translate it efficiently in a better way. Currently some out of tree hack patches are being used to efficiently emulate it in some kernels. We intend to replace those with these patches. So the whole gaming on Linux can effectively get benefit from this. It means there would be tons of users of this code. CRIU use case [2] was mentioned by Andrei and Danylo: > Use cases for migrating sparse VMAs are binaries sanitized with ASAN, > MSAN or TSAN [3]. All of these sanitizers produce sparse mappings of > shadow memory [4]. Being able to migrate such binaries allows to highly > reduce the amount of work needed to identify and fix post-migration > crashes, which happen constantly. Andrei defines the following uses of this code: * it is more granular and allows us to track changed pages more effectively. The current interface can clear dirty bits for the entire process only. In addition, reading info about pages is a separate operation. It means we must freeze the process to read information about all its pages, reset dirty bits, only then we can start dumping pages. The information about pages becomes more and more outdated, while we are processing pages. The new interface solves both these downsides. First, it allows us to read pte bits and clear the soft-dirty bit atomically. It means that CRIU will not need to freeze processes to pre-dump their memory. Second, it clears soft-dirty bits for a specified region of memory. It means CRIU will have actual info about pages to the moment of dumping them. * The new interface has to be much faster because basic page filtering is happening in the kernel. With the old interface, we have to read pagemap for each page. *Implementation Evolution (Short Summary)* From the definition of GetWriteWatch(), we feel like kernel's soft-dirty feature can be used under the hood with some additions like: * reset soft-dirty flag for only a specific region of memory instead of clearing the flag for the entire process * get and clear soft-dirty flag for a specific region atomically So we decided to use ioctl on pagemap file to read or/and reset soft-dirty flag. But using soft-dirty flag, sometimes we get extra pages which weren't even written. They had become soft-dirty because of VMA merging and VM_SOFTDIRTY flag. This breaks the definition of GetWriteWatch(). We were able to by-pass this short coming by ignoring VM_SOFTDIRTY until David reported that mprotect etc messes up the soft-dirty flag while ignoring VM_SOFTDIRTY [5]. This wasn't happening until [6] got introduced. We discussed if we can revert these patches. But we could not reach to any conclusion. So at this point, I made couple of tries to solve this whole VM_SOFTDIRTY issue by correcting the soft-dirty implementation: * [7] Correct the bug fixed wrongly back in 2014. It had potential to cause regression. We left it behind. * [8] Keep a list of soft-dirty part of a VMA across splits and merges. I got the reply don't increase the size of the VMA by 8 bytes. At this point, we left soft-dirty considering it is too much delicate and userfaultfd [9] seemed like the only way forward. From there onward, we have been basing soft-dirty emulation on userfaultfd wp feature where kernel resolves the faults itself when WP_ASYNC feature is used. It was straight forward to add WP_ASYNC feature in userfautlfd. Now we get only those pages dirty or written-to which are really written in reality. (PS There is another WP_UNPOPULATED userfautfd feature is required which is needed to avoid pre-faulting memory before write-protecting [9].) All the different masks were added on the request of CRIU devs to create interface more generic and better. [1] https://learn.microsoft.com/en-us/windows/win32/api/memoryapi/nf-memoryapi-getwritewatch [2] https://lore.kernel.org/all/20221014134802.1361436-1-mdanylo@google.com [3] https://github.com/google/sanitizers [4] https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm#64-bit [5] https://lore.kernel.org/all/bfcae708-db21-04b4-0bbe-712badd03071@redhat.com [6] https://lore.kernel.org/all/20220725142048.30450-1-peterx@redhat.com/ [7] https://lore.kernel.org/all/20221122115007.2787017-1-usama.anjum@collabora.com [8] https://lore.kernel.org/all/20221220162606.1595355-1-usama.anjum@collabora.com [9] https://lore.kernel.org/all/20230306213925.617814-1-peterx@redhat.com [10] https://lore.kernel.org/all/20230125144529.1630917-1-mdanylo@google.com This patch (of 6): Add a new userfaultfd-wp feature UFFD_FEATURE_WP_ASYNC, that allows userfaultfd wr-protect faults to be resolved by the kernel directly. It can be used like a high accuracy version of soft-dirty, without vma modifications during tracking, and also with ranged support by default rather than for a whole mm when reset the protections due to existence of ioctl(UFFDIO_WRITEPROTECT). Several goals of such a dirty tracking interface: 1. All types of memory should be supported and tracable. This is nature for soft-dirty but should mention when the context is userfaultfd, because it used to only support anon/shmem/hugetlb. The problem is for a dirty tracking purpose these three types may not be enough, and it's legal to track anything e.g. any page cache writes from mmap. 2. Protections can be applied to partial of a memory range, without vma split/merge fuss. The hope is that the tracking itself should not affect any vma layout change. It also helps when reset happens because the reset will not need mmap write lock which can block the tracee. 3. Accuracy needs to be maintained. This means we need pte markers to work on any type of VMA. One could question that, the whole concept of async dirty tracking is not really close to fundamentally what userfaultfd used to be: it's not "a fault to be serviced by userspace" anymore. However, using userfaultfd-wp here as a framework is convenient for us in at least: 1. VM_UFFD_WP vma flag, which has a very good name to suite something like this, so we don't need VM_YET_ANOTHER_SOFT_DIRTY. Just use a new feature bit to identify from a sync version of uffd-wp registration. 2. PTE markers logic can be leveraged across the whole kernel to maintain the uffd-wp bit as long as an arch supports, this also applies to this case where uffd-wp bit will be a hint to dirty information and it will not go lost easily (e.g. when some page cache ptes got zapped). 3. Reuse ioctl(UFFDIO_WRITEPROTECT) interface for either starting or resetting a range of memory, while there's no counterpart in the old soft-dirty world, hence if this is wanted in a new design we'll need a new interface otherwise. We can somehow understand that commonality because uffd-wp was fundamentally a similar idea of write-protecting pages just like soft-dirty. This implementation allows WP_ASYNC to imply WP_UNPOPULATED, because so far WP_ASYNC seems to not usable if without WP_UNPOPULATE. This also gives us chance to modify impl of WP_ASYNC just in case it could be not depending on WP_UNPOPULATED anymore in the future kernels. It's also fine to imply that because both features will rely on PTE_MARKER_UFFD_WP config option, so they'll show up together (or both missing) in an UFFDIO_API probe. vma_can_userfault() now allows any VMA if the userfaultfd registration is only about async uffd-wp. So we can track dirty for all kinds of memory including generic file systems (like XFS, EXT4 or BTRFS). One trick worth mention in do_wp_page() is that we need to manually update vmf->orig_pte here because it can be used later with a pte_same() check - this path always has FAULT_FLAG_ORIG_PTE_VALID set in the flags. The major defect of this approach of dirty tracking is we need to populate the pgtables when tracking starts. Soft-dirty doesn't do it like that. It's unwanted in the case where the range of memory to track is huge and unpopulated (e.g., tracking updates on a 10G file with mmap() on top, without having any page cache installed yet). One way to improve this is to allow pte markers exist for larger than PTE level for PMD+. That will not change the interface if to implemented, so we can leave that for later. Link: https://lkml.kernel.org/r/20230821141518.870589-1-usama.anjum@collabora.com Link: https://lkml.kernel.org/r/20230821141518.870589-2-usama.anjum@collabora.com Signed-off-by: Peter Xu Co-developed-by: Muhammad Usama Anjum Signed-off-by: Muhammad Usama Anjum Cc: Alex Sierra Cc: Al Viro Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Cc: Michał Mirosław Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/userfaultfd.rst | 35 ++++++++++++++++++++++++++++ fs/userfaultfd.c | 26 +++++++++++++++++---- include/linux/userfaultfd_k.h | 21 ++++++++++++++++- include/uapi/linux/userfaultfd.h | 9 ++++++- mm/hugetlb.c | 32 ++++++++++++++----------- mm/memory.c | 28 +++++++++++++++++++--- 6 files changed, 129 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 4349a8c2b978..203e26da5f92 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -244,6 +244,41 @@ write-protected (so future writes will also result in a WP fault). These ioctls support a mode flag (``UFFDIO_COPY_MODE_WP`` or ``UFFDIO_CONTINUE_MODE_WP`` respectively) to configure the mapping this way. +If the userfaultfd context has ``UFFD_FEATURE_WP_ASYNC`` feature bit set, +any vma registered with write-protection will work in async mode rather +than the default sync mode. + +In async mode, there will be no message generated when a write operation +happens, meanwhile the write-protection will be resolved automatically by +the kernel. It can be seen as a more accurate version of soft-dirty +tracking and it can be different in a few ways: + + - The dirty result will not be affected by vma changes (e.g. vma + merging) because the dirty is only tracked by the pte. + + - It supports range operations by default, so one can enable tracking on + any range of memory as long as page aligned. + + - Dirty information will not get lost if the pte was zapped due to + various reasons (e.g. during split of a shmem transparent huge page). + + - Due to a reverted meaning of soft-dirty (page clean when uffd-wp bit + set; dirty when uffd-wp bit cleared), it has different semantics on + some of the memory operations. For example: ``MADV_DONTNEED`` on + anonymous (or ``MADV_REMOVE`` on a file mapping) will be treated as + dirtying of memory by dropping uffd-wp bit during the procedure. + +The user app can collect the "written/dirty" status by looking up the +uffd-wp bit for the pages being interested in /proc/pagemap. + +The page will not be under track of uffd-wp async mode until the page is +explicitly write-protected by ``ioctl(UFFDIO_WRITEPROTECT)`` with the mode +flag ``UFFDIO_WRITEPROTECT_MODE_WP`` set. Trying to resolve a page fault +that was tracked by async mode userfaultfd-wp is invalid. + +When userfaultfd-wp async mode is used alone, it can be applied to all +kinds of memory. + Memory Poisioning Emulation --------------------------- diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 56eaae9dac1a..a7c6ef764e63 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -123,6 +123,11 @@ static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx) return ctx->features & UFFD_FEATURE_INITIALIZED; } +static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx) +{ + return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC); +} + /* * Whether WP_UNPOPULATED is enabled on the uffd context. It is only * meaningful when userfaultfd_wp()==true on the vma and when it's @@ -1325,6 +1330,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool basic_ioctls; unsigned long start, end, vma_end; struct vma_iterator vmi; + bool wp_async = userfaultfd_wp_async_ctx(ctx); pgoff_t pgoff; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1399,7 +1405,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, /* check not compatible vmas */ ret = -EINVAL; - if (!vma_can_userfault(cur, vm_flags)) + if (!vma_can_userfault(cur, vm_flags, wp_async)) goto out_unlock; /* @@ -1460,7 +1466,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, for_each_vma_range(vmi, vma, end) { cond_resched(); - BUG_ON(!vma_can_userfault(vma, vm_flags)); + BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async)); BUG_ON(vma->vm_userfaultfd_ctx.ctx && vma->vm_userfaultfd_ctx.ctx != ctx); WARN_ON(!(vma->vm_flags & VM_MAYWRITE)); @@ -1561,6 +1567,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; struct vma_iterator vmi; + bool wp_async = userfaultfd_wp_async_ctx(ctx); pgoff_t pgoff; ret = -EFAULT; @@ -1615,7 +1622,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, * provides for more strict behavior to notice * unregistration errors. */ - if (!vma_can_userfault(cur, cur->vm_flags)) + if (!vma_can_userfault(cur, cur->vm_flags, wp_async)) goto out_unlock; found = true; @@ -1631,7 +1638,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, for_each_vma_range(vmi, vma, end) { cond_resched(); - BUG_ON(!vma_can_userfault(vma, vma->vm_flags)); + BUG_ON(!vma_can_userfault(vma, vma->vm_flags, wp_async)); /* * Nothing to do: this vma is already registered into this @@ -2018,6 +2025,11 @@ out: return ret; } +bool userfaultfd_wp_async(struct vm_area_struct *vma) +{ + return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx); +} + static inline unsigned int uffd_ctx_features(__u64 user_features) { /* @@ -2051,6 +2063,11 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, ret = -EPERM; if ((features & UFFD_FEATURE_EVENT_FORK) && !capable(CAP_SYS_PTRACE)) goto err_out; + + /* WP_ASYNC relies on WP_UNPOPULATED, choose it unconditionally */ + if (features & UFFD_FEATURE_WP_ASYNC) + features |= UFFD_FEATURE_WP_UNPOPULATED; + /* report all available features and ioctls to userland */ uffdio_api.features = UFFD_API_FEATURES; #ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR @@ -2063,6 +2080,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, #ifndef CONFIG_PTE_MARKER_UFFD_WP uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; + uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; #endif uffdio_api.ioctls = UFFD_API_IOCTLS; ret = -EFAULT; diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index ac8c6854097c..c98df391bfd8 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -161,11 +161,22 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) } static inline bool vma_can_userfault(struct vm_area_struct *vma, - unsigned long vm_flags) + unsigned long vm_flags, + bool wp_async) { + vm_flags &= __VM_UFFD_FLAGS; + if ((vm_flags & VM_UFFD_MINOR) && (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) return false; + + /* + * If wp async enabled, and WP is the only mode enabled, allow any + * memory type. + */ + if (wp_async && (vm_flags == VM_UFFD_WP)) + return true; + #ifndef CONFIG_PTE_MARKER_UFFD_WP /* * If user requested uffd-wp but not enabled pte markers for @@ -175,6 +186,8 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma, if ((vm_flags & VM_UFFD_WP) && !vma_is_anonymous(vma)) return false; #endif + + /* By default, allow any of anon|shmem|hugetlb */ return vma_is_anonymous(vma) || is_vm_hugetlb_page(vma) || vma_is_shmem(vma); } @@ -197,6 +210,7 @@ extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma); +extern bool userfaultfd_wp_async(struct vm_area_struct *vma); #else /* CONFIG_USERFAULTFD */ @@ -297,6 +311,11 @@ static inline bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma) return false; } +static inline bool userfaultfd_wp_async(struct vm_area_struct *vma) +{ + return false; +} + #endif /* CONFIG_USERFAULTFD */ static inline bool userfaultfd_wp_use_markers(struct vm_area_struct *vma) diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 62151706c5a3..0dbc81015018 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -40,7 +40,8 @@ UFFD_FEATURE_EXACT_ADDRESS | \ UFFD_FEATURE_WP_HUGETLBFS_SHMEM | \ UFFD_FEATURE_WP_UNPOPULATED | \ - UFFD_FEATURE_POISON) + UFFD_FEATURE_POISON | \ + UFFD_FEATURE_WP_ASYNC) #define UFFD_API_IOCTLS \ ((__u64)1 << _UFFDIO_REGISTER | \ (__u64)1 << _UFFDIO_UNREGISTER | \ @@ -216,6 +217,11 @@ struct uffdio_api { * (i.e. empty ptes). This will be the default behavior for shmem * & hugetlbfs, so this flag only affects anonymous memory behavior * when userfault write-protection mode is registered. + * + * UFFD_FEATURE_WP_ASYNC indicates that userfaultfd write-protection + * asynchronous mode is supported in which the write fault is + * automatically resolved and write-protection is un-set. + * It implies UFFD_FEATURE_WP_UNPOPULATED. */ #define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0) #define UFFD_FEATURE_EVENT_FORK (1<<1) @@ -232,6 +238,7 @@ struct uffdio_api { #define UFFD_FEATURE_WP_HUGETLBFS_SHMEM (1<<12) #define UFFD_FEATURE_WP_UNPOPULATED (1<<13) #define UFFD_FEATURE_POISON (1<<14) +#define UFFD_FEATURE_WP_ASYNC (1<<15) __u64 features; __u64 ioctls; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dfb4534834f5..bc654b36df9f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6247,21 +6247,27 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Handle userfault-wp first, before trying to lock more pages */ if (userfaultfd_wp(vma) && huge_pte_uffd_wp(huge_ptep_get(ptep)) && (flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { - struct vm_fault vmf = { - .vma = vma, - .address = haddr, - .real_address = address, - .flags = flags, - }; + if (!userfaultfd_wp_async(vma)) { + struct vm_fault vmf = { + .vma = vma, + .address = haddr, + .real_address = address, + .flags = flags, + }; - spin_unlock(ptl); - if (pagecache_folio) { - folio_unlock(pagecache_folio); - folio_put(pagecache_folio); + spin_unlock(ptl); + if (pagecache_folio) { + folio_unlock(pagecache_folio); + folio_put(pagecache_folio); + } + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + return handle_userfault(&vmf, VM_UFFD_WP); } - hugetlb_vma_unlock_read(vma); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - return handle_userfault(&vmf, VM_UFFD_WP); + + entry = huge_pte_clear_uffd_wp(entry); + set_huge_pte_at(mm, haddr, ptep, entry); + /* Fallthrough to CoW */ } /* diff --git a/mm/memory.c b/mm/memory.c index 2d209bc4d18c..d0f61d729fb4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1,3 +1,4 @@ + // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory.c @@ -3349,11 +3350,28 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; struct folio *folio = NULL; + pte_t pte; if (likely(!unshare)) { if (userfaultfd_pte_wp(vma, ptep_get(vmf->pte))) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return handle_userfault(vmf, VM_UFFD_WP); + if (!userfaultfd_wp_async(vma)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_WP); + } + + /* + * Nothing needed (cache flush, TLB invalidations, + * etc.) because we're only removing the uffd-wp bit, + * which is completely invisible to the user. + */ + pte = pte_clear_uffd_wp(ptep_get(vmf->pte)); + + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + /* + * Update this to be prepared for following up CoW + * handling + */ + vmf->orig_pte = pte; } /* @@ -4879,8 +4897,11 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) if (vma_is_anonymous(vma)) { if (likely(!unshare) && - userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) + userfaultfd_huge_pmd_wp(vma, vmf->orig_pmd)) { + if (userfaultfd_wp_async(vmf->vma)) + goto split; return handle_userfault(vmf, VM_UFFD_WP); + } return do_huge_pmd_wp_page(vmf); } @@ -4892,6 +4913,7 @@ static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf) } } +split: /* COW or write-notify handled on pte level: split pmd. */ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); -- cgit v1.2.3 From 52526ca7fdb905a768a93f8faa418e9b988fc34b Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 21 Aug 2023 19:15:14 +0500 Subject: fs/proc/task_mmu: implement IOCTL to get and optionally clear info about PTEs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PAGEMAP_SCAN IOCTL on the pagemap file can be used to get or optionally clear the info about page table entries. The following operations are supported in this IOCTL: - Scan the address range and get the memory ranges matching the provided criteria. This is performed when the output buffer is specified. - Write-protect the pages. The PM_SCAN_WP_MATCHING is used to write-protect the pages of interest. The PM_SCAN_CHECK_WPASYNC aborts the operation if non-Async Write Protected pages are found. The ``PM_SCAN_WP_MATCHING`` can be used with or without PM_SCAN_CHECK_WPASYNC. - Both of those operations can be combined into one atomic operation where we can get and write protect the pages as well. Following flags about pages are currently supported: - PAGE_IS_WPALLOWED - Page has async-write-protection enabled - PAGE_IS_WRITTEN - Page has been written to from the time it was write protected - PAGE_IS_FILE - Page is file backed - PAGE_IS_PRESENT - Page is present in the memory - PAGE_IS_SWAPPED - Page is in swapped - PAGE_IS_PFNZERO - Page has zero PFN - PAGE_IS_HUGE - Page is THP or Hugetlb backed This IOCTL can be extended to get information about more PTE bits. The entire address range passed by user [start, end) is scanned until either the user provided buffer is full or max_pages have been found. [akpm@linux-foundation.org: update it for "mm: hugetlb: add huge page size param to set_huge_pte_at()"] [akpm@linux-foundation.org: fix CONFIG_HUGETLB_PAGE=n warning] [arnd@arndb.de: hide unused pagemap_scan_backout_range() function] Link: https://lkml.kernel.org/r/20230927060257.2975412-1-arnd@kernel.org [sfr@canb.auug.org.au: fix "fs/proc/task_mmu: hide unused pagemap_scan_backout_range() function"] Link: https://lkml.kernel.org/r/20230928092223.0625c6bf@canb.auug.org.au Link: https://lkml.kernel.org/r/20230821141518.870589-3-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Signed-off-by: Michał Mirosław Signed-off-by: Arnd Bergmann Signed-off-by: Stephen Rothwell Reviewed-by: Andrei Vagin Reviewed-by: Michał Mirosław Cc: Alex Sierra Cc: Al Viro Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 692 ++++++++++++++++++++++++++++++++++++++++++ include/linux/hugetlb.h | 1 + include/linux/userfaultfd_k.h | 7 + include/uapi/linux/fs.h | 59 ++++ mm/hugetlb.c | 5 +- 5 files changed, 762 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3dd5be96691b..d4ef9a2bf95d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include #include #include @@ -1761,11 +1763,701 @@ static int pagemap_release(struct inode *inode, struct file *file) return 0; } +#define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ + PAGE_IS_FILE | PAGE_IS_PRESENT | \ + PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ + PAGE_IS_HUGE) +#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) + +struct pagemap_scan_private { + struct pm_scan_arg arg; + unsigned long masks_of_interest, cur_vma_category; + struct page_region *vec_buf; + unsigned long vec_buf_len, vec_buf_index, found_pages; + struct page_region __user *vec_out; +}; + +static unsigned long pagemap_page_category(struct pagemap_scan_private *p, + struct vm_area_struct *vma, + unsigned long addr, pte_t pte) +{ + unsigned long categories = 0; + + if (pte_present(pte)) { + struct page *page; + + categories |= PAGE_IS_PRESENT; + if (!pte_uffd_wp(pte)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + page = vm_normal_page(vma, addr, pte); + if (page && !PageAnon(page)) + categories |= PAGE_IS_FILE; + } + + if (is_zero_pfn(pte_pfn(pte))) + categories |= PAGE_IS_PFNZERO; + } else if (is_swap_pte(pte)) { + swp_entry_t swp; + + categories |= PAGE_IS_SWAPPED; + if (!pte_swp_uffd_wp_any(pte)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + swp = pte_to_swp_entry(pte); + if (is_pfn_swap_entry(swp) && + !PageAnon(pfn_swap_entry_to_page(swp))) + categories |= PAGE_IS_FILE; + } + } + + return categories; +} + +static void make_uffd_wp_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ + pte_t ptent = ptep_get(pte); + + if (pte_present(ptent)) { + pte_t old_pte; + + old_pte = ptep_modify_prot_start(vma, addr, pte); + ptent = pte_mkuffd_wp(ptent); + ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); + } else if (is_swap_pte(ptent)) { + ptent = pte_swp_mkuffd_wp(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); + } else { + set_pte_at(vma->vm_mm, addr, pte, + make_pte_marker(PTE_MARKER_UFFD_WP)); + } +} + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, + struct vm_area_struct *vma, + unsigned long addr, pmd_t pmd) +{ + unsigned long categories = PAGE_IS_HUGE; + + if (pmd_present(pmd)) { + struct page *page; + + categories |= PAGE_IS_PRESENT; + if (!pmd_uffd_wp(pmd)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + page = vm_normal_page_pmd(vma, addr, pmd); + if (page && !PageAnon(page)) + categories |= PAGE_IS_FILE; + } + + if (is_zero_pfn(pmd_pfn(pmd))) + categories |= PAGE_IS_PFNZERO; + } else if (is_swap_pmd(pmd)) { + swp_entry_t swp; + + categories |= PAGE_IS_SWAPPED; + if (!pmd_swp_uffd_wp(pmd)) + categories |= PAGE_IS_WRITTEN; + + if (p->masks_of_interest & PAGE_IS_FILE) { + swp = pmd_to_swp_entry(pmd); + if (is_pfn_swap_entry(swp) && + !PageAnon(pfn_swap_entry_to_page(swp))) + categories |= PAGE_IS_FILE; + } + } + + return categories; +} + +static void make_uffd_wp_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t old, pmd = *pmdp; + + if (pmd_present(pmd)) { + old = pmdp_invalidate_ad(vma, addr, pmdp); + pmd = pmd_mkuffd_wp(old); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + pmd = pmd_swp_mkuffd_wp(pmd); + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long pagemap_hugetlb_category(pte_t pte) +{ + unsigned long categories = PAGE_IS_HUGE; + + /* + * According to pagemap_hugetlb_range(), file-backed HugeTLB + * page cannot be swapped. So PAGE_IS_FILE is not checked for + * swapped pages. + */ + if (pte_present(pte)) { + categories |= PAGE_IS_PRESENT; + if (!huge_pte_uffd_wp(pte)) + categories |= PAGE_IS_WRITTEN; + if (!PageAnon(pte_page(pte))) + categories |= PAGE_IS_FILE; + if (is_zero_pfn(pte_pfn(pte))) + categories |= PAGE_IS_PFNZERO; + } else if (is_swap_pte(pte)) { + categories |= PAGE_IS_SWAPPED; + if (!pte_swp_uffd_wp_any(pte)) + categories |= PAGE_IS_WRITTEN; + } + + return categories; +} + +static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep, + pte_t ptent) +{ + unsigned long psize; + + if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent)) + return; + + psize = huge_page_size(hstate_vma(vma)); + + if (is_hugetlb_entry_migration(ptent)) + set_huge_pte_at(vma->vm_mm, addr, ptep, + pte_swp_mkuffd_wp(ptent), psize); + else if (!huge_pte_none(ptent)) + huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, + huge_pte_mkuffd_wp(ptent)); + else + set_huge_pte_at(vma->vm_mm, addr, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP), psize); +} +#endif /* CONFIG_HUGETLB_PAGE */ + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLB_PAGE) +static void pagemap_scan_backout_range(struct pagemap_scan_private *p, + unsigned long addr, unsigned long end) +{ + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + + if (cur_buf->start != addr) + cur_buf->end = addr; + else + cur_buf->start = cur_buf->end = 0; + + p->found_pages -= (end - addr) / PAGE_SIZE; +} +#endif + +static bool pagemap_scan_is_interesting_page(unsigned long categories, + const struct pagemap_scan_private *p) +{ + categories ^= p->arg.category_inverted; + if ((categories & p->arg.category_mask) != p->arg.category_mask) + return false; + if (p->arg.category_anyof_mask && !(categories & p->arg.category_anyof_mask)) + return false; + + return true; +} + +static bool pagemap_scan_is_interesting_vma(unsigned long categories, + const struct pagemap_scan_private *p) +{ + unsigned long required = p->arg.category_mask & PAGE_IS_WPALLOWED; + + categories ^= p->arg.category_inverted; + if ((categories & required) != required) + return false; + + return true; +} + +static int pagemap_scan_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long vma_category = 0; + + if (userfaultfd_wp_async(vma) && userfaultfd_wp_use_markers(vma)) + vma_category |= PAGE_IS_WPALLOWED; + else if (p->arg.flags & PM_SCAN_CHECK_WPASYNC) + return -EPERM; + + if (vma->vm_flags & VM_PFNMAP) + return 1; + + if (!pagemap_scan_is_interesting_vma(vma_category, p)) + return 1; + + p->cur_vma_category = vma_category; + + return 0; +} + +static bool pagemap_scan_push_range(unsigned long categories, + struct pagemap_scan_private *p, + unsigned long addr, unsigned long end) +{ + struct page_region *cur_buf = &p->vec_buf[p->vec_buf_index]; + + /* + * When there is no output buffer provided at all, the sentinel values + * won't match here. There is no other way for `cur_buf->end` to be + * non-zero other than it being non-empty. + */ + if (addr == cur_buf->end && categories == cur_buf->categories) { + cur_buf->end = end; + return true; + } + + if (cur_buf->end) { + if (p->vec_buf_index >= p->vec_buf_len - 1) + return false; + + cur_buf = &p->vec_buf[++p->vec_buf_index]; + } + + cur_buf->start = addr; + cur_buf->end = end; + cur_buf->categories = categories; + + return true; +} + +static int pagemap_scan_output(unsigned long categories, + struct pagemap_scan_private *p, + unsigned long addr, unsigned long *end) +{ + unsigned long n_pages, total_pages; + int ret = 0; + + if (!p->vec_buf) + return 0; + + categories &= p->arg.return_mask; + + n_pages = (*end - addr) / PAGE_SIZE; + if (check_add_overflow(p->found_pages, n_pages, &total_pages) || + total_pages > p->arg.max_pages) { + size_t n_too_much = total_pages - p->arg.max_pages; + *end -= n_too_much * PAGE_SIZE; + n_pages -= n_too_much; + ret = -ENOSPC; + } + + if (!pagemap_scan_push_range(categories, p, addr, *end)) { + *end = addr; + n_pages = 0; + ret = -ENOSPC; + } + + p->found_pages += n_pages; + if (ret) + p->arg.walk_end = *end; + + return ret; +} + +static int pagemap_scan_thp_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long categories; + spinlock_t *ptl; + int ret = 0; + + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) + return -ENOENT; + + categories = p->cur_vma_category | + pagemap_thp_category(p, vma, start, *pmd); + + if (!pagemap_scan_is_interesting_page(categories, p)) + goto out_unlock; + + ret = pagemap_scan_output(categories, p, start, &end); + if (start == end) + goto out_unlock; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + goto out_unlock; + if (~categories & PAGE_IS_WRITTEN) + goto out_unlock; + + /* + * Break huge page into small pages if the WP operation + * needs to be performed on a portion of the huge page. + */ + if (end != start + HPAGE_SIZE) { + spin_unlock(ptl); + split_huge_pmd(vma, pmd, start); + pagemap_scan_backout_range(p, start, end); + /* Report as if there was no THP */ + return -ENOENT; + } + + make_uffd_wp_pmd(vma, start, pmd); + flush_tlb_range(vma, start, end); +out_unlock: + spin_unlock(ptl); + return ret; +#else /* !CONFIG_TRANSPARENT_HUGEPAGE */ + return -ENOENT; +#endif +} + +static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long addr, flush_end = 0; + pte_t *pte, *start_pte; + spinlock_t *ptl; + int ret; + + arch_enter_lazy_mmu_mode(); + + ret = pagemap_scan_thp_entry(pmd, start, end, walk); + if (ret != -ENOENT) { + arch_leave_lazy_mmu_mode(); + return ret; + } + + ret = 0; + start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); + if (!pte) { + arch_leave_lazy_mmu_mode(); + walk->action = ACTION_AGAIN; + return 0; + } + + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { + unsigned long categories = p->cur_vma_category | + pagemap_page_category(p, vma, addr, ptep_get(pte)); + unsigned long next = addr + PAGE_SIZE; + + if (!pagemap_scan_is_interesting_page(categories, p)) + continue; + + ret = pagemap_scan_output(categories, p, addr, &next); + if (next == addr) + break; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + continue; + if (~categories & PAGE_IS_WRITTEN) + continue; + + make_uffd_wp_pte(vma, addr, pte); + if (!flush_end) + start = addr; + flush_end = next; + } + + if (flush_end) + flush_tlb_range(vma, start, addr); + + pte_unmap_unlock(start_pte, ptl); + arch_leave_lazy_mmu_mode(); + + cond_resched(); + return ret; +} + +#ifdef CONFIG_HUGETLB_PAGE +static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask, + unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + unsigned long categories; + spinlock_t *ptl; + int ret = 0; + pte_t pte; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) { + /* Go the short route when not write-protecting pages. */ + + pte = huge_ptep_get(ptep); + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); + + if (!pagemap_scan_is_interesting_page(categories, p)) + return 0; + + return pagemap_scan_output(categories, p, start, &end); + } + + i_mmap_lock_write(vma->vm_file->f_mapping); + ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep); + + pte = huge_ptep_get(ptep); + categories = p->cur_vma_category | pagemap_hugetlb_category(pte); + + if (!pagemap_scan_is_interesting_page(categories, p)) + goto out_unlock; + + ret = pagemap_scan_output(categories, p, start, &end); + if (start == end) + goto out_unlock; + + if (~categories & PAGE_IS_WRITTEN) + goto out_unlock; + + if (end != start + HPAGE_SIZE) { + /* Partial HugeTLB page WP isn't possible. */ + pagemap_scan_backout_range(p, start, end); + p->arg.walk_end = start; + ret = 0; + goto out_unlock; + } + + make_uffd_wp_huge_pte(vma, start, ptep, pte); + flush_hugetlb_tlb_range(vma, start, end); + +out_unlock: + spin_unlock(ptl); + i_mmap_unlock_write(vma->vm_file->f_mapping); + + return ret; +} +#else +#define pagemap_scan_hugetlb_entry NULL +#endif + +static int pagemap_scan_pte_hole(unsigned long addr, unsigned long end, + int depth, struct mm_walk *walk) +{ + struct pagemap_scan_private *p = walk->private; + struct vm_area_struct *vma = walk->vma; + int ret, err; + + if (!vma || !pagemap_scan_is_interesting_page(p->cur_vma_category, p)) + return 0; + + ret = pagemap_scan_output(p->cur_vma_category, p, addr, &end); + if (addr == end) + return ret; + + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + return ret; + + err = uffd_wp_range(vma, addr, end - addr, true); + if (err < 0) + ret = err; + + return ret; +} + +static const struct mm_walk_ops pagemap_scan_ops = { + .test_walk = pagemap_scan_test_walk, + .pmd_entry = pagemap_scan_pmd_entry, + .pte_hole = pagemap_scan_pte_hole, + .hugetlb_entry = pagemap_scan_hugetlb_entry, +}; + +static int pagemap_scan_get_args(struct pm_scan_arg *arg, + unsigned long uarg) +{ + if (copy_from_user(arg, (void __user *)uarg, sizeof(*arg))) + return -EFAULT; + + if (arg->size != sizeof(struct pm_scan_arg)) + return -EINVAL; + + /* Validate requested features */ + if (arg->flags & ~PM_SCAN_FLAGS) + return -EINVAL; + if ((arg->category_inverted | arg->category_mask | + arg->category_anyof_mask | arg->return_mask) & ~PM_SCAN_CATEGORIES) + return -EINVAL; + + arg->start = untagged_addr((unsigned long)arg->start); + arg->end = untagged_addr((unsigned long)arg->end); + arg->vec = untagged_addr((unsigned long)arg->vec); + + /* Validate memory pointers */ + if (!IS_ALIGNED(arg->start, PAGE_SIZE)) + return -EINVAL; + if (!access_ok((void __user *)(long)arg->start, arg->end - arg->start)) + return -EFAULT; + if (!arg->vec && arg->vec_len) + return -EINVAL; + if (arg->vec && !access_ok((void __user *)(long)arg->vec, + arg->vec_len * sizeof(struct page_region))) + return -EFAULT; + + /* Fixup default values */ + arg->end = ALIGN(arg->end, PAGE_SIZE); + arg->walk_end = 0; + if (!arg->max_pages) + arg->max_pages = ULONG_MAX; + + return 0; +} + +static int pagemap_scan_writeback_args(struct pm_scan_arg *arg, + unsigned long uargl) +{ + struct pm_scan_arg __user *uarg = (void __user *)uargl; + + if (copy_to_user(&uarg->walk_end, &arg->walk_end, sizeof(arg->walk_end))) + return -EFAULT; + + return 0; +} + +static int pagemap_scan_init_bounce_buffer(struct pagemap_scan_private *p) +{ + if (!p->arg.vec_len) + return 0; + + p->vec_buf_len = min_t(size_t, PAGEMAP_WALK_SIZE >> PAGE_SHIFT, + p->arg.vec_len); + p->vec_buf = kmalloc_array(p->vec_buf_len, sizeof(*p->vec_buf), + GFP_KERNEL); + if (!p->vec_buf) + return -ENOMEM; + + p->vec_buf->start = p->vec_buf->end = 0; + p->vec_out = (struct page_region __user *)(long)p->arg.vec; + + return 0; +} + +static long pagemap_scan_flush_buffer(struct pagemap_scan_private *p) +{ + const struct page_region *buf = p->vec_buf; + long n = p->vec_buf_index; + + if (!p->vec_buf) + return 0; + + if (buf[n].end != buf[n].start) + n++; + + if (!n) + return 0; + + if (copy_to_user(p->vec_out, buf, n * sizeof(*buf))) + return -EFAULT; + + p->arg.vec_len -= n; + p->vec_out += n; + + p->vec_buf_index = 0; + p->vec_buf_len = min_t(size_t, p->vec_buf_len, p->arg.vec_len); + p->vec_buf->start = p->vec_buf->end = 0; + + return n; +} + +static long do_pagemap_scan(struct mm_struct *mm, unsigned long uarg) +{ + struct mmu_notifier_range range; + struct pagemap_scan_private p = {0}; + unsigned long walk_start; + size_t n_ranges_out = 0; + int ret; + + ret = pagemap_scan_get_args(&p.arg, uarg); + if (ret) + return ret; + + p.masks_of_interest = p.arg.category_mask | p.arg.category_anyof_mask | + p.arg.return_mask; + ret = pagemap_scan_init_bounce_buffer(&p); + if (ret) + return ret; + + /* Protection change for the range is going to happen. */ + if (p.arg.flags & PM_SCAN_WP_MATCHING) { + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_VMA, 0, + mm, p.arg.start, p.arg.end); + mmu_notifier_invalidate_range_start(&range); + } + + for (walk_start = p.arg.start; walk_start < p.arg.end; + walk_start = p.arg.walk_end) { + long n_out; + + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + + ret = mmap_read_lock_killable(mm); + if (ret) + break; + ret = walk_page_range(mm, walk_start, p.arg.end, + &pagemap_scan_ops, &p); + mmap_read_unlock(mm); + + n_out = pagemap_scan_flush_buffer(&p); + if (n_out < 0) + ret = n_out; + else + n_ranges_out += n_out; + + if (ret != -ENOSPC) + break; + + if (p.arg.vec_len == 0 || p.found_pages == p.arg.max_pages) + break; + } + + /* ENOSPC signifies early stop (buffer full) from the walk. */ + if (!ret || ret == -ENOSPC) + ret = n_ranges_out; + + /* The walk_end isn't set when ret is zero */ + if (!p.arg.walk_end) + p.arg.walk_end = p.arg.end; + if (pagemap_scan_writeback_args(&p.arg, uarg)) + ret = -EFAULT; + + if (p.arg.flags & PM_SCAN_WP_MATCHING) + mmu_notifier_invalidate_range_end(&range); + + kfree(p.vec_buf); + return ret; +} + +static long do_pagemap_cmd(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct mm_struct *mm = file->private_data; + + switch (cmd) { + case PAGEMAP_SCAN: + return do_pagemap_scan(mm, arg); + + default: + return -EINVAL; + } +} + const struct file_operations proc_pagemap_operations = { .llseek = mem_lseek, /* borrow this */ .read = pagemap_read, .open = pagemap_open, .release = pagemap_release, + .unlocked_ioctl = do_pagemap_cmd, + .compat_ioctl = do_pagemap_cmd, }; #endif /* CONFIG_PROC_PAGE_MONITOR */ diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e5b9f7e62eeb..205469aa0613 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -280,6 +280,7 @@ long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long cp_flags); bool is_hugetlb_entry_migration(pte_t pte); +bool is_hugetlb_entry_hwpoisoned(pte_t pte); void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index c98df391bfd8..f2dc19f40d05 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -221,6 +221,13 @@ static inline vm_fault_t handle_userfault(struct vm_fault *vmf, return VM_FAULT_SIGBUS; } +static inline long uffd_wp_range(struct vm_area_struct *vma, + unsigned long start, unsigned long len, + bool enable_wp) +{ + return false; +} + static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, struct vm_userfaultfd_ctx vm_ctx) { diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index b7b56871029c..da43810b7485 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -305,4 +305,63 @@ typedef int __bitwise __kernel_rwf_t; #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND) +/* Pagemap ioctl */ +#define PAGEMAP_SCAN _IOWR('f', 16, struct pm_scan_arg) + +/* Bitmasks provided in pm_scan_args masks and reported in page_region.categories. */ +#define PAGE_IS_WPALLOWED (1 << 0) +#define PAGE_IS_WRITTEN (1 << 1) +#define PAGE_IS_FILE (1 << 2) +#define PAGE_IS_PRESENT (1 << 3) +#define PAGE_IS_SWAPPED (1 << 4) +#define PAGE_IS_PFNZERO (1 << 5) +#define PAGE_IS_HUGE (1 << 6) + +/* + * struct page_region - Page region with flags + * @start: Start of the region + * @end: End of the region (exclusive) + * @categories: PAGE_IS_* category bitmask for the region + */ +struct page_region { + __u64 start; + __u64 end; + __u64 categories; +}; + +/* Flags for PAGEMAP_SCAN ioctl */ +#define PM_SCAN_WP_MATCHING (1 << 0) /* Write protect the pages matched. */ +#define PM_SCAN_CHECK_WPASYNC (1 << 1) /* Abort the scan when a non-WP-enabled page is found. */ + +/* + * struct pm_scan_arg - Pagemap ioctl argument + * @size: Size of the structure + * @flags: Flags for the IOCTL + * @start: Starting address of the region + * @end: Ending address of the region + * @walk_end Address where the scan stopped (written by kernel). + * walk_end == end (address tags cleared) informs that the scan completed on entire range. + * @vec: Address of page_region struct array for output + * @vec_len: Length of the page_region struct array + * @max_pages: Optional limit for number of returned pages (0 = disabled) + * @category_inverted: PAGE_IS_* categories which values match if 0 instead of 1 + * @category_mask: Skip pages for which any category doesn't match + * @category_anyof_mask: Skip pages for which no category matches + * @return_mask: PAGE_IS_* categories that are to be reported in `page_region`s returned + */ +struct pm_scan_arg { + __u64 size; + __u64 flags; + __u64 start; + __u64 end; + __u64 walk_end; + __u64 vec; + __u64 vec_len; + __u64 max_pages; + __u64 category_inverted; + __u64 category_mask; + __u64 category_anyof_mask; + __u64 return_mask; +}; + #endif /* _UAPI_LINUX_FS_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bc654b36df9f..2878e0e6bac5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5044,7 +5044,7 @@ bool is_hugetlb_entry_migration(pte_t pte) return false; } -static bool is_hugetlb_entry_hwpoisoned(pte_t pte) +bool is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; @@ -6266,7 +6266,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } entry = huge_pte_clear_uffd_wp(entry); - set_huge_pte_at(mm, haddr, ptep, entry); + set_huge_pte_at(mm, haddr, ptep, entry, + huge_page_size(hstate_vma(vma))); /* Fallthrough to CoW */ } -- cgit v1.2.3 From 12f6b01a0bcbeeab8cc9305673314adb3adf80f7 Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 21 Aug 2023 19:15:15 +0500 Subject: fs/proc/task_mmu: add fast paths to get/clear PAGE_IS_WRITTEN flag MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adding fast code paths to handle specifically only get and/or clear operation of PAGE_IS_WRITTEN, increases its performance by 0-35%. The results of some test cases are given below: Test-case-1 t1 = (Get + WP) time t2 = WP time t1 t2 Without this patch: 140-170mcs 90-115mcs With this patch: 110mcs 80mcs Worst case diff: 35% faster 30% faster Test-case-2 t3 = atomic Get and WP t3 Without this patch: 120-140mcs With this patch: 100-110mcs Worst case diff: 21% faster Link: https://lkml.kernel.org/r/20230821141518.870589-4-usama.anjum@collabora.com Signed-off-by: Muhammad Usama Anjum Cc: Alex Sierra Cc: Al Viro Cc: Andrei Vagin Cc: Axel Rasmussen Cc: Christian Brauner Cc: Cyrill Gorcunov Cc: Dan Williams Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Gustavo A. R. Silva Cc: "Liam R. Howlett" Cc: Matthew Wilcox Cc: Michal Miroslaw Cc: Michał Mirosław Cc: Mike Rapoport (IBM) Cc: Nadav Amit Cc: Pasha Tatashin Cc: Paul Gofman Cc: Peter Xu Cc: Shuah Khan Cc: Suren Baghdasaryan Cc: Vlastimil Babka Cc: Yang Shi Cc: Yun Zhou Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index d4ef9a2bf95d..1d994505805b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2145,6 +2145,41 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, return 0; } + if (!p->vec_out) { + /* Fast path for performing exclusive WP */ + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { + if (pte_uffd_wp(ptep_get(pte))) + continue; + make_uffd_wp_pte(vma, addr, pte); + if (!flush_end) + start = addr; + flush_end = addr + PAGE_SIZE; + } + goto flush_and_return; + } + + if (!p->arg.category_anyof_mask && !p->arg.category_inverted && + p->arg.category_mask == PAGE_IS_WRITTEN && + p->arg.return_mask == PAGE_IS_WRITTEN) { + for (addr = start; addr < end; pte++, addr += PAGE_SIZE) { + unsigned long next = addr + PAGE_SIZE; + + if (pte_uffd_wp(ptep_get(pte))) + continue; + ret = pagemap_scan_output(p->cur_vma_category | PAGE_IS_WRITTEN, + p, addr, &next); + if (next == addr) + break; + if (~p->arg.flags & PM_SCAN_WP_MATCHING) + continue; + make_uffd_wp_pte(vma, addr, pte); + if (!flush_end) + start = addr; + flush_end = next; + } + goto flush_and_return; + } + for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { unsigned long categories = p->cur_vma_category | pagemap_page_category(p, vma, addr, ptep_get(pte)); @@ -2168,6 +2203,7 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, flush_end = next; } +flush_and_return: if (flush_end) flush_tlb_range(vma, start, addr); -- cgit v1.2.3 From 279d5fc3227f04ef2c6125e5c440e7952173a89a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:01 +0100 Subject: iomap: hold state_lock over call to ifs_set_range_uptodate() Patch series "Add folio_end_read", v2. The core of this patchset is the new folio_end_read() call which filesystems can use when finishing a page cache read instead of separate calls to mark the folio uptodate and unlock it. As an illustration of its use, I converted ext4, iomap & mpage; more can be converted. I think that's useful by itself, but the interesting optimisation is that we can implement that with a single XOR instruction that sets the uptodate bit, clears the lock bit, tests the waiter bit and provides a write memory barrier. That removes one memory barrier and one atomic instruction from each page read, which seems worth doing. That's in patch 15. The last two patches could be a separate series, but basically we can do the same thing with the writeback flag that we do with the unlock flag; clear it and test the waiters bit at the same time. This patch (of 17): This is really preparation for the next patch, but it lets us call folio_mark_uptodate() in just one place instead of two. Link: https://lkml.kernel.org/r/20231004165317.1061855-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231004165317.1061855-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Matthew Wilcox (Oracle) Cc: Nicholas Piggin Cc: "Theodore Ts'o" Cc: Andreas Dilger Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Thomas Bogendoerfer Cc: Michael Ellerman Cc: Christophe Leroy Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Cc: Christian Borntraeger Cc: Sven Schnelle Cc: Geert Uytterhoeven Signed-off-by: Andrew Morton --- fs/iomap/buffered-io.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 5db54ca29a35..6e780ca64ce3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -57,30 +57,32 @@ static inline bool ifs_block_is_uptodate(struct iomap_folio_state *ifs, return test_bit(block, ifs->state); } -static void ifs_set_range_uptodate(struct folio *folio, +static bool ifs_set_range_uptodate(struct folio *folio, struct iomap_folio_state *ifs, size_t off, size_t len) { struct inode *inode = folio->mapping->host; unsigned int first_blk = off >> inode->i_blkbits; unsigned int last_blk = (off + len - 1) >> inode->i_blkbits; unsigned int nr_blks = last_blk - first_blk + 1; - unsigned long flags; - spin_lock_irqsave(&ifs->state_lock, flags); bitmap_set(ifs->state, first_blk, nr_blks); - if (ifs_is_fully_uptodate(folio, ifs)) - folio_mark_uptodate(folio); - spin_unlock_irqrestore(&ifs->state_lock, flags); + return ifs_is_fully_uptodate(folio, ifs); } static void iomap_set_range_uptodate(struct folio *folio, size_t off, size_t len) { struct iomap_folio_state *ifs = folio->private; + unsigned long flags; + bool uptodate = true; - if (ifs) - ifs_set_range_uptodate(folio, ifs, off, len); - else + if (ifs) { + spin_lock_irqsave(&ifs->state_lock, flags); + uptodate = ifs_set_range_uptodate(folio, ifs, off, len); + spin_unlock_irqrestore(&ifs->state_lock, flags); + } + + if (uptodate) folio_mark_uptodate(folio); } -- cgit v1.2.3 From f45b494e2a24d86afd79cab7c343b414c5213447 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:02 +0100 Subject: iomap: protect read_bytes_pending with the state_lock Perform one atomic operation (acquiring the spinlock) instead of two (spinlock & atomic_sub) per read completion. Link: https://lkml.kernel.org/r/20231004165317.1061855-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- fs/iomap/buffered-io.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 6e780ca64ce3..4a996c5327ef 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -29,9 +29,9 @@ typedef int (*iomap_punch_t)(struct inode *inode, loff_t offset, loff_t length); * and I/O completions. */ struct iomap_folio_state { - atomic_t read_bytes_pending; - atomic_t write_bytes_pending; spinlock_t state_lock; + unsigned int read_bytes_pending; + atomic_t write_bytes_pending; /* * Each block has two bits in this bitmap: @@ -183,7 +183,7 @@ static void ifs_free(struct folio *folio) if (!ifs) return; - WARN_ON_ONCE(atomic_read(&ifs->read_bytes_pending)); + WARN_ON_ONCE(ifs->read_bytes_pending != 0); WARN_ON_ONCE(atomic_read(&ifs->write_bytes_pending)); WARN_ON_ONCE(ifs_is_fully_uptodate(folio, ifs) != folio_test_uptodate(folio)); @@ -250,19 +250,29 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, *lenp = plen; } -static void iomap_finish_folio_read(struct folio *folio, size_t offset, +static void iomap_finish_folio_read(struct folio *folio, size_t off, size_t len, int error) { struct iomap_folio_state *ifs = folio->private; + bool uptodate = !error; + bool finished = true; - if (unlikely(error)) { - folio_clear_uptodate(folio); - folio_set_error(folio); - } else { - iomap_set_range_uptodate(folio, offset, len); + if (ifs) { + unsigned long flags; + + spin_lock_irqsave(&ifs->state_lock, flags); + if (!error) + uptodate = ifs_set_range_uptodate(folio, ifs, off, len); + ifs->read_bytes_pending -= len; + finished = !ifs->read_bytes_pending; + spin_unlock_irqrestore(&ifs->state_lock, flags); } - if (!ifs || atomic_sub_and_test(len, &ifs->read_bytes_pending)) + if (error) + folio_set_error(folio); + if (uptodate) + folio_mark_uptodate(folio); + if (finished) folio_unlock(folio); } @@ -360,8 +370,11 @@ static loff_t iomap_readpage_iter(const struct iomap_iter *iter, } ctx->cur_folio_in_bio = true; - if (ifs) - atomic_add(plen, &ifs->read_bytes_pending); + if (ifs) { + spin_lock_irq(&ifs->state_lock); + ifs->read_bytes_pending += plen; + spin_unlock_irq(&ifs->state_lock); + } sector = iomap_sector(iomap, pos); if (!ctx->bio || -- cgit v1.2.3 From f8174a1181220d24d6b4332216112318f5905729 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:04 +0100 Subject: ext4: use folio_end_read() folio_end_read() is the perfect fit for ext4. Link: https://lkml.kernel.org/r/20231004165317.1061855-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- fs/ext4/readpage.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 3e7d160f543f..21e8f0aebb3c 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -70,15 +70,8 @@ static void __read_end_io(struct bio *bio) { struct folio_iter fi; - bio_for_each_folio_all(fi, bio) { - struct folio *folio = fi.folio; - - if (bio->bi_status) - folio_clear_uptodate(folio); - else - folio_mark_uptodate(folio); - folio_unlock(folio); - } + bio_for_each_folio_all(fi, bio) + folio_end_read(fi.folio, bio->bi_status == 0); if (bio->bi_private) mempool_free(bio->bi_private, bio_post_read_ctx_pool); bio_put(bio); @@ -336,8 +329,7 @@ int ext4_mpage_readpages(struct inode *inode, if (ext4_need_verity(inode, folio->index) && !fsverity_verify_folio(folio)) goto set_error_page; - folio_mark_uptodate(folio); - folio_unlock(folio); + folio_end_read(folio, true); continue; } } else if (fully_mapped) { -- cgit v1.2.3 From 6ba924d341c24027d95352ae8802c9cd1c308559 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:05 +0100 Subject: buffer: use folio_end_read() There are two places that we can use this new helper. Link: https://lkml.kernel.org/r/20231004165317.1061855-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- fs/buffer.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index edec8652788c..86ddfa52c699 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -282,13 +282,7 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) } while (tmp != bh); spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - /* - * If all of the buffers are uptodate then we can set the page - * uptodate. - */ - if (folio_uptodate) - folio_mark_uptodate(folio); - folio_unlock(folio); + folio_end_read(folio, folio_uptodate); return; still_busy: @@ -2433,12 +2427,10 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (!nr) { /* - * All buffers are uptodate - we can set the folio uptodate - * as well. But not if get_block() returned an error. + * All buffers are uptodate or get_block() returned an + * error when trying to map them - we can finish the read. */ - if (!page_error) - folio_mark_uptodate(folio); - folio_unlock(folio); + folio_end_read(folio, !page_error); return 0; } -- cgit v1.2.3 From 7a4847e54cc1889d109ce2a6ebed19aafc4a4af8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 4 Oct 2023 17:53:06 +0100 Subject: iomap: use folio_end_read() Combine the setting of the uptodate flag with the clearing of the locked flag. Link: https://lkml.kernel.org/r/20231004165317.1061855-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Albert Ou Cc: Alexander Gordeev Cc: Andreas Dilger Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Geert Uytterhoeven Cc: Heiko Carstens Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Nicholas Piggin Cc: Palmer Dabbelt Cc: Paul Walmsley Cc: Richard Henderson Cc: Sven Schnelle Cc: "Theodore Ts'o" Cc: Thomas Bogendoerfer Cc: Vasily Gorbik Signed-off-by: Andrew Morton --- fs/iomap/buffered-io.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 4a996c5327ef..5d19a2b47b6a 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -270,10 +270,8 @@ static void iomap_finish_folio_read(struct folio *folio, size_t off, if (error) folio_set_error(folio); - if (uptodate) - folio_mark_uptodate(folio); if (finished) - folio_unlock(folio); + folio_end_read(folio, uptodate); } static void iomap_read_end_io(struct bio *bio) -- cgit v1.2.3 From 94d7d923395129b9248777e575c877e40007f9dc Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 11 Oct 2023 18:04:28 +0100 Subject: mm: abstract the vma_merge()/split_vma() pattern for mprotect() et al. mprotect() and other functions which change VMA parameters over a range each employ a pattern of:- 1. Attempt to merge the range with adjacent VMAs. 2. If this fails, and the range spans a subset of the VMA, split it accordingly. This is open-coded and duplicated in each case. Also in each case most of the parameters passed to vma_merge() remain the same. Create a new function, vma_modify(), which abstracts this operation, accepting only those parameters which can be changed. To avoid the mess of invoking each function call with unnecessary parameters, create inline wrapper functions for each of the modify operations, parameterised only by what is required to perform the action. We can also significantly simplify the logic - by returning the VMA if we split (or merged VMA if we do not) we no longer need specific handling for merge/split cases in any of the call sites. Note that the userfaultfd_release() case works even though it does not split VMAs - since start is set to vma->vm_start and end is set to vma->vm_end, the split logic does not trigger. In addition, since we calculate pgoff to be equal to vma->vm_pgoff + (start - vma->vm_start) >> PAGE_SHIFT, and start - vma->vm_start will be 0 in this instance, this invocation will remain unchanged. We eliminate a VM_WARN_ON() in mprotect_fixup() as this simply asserts that vma_merge() correctly ensures that flags remain the same, something that is already checked in is_mergeable_vma() and elsewhere, and in any case is not specific to mprotect(). Link: https://lkml.kernel.org/r/0dfa9368f37199a423674bf0ee312e8ea0619044.1697043508.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Alexander Viro Cc: Christian Brauner Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 70 ++++++++++++++---------------------------------------- include/linux/mm.h | 60 ++++++++++++++++++++++++++++++++++++++++++++++ mm/madvise.c | 26 ++++---------------- mm/mempolicy.c | 26 +++----------------- mm/mlock.c | 25 ++++--------------- mm/mmap.c | 48 +++++++++++++++++++++++++++++++++++++ mm/mprotect.c | 29 ++++------------------ 7 files changed, 141 insertions(+), 143 deletions(-) (limited to 'fs') diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index a7c6ef764e63..ac616cfbacf5 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -927,20 +927,15 @@ static int userfaultfd_release(struct inode *inode, struct file *file) continue; } new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - prev = vma_merge(&vmi, mm, prev, vma->vm_start, vma->vm_end, - new_flags, vma->anon_vma, - vma->vm_file, vma->vm_pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX, anon_vma_name(vma)); - if (prev) { - vma = prev; - } else { - prev = vma; - } + vma = vma_modify_flags_uffd(&vmi, prev, vma, vma->vm_start, + vma->vm_end, new_flags, + NULL_VM_UFFD_CTX); vma_start_write(vma); userfaultfd_set_vm_flags(vma, new_flags); vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + + prev = vma; } mmap_write_unlock(mm); mmput(mm); @@ -1331,7 +1326,6 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, unsigned long start, end, vma_end; struct vma_iterator vmi; bool wp_async = userfaultfd_wp_async_ctx(ctx); - pgoff_t pgoff; user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1484,28 +1478,14 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, vma_end = min(end, vma->vm_end); new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags; - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, - vma->anon_vma, vma->vm_file, pgoff, - vma_policy(vma), - ((struct vm_userfaultfd_ctx){ ctx }), - anon_vma_name(vma)); - if (prev) { - /* vma_merge() invalidated the mas */ - vma = prev; - goto next; - } - if (vma->vm_start < start) { - ret = split_vma(&vmi, vma, start, 1); - if (ret) - break; - } - if (vma->vm_end > end) { - ret = split_vma(&vmi, vma, end, 0); - if (ret) - break; + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, + new_flags, + (struct vm_userfaultfd_ctx){ctx}); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + break; } - next: + /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and @@ -1568,7 +1548,6 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, const void __user *buf = (void __user *)arg; struct vma_iterator vmi; bool wp_async = userfaultfd_wp_async_ctx(ctx); - pgoff_t pgoff; ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1671,26 +1650,13 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, uffd_wp_range(vma, start, vma_end - start, false); new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - prev = vma_merge(&vmi, mm, prev, start, vma_end, new_flags, - vma->anon_vma, vma->vm_file, pgoff, - vma_policy(vma), - NULL_VM_UFFD_CTX, anon_vma_name(vma)); - if (prev) { - vma = prev; - goto next; - } - if (vma->vm_start < start) { - ret = split_vma(&vmi, vma, start, 1); - if (ret) - break; - } - if (vma->vm_end > end) { - ret = split_vma(&vmi, vma, end, 0); - if (ret) - break; + vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end, + new_flags, NULL_VM_UFFD_CTX); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + break; } - next: + /* * In the vma_merge() successful mprotect-like case 8: * the next vma was merged into the current one and diff --git a/include/linux/mm.h b/include/linux/mm.h index fa608cba041f..895feb814620 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3251,6 +3251,66 @@ extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff, bool *need_rmap_locks); extern void exit_mmap(struct mm_struct *); +struct vm_area_struct *vma_modify(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long vm_flags, + struct mempolicy *policy, + struct vm_userfaultfd_ctx uffd_ctx, + struct anon_vma_name *anon_name); + +/* We are about to modify the VMA's flags. */ +static inline struct vm_area_struct +*vma_modify_flags(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long new_flags) +{ + return vma_modify(vmi, prev, vma, start, end, new_flags, + vma_policy(vma), vma->vm_userfaultfd_ctx, + anon_vma_name(vma)); +} + +/* We are about to modify the VMA's flags and/or anon_name. */ +static inline struct vm_area_struct +*vma_modify_flags_name(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + unsigned long new_flags, + struct anon_vma_name *new_name) +{ + return vma_modify(vmi, prev, vma, start, end, new_flags, + vma_policy(vma), vma->vm_userfaultfd_ctx, new_name); +} + +/* We are about to modify the VMA's memory policy. */ +static inline struct vm_area_struct +*vma_modify_policy(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct mempolicy *new_pol) +{ + return vma_modify(vmi, prev, vma, start, end, vma->vm_flags, + new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); +} + +/* We are about to modify the VMA's flags and/or uffd context. */ +static inline struct vm_area_struct +*vma_modify_flags_uffd(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long new_flags, + struct vm_userfaultfd_ctx new_ctx) +{ + return vma_modify(vmi, prev, vma, start, end, new_flags, + vma_policy(vma), new_ctx, anon_vma_name(vma)); +} static inline int check_data_rlimit(unsigned long rlim, unsigned long new, diff --git a/mm/madvise.c b/mm/madvise.c index 59e8860c86af..a6b48d4796ba 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -141,7 +141,6 @@ static int madvise_update_vma(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; int error; - pgoff_t pgoff; VMA_ITERATOR(vmi, mm, start); if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_vma_name(vma), anon_name)) { @@ -149,30 +148,13 @@ static int madvise_update_vma(struct vm_area_struct *vma, return 0; } - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(&vmi, mm, *prev, start, end, new_flags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_name); - if (*prev) { - vma = *prev; - goto success; - } + vma = vma_modify_flags_name(&vmi, *prev, vma, start, end, new_flags, + anon_name); + if (IS_ERR(vma)) + return PTR_ERR(vma); *prev = vma; - if (start != vma->vm_start) { - error = split_vma(&vmi, vma, start, 1); - if (error) - return error; - } - - if (end != vma->vm_end) { - error = split_vma(&vmi, vma, end, 0); - if (error) - return error; - } - -success: /* vm_flags is protected by the mmap_lock held in write mode. */ vma_start_write(vma); vm_flags_reset(vma, new_flags); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 38a47fa33ef4..8bb5b4b4b395 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -811,10 +811,7 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, struct mempolicy *new_pol) { - struct vm_area_struct *merged; unsigned long vmstart, vmend; - pgoff_t pgoff; - int err; vmend = min(end, vma->vm_end); if (start > vma->vm_start) { @@ -829,26 +826,9 @@ static int mbind_range(struct vma_iterator *vmi, struct vm_area_struct *vma, return 0; } - pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); - merged = vma_merge(vmi, vma->vm_mm, *prev, vmstart, vmend, vma->vm_flags, - vma->anon_vma, vma->vm_file, pgoff, new_pol, - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - if (merged) { - *prev = merged; - return vma_replace_policy(merged, new_pol); - } - - if (vma->vm_start != vmstart) { - err = split_vma(vmi, vma, vmstart, 1); - if (err) - return err; - } - - if (vma->vm_end != vmend) { - err = split_vma(vmi, vma, vmend, 0); - if (err) - return err; - } + vma = vma_modify_policy(vmi, *prev, vma, vmstart, vmend, new_pol); + if (IS_ERR(vma)) + return PTR_ERR(vma); *prev = vma; return vma_replace_policy(vma, new_pol); diff --git a/mm/mlock.c b/mm/mlock.c index 42b6865f8f82..aa44456200e3 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -476,7 +476,6 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, unsigned long end, vm_flags_t newflags) { struct mm_struct *mm = vma->vm_mm; - pgoff_t pgoff; int nr_pages; int ret = 0; vm_flags_t oldflags = vma->vm_flags; @@ -487,28 +486,12 @@ static int mlock_fixup(struct vma_iterator *vmi, struct vm_area_struct *vma, /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *prev = vma_merge(vmi, mm, *prev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - if (*prev) { - vma = *prev; - goto success; - } - - if (start != vma->vm_start) { - ret = split_vma(vmi, vma, start, 1); - if (ret) - goto out; - } - - if (end != vma->vm_end) { - ret = split_vma(vmi, vma, end, 0); - if (ret) - goto out; + vma = vma_modify_flags(vmi, *prev, vma, start, end, newflags); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; } -success: /* * Keep track of amount of locked VM. */ diff --git a/mm/mmap.c b/mm/mmap.c index 673429ee8a9e..bca685820763 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2437,6 +2437,54 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } +/* + * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd + * context and anonymous VMA name within the range [start, end). + * + * As a result, we might be able to merge the newly modified VMA range with an + * adjacent VMA with identical properties. + * + * If no merge is possible and the range does not span the entirety of the VMA, + * we then need to split the VMA to accommodate the change. + * + * The function returns either the merged VMA, the original VMA if a split was + * required instead, or an error if the split failed. + */ +struct vm_area_struct *vma_modify(struct vma_iterator *vmi, + struct vm_area_struct *prev, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + unsigned long vm_flags, + struct mempolicy *policy, + struct vm_userfaultfd_ctx uffd_ctx, + struct anon_vma_name *anon_name) +{ + pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + struct vm_area_struct *merged; + + merged = vma_merge(vmi, vma->vm_mm, prev, start, end, vm_flags, + vma->anon_vma, vma->vm_file, pgoff, policy, + uffd_ctx, anon_name); + if (merged) + return merged; + + if (vma->vm_start < start) { + int err = split_vma(vmi, vma, start, 1); + + if (err) + return ERR_PTR(err); + } + + if (vma->vm_end > end) { + int err = split_vma(vmi, vma, end, 0); + + if (err) + return ERR_PTR(err); + } + + return vma; +} + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator diff --git a/mm/mprotect.c b/mm/mprotect.c index 03e2cec3e669..f1dc8f8c84ef 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -581,7 +581,6 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, long nrpages = (end - start) >> PAGE_SHIFT; unsigned int mm_cp_flags = 0; unsigned long charged = 0; - pgoff_t pgoff; int error; if (newflags == oldflags) { @@ -631,34 +630,14 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb, newflags &= ~VM_ACCOUNT; } - /* - * First try to merge with previous and/or next vma. - */ - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); - *pprev = vma_merge(vmi, mm, *pprev, start, end, newflags, - vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx, anon_vma_name(vma)); - if (*pprev) { - vma = *pprev; - VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); - goto success; + vma = vma_modify_flags(vmi, *pprev, vma, start, end, newflags); + if (IS_ERR(vma)) { + error = PTR_ERR(vma); + goto fail; } *pprev = vma; - if (start != vma->vm_start) { - error = split_vma(vmi, vma, start, 1); - if (error) - goto fail; - } - - if (end != vma->vm_end) { - error = split_vma(vmi, vma, end, 0); - if (error) - goto fail; - } - -success: /* * vm_flags and vm_page_prot are protected by the mmap_lock * held in write mode. -- cgit v1.2.3 From 28464bbb2ddc199433383994bcb9600c8034afa1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 12 Oct 2023 18:04:29 +0100 Subject: mm: update memfd seal write check to include F_SEAL_WRITE The seal_check_future_write() function is called by shmem_mmap() or hugetlbfs_file_mmap() to disallow any future writable mappings of an memfd sealed this way. The F_SEAL_WRITE flag is not checked here, as that is handled via the mapping->i_mmap_writable mechanism and so any attempt at a mapping would fail before this could be run. However we intend to change this, meaning this check can be performed for F_SEAL_WRITE mappings also. The logic here is equally applicable to both flags, so update this function to accommodate both and rename it accordingly. Link: https://lkml.kernel.org/r/913628168ce6cce77df7d13a63970bae06a526e0.1697116581.git.lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jan Kara Cc: Alexander Viro Cc: Andy Lutomirski Cc: Christian Brauner Cc: Hugh Dickins Cc: Matthew Wilcox (Oracle) Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 +- include/linux/mm.h | 15 ++++++++------- mm/shmem.c | 2 +- 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 926d01c493fb..85cd418d2340 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -135,7 +135,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); vma->vm_ops = &hugetlb_vm_ops; - ret = seal_check_future_write(info->seals, vma); + ret = seal_check_write(info->seals, vma); if (ret) return ret; diff --git a/include/linux/mm.h b/include/linux/mm.h index c3b5749ede9d..86e040e886e4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4076,25 +4076,26 @@ static inline void mem_dump_obj(void *object) {} #endif /** - * seal_check_future_write - Check for F_SEAL_FUTURE_WRITE flag and handle it + * seal_check_write - Check for F_SEAL_WRITE or F_SEAL_FUTURE_WRITE flags and + * handle them. * @seals: the seals to check * @vma: the vma to operate on * - * Check whether F_SEAL_FUTURE_WRITE is set; if so, do proper check/handling on - * the vma flags. Return 0 if check pass, or <0 for errors. + * Check whether F_SEAL_WRITE or F_SEAL_FUTURE_WRITE are set; if so, do proper + * check/handling on the vma flags. Return 0 if check pass, or <0 for errors. */ -static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) +static inline int seal_check_write(int seals, struct vm_area_struct *vma) { - if (seals & F_SEAL_FUTURE_WRITE) { + if (seals & (F_SEAL_WRITE | F_SEAL_FUTURE_WRITE)) { /* * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * "future write" seal active. + * write seals are active. */ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) return -EPERM; /* - * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as + * Since an F_SEAL_[FUTURE_]WRITE sealed memfd can be mapped as * MAP_SHARED and read-only, take care to not allow mprotect to * revert protections on such mappings. Do this only for shared * mappings. For private mappings, don't need to mask diff --git a/mm/shmem.c b/mm/shmem.c index 61b170324e5c..bcbe9dbb9f5f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2378,7 +2378,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) struct shmem_inode_info *info = SHMEM_I(inode); int ret; - ret = seal_check_future_write(info->seals, vma); + ret = seal_check_write(info->seals, vma); if (ret) return ret; -- cgit v1.2.3 From 3decb8564eff88a2533f83b01cec2cf9259c3eaf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:49 +0100 Subject: buffer: make folio_create_empty_buffers() return a buffer_head Patch series "Finish the create_empty_buffers() transition", v2. Pankaj recently added folio_create_empty_buffers() as the folio equivalent to create_empty_buffers(). This patch set finishes the conversion by first converting all remaining filesystems to call folio_create_empty_buffers(), then renaming it back to create_empty_buffers(). I took the opportunity to make a few simplifications like making folio_create_empty_buffers() return the head buffer and extracting get_nth_bh() from nilfs2. A few of the patches in this series aren't directly related to create_empty_buffers(), but I saw them while I was working on this and thought they'd be easy enough to add to this series. Compile-tested only, other than ext4. This patch (of 26): Almost all callers want to know the first BH that was allocated for this folio. We already have that handy, so return it. Link: https://lkml.kernel.org/r/20231016201114.1928083-1-willy@infradead.org Link: https://lkml.kernel.org/r/20231016201114.1928083-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Pankaj Raghav Cc: Andreas Gruenbacher Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/buffer.c | 24 +++++++++++++----------- include/linux/buffer_head.h | 4 ++-- 2 files changed, 15 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 86ddfa52c699..bfa7604d1891 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1641,8 +1641,8 @@ EXPORT_SYMBOL(block_invalidate_folio); * block_dirty_folio() via private_lock. try_to_free_buffers * is already excluded via the folio lock. */ -void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, - unsigned long b_state) +struct buffer_head *folio_create_empty_buffers(struct folio *folio, + unsigned long blocksize, unsigned long b_state) { struct buffer_head *bh, *head, *tail; gfp_t gfp = GFP_NOFS | __GFP_ACCOUNT | __GFP_NOFAIL; @@ -1669,6 +1669,8 @@ void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, } folio_attach_private(folio, head); spin_unlock(&folio->mapping->private_lock); + + return head; } EXPORT_SYMBOL(folio_create_empty_buffers); @@ -1770,13 +1772,15 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, struct inode *inode, unsigned int b_state) { + struct buffer_head *bh; + BUG_ON(!folio_test_locked(folio)); - if (!folio_buffers(folio)) - folio_create_empty_buffers(folio, - 1 << READ_ONCE(inode->i_blkbits), - b_state); - return folio_buffers(folio); + bh = folio_buffers(folio); + if (!bh) + bh = folio_create_empty_buffers(folio, + 1 << READ_ONCE(inode->i_blkbits), b_state); + return bh; } /* @@ -2676,10 +2680,8 @@ int block_truncate_page(struct address_space *mapping, return PTR_ERR(folio); bh = folio_buffers(folio); - if (!bh) { - folio_create_empty_buffers(folio, blocksize, 0); - bh = folio_buffers(folio); - } + if (!bh) + bh = folio_create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ offset = offset_in_folio(folio, from); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 3dc4720e4773..1001244a8941 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -203,8 +203,8 @@ struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry); void create_empty_buffers(struct page *, unsigned long, unsigned long b_state); -void folio_create_empty_buffers(struct folio *folio, unsigned long blocksize, - unsigned long b_state); +struct buffer_head *folio_create_empty_buffers(struct folio *folio, + unsigned long blocksize, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); void end_buffer_async_write(struct buffer_head *bh, int uptodate); -- cgit v1.2.3 From 4f05f139e3f8938c4c456bd886355c9bbcc3190d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:50 +0100 Subject: mpage: convert map_buffer_to_folio() to folio_create_empty_buffers() Saves a folio->page->folio conversion. Link: https://lkml.kernel.org/r/20231016201114.1928083-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Pankaj Raghav Cc: Andreas Gruenbacher Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/mpage.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/mpage.c b/fs/mpage.c index 242e213ee064..964a6efe594d 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -119,8 +119,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, folio_mark_uptodate(folio); return; } - create_empty_buffers(&folio->page, i_blocksize(inode), 0); - head = folio_buffers(folio); + head = folio_create_empty_buffers(folio, i_blocksize(inode), 0); } page_bh = head; -- cgit v1.2.3 From d4059993674b533798a551859042957bb5578332 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:51 +0100 Subject: ext4: convert to folio_create_empty_buffers Remove an unnecessary folio->page->folio conversion and take advantage of the new return value from folio_create_empty_buffers(). Link: https://lkml.kernel.org/r/20231016201114.1928083-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Pankaj Raghav Cc: Andreas Gruenbacher Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ext4/inode.c | 14 +++++--------- fs/ext4/move_extent.c | 11 +++++------ 2 files changed, 10 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4ce35f1c8b0a..8e431ff2fd95 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1020,10 +1020,8 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, BUG_ON(from > to); head = folio_buffers(folio); - if (!head) { - create_empty_buffers(&folio->page, blocksize, 0); - head = folio_buffers(folio); - } + if (!head) + head = folio_create_empty_buffers(folio, blocksize, 0); bbits = ilog2(blocksize); block = (sector_t)folio->index << (PAGE_SHIFT - bbits); @@ -1153,7 +1151,7 @@ retry_grab: * starting the handle. */ if (!folio_buffers(folio)) - create_empty_buffers(&folio->page, inode->i_sb->s_blocksize, 0); + folio_create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); folio_unlock(folio); @@ -3643,10 +3641,8 @@ static int __ext4_block_zero_page_range(handle_t *handle, iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); - if (!bh) { - create_empty_buffers(&folio->page, blocksize, 0); - bh = folio_buffers(folio); - } + if (!bh) + bh = folio_create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ pos = blocksize; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 18a9e7c47975..7fe448fb948b 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -183,10 +183,8 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) blocksize = i_blocksize(inode); head = folio_buffers(folio); - if (!head) { - create_empty_buffers(&folio->page, blocksize, 0); - head = folio_buffers(folio); - } + if (!head) + head = folio_create_empty_buffers(folio, blocksize, 0); block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits); for (bh = head, block_start = 0; bh != head || !block_start; @@ -380,9 +378,10 @@ data_copy: } /* Perform all necessary steps similar write_begin()/write_end() * but keeping in mind that i_size will not change */ - if (!folio_buffers(folio[0])) - create_empty_buffers(&folio[0]->page, 1 << orig_inode->i_blkbits, 0); bh = folio_buffers(folio[0]); + if (!bh) + bh = folio_create_empty_buffers(folio[0], + 1 << orig_inode->i_blkbits, 0); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { -- cgit v1.2.3 From 0217fbb0271a7c78e16629cf45375916ec2eb35f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:52 +0100 Subject: buffer: add get_nth_bh() Extract this useful helper from nilfs_page_get_nth_block() Link: https://lkml.kernel.org/r/20231016201114.1928083-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/page.h | 7 +------ include/linux/buffer_head.h | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 21ddcdd4d63e..344d71942d36 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -55,12 +55,7 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, static inline struct buffer_head * nilfs_page_get_nth_block(struct page *page, unsigned int count) { - struct buffer_head *bh = page_buffers(page); - - while (count-- > 0) - bh = bh->b_this_page; - get_bh(bh); - return bh; + return get_nth_bh(page_buffers(page), count); } #endif /* _NILFS_PAGE_H */ diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 1001244a8941..3d85a0cf0ca5 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -457,6 +457,28 @@ __bread(struct block_device *bdev, sector_t block, unsigned size) return __bread_gfp(bdev, block, size, __GFP_MOVABLE); } +/** + * get_nth_bh - Get a reference on the n'th buffer after this one. + * @bh: The buffer to start counting from. + * @count: How many buffers to skip. + * + * This is primarily useful for finding the nth buffer in a folio; in + * that case you pass the head buffer and the byte offset in the folio + * divided by the block size. It can be used for other purposes, but + * it will wrap at the end of the folio rather than returning NULL or + * proceeding to the next folio for you. + * + * Return: The requested buffer with an elevated refcount. + */ +static inline __must_check +struct buffer_head *get_nth_bh(struct buffer_head *bh, unsigned int count) +{ + while (count--) + bh = bh->b_this_page; + get_bh(bh); + return bh; +} + bool block_dirty_folio(struct address_space *mapping, struct folio *folio); #ifdef CONFIG_BUFFER_HEAD -- cgit v1.2.3 From 81cb277ebdfd73b4b4cbfcdf377b4b514e90520a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:53 +0100 Subject: gfs2: convert inode unstuffing to use a folio Use the folio APIs, removing numerous hidden calls to compound_head(). Also remove the stale comment about the page being looked up if it's NULL. Link: https://lkml.kernel.org/r/20231016201114.1928083-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/gfs2/bmap.c | 48 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index ef7017fb6951..247d2c16593c 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -43,53 +43,51 @@ struct metapath { static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length); /** - * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page + * gfs2_unstuffer_folio - unstuff a stuffed inode into a block cached by a folio * @ip: the inode * @dibh: the dinode buffer * @block: the block number that was allocated - * @page: The (optional) page. This is looked up if @page is NULL + * @folio: The folio. * * Returns: errno */ - -static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, - u64 block, struct page *page) +static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh, + u64 block, struct folio *folio) { struct inode *inode = &ip->i_inode; - if (!PageUptodate(page)) { - void *kaddr = kmap(page); + if (!folio_test_uptodate(folio)) { + void *kaddr = kmap_local_folio(folio, 0); u64 dsize = i_size_read(inode); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize); - memset(kaddr + dsize, 0, PAGE_SIZE - dsize); - kunmap(page); + memset(kaddr + dsize, 0, folio_size(folio) - dsize); + kunmap_local(kaddr); - SetPageUptodate(page); + folio_mark_uptodate(folio); } if (gfs2_is_jdata(ip)) { - struct buffer_head *bh; + struct buffer_head *bh = folio_buffers(folio); - if (!page_has_buffers(page)) - create_empty_buffers(page, BIT(inode->i_blkbits), - BIT(BH_Uptodate)); + if (!bh) + bh = folio_create_empty_buffers(folio, + BIT(inode->i_blkbits), BIT(BH_Uptodate)); - bh = page_buffers(page); if (!buffer_mapped(bh)) map_bh(bh, inode->i_sb, block); set_buffer_uptodate(bh); gfs2_trans_add_data(ip->i_gl, bh); } else { - set_page_dirty(page); + folio_mark_dirty(folio); gfs2_ordered_add_inode(ip); } return 0; } -static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page) +static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct folio *folio) { struct buffer_head *bh, *dibh; struct gfs2_dinode *di; @@ -118,7 +116,7 @@ static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page) dibh, sizeof(struct gfs2_dinode)); brelse(bh); } else { - error = gfs2_unstuffer_page(ip, dibh, block, page); + error = gfs2_unstuffer_folio(ip, dibh, block, folio); if (error) goto out_brelse; } @@ -157,17 +155,17 @@ out_brelse: int gfs2_unstuff_dinode(struct gfs2_inode *ip) { struct inode *inode = &ip->i_inode; - struct page *page; + struct folio *folio; int error; down_write(&ip->i_rw_mutex); - page = grab_cache_page(inode->i_mapping, 0); - error = -ENOMEM; - if (!page) + folio = filemap_grab_folio(inode->i_mapping, 0); + error = PTR_ERR(folio); + if (IS_ERR(folio)) goto out; - error = __gfs2_unstuff_inode(ip, page); - unlock_page(page); - put_page(page); + error = __gfs2_unstuff_inode(ip, folio); + folio_unlock(folio); + folio_put(folio); out: up_write(&ip->i_rw_mutex); return error; -- cgit v1.2.3 From 0eb751791df86dc05a87fff1ada8d37044267a7e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:54 +0100 Subject: gfs2: convert gfs2_getbuf() to folios Remove several folio->page->folio conversions. Also use __GFP_NOFAIL instead of calling yield() and the new get_nth_bh(). Link: https://lkml.kernel.org/r/20231016201114.1928083-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/gfs2/meta_io.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 924361fa510b..f1fac1b45059 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -115,7 +115,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) { struct address_space *mapping = gfs2_glock2aspace(gl); struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct page *page; + struct folio *folio; struct buffer_head *bh; unsigned int shift; unsigned long index; @@ -129,36 +129,31 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) bufnum = blkno - (index << shift); /* block buf index within page */ if (create) { - for (;;) { - page = grab_cache_page(mapping, index); - if (page) - break; - yield(); - } - if (!page_has_buffers(page)) - create_empty_buffers(page, sdp->sd_sb.sb_bsize, 0); + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_mask(mapping) | __GFP_NOFAIL); + bh = folio_buffers(folio); + if (!bh) + bh = folio_create_empty_buffers(folio, + sdp->sd_sb.sb_bsize, 0); } else { - page = find_get_page_flags(mapping, index, - FGP_LOCK|FGP_ACCESSED); - if (!page) + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED, 0); + if (IS_ERR(folio)) return NULL; - if (!page_has_buffers(page)) { - bh = NULL; - goto out_unlock; - } + bh = folio_buffers(folio); } - /* Locate header for our buffer within our page */ - for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) - /* Do nothing */; - get_bh(bh); + if (!bh) + goto out_unlock; + bh = get_nth_bh(bh, bufnum); if (!buffer_mapped(bh)) map_bh(bh, sdp->sd_vfs, blkno); out_unlock: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return bh; } -- cgit v1.2.3 From c646e573729bfe885280c343203c775f1f5d71c2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:55 +0100 Subject: gfs2: convert gfs2_getjdatabuf to use a folio Use the folio APIs, saving four hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/gfs2/meta_io.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index f1fac1b45059..f6d40d51f5ed 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -400,26 +400,20 @@ static struct buffer_head *gfs2_getjdatabuf(struct gfs2_inode *ip, u64 blkno) { struct address_space *mapping = ip->i_inode.i_mapping; struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct page *page; + struct folio *folio; struct buffer_head *bh; unsigned int shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift; unsigned long index = blkno >> shift; /* convert block to page */ unsigned int bufnum = blkno - (index << shift); - page = find_get_page_flags(mapping, index, FGP_LOCK|FGP_ACCESSED); - if (!page) - return NULL; - if (!page_has_buffers(page)) { - unlock_page(page); - put_page(page); + folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED, 0); + if (IS_ERR(folio)) return NULL; - } - /* Locate header for our buffer within our page */ - for (bh = page_buffers(page); bufnum--; bh = bh->b_this_page) - /* Do nothing */; - get_bh(bh); - unlock_page(page); - put_page(page); + bh = folio_buffers(folio); + if (bh) + bh = get_nth_bh(bh, bufnum); + folio_unlock(folio); + folio_put(folio); return bh; } -- cgit v1.2.3 From 4064a0aa8a6a341b14fb9ee8cf0f0bb86c575e3b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:56 +0100 Subject: gfs2: convert gfs2_write_buf_to_page() to use a folio Remove several folio->page->folio conversions. Link: https://lkml.kernel.org/r/20231016201114.1928083-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/gfs2/quota.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index d3d013d1d5ac..25354278cecb 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -749,7 +749,7 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode); struct inode *inode = &ip->i_inode; struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; struct buffer_head *bh; u64 blk; unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0; @@ -758,15 +758,15 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, blk = index << (PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift); boff = off % bsize; - page = grab_cache_page(mapping, index); - if (!page) - return -ENOMEM; - if (!page_has_buffers(page)) - create_empty_buffers(page, bsize, 0); + folio = filemap_grab_folio(mapping, index); + if (IS_ERR(folio)) + return PTR_ERR(folio); + bh = folio_buffers(folio); + if (!bh) + bh = folio_create_empty_buffers(folio, bsize, 0); - bh = page_buffers(page); - for(;;) { - /* Find the beginning block within the page */ + for (;;) { + /* Find the beginning block within the folio */ if (pg_off >= ((bnum * bsize) + bsize)) { bh = bh->b_this_page; bnum++; @@ -779,9 +779,10 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, goto unlock_out; /* If it's a newly allocated disk block, zero it */ if (buffer_new(bh)) - zero_user(page, bnum * bsize, bh->b_size); + folio_zero_range(folio, bnum * bsize, + bh->b_size); } - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (bh_read(bh, REQ_META | REQ_PRIO) < 0) goto unlock_out; @@ -797,17 +798,17 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, break; } - /* Write to the page, now that we have setup the buffer(s) */ - memcpy_to_page(page, off, buf, bytes); - flush_dcache_page(page); - unlock_page(page); - put_page(page); + /* Write to the folio, now that we have setup the buffer(s) */ + memcpy_to_folio(folio, off, buf, bytes); + flush_dcache_folio(folio); + folio_unlock(folio); + folio_put(folio); return 0; unlock_out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return -EIO; } -- cgit v1.2.3 From 6c346be91dcf2b588510e41e4e04c0638af3d536 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:57 +0100 Subject: nilfs2: convert nilfs_mdt_freeze_buffer to use a folio Remove a number of folio->page->folio conversions. Link: https://lkml.kernel.org/r/20231016201114.1928083-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/mdt.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 19c8158605ed..db2260d6e44d 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -560,17 +560,19 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) { struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow; struct buffer_head *bh_frozen; - struct page *page; + struct folio *folio; int blkbits = inode->i_blkbits; - page = grab_cache_page(shadow->inode->i_mapping, bh->b_folio->index); - if (!page) - return -ENOMEM; + folio = filemap_grab_folio(shadow->inode->i_mapping, + bh->b_folio->index); + if (IS_ERR(folio)) + return PTR_ERR(folio); - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << blkbits, 0); + bh_frozen = folio_buffers(folio); + if (!bh_frozen) + bh_frozen = folio_create_empty_buffers(folio, 1 << blkbits, 0); - bh_frozen = nilfs_page_get_nth_block(page, bh_offset(bh) >> blkbits); + bh_frozen = get_nth_bh(bh_frozen, bh_offset(bh) >> blkbits); if (!buffer_uptodate(bh_frozen)) nilfs_copy_buffer(bh_frozen, bh); @@ -582,8 +584,8 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) brelse(bh_frozen); /* already frozen */ } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return 0; } -- cgit v1.2.3 From c5521c7689b892c8ea684a284722b5584ba1ce8f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:58 +0100 Subject: nilfs2: convert nilfs_grab_buffer() to use a folio Remove a number of folio->page->folio conversions. Link: https://lkml.kernel.org/r/20231016201114.1928083-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index b4e54d079b7d..1c075bd906c9 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -25,19 +25,19 @@ (BIT(BH_Uptodate) | BIT(BH_Mapped) | BIT(BH_NILFS_Node) | \ BIT(BH_NILFS_Volatile) | BIT(BH_NILFS_Checked)) -static struct buffer_head * -__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, - int blkbits, unsigned long b_state) +static struct buffer_head *__nilfs_get_folio_block(struct folio *folio, + unsigned long block, pgoff_t index, int blkbits, + unsigned long b_state) { unsigned long first_block; - struct buffer_head *bh; + struct buffer_head *bh = folio_buffers(folio); - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << blkbits, b_state); + if (!bh) + bh = folio_create_empty_buffers(folio, 1 << blkbits, b_state); first_block = (unsigned long)index << (PAGE_SHIFT - blkbits); - bh = nilfs_page_get_nth_block(page, block - first_block); + bh = get_nth_bh(bh, block - first_block); touch_buffer(bh); wait_on_buffer(bh); @@ -51,17 +51,17 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, { int blkbits = inode->i_blkbits; pgoff_t index = blkoff >> (PAGE_SHIFT - blkbits); - struct page *page; + struct folio *folio; struct buffer_head *bh; - page = grab_cache_page(mapping, index); - if (unlikely(!page)) + folio = filemap_grab_folio(mapping, index); + if (IS_ERR(folio)) return NULL; - bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state); + bh = __nilfs_get_folio_block(folio, blkoff, index, blkbits, b_state); if (unlikely(!bh)) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return NULL; } return bh; -- cgit v1.2.3 From 4093602d6bbb2c82b922d1b442a022a069cdb59e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:10:59 +0100 Subject: nilfs2: convert nilfs_copy_page() to nilfs_copy_folio() Both callers already have a folio, so pass it in and use it directly. Removes a lot of hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/page.c | 50 ++++++++++++++++++++++++++------------------------ mm/util.c | 1 + 2 files changed, 27 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 1c075bd906c9..696215d899bf 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -184,30 +184,32 @@ void nilfs_page_bug(struct page *page) } /** - * nilfs_copy_page -- copy the page with buffers - * @dst: destination page - * @src: source page - * @copy_dirty: flag whether to copy dirty states on the page's buffer heads. + * nilfs_copy_folio -- copy the folio with buffers + * @dst: destination folio + * @src: source folio + * @copy_dirty: flag whether to copy dirty states on the folio's buffer heads. * - * This function is for both data pages and btnode pages. The dirty flag - * should be treated by caller. The page must not be under i/o. - * Both src and dst page must be locked + * This function is for both data folios and btnode folios. The dirty flag + * should be treated by caller. The folio must not be under i/o. + * Both src and dst folio must be locked */ -static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) +static void nilfs_copy_folio(struct folio *dst, struct folio *src, + bool copy_dirty) { struct buffer_head *dbh, *dbufs, *sbh; unsigned long mask = NILFS_BUFFER_INHERENT_BITS; - BUG_ON(PageWriteback(dst)); + BUG_ON(folio_test_writeback(dst)); - sbh = page_buffers(src); - if (!page_has_buffers(dst)) - create_empty_buffers(dst, sbh->b_size, 0); + sbh = folio_buffers(src); + dbh = folio_buffers(dst); + if (!dbh) + dbh = folio_create_empty_buffers(dst, sbh->b_size, 0); if (copy_dirty) mask |= BIT(BH_Dirty); - dbh = dbufs = page_buffers(dst); + dbufs = dbh; do { lock_buffer(sbh); lock_buffer(dbh); @@ -218,16 +220,16 @@ static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty) dbh = dbh->b_this_page; } while (dbh != dbufs); - copy_highpage(dst, src); + folio_copy(dst, src); - if (PageUptodate(src) && !PageUptodate(dst)) - SetPageUptodate(dst); - else if (!PageUptodate(src) && PageUptodate(dst)) - ClearPageUptodate(dst); - if (PageMappedToDisk(src) && !PageMappedToDisk(dst)) - SetPageMappedToDisk(dst); - else if (!PageMappedToDisk(src) && PageMappedToDisk(dst)) - ClearPageMappedToDisk(dst); + if (folio_test_uptodate(src) && !folio_test_uptodate(dst)) + folio_mark_uptodate(dst); + else if (!folio_test_uptodate(src) && folio_test_uptodate(dst)) + folio_clear_uptodate(dst); + if (folio_test_mappedtodisk(src) && !folio_test_mappedtodisk(dst)) + folio_set_mappedtodisk(dst); + else if (!folio_test_mappedtodisk(src) && folio_test_mappedtodisk(dst)) + folio_clear_mappedtodisk(dst); do { unlock_buffer(sbh); @@ -269,7 +271,7 @@ repeat: NILFS_PAGE_BUG(&folio->page, "found empty page in dat page cache"); - nilfs_copy_page(&dfolio->page, &folio->page, 1); + nilfs_copy_folio(dfolio, folio, true); filemap_dirty_folio(folio_mapping(dfolio), dfolio); folio_unlock(dfolio); @@ -314,7 +316,7 @@ repeat: if (!IS_ERR(dfolio)) { /* overwrite existing folio in the destination cache */ WARN_ON(folio_test_dirty(dfolio)); - nilfs_copy_page(&dfolio->page, &folio->page, 0); + nilfs_copy_folio(dfolio, folio, false); folio_unlock(dfolio); folio_put(dfolio); /* Do we not need to remove folio from smap here? */ diff --git a/mm/util.c b/mm/util.c index 8cbbfd3a3d59..eefa0336d38c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -799,6 +799,7 @@ void folio_copy(struct folio *dst, struct folio *src) cond_resched(); } } +EXPORT_SYMBOL(folio_copy); int sysctl_overcommit_memory __read_mostly = OVERCOMMIT_GUESS; int sysctl_overcommit_ratio __read_mostly = 50; -- cgit v1.2.3 From 1a846bf3884694b2f9ecbd70011927c2fb40f224 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:00 +0100 Subject: nilfs2: convert nilfs_mdt_forget_block() to use a folio Remove a number of folio->page->folio conversions. Link: https://lkml.kernel.org/r/20231016201114.1928083-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/mdt.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index db2260d6e44d..11b7cf4acc92 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -356,30 +356,28 @@ int nilfs_mdt_delete_block(struct inode *inode, unsigned long block) */ int nilfs_mdt_forget_block(struct inode *inode, unsigned long block) { - pgoff_t index = (pgoff_t)block >> - (PAGE_SHIFT - inode->i_blkbits); - struct page *page; - unsigned long first_block; + pgoff_t index = block >> (PAGE_SHIFT - inode->i_blkbits); + struct folio *folio; + struct buffer_head *bh; int ret = 0; int still_dirty; - page = find_lock_page(inode->i_mapping, index); - if (!page) + folio = filemap_lock_folio(inode->i_mapping, index); + if (IS_ERR(folio)) return -ENOENT; - wait_on_page_writeback(page); + folio_wait_writeback(folio); - first_block = (unsigned long)index << - (PAGE_SHIFT - inode->i_blkbits); - if (page_has_buffers(page)) { - struct buffer_head *bh; - - bh = nilfs_page_get_nth_block(page, block - first_block); + bh = folio_buffers(folio); + if (bh) { + unsigned long first_block = index << + (PAGE_SHIFT - inode->i_blkbits); + bh = get_nth_bh(bh, block - first_block); nilfs_forget_buffer(bh); } - still_dirty = PageDirty(page); - unlock_page(page); - put_page(page); + still_dirty = folio_test_dirty(folio); + folio_unlock(folio); + folio_put(folio); if (still_dirty || invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0) -- cgit v1.2.3 From 664c87b75ef6ec3ebb97383891a6787b63925547 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:01 +0100 Subject: nilfs2: convert nilfs_mdt_get_frozen_buffer to use a folio Remove a number of folio->page->folio conversions. Link: https://lkml.kernel.org/r/20231016201114.1928083-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/mdt.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 11b7cf4acc92..7b754e6494d7 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -592,17 +592,19 @@ nilfs_mdt_get_frozen_buffer(struct inode *inode, struct buffer_head *bh) { struct nilfs_shadow_map *shadow = NILFS_MDT(inode)->mi_shadow; struct buffer_head *bh_frozen = NULL; - struct page *page; + struct folio *folio; int n; - page = find_lock_page(shadow->inode->i_mapping, bh->b_folio->index); - if (page) { - if (page_has_buffers(page)) { + folio = filemap_lock_folio(shadow->inode->i_mapping, + bh->b_folio->index); + if (!IS_ERR(folio)) { + bh_frozen = folio_buffers(folio); + if (bh_frozen) { n = bh_offset(bh) >> inode->i_blkbits; - bh_frozen = nilfs_page_get_nth_block(page, n); + bh_frozen = get_nth_bh(bh_frozen, n); } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); } return bh_frozen; } -- cgit v1.2.3 From 73c32e07a3977f8470b29889d30d2f808e6fee4a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:02 +0100 Subject: nilfs2: remove nilfs_page_get_nth_block All users have now been converted to get_nth_block(). Link: https://lkml.kernel.org/r/20231016201114.1928083-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/page.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 344d71942d36..d249ea1cefff 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -52,10 +52,4 @@ unsigned long nilfs_find_uncommitted_extent(struct inode *inode, #define NILFS_PAGE_BUG(page, m, a...) \ do { nilfs_page_bug(page); BUG(); } while (0) -static inline struct buffer_head * -nilfs_page_get_nth_block(struct page *page, unsigned int count) -{ - return get_nth_bh(page_buffers(page), count); -} - #endif /* _NILFS_PAGE_H */ -- cgit v1.2.3 From 922b12eff0b293fc13ae4045c7d7264e18938878 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:03 +0100 Subject: nilfs2: convert nilfs_lookup_dirty_data_buffers to use folio_create_empty_buffers This function was already using a folio, so this update to the new API removes a single folio->page->folio conversion. Link: https://lkml.kernel.org/r/20231016201114.1928083-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Ryusuke Konishi Cc: Andreas Gruenbacher Cc: Pankaj Raghav Signed-off-by: Andrew Morton --- fs/nilfs2/segment.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 7ec16879756e..94388fe83cf8 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -731,10 +731,9 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, continue; } head = folio_buffers(folio); - if (!head) { - create_empty_buffers(&folio->page, i_blocksize(inode), 0); - head = folio_buffers(folio); - } + if (!head) + head = folio_create_empty_buffers(folio, + i_blocksize(inode), 0); folio_unlock(folio); bh = head; -- cgit v1.2.3 From a2da3afce96ce747ec0e1670bfc73fec85d20f95 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:04 +0100 Subject: ntfs: convert ntfs_read_block() to use a folio The caller already has the folio, so pass it in and use the folio API throughout saving five hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ntfs/aops.c | 44 +++++++++++++++++++------------------------- 1 file changed, 19 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index 4e158bce4192..d66a9f5ffde9 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -145,13 +145,12 @@ still_busy: } /** - * ntfs_read_block - fill a @page of an address space with data - * @page: page cache page to fill with data + * ntfs_read_block - fill a @folio of an address space with data + * @folio: page cache folio to fill with data * - * Fill the page @page of the address space belonging to the @page->host inode. * We read each buffer asynchronously and when all buffers are read in, our io * completion handler ntfs_end_buffer_read_async(), if required, automatically - * applies the mst fixups to the page before finally marking it uptodate and + * applies the mst fixups to the folio before finally marking it uptodate and * unlocking it. * * We only enforce allocated_size limit because i_size is checked for in @@ -161,7 +160,7 @@ still_busy: * * Contains an adapted version of fs/buffer.c::block_read_full_folio(). */ -static int ntfs_read_block(struct page *page) +static int ntfs_read_block(struct folio *folio) { loff_t i_size; VCN vcn; @@ -178,7 +177,7 @@ static int ntfs_read_block(struct page *page) int i, nr; unsigned char blocksize_bits; - vi = page->mapping->host; + vi = folio->mapping->host; ni = NTFS_I(vi); vol = ni->vol; @@ -188,15 +187,10 @@ static int ntfs_read_block(struct page *page) blocksize = vol->sb->s_blocksize; blocksize_bits = vol->sb->s_blocksize_bits; - if (!page_has_buffers(page)) { - create_empty_buffers(page, blocksize, 0); - if (unlikely(!page_has_buffers(page))) { - unlock_page(page); - return -ENOMEM; - } - } - bh = head = page_buffers(page); - BUG_ON(!bh); + head = folio_buffers(folio); + if (!head) + head = folio_create_empty_buffers(folio, blocksize, 0); + bh = head; /* * We may be racing with truncate. To avoid some of the problems we @@ -205,11 +199,11 @@ static int ntfs_read_block(struct page *page) * may leave some buffers unmapped which are now allocated. This is * not a problem since these buffers will just get mapped when a write * occurs. In case of a shrinking truncate, we will detect this later - * on due to the runlist being incomplete and if the page is being + * on due to the runlist being incomplete and if the folio is being * fully truncated, truncate will throw it away as soon as we unlock * it so no need to worry what we do with it. */ - iblock = (s64)page->index << (PAGE_SHIFT - blocksize_bits); + iblock = (s64)folio->index << (PAGE_SHIFT - blocksize_bits); read_lock_irqsave(&ni->size_lock, flags); lblock = (ni->allocated_size + blocksize - 1) >> blocksize_bits; init_size = ni->initialized_size; @@ -221,7 +215,7 @@ static int ntfs_read_block(struct page *page) } zblock = (init_size + blocksize - 1) >> blocksize_bits; - /* Loop through all the buffers in the page. */ + /* Loop through all the buffers in the folio. */ rl = NULL; nr = i = 0; do { @@ -299,7 +293,7 @@ lock_retry_remap: if (!err) err = -EIO; bh->b_blocknr = -1; - SetPageError(page); + folio_set_error(folio); ntfs_error(vol->sb, "Failed to read from inode 0x%lx, " "attribute type 0x%x, vcn 0x%llx, " "offset 0x%x because its location on " @@ -312,13 +306,13 @@ lock_retry_remap: /* * Either iblock was outside lblock limits or * ntfs_rl_vcn_to_lcn() returned error. Just zero that portion - * of the page and set the buffer uptodate. + * of the folio and set the buffer uptodate. */ handle_hole: bh->b_blocknr = -1UL; clear_buffer_mapped(bh); handle_zblock: - zero_user(page, i * blocksize, blocksize); + folio_zero_range(folio, i * blocksize, blocksize); if (likely(!err)) set_buffer_uptodate(bh); } while (i++, iblock++, (bh = bh->b_this_page) != head); @@ -349,11 +343,11 @@ handle_zblock: return 0; } /* No i/o was scheduled on any of the buffers. */ - if (likely(!PageError(page))) - SetPageUptodate(page); + if (likely(!folio_test_error(folio))) + folio_mark_uptodate(folio); else /* Signal synchronous i/o error. */ nr = -EIO; - unlock_page(page); + folio_unlock(folio); return nr; } @@ -433,7 +427,7 @@ retry_readpage: /* NInoNonResident() == NInoIndexAllocPresent() */ if (NInoNonResident(ni)) { /* Normal, non-resident data stream. */ - return ntfs_read_block(page); + return ntfs_read_block(folio); } /* * Attribute is resident, implying it is not compressed or encrypted. -- cgit v1.2.3 From a04eb7cb186b4d537eefc2d68dba8a7e5eb7e6d7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:05 +0100 Subject: ntfs: convert ntfs_writepage to use a folio Use folio APIs throughout. Saves many hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-19-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ntfs/aops.c | 211 +++++++++++++++++++++++++++------------------------------ 1 file changed, 100 insertions(+), 111 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index d66a9f5ffde9..c4426992a2ee 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -501,28 +501,29 @@ err_out: #ifdef NTFS_RW /** - * ntfs_write_block - write a @page to the backing store - * @page: page cache page to write out + * ntfs_write_block - write a @folio to the backing store + * @folio: page cache folio to write out * @wbc: writeback control structure * - * This function is for writing pages belonging to non-resident, non-mst + * This function is for writing folios belonging to non-resident, non-mst * protected attributes to their backing store. * - * For a page with buffers, map and write the dirty buffers asynchronously - * under page writeback. For a page without buffers, create buffers for the - * page, then proceed as above. + * For a folio with buffers, map and write the dirty buffers asynchronously + * under folio writeback. For a folio without buffers, create buffers for the + * folio, then proceed as above. * - * If a page doesn't have buffers the page dirty state is definitive. If a page - * does have buffers, the page dirty state is just a hint, and the buffer dirty - * state is definitive. (A hint which has rules: dirty buffers against a clean - * page is illegal. Other combinations are legal and need to be handled. In - * particular a dirty page containing clean buffers for example.) + * If a folio doesn't have buffers the folio dirty state is definitive. If + * a folio does have buffers, the folio dirty state is just a hint, + * and the buffer dirty state is definitive. (A hint which has rules: + * dirty buffers against a clean folio is illegal. Other combinations are + * legal and need to be handled. In particular a dirty folio containing + * clean buffers for example.) * * Return 0 on success and -errno on error. * * Based on ntfs_read_block() and __block_write_full_folio(). */ -static int ntfs_write_block(struct page *page, struct writeback_control *wbc) +static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc) { VCN vcn; LCN lcn; @@ -540,41 +541,29 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) bool need_end_writeback; unsigned char blocksize_bits; - vi = page->mapping->host; + vi = folio->mapping->host; ni = NTFS_I(vi); vol = ni->vol; ntfs_debug("Entering for inode 0x%lx, attribute type 0x%x, page index " - "0x%lx.", ni->mft_no, ni->type, page->index); + "0x%lx.", ni->mft_no, ni->type, folio->index); BUG_ON(!NInoNonResident(ni)); BUG_ON(NInoMstProtected(ni)); blocksize = vol->sb->s_blocksize; blocksize_bits = vol->sb->s_blocksize_bits; - if (!page_has_buffers(page)) { - BUG_ON(!PageUptodate(page)); - create_empty_buffers(page, blocksize, + head = folio_buffers(folio); + if (!head) { + BUG_ON(!folio_test_uptodate(folio)); + head = folio_create_empty_buffers(folio, blocksize, (1 << BH_Uptodate) | (1 << BH_Dirty)); - if (unlikely(!page_has_buffers(page))) { - ntfs_warning(vol->sb, "Error allocating page " - "buffers. Redirtying page so we try " - "again later."); - /* - * Put the page back on mapping->dirty_pages, but leave - * its buffers' dirty state as-is. - */ - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } } - bh = head = page_buffers(page); - BUG_ON(!bh); + bh = head; /* NOTE: Different naming scheme to ntfs_read_block()! */ - /* The first block in the page. */ - block = (s64)page->index << (PAGE_SHIFT - blocksize_bits); + /* The first block in the folio. */ + block = (s64)folio->index << (PAGE_SHIFT - blocksize_bits); read_lock_irqsave(&ni->size_lock, flags); i_size = i_size_read(vi); @@ -591,14 +580,14 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) * Be very careful. We have no exclusion from block_dirty_folio * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it - * then we just miss that fact, and the page stays dirty. + * then we just miss that fact, and the folio stays dirty. * * Buffers outside i_size may be dirtied by block_dirty_folio; * handle that here by just cleaning them. */ /* - * Loop through all the buffers in the page, mapping all the dirty + * Loop through all the buffers in the folio, mapping all the dirty * buffers to disk addresses and handling any aliases from the * underlying block device's mapping. */ @@ -610,13 +599,13 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) if (unlikely(block >= dblock)) { /* * Mapped buffers outside i_size will occur, because - * this page can be outside i_size when there is a + * this folio can be outside i_size when there is a * truncate in progress. The contents of such buffers * were zeroed by ntfs_writepage(). * * FIXME: What about the small race window where * ntfs_writepage() has not done any clearing because - * the page was within i_size but before we get here, + * the folio was within i_size but before we get here, * vmtruncate() modifies i_size? */ clear_buffer_dirty(bh); @@ -632,38 +621,38 @@ static int ntfs_write_block(struct page *page, struct writeback_control *wbc) if (unlikely((block >= iblock) && (initialized_size < i_size))) { /* - * If this page is fully outside initialized - * size, zero out all pages between the current - * initialized size and the current page. Just + * If this folio is fully outside initialized + * size, zero out all folios between the current + * initialized size and the current folio. Just * use ntfs_read_folio() to do the zeroing * transparently. */ if (block > iblock) { // TODO: - // For each page do: - // - read_cache_page() - // Again for each page do: - // - wait_on_page_locked() - // - Check (PageUptodate(page) && - // !PageError(page)) + // For each folio do: + // - read_cache_folio() + // Again for each folio do: + // - wait_on_folio_locked() + // - Check (folio_test_uptodate(folio) && + // !folio_test_error(folio)) // Update initialized size in the attribute and // in the inode. - // Again, for each page do: + // Again, for each folio do: // block_dirty_folio(); - // put_page() + // folio_put() // We don't need to wait on the writes. // Update iblock. } /* - * The current page straddles initialized size. Zero + * The current folio straddles initialized size. Zero * all non-uptodate buffers and set them uptodate (and * dirty?). Note, there aren't any non-uptodate buffers - * if the page is uptodate. - * FIXME: For an uptodate page, the buffers may need to + * if the folio is uptodate. + * FIXME: For an uptodate folio, the buffers may need to * be written out because they were not initialized on * disk before. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { // TODO: // Zero any non-uptodate buffers up to i_size. // Set them uptodate and dirty. @@ -721,14 +710,14 @@ lock_retry_remap: unsigned long *bpos, *bend; /* Check if the buffer is zero. */ - kaddr = kmap_atomic(page); - bpos = (unsigned long *)(kaddr + bh_offset(bh)); - bend = (unsigned long *)((u8*)bpos + blocksize); + kaddr = kmap_local_folio(folio, bh_offset(bh)); + bpos = (unsigned long *)kaddr; + bend = (unsigned long *)(kaddr + blocksize); do { if (unlikely(*bpos)) break; } while (likely(++bpos < bend)); - kunmap_atomic(kaddr); + kunmap_local(kaddr); if (bpos == bend) { /* * Buffer is zero and sparse, no need to write @@ -768,7 +757,7 @@ lock_retry_remap: if (err == -ENOENT || lcn == LCN_ENOENT) { bh->b_blocknr = -1; clear_buffer_dirty(bh); - zero_user(page, bh_offset(bh), blocksize); + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); err = 0; continue; @@ -795,7 +784,7 @@ lock_retry_remap: bh = head; /* Just an optimization, so ->read_folio() is not called later. */ - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { int uptodate = 1; do { if (!buffer_uptodate(bh)) { @@ -805,7 +794,7 @@ lock_retry_remap: } } while ((bh = bh->b_this_page) != head); if (uptodate) - SetPageUptodate(page); + folio_mark_uptodate(folio); } /* Setup all mapped, dirty buffers for async write i/o. */ @@ -820,7 +809,7 @@ lock_retry_remap: } else if (unlikely(err)) { /* * For the error case. The buffer may have been set - * dirty during attachment to a dirty page. + * dirty during attachment to a dirty folio. */ if (err != -ENOMEM) clear_buffer_dirty(bh); @@ -833,20 +822,20 @@ lock_retry_remap: err = 0; else if (err == -ENOMEM) { ntfs_warning(vol->sb, "Error allocating memory. " - "Redirtying page so we try again " + "Redirtying folio so we try again " "later."); /* - * Put the page back on mapping->dirty_pages, but + * Put the folio back on mapping->dirty_pages, but * leave its buffer's dirty state as-is. */ - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); err = 0; } else - SetPageError(page); + folio_set_error(folio); } - BUG_ON(PageWriteback(page)); - set_page_writeback(page); /* Keeps try_to_free_buffers() away. */ + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); /* Keeps try_to_free_buffers() away. */ /* Submit the prepared buffers for i/o. */ need_end_writeback = true; @@ -858,11 +847,11 @@ lock_retry_remap: } bh = next; } while (bh != head); - unlock_page(page); + folio_unlock(folio); - /* If no i/o was started, need to end_page_writeback(). */ + /* If no i/o was started, need to end writeback here. */ if (unlikely(need_end_writeback)) - end_page_writeback(page); + folio_end_writeback(folio); ntfs_debug("Done."); return err; @@ -1331,8 +1320,9 @@ done: */ static int ntfs_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); loff_t i_size; - struct inode *vi = page->mapping->host; + struct inode *vi = folio->mapping->host; ntfs_inode *base_ni = NULL, *ni = NTFS_I(vi); char *addr; ntfs_attr_search_ctx *ctx = NULL; @@ -1341,14 +1331,13 @@ static int ntfs_writepage(struct page *page, struct writeback_control *wbc) int err; retry_writepage: - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); i_size = i_size_read(vi); - /* Is the page fully outside i_size? (truncate in progress) */ - if (unlikely(page->index >= (i_size + PAGE_SIZE - 1) >> + /* Is the folio fully outside i_size? (truncate in progress) */ + if (unlikely(folio->index >= (i_size + PAGE_SIZE - 1) >> PAGE_SHIFT)) { - struct folio *folio = page_folio(page); /* - * The page may have dirty, unmapped buffers. Make them + * The folio may have dirty, unmapped buffers. Make them * freeable here, so the page does not leak. */ block_invalidate_folio(folio, 0, folio_size(folio)); @@ -1367,7 +1356,7 @@ retry_writepage: if (ni->type != AT_INDEX_ALLOCATION) { /* If file is encrypted, deny access, just like NT4. */ if (NInoEncrypted(ni)) { - unlock_page(page); + folio_unlock(folio); BUG_ON(ni->type != AT_DATA); ntfs_debug("Denying write access to encrypted file."); return -EACCES; @@ -1378,14 +1367,14 @@ retry_writepage: BUG_ON(ni->name_len); // TODO: Implement and replace this with // return ntfs_write_compressed_block(page); - unlock_page(page); + folio_unlock(folio); ntfs_error(vi->i_sb, "Writing to compressed files is " "not supported yet. Sorry."); return -EOPNOTSUPP; } // TODO: Implement and remove this check. if (NInoNonResident(ni) && NInoSparse(ni)) { - unlock_page(page); + folio_unlock(folio); ntfs_error(vi->i_sb, "Writing to sparse files is not " "supported yet. Sorry."); return -EOPNOTSUPP; @@ -1394,34 +1383,34 @@ retry_writepage: /* NInoNonResident() == NInoIndexAllocPresent() */ if (NInoNonResident(ni)) { /* We have to zero every time due to mmap-at-end-of-file. */ - if (page->index >= (i_size >> PAGE_SHIFT)) { - /* The page straddles i_size. */ - unsigned int ofs = i_size & ~PAGE_MASK; - zero_user_segment(page, ofs, PAGE_SIZE); + if (folio->index >= (i_size >> PAGE_SHIFT)) { + /* The folio straddles i_size. */ + unsigned int ofs = i_size & (folio_size(folio) - 1); + folio_zero_segment(folio, ofs, folio_size(folio)); } /* Handle mst protected attributes. */ if (NInoMstProtected(ni)) return ntfs_write_mst_block(page, wbc); /* Normal, non-resident data stream. */ - return ntfs_write_block(page, wbc); + return ntfs_write_block(folio, wbc); } /* * Attribute is resident, implying it is not compressed, encrypted, or * mst protected. This also means the attribute is smaller than an mft - * record and hence smaller than a page, so can simply return error on - * any pages with index above 0. Note the attribute can actually be + * record and hence smaller than a folio, so can simply return error on + * any folios with index above 0. Note the attribute can actually be * marked compressed but if it is resident the actual data is not * compressed so we are ok to ignore the compressed flag here. */ - BUG_ON(page_has_buffers(page)); - BUG_ON(!PageUptodate(page)); - if (unlikely(page->index > 0)) { - ntfs_error(vi->i_sb, "BUG()! page->index (0x%lx) > 0. " - "Aborting write.", page->index); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); - end_page_writeback(page); + BUG_ON(folio_buffers(folio)); + BUG_ON(!folio_test_uptodate(folio)); + if (unlikely(folio->index > 0)) { + ntfs_error(vi->i_sb, "BUG()! folio->index (0x%lx) > 0. " + "Aborting write.", folio->index); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); + folio_end_writeback(folio); return -EIO; } if (!NInoAttr(ni)) @@ -1454,12 +1443,12 @@ retry_writepage: if (unlikely(err)) goto err_out; /* - * Keep the VM happy. This must be done otherwise the radix-tree tag - * PAGECACHE_TAG_DIRTY remains set even though the page is clean. + * Keep the VM happy. This must be done otherwise + * PAGECACHE_TAG_DIRTY remains set even though the folio is clean. */ - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); attr_len = le32_to_cpu(ctx->attr->data.resident.value_length); i_size = i_size_read(vi); if (unlikely(attr_len > i_size)) { @@ -1474,18 +1463,18 @@ retry_writepage: /* Shrinking cannot fail. */ BUG_ON(err); } - addr = kmap_atomic(page); - /* Copy the data from the page to the mft record. */ + addr = kmap_local_folio(folio, 0); + /* Copy the data from the folio to the mft record. */ memcpy((u8*)ctx->attr + le16_to_cpu(ctx->attr->data.resident.value_offset), addr, attr_len); - /* Zero out of bounds area in the page cache page. */ - memset(addr + attr_len, 0, PAGE_SIZE - attr_len); - kunmap_atomic(addr); - flush_dcache_page(page); + /* Zero out of bounds area in the page cache folio. */ + memset(addr + attr_len, 0, folio_size(folio) - attr_len); + kunmap_local(addr); + flush_dcache_folio(folio); flush_dcache_mft_record_page(ctx->ntfs_ino); - /* We are done with the page. */ - end_page_writeback(page); + /* We are done with the folio. */ + folio_end_writeback(folio); /* Finally, mark the mft record dirty, so it gets written back. */ mark_mft_record_dirty(ctx->ntfs_ino); ntfs_attr_put_search_ctx(ctx); @@ -1496,18 +1485,18 @@ err_out: ntfs_warning(vi->i_sb, "Error allocating memory. Redirtying " "page so we try again later."); /* - * Put the page back on mapping->dirty_pages, but leave its + * Put the folio back on mapping->dirty_pages, but leave its * buffers' dirty state as-is. */ - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); err = 0; } else { ntfs_error(vi->i_sb, "Resident attribute write failed with " "error %i.", err); - SetPageError(page); + folio_set_error(folio); NVolSetErrors(ni->vol); } - unlock_page(page); + folio_unlock(folio); if (ctx) ntfs_attr_put_search_ctx(ctx); if (m) -- cgit v1.2.3 From 24a7b35285c5514393e7ed46407111c68f4602ba Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:06 +0100 Subject: ntfs: convert ntfs_prepare_pages_for_non_resident_write() to folios Convert each element of the pages array to a folio before using it. This in no way renders the function large-folio safe, but it does remove a lot of hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-20-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ntfs/file.c | 89 +++++++++++++++++++++++++++------------------------------- 1 file changed, 41 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index cbc545999cfe..099141d20db6 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -567,7 +567,7 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, LCN lcn; s64 bh_pos, vcn_len, end, initialized_size; sector_t lcn_block; - struct page *page; + struct folio *folio; struct inode *vi; ntfs_inode *ni, *base_ni = NULL; ntfs_volume *vol; @@ -601,20 +601,6 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, (long long)pos, bytes); blocksize = vol->sb->s_blocksize; blocksize_bits = vol->sb->s_blocksize_bits; - u = 0; - do { - page = pages[u]; - BUG_ON(!page); - /* - * create_empty_buffers() will create uptodate/dirty buffers if - * the page is uptodate/dirty. - */ - if (!page_has_buffers(page)) { - create_empty_buffers(page, blocksize, 0); - if (unlikely(!page_has_buffers(page))) - return -ENOMEM; - } - } while (++u < nr_pages); rl_write_locked = false; rl = NULL; err = 0; @@ -626,14 +612,21 @@ static int ntfs_prepare_pages_for_non_resident_write(struct page **pages, end = pos + bytes; cend = (end + vol->cluster_size - 1) >> vol->cluster_size_bits; /* - * Loop over each page and for each page over each buffer. Use goto to + * Loop over each buffer in each folio. Use goto to * reduce indentation. */ u = 0; -do_next_page: - page = pages[u]; - bh_pos = (s64)page->index << PAGE_SHIFT; - bh = head = page_buffers(page); +do_next_folio: + folio = page_folio(pages[u]); + bh_pos = folio_pos(folio); + head = folio_buffers(folio); + if (!head) + /* + * create_empty_buffers() will create uptodate/dirty + * buffers if the folio is uptodate/dirty. + */ + head = folio_create_empty_buffers(folio, blocksize, 0); + bh = head; do { VCN cdelta; s64 bh_end; @@ -653,15 +646,15 @@ do_next_page: if (buffer_uptodate(bh)) continue; /* - * The buffer is not uptodate. If the page is uptodate + * The buffer is not uptodate. If the folio is uptodate * set the buffer uptodate and otherwise ignore it. */ - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); continue; } /* - * Neither the page nor the buffer are uptodate. If + * Neither the folio nor the buffer are uptodate. If * the buffer is only partially being written to, we * need to read it in before the write, i.e. now. */ @@ -679,7 +672,7 @@ do_next_page: ntfs_submit_bh_for_read(bh); *wait_bh++ = bh; } else { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); } @@ -706,7 +699,7 @@ map_buffer_cached: (bh_cofs >> blocksize_bits); set_buffer_mapped(bh); /* - * If the page is uptodate so is the buffer. If the + * If the folio is uptodate so is the buffer. If the * buffer is fully outside the write, we ignore it if * it was already allocated and we mark it dirty so it * gets written out if we allocated it. On the other @@ -714,7 +707,7 @@ map_buffer_cached: * marking it dirty we set buffer_new so we can do * error recovery. */ - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); if (unlikely(was_hole)) { @@ -754,7 +747,8 @@ map_buffer_cached: ntfs_submit_bh_for_read(bh); *wait_bh++ = bh; } else { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, + bh_offset(bh), blocksize); set_buffer_uptodate(bh); } @@ -773,7 +767,7 @@ map_buffer_cached: */ if (bh_end <= pos || bh_pos >= end) { if (!buffer_uptodate(bh)) { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); } @@ -786,7 +780,7 @@ map_buffer_cached: u8 *kaddr; unsigned pofs; - kaddr = kmap_atomic(page); + kaddr = kmap_local_folio(folio, 0); if (bh_pos < pos) { pofs = bh_pos & ~PAGE_MASK; memset(kaddr + pofs, 0, pos - bh_pos); @@ -795,8 +789,8 @@ map_buffer_cached: pofs = end & ~PAGE_MASK; memset(kaddr + pofs, 0, bh_end - end); } - kunmap_atomic(kaddr); - flush_dcache_page(page); + kunmap_local(kaddr); + flush_dcache_folio(folio); } continue; } @@ -809,11 +803,12 @@ map_buffer_cached: initialized_size = ni->allocated_size; read_unlock_irqrestore(&ni->size_lock, flags); if (bh_pos > initialized_size) { - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh)) { - zero_user(page, bh_offset(bh), blocksize); + folio_zero_range(folio, bh_offset(bh), + blocksize); set_buffer_uptodate(bh); } continue; @@ -927,17 +922,17 @@ rl_not_mapped_enoent: bh->b_blocknr = -1; /* * If the buffer is uptodate we skip it. If it - * is not but the page is uptodate, we can set - * the buffer uptodate. If the page is not + * is not but the folio is uptodate, we can set + * the buffer uptodate. If the folio is not * uptodate, we can clear the buffer and set it * uptodate. Whether this is worthwhile is * debatable and this could be removed. */ - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { if (!buffer_uptodate(bh)) set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh)) { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); } @@ -1167,7 +1162,7 @@ rl_not_mapped_enoent: } while (bh_pos += blocksize, (bh = bh->b_this_page) != head); /* If there are no errors, do the next page. */ if (likely(!err && ++u < nr_pages)) - goto do_next_page; + goto do_next_folio; /* If there are no errors, release the runlist lock if we took it. */ if (likely(!err)) { if (unlikely(rl_write_locked)) { @@ -1185,9 +1180,8 @@ rl_not_mapped_enoent: bh = *--wait_bh; wait_on_buffer(bh); if (likely(buffer_uptodate(bh))) { - page = bh->b_page; - bh_pos = ((s64)page->index << PAGE_SHIFT) + - bh_offset(bh); + folio = bh->b_folio; + bh_pos = folio_pos(folio) + bh_offset(bh); /* * If the buffer overflows the initialized size, need * to zero the overflowing region. @@ -1197,7 +1191,7 @@ rl_not_mapped_enoent: if (likely(bh_pos < initialized_size)) ofs = initialized_size - bh_pos; - zero_user_segment(page, bh_offset(bh) + ofs, + folio_zero_segment(folio, bh_offset(bh) + ofs, blocksize); } } else /* if (unlikely(!buffer_uptodate(bh))) */ @@ -1324,21 +1318,20 @@ rl_not_mapped_enoent: u = 0; end = bh_cpos << vol->cluster_size_bits; do { - page = pages[u]; - bh = head = page_buffers(page); + folio = page_folio(pages[u]); + bh = head = folio_buffers(folio); do { if (u == nr_pages && - ((s64)page->index << PAGE_SHIFT) + - bh_offset(bh) >= end) + folio_pos(folio) + bh_offset(bh) >= end) break; if (!buffer_new(bh)) continue; clear_buffer_new(bh); if (!buffer_uptodate(bh)) { - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); else { - zero_user(page, bh_offset(bh), + folio_zero_range(folio, bh_offset(bh), blocksize); set_buffer_uptodate(bh); } -- cgit v1.2.3 From c3f4200ac61ae98ea29d28519ea5076c04f22e06 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:07 +0100 Subject: ntfs3: convert ntfs_zero_range() to use a folio Use the folio API throughout, saving six hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-21-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ntfs3/file.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 962f12ce6c0a..a003a69091a2 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -187,7 +187,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) struct buffer_head *head, *bh; u32 bh_next, bh_off, to; sector_t iblock; - struct page *page; + struct folio *folio; for (; idx < idx_end; idx += 1, from = 0) { page_off = (loff_t)idx << PAGE_SHIFT; @@ -195,16 +195,17 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) PAGE_SIZE; iblock = page_off >> inode->i_blkbits; - page = find_or_create_page(mapping, idx, - mapping_gfp_constraint(mapping, - ~__GFP_FS)); - if (!page) - return -ENOMEM; + folio = __filemap_get_folio(mapping, idx, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping_gfp_constraint(mapping, ~__GFP_FS)); + if (IS_ERR(folio)) + return PTR_ERR(folio); - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); + head = folio_buffers(folio); + if (!head) + head = folio_create_empty_buffers(folio, blocksize, 0); - bh = head = page_buffers(page); + bh = head; bh_off = 0; do { bh_next = bh_off + blocksize; @@ -220,14 +221,14 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) } /* Ok, it's mapped. Make sure it's up-to-date. */ - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); if (!buffer_uptodate(bh)) { err = bh_read(bh, 0); if (err < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } } @@ -237,10 +238,10 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) } while (bh_off = bh_next, iblock += 1, head != (bh = bh->b_this_page)); - zero_user_segment(page, from, to); + folio_zero_segment(folio, from, to); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); cond_resched(); } out: -- cgit v1.2.3 From 414ae0a440333143dcac39faaef00f098cceeff5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:08 +0100 Subject: ocfs2: convert ocfs2_map_page_blocks to use a folio Convert the page argument to a folio and then use the folio APIs throughout. Replaces three hidden calls to compound_head() with one explicit one. Link: https://lkml.kernel.org/r/20231016201114.1928083-22-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ocfs2/aops.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 0fdba30740ab..95d1e70b4401 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -568,10 +568,10 @@ static void ocfs2_clear_page_regions(struct page *page, * read-in the blocks at the tail of our file. Avoid reading them by * testing i_size against each block offset. */ -static int ocfs2_should_read_blk(struct inode *inode, struct page *page, +static int ocfs2_should_read_blk(struct inode *inode, struct folio *folio, unsigned int block_start) { - u64 offset = page_offset(page) + block_start; + u64 offset = folio_pos(folio) + block_start; if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) return 1; @@ -593,15 +593,16 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, struct inode *inode, unsigned int from, unsigned int to, int new) { + struct folio *folio = page_folio(page); int ret = 0; struct buffer_head *head, *bh, *wait[2], **wait_bh = wait; unsigned int block_end, block_start; unsigned int bsize = i_blocksize(inode); - if (!page_has_buffers(page)) - create_empty_buffers(page, bsize, 0); + head = folio_buffers(folio); + if (!head) + head = folio_create_empty_buffers(folio, bsize, 0); - head = page_buffers(page); for (bh = head, block_start = 0; bh != head || !block_start; bh = bh->b_this_page, block_start += bsize) { block_end = block_start + bsize; @@ -613,7 +614,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, * they may belong to unallocated clusters. */ if (block_start >= to || block_end <= from) { - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) set_buffer_uptodate(bh); continue; } @@ -630,11 +631,11 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, clean_bdev_bh_alias(bh); } - if (PageUptodate(page)) { + if (folio_test_uptodate(folio)) { set_buffer_uptodate(bh); } else if (!buffer_uptodate(bh) && !buffer_delay(bh) && !buffer_new(bh) && - ocfs2_should_read_blk(inode, page, block_start) && + ocfs2_should_read_blk(inode, folio, block_start) && (block_start < from || block_end > to)) { bh_read_nowait(bh, 0); *wait_bh++=bh; @@ -668,7 +669,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, if (block_start >= to) break; - zero_user(page, block_start, bh->b_size); + folio_zero_range(folio, block_start, bh->b_size); set_buffer_uptodate(bh); mark_buffer_dirty(bh); -- cgit v1.2.3 From 44f68575267ecfc439d11bb782c69ba2598b3ed0 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:09 +0100 Subject: reiserfs: convert writepage to use a folio Convert the incoming page to a folio and then use it throughout the writeback path. This definitely isn't enough to support large folios, but I don't expect reiserfs to gain support for those before it is removed. Link: https://lkml.kernel.org/r/20231016201114.1928083-23-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/reiserfs/inode.c | 80 ++++++++++++++++++++++++++--------------------------- 1 file changed, 40 insertions(+), 40 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 86e55d4bb10d..d9737235b8e0 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2507,10 +2507,10 @@ out: * start/recovery path as __block_write_full_folio, along with special * code to handle reiserfs tails. */ -static int reiserfs_write_full_page(struct page *page, +static int reiserfs_write_full_folio(struct folio *folio, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; unsigned long end_index = inode->i_size >> PAGE_SHIFT; int error = 0; unsigned long block; @@ -2518,7 +2518,7 @@ static int reiserfs_write_full_page(struct page *page, struct buffer_head *head, *bh; int partial = 0; int nr = 0; - int checked = PageChecked(page); + int checked = folio_test_checked(folio); struct reiserfs_transaction_handle th; struct super_block *s = inode->i_sb; int bh_per_page = PAGE_SIZE / s->s_blocksize; @@ -2526,47 +2526,46 @@ static int reiserfs_write_full_page(struct page *page, /* no logging allowed when nonblocking or from PF_MEMALLOC */ if (checked && (current->flags & PF_MEMALLOC)) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); return 0; } /* - * The page dirty bit is cleared before writepage is called, which + * The folio dirty bit is cleared before writepage is called, which * means we have to tell create_empty_buffers to make dirty buffers - * The page really should be up to date at this point, so tossing + * The folio really should be up to date at this point, so tossing * in the BH_Uptodate is just a sanity check. */ - if (!page_has_buffers(page)) { - create_empty_buffers(page, s->s_blocksize, + head = folio_buffers(folio); + if (!head) + head = folio_create_empty_buffers(folio, s->s_blocksize, (1 << BH_Dirty) | (1 << BH_Uptodate)); - } - head = page_buffers(page); /* - * last page in the file, zero out any contents past the + * last folio in the file, zero out any contents past the * last byte in the file */ - if (page->index >= end_index) { + if (folio->index >= end_index) { unsigned last_offset; last_offset = inode->i_size & (PAGE_SIZE - 1); - /* no file contents in this page */ - if (page->index >= end_index + 1 || !last_offset) { - unlock_page(page); + /* no file contents in this folio */ + if (folio->index >= end_index + 1 || !last_offset) { + folio_unlock(folio); return 0; } - zero_user_segment(page, last_offset, PAGE_SIZE); + folio_zero_segment(folio, last_offset, folio_size(folio)); } bh = head; - block = page->index << (PAGE_SHIFT - s->s_blocksize_bits); + block = folio->index << (PAGE_SHIFT - s->s_blocksize_bits); last_block = (i_size_read(inode) - 1) >> inode->i_blkbits; /* first map all the buffers, logging any direct items we find */ do { if (block > last_block) { /* * This can happen when the block size is less than - * the page size. The corresponding bytes in the page + * the folio size. The corresponding bytes in the folio * were zero filled above */ clear_buffer_dirty(bh); @@ -2593,7 +2592,7 @@ static int reiserfs_write_full_page(struct page *page, * blocks we're going to log */ if (checked) { - ClearPageChecked(page); + folio_clear_checked(folio); reiserfs_write_lock(s); error = journal_begin(&th, s, bh_per_page + 1); if (error) { @@ -2602,7 +2601,7 @@ static int reiserfs_write_full_page(struct page *page, } reiserfs_update_inode_transaction(inode); } - /* now go through and lock any dirty buffers on the page */ + /* now go through and lock any dirty buffers on the folio */ do { get_bh(bh); if (!buffer_mapped(bh)) @@ -2623,7 +2622,7 @@ static int reiserfs_write_full_page(struct page *page, lock_buffer(bh); } else { if (!trylock_buffer(bh)) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); continue; } } @@ -2640,13 +2639,13 @@ static int reiserfs_write_full_page(struct page *page, if (error) goto fail; } - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); /* - * since any buffer might be the only dirty buffer on the page, - * the first submit_bh can bring the page out of writeback. + * since any buffer might be the only dirty buffer on the folio, + * the first submit_bh can bring the folio out of writeback. * be careful with the buffers. */ do { @@ -2663,10 +2662,10 @@ static int reiserfs_write_full_page(struct page *page, done: if (nr == 0) { /* - * if this page only had a direct item, it is very possible for + * if this folio only had a direct item, it is very possible for * no io to be required without there being an error. Or, * someone else could have locked them and sent them down the - * pipe without locking the page + * pipe without locking the folio */ bh = head; do { @@ -2677,18 +2676,18 @@ done: bh = bh->b_this_page; } while (bh != head); if (!partial) - SetPageUptodate(page); - end_page_writeback(page); + folio_mark_uptodate(folio); + folio_end_writeback(folio); } return error; fail: /* * catches various errors, we need to make sure any valid dirty blocks - * get to the media. The page is currently locked and not marked for + * get to the media. The folio is currently locked and not marked for * writeback */ - ClearPageUptodate(page); + folio_clear_uptodate(folio); bh = head; do { get_bh(bh); @@ -2698,16 +2697,16 @@ fail: } else { /* * clear any dirty bits that might have come from - * getting attached to a dirty page + * getting attached to a dirty folio */ clear_buffer_dirty(bh); } bh = bh->b_this_page; } while (bh != head); - SetPageError(page); - BUG_ON(PageWriteback(page)); - set_page_writeback(page); - unlock_page(page); + folio_set_error(folio); + BUG_ON(folio_test_writeback(folio)); + folio_start_writeback(folio); + folio_unlock(folio); do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { @@ -2728,9 +2727,10 @@ static int reiserfs_read_folio(struct file *f, struct folio *folio) static int reiserfs_writepage(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(page); + struct inode *inode = folio->mapping->host; reiserfs_wait_on_write_block(inode->i_sb); - return reiserfs_write_full_page(page, wbc); + return reiserfs_write_full_folio(folio, wbc); } static void reiserfs_truncate_failed_write(struct inode *inode) -- cgit v1.2.3 From 5fb7bd50b3516a9d5fea5775e7ebd43f0c96bb3d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:10 +0100 Subject: ufs: add ufs_get_locked_folio and ufs_put_locked_folio Convert the _page variants to call them. Saves a few hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-24-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ufs/util.c | 43 +++++++++++++++++++++++++------------------ fs/ufs/util.h | 13 +++++++++---- 2 files changed, 34 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 08ddf41eaaad..151b400cb3b6 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -229,43 +229,50 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev ufsi->i_u1.i_data[0] = cpu_to_fs32(sb, fs32); } +struct page *ufs_get_locked_page(struct address_space *mapping, pgoff_t index) +{ + struct folio *folio = ufs_get_locked_folio(mapping, index); + + if (folio) + return folio_file_page(folio, index); + return NULL; +} + /** - * ufs_get_locked_page() - locate, pin and lock a pagecache page, if not exist + * ufs_get_locked_folio() - locate, pin and lock a pagecache folio, if not exist * read it from disk. * @mapping: the address_space to search * @index: the page index * - * Locates the desired pagecache page, if not exist we'll read it, + * Locates the desired pagecache folio, if not exist we'll read it, * locks it, increments its reference * count and returns its address. * */ - -struct page *ufs_get_locked_page(struct address_space *mapping, +struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index) { struct inode *inode = mapping->host; - struct page *page = find_lock_page(mapping, index); - if (!page) { - page = read_mapping_page(mapping, index, NULL); + struct folio *folio = filemap_lock_folio(mapping, index); + if (!folio) { + folio = read_mapping_folio(mapping, index, NULL); - if (IS_ERR(page)) { - printk(KERN_ERR "ufs_change_blocknr: " - "read_mapping_page error: ino %lu, index: %lu\n", + if (IS_ERR(folio)) { + printk(KERN_ERR "ufs_change_blocknr: read_mapping_folio error: ino %lu, index: %lu\n", mapping->host->i_ino, index); - return page; + return folio; } - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping == NULL)) { + if (unlikely(folio->mapping == NULL)) { /* Truncate got there first */ - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return NULL; } } - if (!page_has_buffers(page)) - create_empty_buffers(page, 1 << inode->i_blkbits, 0); - return page; + if (!folio_buffers(folio)) + folio_create_empty_buffers(folio, 1 << inode->i_blkbits, 0); + return folio; } diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 89247193d96d..62542561d150 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -273,12 +273,17 @@ extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struc extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned); /* This functions works with cache pages*/ -extern struct page *ufs_get_locked_page(struct address_space *mapping, - pgoff_t index); +struct page *ufs_get_locked_page(struct address_space *mapping, pgoff_t index); +struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index); +static inline void ufs_put_locked_folio(struct folio *folio) +{ + folio_unlock(folio); + folio_put(folio); +} + static inline void ufs_put_locked_page(struct page *page) { - unlock_page(page); - put_page(page); + ufs_put_locked_folio(page_folio(page)); } -- cgit v1.2.3 From e7ca7f1725b3b0b276dd7c5119326b5e098abb53 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:11 +0100 Subject: ufs: use ufs_get_locked_folio() in ufs_alloc_lastblock() Switch to the folio APIs, saving one folio->page->folio conversion. Link: https://lkml.kernel.org/r/20231016201114.1928083-25-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ufs/inode.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 21a4779a2de5..5b289374d4fd 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1057,7 +1057,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; unsigned i, end; sector_t lastfrag; - struct page *lastpage; + struct folio *folio; struct buffer_head *bh; u64 phys64; @@ -1068,18 +1068,17 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) lastfrag--; - lastpage = ufs_get_locked_page(mapping, lastfrag >> + folio = ufs_get_locked_folio(mapping, lastfrag >> (PAGE_SHIFT - inode->i_blkbits)); - if (IS_ERR(lastpage)) { - err = -EIO; - goto out; - } - - end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1); - bh = page_buffers(lastpage); - for (i = 0; i < end; ++i) - bh = bh->b_this_page; + if (IS_ERR(folio)) { + err = -EIO; + goto out; + } + end = lastfrag & ((1 << (PAGE_SHIFT - inode->i_blkbits)) - 1); + bh = folio_buffers(folio); + for (i = 0; i < end; ++i) + bh = bh->b_this_page; err = ufs_getfrag_block(inode, lastfrag, bh, 1); @@ -1095,7 +1094,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) */ set_buffer_uptodate(bh); mark_buffer_dirty(bh); - set_page_dirty(lastpage); + folio_mark_dirty(folio); } if (lastfrag >= UFS_IND_FRAGMENT) { @@ -1113,7 +1112,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size) } } out_unlock: - ufs_put_locked_page(lastpage); + ufs_put_locked_folio(folio); out: return err; } -- cgit v1.2.3 From c7e8812ce5cfcb16c69faa86d38b36ddd3141ba9 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:12 +0100 Subject: ufs: convert ufs_change_blocknr() to use folios Convert the locked_page argument to a folio, then use folios throughout. Saves three hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20231016201114.1928083-26-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ufs/balloc.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 2436e3f82147..53c11be2b2c1 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -240,6 +240,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, unsigned int count, sector_t oldb, sector_t newb, struct page *locked_page) { + struct folio *folio, *locked_folio = page_folio(locked_page); const unsigned blks_per_page = 1 << (PAGE_SHIFT - inode->i_blkbits); const unsigned mask = blks_per_page - 1; @@ -247,42 +248,39 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, pgoff_t index, cur_index, last_index; unsigned pos, j, lblock; sector_t end, i; - struct page *page; struct buffer_head *head, *bh; UFSD("ENTER, ino %lu, count %u, oldb %llu, newb %llu\n", inode->i_ino, count, (unsigned long long)oldb, (unsigned long long)newb); - BUG_ON(!locked_page); - BUG_ON(!PageLocked(locked_page)); + BUG_ON(!folio_test_locked(locked_folio)); - cur_index = locked_page->index; + cur_index = locked_folio->index; end = count + beg; last_index = end >> (PAGE_SHIFT - inode->i_blkbits); for (i = beg; i < end; i = (i | mask) + 1) { index = i >> (PAGE_SHIFT - inode->i_blkbits); if (likely(cur_index != index)) { - page = ufs_get_locked_page(mapping, index); - if (!page)/* it was truncated */ + folio = ufs_get_locked_folio(mapping, index); + if (!folio) /* it was truncated */ continue; - if (IS_ERR(page)) {/* or EIO */ + if (IS_ERR(folio)) {/* or EIO */ ufs_error(inode->i_sb, __func__, "read of page %llu failed\n", (unsigned long long)index); continue; } } else - page = locked_page; + folio = locked_folio; - head = page_buffers(page); + head = folio_buffers(folio); bh = head; pos = i & mask; for (j = 0; j < pos; ++j) bh = bh->b_this_page; - if (unlikely(index == last_index)) lblock = end & mask; else @@ -313,7 +311,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, } while (bh != head); if (likely(cur_index != index)) - ufs_put_locked_page(page); + ufs_put_locked_folio(folio); } UFSD("EXIT\n"); } -- cgit v1.2.3 From c9f2480ed7b221baf738edf431757e5ca4115bca Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:13 +0100 Subject: ufs: remove ufs_get_locked_page() Both callers are now converted to ufs_get_locked_folio(). Link: https://lkml.kernel.org/r/20231016201114.1928083-27-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/ufs/util.c | 9 --------- fs/ufs/util.h | 7 ------- 2 files changed, 16 deletions(-) (limited to 'fs') diff --git a/fs/ufs/util.c b/fs/ufs/util.c index 151b400cb3b6..d32de30009a0 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -229,15 +229,6 @@ ufs_set_inode_dev(struct super_block *sb, struct ufs_inode_info *ufsi, dev_t dev ufsi->i_u1.i_data[0] = cpu_to_fs32(sb, fs32); } -struct page *ufs_get_locked_page(struct address_space *mapping, pgoff_t index) -{ - struct folio *folio = ufs_get_locked_folio(mapping, index); - - if (folio) - return folio_file_page(folio, index); - return NULL; -} - /** * ufs_get_locked_folio() - locate, pin and lock a pagecache folio, if not exist * read it from disk. diff --git a/fs/ufs/util.h b/fs/ufs/util.h index 62542561d150..0ecd2ed792f5 100644 --- a/fs/ufs/util.h +++ b/fs/ufs/util.h @@ -273,7 +273,6 @@ extern void _ubh_ubhcpymem_(struct ufs_sb_private_info *, unsigned char *, struc extern void _ubh_memcpyubh_(struct ufs_sb_private_info *, struct ufs_buffer_head *, unsigned char *, unsigned); /* This functions works with cache pages*/ -struct page *ufs_get_locked_page(struct address_space *mapping, pgoff_t index); struct folio *ufs_get_locked_folio(struct address_space *mapping, pgoff_t index); static inline void ufs_put_locked_folio(struct folio *folio) { @@ -281,12 +280,6 @@ static inline void ufs_put_locked_folio(struct folio *folio) folio_put(folio); } -static inline void ufs_put_locked_page(struct page *page) -{ - ufs_put_locked_folio(page_folio(page)); -} - - /* * macros and inline function to get important structures from ufs_sb_private_info */ -- cgit v1.2.3 From 0a88810d9b76e6ecfd234f5728e27344a39e3ff8 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 16 Oct 2023 21:11:14 +0100 Subject: buffer: remove folio_create_empty_buffers() With all users converted, remove the old create_empty_buffers() and rename folio_create_empty_buffers() to create_empty_buffers(). Link: https://lkml.kernel.org/r/20231016201114.1928083-28-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Andreas Gruenbacher Cc: Pankaj Raghav Cc: Ryusuke Konishi Signed-off-by: Andrew Morton --- fs/buffer.c | 13 +++---------- fs/ext4/inode.c | 6 +++--- fs/ext4/move_extent.c | 4 ++-- fs/gfs2/aops.c | 2 +- fs/gfs2/bmap.c | 2 +- fs/gfs2/meta_io.c | 2 +- fs/gfs2/quota.c | 2 +- fs/mpage.c | 2 +- fs/nilfs2/mdt.c | 2 +- fs/nilfs2/page.c | 4 ++-- fs/nilfs2/segment.c | 2 +- fs/ntfs/aops.c | 4 ++-- fs/ntfs/file.c | 2 +- fs/ntfs3/file.c | 2 +- fs/ocfs2/aops.c | 2 +- fs/reiserfs/inode.c | 2 +- fs/ufs/util.c | 2 +- include/linux/buffer_head.h | 4 +--- 18 files changed, 25 insertions(+), 34 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index bfa7604d1891..657a62bab73d 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1641,7 +1641,7 @@ EXPORT_SYMBOL(block_invalidate_folio); * block_dirty_folio() via private_lock. try_to_free_buffers * is already excluded via the folio lock. */ -struct buffer_head *folio_create_empty_buffers(struct folio *folio, +struct buffer_head *create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state) { struct buffer_head *bh, *head, *tail; @@ -1672,13 +1672,6 @@ struct buffer_head *folio_create_empty_buffers(struct folio *folio, return head; } -EXPORT_SYMBOL(folio_create_empty_buffers); - -void create_empty_buffers(struct page *page, - unsigned long blocksize, unsigned long b_state) -{ - folio_create_empty_buffers(page_folio(page), blocksize, b_state); -} EXPORT_SYMBOL(create_empty_buffers); /** @@ -1778,7 +1771,7 @@ static struct buffer_head *folio_create_buffers(struct folio *folio, bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, + bh = create_empty_buffers(folio, 1 << READ_ONCE(inode->i_blkbits), b_state); return bh; } @@ -2681,7 +2674,7 @@ int block_truncate_page(struct address_space *mapping, bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, blocksize, 0); + bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ offset = offset_in_folio(folio, from); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8e431ff2fd95..347fc8986e93 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1021,7 +1021,7 @@ static int ext4_block_write_begin(struct folio *folio, loff_t pos, unsigned len, head = folio_buffers(folio); if (!head) - head = folio_create_empty_buffers(folio, blocksize, 0); + head = create_empty_buffers(folio, blocksize, 0); bbits = ilog2(blocksize); block = (sector_t)folio->index << (PAGE_SHIFT - bbits); @@ -1151,7 +1151,7 @@ retry_grab: * starting the handle. */ if (!folio_buffers(folio)) - folio_create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); + create_empty_buffers(folio, inode->i_sb->s_blocksize, 0); folio_unlock(folio); @@ -3642,7 +3642,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, blocksize, 0); + bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ pos = blocksize; diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 7fe448fb948b..3aa57376d9c2 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -184,7 +184,7 @@ mext_page_mkuptodate(struct folio *folio, unsigned from, unsigned to) blocksize = i_blocksize(inode); head = folio_buffers(folio); if (!head) - head = folio_create_empty_buffers(folio, blocksize, 0); + head = create_empty_buffers(folio, blocksize, 0); block = (sector_t)folio->index << (PAGE_SHIFT - inode->i_blkbits); for (bh = head, block_start = 0; bh != head || !block_start; @@ -380,7 +380,7 @@ data_copy: * but keeping in mind that i_size will not change */ bh = folio_buffers(folio[0]); if (!bh) - bh = folio_create_empty_buffers(folio[0], + bh = create_empty_buffers(folio[0], 1 << orig_inode->i_blkbits, 0); for (i = 0; i < data_offset_in_page; i++) bh = bh->b_this_page; diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index c26d48355cc2..6b060fc9e260 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -130,7 +130,7 @@ static int __gfs2_jdata_write_folio(struct folio *folio, if (folio_test_checked(folio)) { folio_clear_checked(folio); if (!folio_buffers(folio)) { - folio_create_empty_buffers(folio, + create_empty_buffers(folio, inode->i_sb->s_blocksize, BIT(BH_Dirty)|BIT(BH_Uptodate)); } diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 247d2c16593c..f1eee3f4704b 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -71,7 +71,7 @@ static int gfs2_unstuffer_folio(struct gfs2_inode *ip, struct buffer_head *dibh, struct buffer_head *bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, + bh = create_empty_buffers(folio, BIT(inode->i_blkbits), BIT(BH_Uptodate)); if (!buffer_mapped(bh)) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index f6d40d51f5ed..25ceb0805df2 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -134,7 +134,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) mapping_gfp_mask(mapping) | __GFP_NOFAIL); bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, + bh = create_empty_buffers(folio, sdp->sd_sb.sb_bsize, 0); } else { folio = __filemap_get_folio(mapping, index, diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 25354278cecb..2f1328af34f4 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -763,7 +763,7 @@ static int gfs2_write_buf_to_page(struct gfs2_sbd *sdp, unsigned long index, return PTR_ERR(folio); bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, bsize, 0); + bh = create_empty_buffers(folio, bsize, 0); for (;;) { /* Find the beginning block within the folio */ diff --git a/fs/mpage.c b/fs/mpage.c index 964a6efe594d..ffb064ed9d04 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -119,7 +119,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, folio_mark_uptodate(folio); return; } - head = folio_create_empty_buffers(folio, i_blocksize(inode), 0); + head = create_empty_buffers(folio, i_blocksize(inode), 0); } page_bh = head; diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 7b754e6494d7..c97c77a39668 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -568,7 +568,7 @@ int nilfs_mdt_freeze_buffer(struct inode *inode, struct buffer_head *bh) bh_frozen = folio_buffers(folio); if (!bh_frozen) - bh_frozen = folio_create_empty_buffers(folio, 1 << blkbits, 0); + bh_frozen = create_empty_buffers(folio, 1 << blkbits, 0); bh_frozen = get_nth_bh(bh_frozen, bh_offset(bh) >> blkbits); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 696215d899bf..06b04758f289 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -34,7 +34,7 @@ static struct buffer_head *__nilfs_get_folio_block(struct folio *folio, struct buffer_head *bh = folio_buffers(folio); if (!bh) - bh = folio_create_empty_buffers(folio, 1 << blkbits, b_state); + bh = create_empty_buffers(folio, 1 << blkbits, b_state); first_block = (unsigned long)index << (PAGE_SHIFT - blkbits); bh = get_nth_bh(bh, block - first_block); @@ -204,7 +204,7 @@ static void nilfs_copy_folio(struct folio *dst, struct folio *src, sbh = folio_buffers(src); dbh = folio_buffers(dst); if (!dbh) - dbh = folio_create_empty_buffers(dst, sbh->b_size, 0); + dbh = create_empty_buffers(dst, sbh->b_size, 0); if (copy_dirty) mask |= BIT(BH_Dirty); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 94388fe83cf8..55e31cc903d1 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -732,7 +732,7 @@ static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode, } head = folio_buffers(folio); if (!head) - head = folio_create_empty_buffers(folio, + head = create_empty_buffers(folio, i_blocksize(inode), 0); folio_unlock(folio); diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index c4426992a2ee..71e31e789b29 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -189,7 +189,7 @@ static int ntfs_read_block(struct folio *folio) head = folio_buffers(folio); if (!head) - head = folio_create_empty_buffers(folio, blocksize, 0); + head = create_empty_buffers(folio, blocksize, 0); bh = head; /* @@ -555,7 +555,7 @@ static int ntfs_write_block(struct folio *folio, struct writeback_control *wbc) head = folio_buffers(folio); if (!head) { BUG_ON(!folio_test_uptodate(folio)); - head = folio_create_empty_buffers(folio, blocksize, + head = create_empty_buffers(folio, blocksize, (1 << BH_Uptodate) | (1 << BH_Dirty)); } bh = head; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 099141d20db6..297c0b9db621 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -625,7 +625,7 @@ do_next_folio: * create_empty_buffers() will create uptodate/dirty * buffers if the folio is uptodate/dirty. */ - head = folio_create_empty_buffers(folio, blocksize, 0); + head = create_empty_buffers(folio, blocksize, 0); bh = head; do { VCN cdelta; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index a003a69091a2..66fd4ac28395 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -203,7 +203,7 @@ static int ntfs_zero_range(struct inode *inode, u64 vbo, u64 vbo_to) head = folio_buffers(folio); if (!head) - head = folio_create_empty_buffers(folio, blocksize, 0); + head = create_empty_buffers(folio, blocksize, 0); bh = head; bh_off = 0; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 95d1e70b4401..a6405dd5df09 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -601,7 +601,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno, head = folio_buffers(folio); if (!head) - head = folio_create_empty_buffers(folio, bsize, 0); + head = create_empty_buffers(folio, bsize, 0); for (bh = head, block_start = 0; bh != head || !block_start; bh = bh->b_this_page, block_start += bsize) { diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index d9737235b8e0..a9075c4843ed 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -2539,7 +2539,7 @@ static int reiserfs_write_full_folio(struct folio *folio, */ head = folio_buffers(folio); if (!head) - head = folio_create_empty_buffers(folio, s->s_blocksize, + head = create_empty_buffers(folio, s->s_blocksize, (1 << BH_Dirty) | (1 << BH_Uptodate)); /* diff --git a/fs/ufs/util.c b/fs/ufs/util.c index d32de30009a0..13ba34e6d64f 100644 --- a/fs/ufs/util.c +++ b/fs/ufs/util.c @@ -264,6 +264,6 @@ struct folio *ufs_get_locked_folio(struct address_space *mapping, } } if (!folio_buffers(folio)) - folio_create_empty_buffers(folio, 1 << inode->i_blkbits, 0); + create_empty_buffers(folio, 1 << inode->i_blkbits, 0); return folio; } diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 3d85a0cf0ca5..5f23ee599889 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -201,9 +201,7 @@ struct buffer_head *folio_alloc_buffers(struct folio *folio, unsigned long size, gfp_t gfp); struct buffer_head *alloc_page_buffers(struct page *page, unsigned long size, bool retry); -void create_empty_buffers(struct page *, unsigned long, - unsigned long b_state); -struct buffer_head *folio_create_empty_buffers(struct folio *folio, +struct buffer_head *create_empty_buffers(struct folio *folio, unsigned long blocksize, unsigned long b_state); void end_buffer_read_sync(struct buffer_head *bh, int uptodate); void end_buffer_write_sync(struct buffer_head *bh, int uptodate); -- cgit v1.2.3 From 1cbf0a58847b30507611f92ad69964ef37264d14 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 23 Oct 2023 23:26:08 -0700 Subject: ext4: add __GFP_NOWARN to GFP_NOWAIT in readahead Since commit e509ad4d77e6 ("ext4: use bdev_getblk() to avoid memory reclaim in readahead path") rightly replaced GFP_NOFAIL allocations by GFP_NOWAIT allocations, I've occasionally been seeing "page allocation failure: order:0" warnings under load: all with ext4_sb_breadahead_unmovable() in the stack. I don't think those warnings are of any interest: suppress them with __GFP_NOWARN. Link: https://lkml.kernel.org/r/7bc6ad16-9a4d-dd90-202e-47d6cbb5a136@google.com Fixes: e509ad4d77e6 ("ext4: use bdev_getblk() to avoid memory reclaim in readahead path") Signed-off-by: Hugh Dickins Reviewed-by: Jan Kara Cc: Hui Zhu Cc: Matthew Wilcox (Oracle) Cc: Theodore Ts'o Signed-off-by: Andrew Morton --- fs/ext4/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c00ec159dea5..56a08fc5c5d5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -262,7 +262,7 @@ struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, - sb->s_blocksize, GFP_NOWAIT); + sb->s_blocksize, GFP_NOWAIT | __GFP_NOWARN); if (likely(bh)) { if (trylock_buffer(bh)) -- cgit v1.2.3 From 10969b5571387047cd461a9c701b8b9cd007f6c7 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:15:09 -0700 Subject: hugetlbfs: drop shared NUMA mempolicy pretence Patch series "mempolicy: cleanups leading to NUMA mpol without vma", v2. Mostly cleanups in mm/mempolicy.c, but finally removing the pseudo-vma from shmem folio allocation, and removing the mmap_lock around folio migration for mbind and migrate_pages syscalls. This patch (of 12): hugetlbfs_fallocate() goes through the motions of pasting a shared NUMA mempolicy onto its pseudo-vma, but how could there ever be a shared NUMA mempolicy for this file? hugetlb_vm_ops has never offered a set_policy method, and hugetlbfs_parse_param() has never supported any mpol options for a mount-wide default policy. It's just an illusion: clean it away so as not to confuse others, giving us more freedom to adjust shmem's set_policy/get_policy implementation. But hugetlbfs_inode_info is still required, just to accommodate seals. Yes, shared NUMA mempolicy support could be added to hugetlbfs, with a set_policy method and/or mpol mount option (Andi's first posting did include an admitted-unsatisfactory hugetlb_set_policy()); but it seems that nobody has bothered to add that in the nineteen years since v2.6.7 made it possible, and there is at least one company that has invested enough into hugetlbfs, that I guess they have learnt well enough how to manage its NUMA, without needing shared mempolicy. Remove linux/mempolicy.h from linux/hugetlb.h: include linux/pagemap.h in its place, because hugetlb.h's recently added use of filemap_lock_folio() requires that (although most .configs and .c's get it in some other way). Link: https://lkml.kernel.org/r/ebc0987e-beff-8bfb-9283-234c2cbd17c5@google.com Link: https://lkml.kernel.org/r/cae82d4b-904a-faaf-282a-34fcc188c81f@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Nhat Pham Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 41 +---------------------------------------- include/linux/hugetlb.h | 3 +-- 2 files changed, 2 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 85cd418d2340..3ad5fc3cb8db 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -83,29 +83,6 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { {} }; -#ifdef CONFIG_NUMA -static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) -{ - vma->vm_policy = mpol_shared_policy_lookup(&HUGETLBFS_I(inode)->policy, - index); -} - -static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) -{ - mpol_cond_put(vma->vm_policy); -} -#else -static inline void hugetlb_set_vma_policy(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) -{ -} - -static inline void hugetlb_drop_vma_policy(struct vm_area_struct *vma) -{ -} -#endif - /* * Mask used when checking the page offset value passed in via system * calls. This value will be converted to a loff_t which is signed. @@ -853,8 +830,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* * Initialize a pseudo vma as this is required by the huge page - * allocation routines. If NUMA is configured, use page index - * as input to create an allocation policy. + * allocation routines. */ vma_init(&pseudo_vma, mm); vm_flags_init(&pseudo_vma, VM_HUGETLB | VM_MAYSHARE | VM_SHARED); @@ -902,9 +878,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * folios in these areas, we need to consume the reserves * to keep reservation accounting consistent. */ - hugetlb_set_vma_policy(&pseudo_vma, inode, index); folio = alloc_hugetlb_folio(&pseudo_vma, addr, 0); - hugetlb_drop_vma_policy(&pseudo_vma); if (IS_ERR(folio)) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); error = PTR_ERR(folio); @@ -1283,18 +1257,6 @@ static struct inode *hugetlbfs_alloc_inode(struct super_block *sb) hugetlbfs_inc_free_inodes(sbinfo); return NULL; } - - /* - * Any time after allocation, hugetlbfs_destroy_inode can be called - * for the inode. mpol_free_shared_policy is unconditionally called - * as part of hugetlbfs_destroy_inode. So, initialize policy here - * in case of a quick call to destroy. - * - * Note that the policy is initialized even if we are creating a - * private inode. This simplifies hugetlbfs_destroy_inode. - */ - mpol_shared_policy_init(&p->policy, NULL); - return &p->vfs_inode; } @@ -1306,7 +1268,6 @@ static void hugetlbfs_free_inode(struct inode *inode) static void hugetlbfs_destroy_inode(struct inode *inode) { hugetlbfs_inc_free_inodes(HUGETLBFS_SB(inode->i_sb)); - mpol_free_shared_policy(&HUGETLBFS_I(inode)->policy); } static const struct address_space_operations hugetlbfs_aops = { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 205469aa0613..158ff156710b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -30,7 +30,7 @@ void free_huge_folio(struct folio *folio); #ifdef CONFIG_HUGETLB_PAGE -#include +#include #include #include @@ -545,7 +545,6 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) } struct hugetlbfs_inode_info { - struct shared_policy policy; struct inode vfs_inode; unsigned int seals; }; -- cgit v1.2.3 From 4b981bc1aa73c204c2aa7f99b5f4f74d03b0e381 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 3 Oct 2023 02:16:29 -0700 Subject: kernfs: drop shared NUMA mempolicy hooks It seems strange that kernfs should be an outlier with a set_policy and get_policy in its kernfs_vm_ops. Ah, it dates back to v2.6.30's commit 095160aee954 ("sysfs: fix some bin_vm_ops errors"), when I had crashed on powerpc's pci_mmap_legacy_page_range() fallback to shmem_zero_setup(). Well, that was commendably thorough, to give sysfs-bin a set_policy and get_policy, just to avoid the way it was coded resulting in EINVAL from mmap when CONFIG_NUMA; but somehow feels a bit over-the-top to me now. It's easier to say that nobody should expect to manage a shmem object's shared NUMA mempolicy via some kernfs backdoor to that object: delete that code (and there's no longer an EINVAL from mmap in the NUMA case). This then leaves set_policy/get_policy as implemented only by shmem - though importantly also by SysV SHM, which has to interface with shmem which implements them, and with SHM_HUGETLB which does not. Link: https://lkml.kernel.org/r/302164-a760-4a9e-879b-6870c9b4013@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: "Huang, Ying" Cc: Kefeng Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- fs/kernfs/file.c | 49 ------------------------------------------------- 1 file changed, 49 deletions(-) (limited to 'fs') diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 180906c36f51..aaa76410e550 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -429,60 +429,11 @@ static int kernfs_vma_access(struct vm_area_struct *vma, unsigned long addr, return ret; } -#ifdef CONFIG_NUMA -static int kernfs_vma_set_policy(struct vm_area_struct *vma, - struct mempolicy *new) -{ - struct file *file = vma->vm_file; - struct kernfs_open_file *of = kernfs_of(file); - int ret; - - if (!of->vm_ops) - return 0; - - if (!kernfs_get_active(of->kn)) - return -EINVAL; - - ret = 0; - if (of->vm_ops->set_policy) - ret = of->vm_ops->set_policy(vma, new); - - kernfs_put_active(of->kn); - return ret; -} - -static struct mempolicy *kernfs_vma_get_policy(struct vm_area_struct *vma, - unsigned long addr) -{ - struct file *file = vma->vm_file; - struct kernfs_open_file *of = kernfs_of(file); - struct mempolicy *pol; - - if (!of->vm_ops) - return vma->vm_policy; - - if (!kernfs_get_active(of->kn)) - return vma->vm_policy; - - pol = vma->vm_policy; - if (of->vm_ops->get_policy) - pol = of->vm_ops->get_policy(vma, addr); - - kernfs_put_active(of->kn); - return pol; -} - -#endif - static const struct vm_operations_struct kernfs_vm_ops = { .open = kernfs_vma_open, .fault = kernfs_vma_fault, .page_mkwrite = kernfs_vma_page_mkwrite, .access = kernfs_vma_access, -#ifdef CONFIG_NUMA - .set_policy = kernfs_vma_set_policy, - .get_policy = kernfs_vma_get_policy, -#endif }; static int kernfs_fop_mmap(struct file *file, struct vm_area_struct *vma) -- cgit v1.2.3 From ddc1a5cbc05dc62743a2f409b96faa5cf95ba064 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 19 Oct 2023 13:39:08 -0700 Subject: mempolicy: alloc_pages_mpol() for NUMA policy without vma Shrink shmem's stack usage by eliminating the pseudo-vma from its folio allocation. alloc_pages_mpol(gfp, order, pol, ilx, nid) becomes the principal actor for passing mempolicy choice down to __alloc_pages(), rather than vma_alloc_folio(gfp, order, vma, addr, hugepage). vma_alloc_folio() and alloc_pages() remain, but as wrappers around alloc_pages_mpol(). alloc_pages_bulk_*() untouched, except to provide the additional args to policy_nodemask(), which subsumes policy_node(). Cleanup throughout, cutting out some unhelpful "helpers". It would all be much simpler without MPOL_INTERLEAVE, but that adds a dynamic to the constant mpol: complicated by v3.6 commit 09c231cb8bfd ("tmpfs: distribute interleave better across nodes"), which added ino bias to the interleave, hidden from mm/mempolicy.c until this commit. Hence "ilx" throughout, the "interleave index". Originally I thought it could be done just with nid, but that's wrong: the nodemask may come from the shared policy layer below a shmem vma, or it may come from the task layer above a shmem vma; and without the final nodemask then nodeid cannot be decided. And how ilx is applied depends also on page order. The interleave index is almost always irrelevant unless MPOL_INTERLEAVE: with one exception in alloc_pages_mpol(), where the NO_INTERLEAVE_INDEX passed down from vma-less alloc_pages() is also used as hint not to use THP-style hugepage allocation - to avoid the overhead of a hugepage arg (though I don't understand why we never just added a GFP bit for THP - if it actually needs a different allocation strategy from other pages of the same order). vma_alloc_folio() still carries its hugepage arg here, but it is not used, and should be removed when agreed. get_vma_policy() no longer allows a NULL vma: over time I believe we've eradicated all the places which used to need it e.g. swapoff and madvise used to pass NULL vma to read_swap_cache_async(), but now know the vma. [hughd@google.com: handle NULL mpol being passed to __read_swap_cache_async()] Link: https://lkml.kernel.org/r/ea419956-4751-0102-21f7-9c93cb957892@google.com Link: https://lkml.kernel.org/r/74e34633-6060-f5e3-aee-7040d43f2e93@google.com Link: https://lkml.kernel.org/r/1738368e-bac0-fd11-ed7f-b87142a939fe@google.com Signed-off-by: Hugh Dickins Cc: Andi Kleen Cc: Christoph Lameter Cc: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Huang Ying Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Mel Gorman Cc: Michal Hocko Cc: Mike Kravetz Cc: Nhat Pham Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Tejun heo Cc: Vishal Moola (Oracle) Cc: Yang Shi Cc: Yosry Ahmed Cc: Domenico Cerasuolo Cc: Johannes Weiner Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 5 +- include/linux/gfp.h | 10 +- include/linux/mempolicy.h | 20 ++- include/linux/mm.h | 2 +- ipc/shm.c | 21 +-- mm/mempolicy.c | 379 +++++++++++++++++++--------------------------- mm/shmem.c | 92 ++++++----- mm/swap.h | 9 +- mm/swap_state.c | 86 +++++++---- mm/zswap.c | 7 +- 10 files changed, 308 insertions(+), 323 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 1d994505805b..66ae1c265da3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -2673,8 +2673,9 @@ static int show_numa_map(struct seq_file *m, void *v) struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; - struct mempolicy *pol; char buffer[64]; + struct mempolicy *pol; + pgoff_t ilx; int nid; if (!mm) @@ -2683,7 +2684,7 @@ static int show_numa_map(struct seq_file *m, void *v) /* Ensure we start with an empty set of numa_maps statistics. */ memset(md, 0, sizeof(*md)); - pol = __get_vma_policy(vma, vma->vm_start); + pol = __get_vma_policy(vma, vma->vm_start, &ilx); if (pol) { mpol_to_str(buffer, sizeof(buffer), pol); mpol_cond_put(pol); diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 5b917e5b9350..de292a007138 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -8,6 +8,7 @@ #include struct vm_area_struct; +struct mempolicy; /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) @@ -262,7 +263,9 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, #ifdef CONFIG_NUMA struct page *alloc_pages(gfp_t gfp, unsigned int order); -struct folio *folio_alloc(gfp_t gfp, unsigned order); +struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid); +struct folio *folio_alloc(gfp_t gfp, unsigned int order); struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, bool hugepage); #else @@ -270,6 +273,11 @@ static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_node(numa_node_id(), gfp_mask, order); } +static inline struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *mpol, pgoff_t ilx, int nid) +{ + return alloc_pages(gfp, order); +} static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index acdb12fcb6cd..931b118336f4 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -17,6 +17,8 @@ struct mm_struct; +#define NO_INTERLEAVE_INDEX (-1UL) /* use task il_prev for interleaving */ + #ifdef CONFIG_NUMA /* @@ -126,7 +128,9 @@ struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, - unsigned long addr); + unsigned long addr, pgoff_t *ilx); +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); @@ -140,8 +144,6 @@ extern int huge_node(struct vm_area_struct *vma, extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask); -extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy); - extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; @@ -179,6 +181,11 @@ extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); struct mempolicy {}; +static inline struct mempolicy *get_task_policy(struct task_struct *p) +{ + return NULL; +} + static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { return true; @@ -213,6 +220,13 @@ mpol_shared_policy_lookup(struct shared_policy *sp, pgoff_t idx) return NULL; } +static inline struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx) +{ + *ilx = 0; + return NULL; +} + static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 0e496a8b8e50..14d5aaff96d0 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -619,7 +619,7 @@ struct vm_operations_struct { * policy. */ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, - unsigned long addr); + unsigned long addr, pgoff_t *ilx); #endif /* * Called by vm_normal_page() for special PTEs to find the diff --git a/ipc/shm.c b/ipc/shm.c index 576a543b7cff..222aaf035afb 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -562,30 +562,25 @@ static unsigned long shm_pagesize(struct vm_area_struct *vma) } #ifdef CONFIG_NUMA -static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *new) +static int shm_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); + struct shm_file_data *sfd = shm_file_data(vma->vm_file); int err = 0; if (sfd->vm_ops->set_policy) - err = sfd->vm_ops->set_policy(vma, new); + err = sfd->vm_ops->set_policy(vma, mpol); return err; } static struct mempolicy *shm_get_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { - struct file *file = vma->vm_file; - struct shm_file_data *sfd = shm_file_data(file); - struct mempolicy *pol = NULL; + struct shm_file_data *sfd = shm_file_data(vma->vm_file); + struct mempolicy *mpol = vma->vm_policy; if (sfd->vm_ops->get_policy) - pol = sfd->vm_ops->get_policy(vma, addr); - else if (vma->vm_policy) - pol = vma->vm_policy; - - return pol; + mpol = sfd->vm_ops->get_policy(vma, addr, ilx); + return mpol; } #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 596d580f92d1..ded5508f0972 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -898,6 +898,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, } if (flags & MPOL_F_ADDR) { + pgoff_t ilx; /* ignored here */ /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We @@ -909,10 +910,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, mmap_read_unlock(mm); return -EFAULT; } - if (vma->vm_ops && vma->vm_ops->get_policy) - pol = vma->vm_ops->get_policy(vma, addr); - else - pol = vma->vm_policy; + pol = __get_vma_policy(vma, addr, &ilx); } else if (addr) return -EINVAL; @@ -1170,6 +1168,15 @@ static struct folio *new_folio(struct folio *src, unsigned long start) break; } + /* + * __get_vma_policy() now expects a genuine non-NULL vma. Return NULL + * when the page can no longer be located in a vma: that is not ideal + * (migrate_pages() will give up early, presuming ENOMEM), but good + * enough to avoid a crash by syzkaller or concurrent holepunch. + */ + if (!vma) + return NULL; + if (folio_test_hugetlb(src)) { return alloc_hugetlb_folio_vma(folio_hstate(src), vma, address); @@ -1178,9 +1185,6 @@ static struct folio *new_folio(struct folio *src, unsigned long start) if (folio_test_large(src)) gfp = GFP_TRANSHUGE; - /* - * if !vma, vma_alloc_folio() will use task or system default policy - */ return vma_alloc_folio(gfp, folio_order(src), vma, address, folio_test_large(src)); } @@ -1690,34 +1694,19 @@ bool vma_migratable(struct vm_area_struct *vma) } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { - struct mempolicy *pol = NULL; - - if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) { - pol = vma->vm_ops->get_policy(vma, addr); - } else if (vma->vm_policy) { - pol = vma->vm_policy; - - /* - * shmem_alloc_page() passes MPOL_F_SHARED policy with - * a pseudo vma whose vma->vm_ops=NULL. Take a reference - * count on these policies which will be dropped by - * mpol_cond_put() later - */ - if (mpol_needs_cond_ref(pol)) - mpol_get(pol); - } - } - - return pol; + *ilx = 0; + return (vma->vm_ops && vma->vm_ops->get_policy) ? + vma->vm_ops->get_policy(vma, addr, ilx) : vma->vm_policy; } /* - * get_vma_policy(@vma, @addr) + * get_vma_policy(@vma, @addr, @order, @ilx) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup + * @order: 0, or appropriate huge_page_order for interleaving + * @ilx: interleave index (output), for use only when MPOL_INTERLEAVE * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. @@ -1726,14 +1715,18 @@ struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ -static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, - unsigned long addr) +struct mempolicy *get_vma_policy(struct vm_area_struct *vma, + unsigned long addr, int order, pgoff_t *ilx) { - struct mempolicy *pol = __get_vma_policy(vma, addr); + struct mempolicy *pol; + pol = __get_vma_policy(vma, addr, ilx); if (!pol) pol = get_task_policy(current); - + if (pol->mode == MPOL_INTERLEAVE) { + *ilx += vma->vm_pgoff >> order; + *ilx += (addr - vma->vm_start) >> (PAGE_SHIFT + order); + } return pol; } @@ -1743,8 +1736,9 @@ bool vma_policy_mof(struct vm_area_struct *vma) if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; + pgoff_t ilx; /* ignored here */ - pol = vma->vm_ops->get_policy(vma, vma->vm_start); + pol = vma->vm_ops->get_policy(vma, vma->vm_start, &ilx); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); @@ -1779,54 +1773,6 @@ bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) return zone >= dynamic_policy_zone; } -/* - * Return a nodemask representing a mempolicy for filtering nodes for - * page allocation - */ -nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) -{ - int mode = policy->mode; - - /* Lower zones don't get a nodemask applied for MPOL_BIND */ - if (unlikely(mode == MPOL_BIND) && - apply_policy_zone(policy, gfp_zone(gfp)) && - cpuset_nodemask_valid_mems_allowed(&policy->nodes)) - return &policy->nodes; - - if (mode == MPOL_PREFERRED_MANY) - return &policy->nodes; - - return NULL; -} - -/* - * Return the preferred node id for 'prefer' mempolicy, and return - * the given id for all other policies. - * - * policy_node() is always coupled with policy_nodemask(), which - * secures the nodemask limit for 'bind' and 'prefer-many' policy. - */ -static int policy_node(gfp_t gfp, struct mempolicy *policy, int nid) -{ - if (policy->mode == MPOL_PREFERRED) { - nid = first_node(policy->nodes); - } else { - /* - * __GFP_THISNODE shouldn't even be used with the bind policy - * because we might easily break the expectation to stay on the - * requested node and not break the policy. - */ - WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); - } - - if ((policy->mode == MPOL_BIND || - policy->mode == MPOL_PREFERRED_MANY) && - policy->home_node != NUMA_NO_NODE) - return policy->home_node; - - return nid; -} - /* Do dynamic interleaving for a process */ static unsigned int interleave_nodes(struct mempolicy *policy) { @@ -1886,11 +1832,11 @@ unsigned int mempolicy_slab_node(void) } /* - * Do static interleaving for a VMA with known offset @n. Returns the n'th - * node in pol->nodes (starting from n=0), wrapping around if n exceeds the - * number of present nodes. + * Do static interleaving for interleave index @ilx. Returns the ilx'th + * node in pol->nodes (starting from ilx=0), wrapping around if ilx + * exceeds the number of present nodes. */ -static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) +static unsigned int interleave_nid(struct mempolicy *pol, pgoff_t ilx) { nodemask_t nodemask = pol->nodes; unsigned int target, nnodes; @@ -1908,33 +1854,54 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) nnodes = nodes_weight(nodemask); if (!nnodes) return numa_node_id(); - target = (unsigned int)n % nnodes; + target = ilx % nnodes; nid = first_node(nodemask); for (i = 0; i < target; i++) nid = next_node(nid, nodemask); return nid; } -/* Determine a node number for interleave */ -static inline unsigned interleave_nid(struct mempolicy *pol, - struct vm_area_struct *vma, unsigned long addr, int shift) +/* + * Return a nodemask representing a mempolicy for filtering nodes for + * page allocation, together with preferred node id (or the input node id). + */ +static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *pol, + pgoff_t ilx, int *nid) { - if (vma) { - unsigned long off; + nodemask_t *nodemask = NULL; + switch (pol->mode) { + case MPOL_PREFERRED: + /* Override input node id */ + *nid = first_node(pol->nodes); + break; + case MPOL_PREFERRED_MANY: + nodemask = &pol->nodes; + if (pol->home_node != NUMA_NO_NODE) + *nid = pol->home_node; + break; + case MPOL_BIND: + /* Restrict to nodemask (but not on lower zones) */ + if (apply_policy_zone(pol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&pol->nodes)) + nodemask = &pol->nodes; + if (pol->home_node != NUMA_NO_NODE) + *nid = pol->home_node; /* - * for small pages, there is no difference between - * shift and PAGE_SHIFT, so the bit-shift is safe. - * for huge pages, since vm_pgoff is in units of small - * pages, we need to shift off the always 0 bits to get - * a useful offset. + * __GFP_THISNODE shouldn't even be used with the bind policy + * because we might easily break the expectation to stay on the + * requested node and not break the policy. */ - BUG_ON(shift < PAGE_SHIFT); - off = vma->vm_pgoff >> (shift - PAGE_SHIFT); - off += (addr - vma->vm_start) >> shift; - return offset_il_node(pol, off); - } else - return interleave_nodes(pol); + WARN_ON_ONCE(gfp & __GFP_THISNODE); + break; + case MPOL_INTERLEAVE: + /* Override input node id */ + *nid = (ilx == NO_INTERLEAVE_INDEX) ? + interleave_nodes(pol) : interleave_nid(pol, ilx); + break; + } + + return nodemask; } #ifdef CONFIG_HUGETLBFS @@ -1950,27 +1917,16 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'bind' or 'prefer-many', returns a pointer * to the mempolicy's @nodemask for filtering the zonelist. - * - * Must be protected by read_mems_allowed_begin() */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, - struct mempolicy **mpol, nodemask_t **nodemask) + struct mempolicy **mpol, nodemask_t **nodemask) { + pgoff_t ilx; int nid; - int mode; - *mpol = get_vma_policy(vma, addr); - *nodemask = NULL; - mode = (*mpol)->mode; - - if (unlikely(mode == MPOL_INTERLEAVE)) { - nid = interleave_nid(*mpol, vma, addr, - huge_page_shift(hstate_vma(vma))); - } else { - nid = policy_node(gfp_flags, *mpol, numa_node_id()); - if (mode == MPOL_BIND || mode == MPOL_PREFERRED_MANY) - *nodemask = &(*mpol)->nodes; - } + nid = numa_node_id(); + *mpol = get_vma_policy(vma, addr, hstate_vma(vma)->order, &ilx); + *nodemask = policy_nodemask(gfp_flags, *mpol, ilx, &nid); return nid; } @@ -2048,27 +2004,8 @@ bool mempolicy_in_oom_domain(struct task_struct *tsk, return ret; } -/* Allocate a page in interleaved policy. - Own path because it needs to do special accounting. */ -static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, - unsigned nid) -{ - struct page *page; - - page = __alloc_pages(gfp, order, nid, NULL); - /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ - if (!static_branch_likely(&vm_numa_stat_key)) - return page; - if (page && page_to_nid(page) == nid) { - preempt_disable(); - __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); - preempt_enable(); - } - return page; -} - static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, - int nid, struct mempolicy *pol) + int nid, nodemask_t *nodemask) { struct page *page; gfp_t preferred_gfp; @@ -2081,7 +2018,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, */ preferred_gfp = gfp | __GFP_NOWARN; preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); + page = __alloc_pages(preferred_gfp, order, nid, nodemask); if (!page) page = __alloc_pages(gfp, order, nid, NULL); @@ -2089,55 +2026,29 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, } /** - * vma_alloc_folio - Allocate a folio for a VMA. + * alloc_pages_mpol - Allocate pages according to NUMA mempolicy. * @gfp: GFP flags. - * @order: Order of the folio. - * @vma: Pointer to VMA or NULL if not available. - * @addr: Virtual address of the allocation. Must be inside @vma. - * @hugepage: For hugepages try only the preferred node if possible. - * - * Allocate a folio for a specific address in @vma, using the appropriate - * NUMA policy. When @vma is not NULL the caller must hold the mmap_lock - * of the mm_struct of the VMA to prevent it from going away. Should be - * used for all allocations for folios that will be mapped into user space. + * @order: Order of the page allocation. + * @pol: Pointer to the NUMA mempolicy. + * @ilx: Index for interleave mempolicy (also distinguishes alloc_pages()). + * @nid: Preferred node (usually numa_node_id() but @mpol may override it). * - * Return: The folio on success or NULL if allocation fails. + * Return: The page on success or NULL if allocation fails. */ -struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage) +struct page *alloc_pages_mpol(gfp_t gfp, unsigned int order, + struct mempolicy *pol, pgoff_t ilx, int nid) { - struct mempolicy *pol; - int node = numa_node_id(); - struct folio *folio; - int preferred_nid; - nodemask_t *nmask; - - pol = get_vma_policy(vma, addr); - - if (pol->mode == MPOL_INTERLEAVE) { - struct page *page; - unsigned nid; - - nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); - mpol_cond_put(pol); - gfp |= __GFP_COMP; - page = alloc_page_interleave(gfp, order, nid); - return page_rmappable_folio(page); - } - - if (pol->mode == MPOL_PREFERRED_MANY) { - struct page *page; + nodemask_t *nodemask; + struct page *page; - node = policy_node(gfp, pol, node); - gfp |= __GFP_COMP; - page = alloc_pages_preferred_many(gfp, order, node, pol); - mpol_cond_put(pol); - return page_rmappable_folio(page); - } + nodemask = policy_nodemask(gfp, pol, ilx, &nid); - if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { - int hpage_node = node; + if (pol->mode == MPOL_PREFERRED_MANY) + return alloc_pages_preferred_many(gfp, order, nid, nodemask); + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + /* filter "hugepage" allocation, unless from alloc_pages() */ + order == HPAGE_PMD_ORDER && ilx != NO_INTERLEAVE_INDEX) { /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred @@ -2148,39 +2059,68 @@ struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, * If the policy is interleave or does not allow the current * node in its nodemask, we allocate the standard way. */ - if (pol->mode == MPOL_PREFERRED) - hpage_node = first_node(pol->nodes); - - nmask = policy_nodemask(gfp, pol); - if (!nmask || node_isset(hpage_node, *nmask)) { - mpol_cond_put(pol); + if (pol->mode != MPOL_INTERLEAVE && + (!nodemask || node_isset(nid, *nodemask))) { /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ - folio = __folio_alloc_node(gfp | __GFP_THISNODE | - __GFP_NORETRY, order, hpage_node); - + page = __alloc_pages_node(nid, + gfp | __GFP_THISNODE | __GFP_NORETRY, order); + if (page || !(gfp & __GFP_DIRECT_RECLAIM)) + return page; /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ - if (!folio && (gfp & __GFP_DIRECT_RECLAIM)) - folio = __folio_alloc(gfp, order, hpage_node, - nmask); + } + } - goto out; + page = __alloc_pages(gfp, order, nid, nodemask); + + if (unlikely(pol->mode == MPOL_INTERLEAVE) && page) { + /* skip NUMA_INTERLEAVE_HIT update if numa stats is disabled */ + if (static_branch_likely(&vm_numa_stat_key) && + page_to_nid(page) == nid) { + preempt_disable(); + __count_numa_event(page_zone(page), NUMA_INTERLEAVE_HIT); + preempt_enable(); } } - nmask = policy_nodemask(gfp, pol); - preferred_nid = policy_node(gfp, pol, node); - folio = __folio_alloc(gfp, order, preferred_nid, nmask); + return page; +} + +/** + * vma_alloc_folio - Allocate a folio for a VMA. + * @gfp: GFP flags. + * @order: Order of the folio. + * @vma: Pointer to VMA. + * @addr: Virtual address of the allocation. Must be inside @vma. + * @hugepage: Unused (was: For hugepages try only preferred node if possible). + * + * Allocate a folio for a specific address in @vma, using the appropriate + * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the + * VMA to prevent it from going away. Should be used for all allocations + * for folios that will be mapped into user space, excepting hugetlbfs, and + * excepting where direct use of alloc_pages_mpol() is more appropriate. + * + * Return: The folio on success or NULL if allocation fails. + */ +struct folio *vma_alloc_folio(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr, bool hugepage) +{ + struct mempolicy *pol; + pgoff_t ilx; + struct page *page; + + pol = get_vma_policy(vma, addr, order, &ilx); + page = alloc_pages_mpol(gfp | __GFP_COMP, order, + pol, ilx, numa_node_id()); mpol_cond_put(pol); -out: - return folio; + return page_rmappable_folio(page); } EXPORT_SYMBOL(vma_alloc_folio); @@ -2198,33 +2138,23 @@ EXPORT_SYMBOL(vma_alloc_folio); * flags are used. * Return: The page on success or NULL if allocation fails. */ -struct page *alloc_pages(gfp_t gfp, unsigned order) +struct page *alloc_pages(gfp_t gfp, unsigned int order) { struct mempolicy *pol = &default_policy; - struct page *page; - - if (!in_interrupt() && !(gfp & __GFP_THISNODE)) - pol = get_task_policy(current); /* * No reference counting needed for current->mempolicy * nor system default_policy */ - if (pol->mode == MPOL_INTERLEAVE) - page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); - else if (pol->mode == MPOL_PREFERRED_MANY) - page = alloc_pages_preferred_many(gfp, order, - policy_node(gfp, pol, numa_node_id()), pol); - else - page = __alloc_pages(gfp, order, - policy_node(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol)); + if (!in_interrupt() && !(gfp & __GFP_THISNODE)) + pol = get_task_policy(current); - return page; + return alloc_pages_mpol(gfp, order, + pol, NO_INTERLEAVE_INDEX, numa_node_id()); } EXPORT_SYMBOL(alloc_pages); -struct folio *folio_alloc(gfp_t gfp, unsigned order) +struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return page_rmappable_folio(alloc_pages(gfp | __GFP_COMP, order)); } @@ -2295,6 +2225,8 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, unsigned long nr_pages, struct page **page_array) { struct mempolicy *pol = &default_policy; + nodemask_t *nodemask; + int nid; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); @@ -2307,9 +2239,10 @@ unsigned long alloc_pages_bulk_array_mempolicy(gfp_t gfp, return alloc_pages_bulk_array_preferred_many(gfp, numa_node_id(), pol, nr_pages, page_array); - return __alloc_pages_bulk(gfp, policy_node(gfp, pol, numa_node_id()), - policy_nodemask(gfp, pol), nr_pages, NULL, - page_array); + nid = numa_node_id(); + nodemask = policy_nodemask(gfp, pol, NO_INTERLEAVE_INDEX, &nid); + return __alloc_pages_bulk(gfp, nid, nodemask, + nr_pages, NULL, page_array); } int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) @@ -2496,23 +2429,21 @@ int mpol_misplaced(struct folio *folio, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; + pgoff_t ilx; struct zoneref *z; int curnid = folio_nid(folio); - unsigned long pgoff; int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); int polnid = NUMA_NO_NODE; int ret = NUMA_NO_NODE; - pol = get_vma_policy(vma, addr); + pol = get_vma_policy(vma, addr, folio_order(folio), &ilx); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: - pgoff = vma->vm_pgoff; - pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; - polnid = offset_il_node(pol, pgoff); + polnid = interleave_nid(pol, ilx); break; case MPOL_PREFERRED: diff --git a/mm/shmem.c b/mm/shmem.c index bcbe9dbb9f5f..a314a25aea8c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1544,38 +1544,20 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) return NULL; } #endif /* CONFIG_NUMA && CONFIG_TMPFS */ -#ifndef CONFIG_NUMA -#define vm_policy vm_private_data -#endif - -static void shmem_pseudo_vma_init(struct vm_area_struct *vma, - struct shmem_inode_info *info, pgoff_t index) -{ - /* Create a pseudo vma that just contains the policy */ - vma_init(vma, NULL); - /* Bias interleave by inode number to distribute better across nodes */ - vma->vm_pgoff = index + info->vfs_inode.i_ino; - vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); -} -static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) -{ - /* Drop reference taken by mpol_shared_policy_lookup() */ - mpol_cond_put(vma->vm_policy); -} +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx); -static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, +static struct folio *shmem_swapin_cluster(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; + struct mempolicy *mpol; + pgoff_t ilx; struct page *page; - struct vm_fault vmf = { - .vma = &pvma, - }; - shmem_pseudo_vma_init(&pvma, info, index); - page = swap_cluster_readahead(swap, gfp, &vmf); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); + page = swap_cluster_readahead(swap, gfp, mpol, ilx); + mpol_cond_put(mpol); if (!page) return NULL; @@ -1609,27 +1591,29 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) static struct folio *shmem_alloc_hugefolio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; - struct folio *folio; + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; - shmem_pseudo_vma_init(&pvma, info, index); - folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, HPAGE_PMD_ORDER, &ilx); + page = alloc_pages_mpol(gfp, HPAGE_PMD_ORDER, mpol, ilx, numa_node_id()); + mpol_cond_put(mpol); - return folio; + return page_rmappable_folio(page); } static struct folio *shmem_alloc_folio(gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct vm_area_struct pvma; - struct folio *folio; + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; - shmem_pseudo_vma_init(&pvma, info, index); - folio = vma_alloc_folio(gfp, 0, &pvma, 0, false); - shmem_pseudo_vma_destroy(&pvma); + mpol = shmem_get_pgoff_policy(info, index, 0, &ilx); + page = alloc_pages_mpol(gfp, 0, mpol, ilx, numa_node_id()); + mpol_cond_put(mpol); - return folio; + return (struct folio *)page; } static struct folio *shmem_alloc_and_add_folio(gfp_t gfp, @@ -1883,7 +1867,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_memcg_event_mm(fault_mm, PGMAJFAULT); } /* Here we actually start the io */ - folio = shmem_swapin(swap, gfp, info, index); + folio = shmem_swapin_cluster(swap, gfp, info, index); if (!folio) { error = -ENOMEM; goto failed; @@ -2334,15 +2318,41 @@ static int shmem_set_policy(struct vm_area_struct *vma, struct mempolicy *mpol) } static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, - unsigned long addr) + unsigned long addr, pgoff_t *ilx) { struct inode *inode = file_inode(vma->vm_file); pgoff_t index; + /* + * Bias interleave by inode number to distribute better across nodes; + * but this interface is independent of which page order is used, so + * supplies only that bias, letting caller apply the offset (adjusted + * by page order, as in shmem_get_pgoff_policy() and get_vma_policy()). + */ + *ilx = inode->i_ino; index = ((addr - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; return mpol_shared_policy_lookup(&SHMEM_I(inode)->policy, index); } -#endif + +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + struct mempolicy *mpol; + + /* Bias interleave by inode number to distribute better across nodes */ + *ilx = info->vfs_inode.i_ino + (index >> order); + + mpol = mpol_shared_policy_lookup(&info->policy, index); + return mpol ? mpol : get_task_policy(current); +} +#else +static struct mempolicy *shmem_get_pgoff_policy(struct shmem_inode_info *info, + pgoff_t index, unsigned int order, pgoff_t *ilx) +{ + *ilx = 0; + return NULL; +} +#endif /* CONFIG_NUMA */ int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { diff --git a/mm/swap.h b/mm/swap.h index 8a3c7a0ace4f..73c332ee4d91 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -2,6 +2,8 @@ #ifndef _MM_SWAP_H #define _MM_SWAP_H +struct mempolicy; + #ifdef CONFIG_SWAP #include /* for bio_end_io_t */ @@ -48,11 +50,10 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, unsigned long addr, struct swap_iocb **plug); struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, - unsigned long addr, + struct mempolicy *mpol, pgoff_t ilx, bool *new_page_allocated); struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, - struct vm_fault *vmf); + struct mempolicy *mpol, pgoff_t ilx); struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); @@ -80,7 +81,7 @@ static inline void show_swap_cache_info(void) } static inline struct page *swap_cluster_readahead(swp_entry_t entry, - gfp_t gfp_mask, struct vm_fault *vmf) + gfp_t gfp_mask, struct mempolicy *mpol, pgoff_t ilx) { return NULL; } diff --git a/mm/swap_state.c b/mm/swap_state.c index ab79ffb71736..85d9e5806a6a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -410,8 +411,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping, } struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, - struct vm_area_struct *vma, unsigned long addr, - bool *new_page_allocated) + struct mempolicy *mpol, pgoff_t ilx, + bool *new_page_allocated) { struct swap_info_struct *si; struct folio *folio; @@ -453,7 +454,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ - folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); + folio = (struct folio *)alloc_pages_mpol(gfp_mask, 0, + mpol, ilx, numa_node_id()); if (!folio) goto fail_put_swap; @@ -528,14 +530,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, struct vm_area_struct *vma, unsigned long addr, struct swap_iocb **plug) { - bool page_was_allocated; - struct page *retpage = __read_swap_cache_async(entry, gfp_mask, - vma, addr, &page_was_allocated); + bool page_allocated; + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; - if (page_was_allocated) - swap_readpage(retpage, false, plug); + mpol = get_vma_policy(vma, addr, 0, &ilx); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); + mpol_cond_put(mpol); - return retpage; + if (page_allocated) + swap_readpage(page, false, plug); + return page; } static unsigned int __swapin_nr_pages(unsigned long prev_offset, @@ -603,7 +610,8 @@ static unsigned long swapin_nr_pages(unsigned long offset) * swap_cluster_readahead - swap in pages in hope we need them soon * @entry: swap entry of this memory * @gfp_mask: memory allocation flags - * @vmf: fault information + * @mpol: NUMA memory allocation policy to be applied + * @ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * * Returns the struct page for entry and addr, after queueing swapin. * @@ -612,13 +620,12 @@ static unsigned long swapin_nr_pages(unsigned long offset) * because it doesn't cost us any seek time. We also make sure to queue * the 'original' request together with the readahead ones... * - * This has been extended to use the NUMA policies from the mm triggering - * the readahead. - * - * Caller must hold read mmap_lock if vmf->vma is not NULL. + * Note: it is intentional that the same NUMA policy and interleave index + * are used for every page of the readahead: neighbouring pages on swap + * are fairly likely to have been swapped out from the same node. */ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, - struct vm_fault *vmf) + struct mempolicy *mpol, pgoff_t ilx) { struct page *page; unsigned long entry_offset = swp_offset(entry); @@ -629,8 +636,6 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct blk_plug plug; struct swap_iocb *splug = NULL; bool page_allocated; - struct vm_area_struct *vma = vmf->vma; - unsigned long addr = vmf->address; mask = swapin_nr_pages(offset) - 1; if (!mask) @@ -648,8 +653,8 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ page = __read_swap_cache_async( - swp_entry(swp_type(entry), offset), - gfp_mask, vma, addr, &page_allocated); + swp_entry(swp_type(entry), offset), + gfp_mask, mpol, ilx, &page_allocated); if (!page) continue; if (page_allocated) { @@ -663,11 +668,14 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, } blk_finish_plug(&plug); swap_read_unplug(splug); - lru_add_drain(); /* Push any new pages onto the LRU now */ skip: /* The page was likely read above, so no need for plugging here */ - return read_swap_cache_async(entry, gfp_mask, vma, addr, NULL); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); + if (unlikely(page_allocated)) + swap_readpage(page, false, NULL); + return page; } int init_swap_address_space(unsigned int type, unsigned long nr_pages) @@ -765,8 +773,10 @@ static void swap_ra_info(struct vm_fault *vmf, /** * swap_vma_readahead - swap in pages in hope we need them soon - * @fentry: swap entry of this memory + * @targ_entry: swap entry of the targeted memory * @gfp_mask: memory allocation flags + * @mpol: NUMA memory allocation policy to be applied + * @targ_ilx: NUMA interleave index, for use only when MPOL_INTERLEAVE * @vmf: fault information * * Returns the struct page for entry and addr, after queueing swapin. @@ -777,16 +787,17 @@ static void swap_ra_info(struct vm_fault *vmf, * Caller must hold read mmap_lock if vmf->vma is not NULL. * */ -static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, +static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask, + struct mempolicy *mpol, pgoff_t targ_ilx, struct vm_fault *vmf) { struct blk_plug plug; struct swap_iocb *splug = NULL; - struct vm_area_struct *vma = vmf->vma; struct page *page; pte_t *pte = NULL, pentry; unsigned long addr; swp_entry_t entry; + pgoff_t ilx; unsigned int i; bool page_allocated; struct vma_swap_readahead ra_info = { @@ -798,9 +809,10 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, goto skip; addr = vmf->address - (ra_info.offset * PAGE_SIZE); + ilx = targ_ilx - ra_info.offset; blk_start_plug(&plug); - for (i = 0; i < ra_info.nr_pte; i++, addr += PAGE_SIZE) { + for (i = 0; i < ra_info.nr_pte; i++, ilx++, addr += PAGE_SIZE) { if (!pte++) { pte = pte_offset_map(vmf->pmd, addr); if (!pte) @@ -814,8 +826,8 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, continue; pte_unmap(pte); pte = NULL; - page = __read_swap_cache_async(entry, gfp_mask, vma, - addr, &page_allocated); + page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx, + &page_allocated); if (!page) continue; if (page_allocated) { @@ -834,8 +846,11 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask, lru_add_drain(); skip: /* The page was likely read above, so no need for plugging here */ - return read_swap_cache_async(fentry, gfp_mask, vma, vmf->address, - NULL); + page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx, + &page_allocated); + if (unlikely(page_allocated)) + swap_readpage(page, false, NULL); + return page; } /** @@ -853,9 +868,16 @@ skip: struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { - return swap_use_vma_readahead() ? - swap_vma_readahead(entry, gfp_mask, vmf) : - swap_cluster_readahead(entry, gfp_mask, vmf); + struct mempolicy *mpol; + pgoff_t ilx; + struct page *page; + + mpol = get_vma_policy(vmf->vma, vmf->address, 0, &ilx); + page = swap_use_vma_readahead() ? + swap_vma_readahead(entry, gfp_mask, mpol, ilx, vmf) : + swap_cluster_readahead(entry, gfp_mask, mpol, ilx); + mpol_cond_put(mpol); + return page; } #ifdef CONFIG_SYSFS diff --git a/mm/zswap.c b/mm/zswap.c index 37d2b1cb2ecb..060857adca76 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -1057,6 +1058,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, { swp_entry_t swpentry = entry->swpentry; struct page *page; + struct mempolicy *mpol; struct scatterlist input, output; struct crypto_acomp_ctx *acomp_ctx; struct zpool *pool = zswap_find_zpool(entry); @@ -1075,8 +1077,9 @@ static int zswap_writeback_entry(struct zswap_entry *entry, } /* try to allocate swap cache page */ - page = __read_swap_cache_async(swpentry, GFP_KERNEL, NULL, 0, - &page_was_allocated); + mpol = get_task_policy(current); + page = __read_swap_cache_async(swpentry, GFP_KERNEL, mpol, + NO_INTERLEAVE_INDEX, &page_was_allocated); if (!page) { ret = -ENOMEM; goto fail; -- cgit v1.2.3