From 606c812eb1d5b5fb0dd9e330ca94b52d7c227830 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Sat, 17 Jun 2023 20:47:08 -0400 Subject: mm/mmap: Fix error path in do_vmi_align_munmap() The error unrolling was leaving the VMAs detached in many cases and leaving the locked_vm statistic altered, and skipping the unrolling entirely in the case of the vma tree write failing. Fix the error path by re-attaching the detached VMAs and adding the necessary goto for the failed vma tree write, and fix the locked_vm statistic by only updating after the vma tree write succeeds. Fixes: 763ecb035029 ("mm: remove the vma linked list") Reported-by: Vegard Nossum Signed-off-by: Liam R. Howlett Signed-off-by: Linus Torvalds --- mm/mmap.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 13678edaa22c..d600404580b2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2318,21 +2318,6 @@ int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma, return __split_vma(vmi, vma, addr, new_below); } -static inline int munmap_sidetree(struct vm_area_struct *vma, - struct ma_state *mas_detach) -{ - vma_start_write(vma); - mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1); - if (mas_store_gfp(mas_detach, vma, GFP_KERNEL)) - return -ENOMEM; - - vma_mark_detached(vma, true); - if (vma->vm_flags & VM_LOCKED) - vma->vm_mm->locked_vm -= vma_pages(vma); - - return 0; -} - /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -2354,6 +2339,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct maple_tree mt_detach; int count = 0; int error = -ENOMEM; + unsigned long locked_vm = 0; MA_STATE(mas_detach, &mt_detach, 0, 0); mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); mt_set_external_lock(&mt_detach, &mm->mmap_lock); @@ -2399,9 +2385,13 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, if (error) goto end_split_failed; } - error = munmap_sidetree(next, &mas_detach); - if (error) - goto munmap_sidetree_failed; + vma_start_write(next); + mas_set_range(&mas_detach, next->vm_start, next->vm_end - 1); + if (mas_store_gfp(&mas_detach, next, GFP_KERNEL)) + goto munmap_gather_failed; + vma_mark_detached(next, true); + if (next->vm_flags & VM_LOCKED) + locked_vm += vma_pages(next); count++; #ifdef CONFIG_DEBUG_VM_MAPLE_TREE @@ -2447,10 +2437,12 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, } #endif /* Point of no return */ + error = -ENOMEM; vma_iter_set(vmi, start); if (vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL)) - return -ENOMEM; + goto clear_tree_failed; + mm->locked_vm -= locked_vm; mm->map_count -= count; /* * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or @@ -2480,9 +2472,14 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, validate_mm(mm); return downgrade ? 1 : 0; +clear_tree_failed: userfaultfd_error: -munmap_sidetree_failed: +munmap_gather_failed: end_split_failed: + mas_set(&mas_detach, 0); + mas_for_each(&mas_detach, next, end) + vma_mark_detached(next, false); + __mt_destroy(&mt_detach); start_split_failed: map_count_exceeded: -- cgit v1.2.3 From 77795f900e2a07c1cbedc375789aefb43843b6c2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Jun 2023 14:29:12 -0400 Subject: mm/mprotect: fix do_mprotect_pkey() limit check The return of do_mprotect_pkey() can still be incorrectly returned as success if there is a gap that spans to or beyond the end address passed in. Update the check to ensure that the end address has indeed been seen. Link: https://lore.kernel.org/all/CABi2SkXjN+5iFoBhxk71t3cmunTk-s=rB4T7qo0UQRh17s49PQ@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230606182912.586576-1-Liam.Howlett@oracle.com Fixes: 82f951340f25 ("mm/mprotect: fix do_mprotect_pkey() return on error") Signed-off-by: Liam R. Howlett Reported-by: Jeff Xu Reviewed-by: Lorenzo Stoakes Acked-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Signed-off-by: Andrew Morton --- mm/mprotect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index 92d3d3ca390a..c59e7561698c 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -867,7 +867,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, } tlb_finish_mmu(&tlb); - if (!error && vma_iter_end(&vmi) < end) + if (!error && tmp < end) error = -ENOMEM; out: -- cgit v1.2.3 From 95a301eefa82057571207edd06ea36218985a75e Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 5 Jun 2023 21:11:07 +0100 Subject: mm/vmalloc: do not output a spurious warning when huge vmalloc() fails In __vmalloc_area_node() we always warn_alloc() when an allocation performed by vm_area_alloc_pages() fails unless it was due to a pending fatal signal. However, huge page allocations instigated either by vmalloc_huge() or __vmalloc_node_range() (or a caller that invokes this like kvmalloc() or kvmalloc_node()) always falls back to order-0 allocations if the huge page allocation fails. This renders the warning useless and noisy, especially as all callers appear to be aware that this may fallback. This has already resulted in at least one bug report from a user who was confused by this (see link). Therefore, simply update the code to only output this warning for order-0 pages when no fatal signal is pending. Link: https://bugzilla.suse.com/show_bug.cgi?id=1211410 Link: https://lkml.kernel.org/r/20230605201107.83298-1-lstoakes@gmail.com Fixes: 80b1d8fdfad1 ("mm: vmalloc: correct use of __GFP_NOWARN mask in __vmalloc_area_node()") Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Baoquan He Acked-by: Michal Hocko Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: David Hildenbrand Cc: Christoph Hellwig Cc: Signed-off-by: Andrew Morton --- mm/vmalloc.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 9683573f1225..1d13d71687d7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3098,11 +3098,20 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, * allocation request, free them via vfree() if any. */ if (area->nr_pages != nr_small_pages) { - /* vm_area_alloc_pages() can also fail due to a fatal signal */ - if (!fatal_signal_pending(current)) + /* + * vm_area_alloc_pages() can fail due to insufficient memory but + * also:- + * + * - a pending fatal signal + * - insufficient huge page-order pages + * + * Since we always retry allocations at order-0 in the huge page + * case a warning for either is spurious. + */ + if (!fatal_signal_pending(current) && page_order == 0) warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, page order %u, failed to allocate pages", - area->nr_pages * PAGE_SIZE, page_order); + "vmalloc error: size %lu, failed to allocate pages", + area->nr_pages * PAGE_SIZE); goto fail; } -- cgit v1.2.3 From 935d44acf621aa0688fef8312dec3e5940f38f4e Mon Sep 17 00:00:00 2001 From: Roberto Sassu Date: Wed, 7 Jun 2023 15:24:27 +0200 Subject: memfd: check for non-NULL file_seals in memfd_create() syscall Ensure that file_seals is non-NULL before using it in the memfd_create() syscall. One situation in which memfd_file_seals_ptr() could return a NULL pointer when CONFIG_SHMEM=n, oopsing the kernel. Link: https://lkml.kernel.org/r/20230607132427.2867435-1-roberto.sassu@huaweicloud.com Fixes: 47b9012ecdc7 ("shmem: add sealing support to hugetlb-backed memfd") Signed-off-by: Roberto Sassu Cc: Marc-Andr Lureau Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton --- mm/memfd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memfd.c b/mm/memfd.c index 69b90c31d38c..e763e76f1106 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -371,12 +371,15 @@ SYSCALL_DEFINE2(memfd_create, inode->i_mode &= ~0111; file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; - *file_seals |= F_SEAL_EXEC; + if (file_seals) { + *file_seals &= ~F_SEAL_SEAL; + *file_seals |= F_SEAL_EXEC; + } } else if (flags & MFD_ALLOW_SEALING) { /* MFD_EXEC and MFD_ALLOW_SEALING are set */ file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; + if (file_seals) + *file_seals &= ~F_SEAL_SEAL; } fd_install(fd, file); -- cgit v1.2.3 From c8a8f3b4a95ace7683b615ad9c9aa0eac59013ae Mon Sep 17 00:00:00 2001 From: David Stevens Date: Wed, 7 Jun 2023 14:31:35 +0900 Subject: mm/khugepaged: fix iteration in collapse_file Remove an unnecessary call to xas_set(index) when iterating over the target range in collapse_file. The extra call to xas_set reset the xas cursor to the top of the tree, causing the xas_next call on the next iteration to walk the tree to index instead of advancing to index+1. This returned the same page again, which would cause collapse_file to fail because the page is already locked. This bug was hidden when CONFIG_DEBUG_VM was set. When that config was used, the xas_load in a subsequent VM_BUG_ON assert would walk xas from the top of the tree to index, causing the xas_next call on the next loop iteration to advance the cursor as expected. Link: https://lkml.kernel.org/r/20230607053135.2087354-1-stevensd@google.com Fixes: a2e17cc2efc7 ("mm/khugepaged: maintain page cache uptodate flag") Signed-off-by: David Stevens Reviewed-by: Peter Xu Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jiaqi Yan Cc: Kirill A . Shutemov Cc: Matthew Wilcox Cc: Yang Shi Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/khugepaged.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 6b9d39d65b73..2d0d58fb4e7f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2070,7 +2070,6 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr, TTU_IGNORE_MLOCK | TTU_BATCH_FLUSH); xas_lock_irq(&xas); - xas_set(&xas, index); VM_BUG_ON_PAGE(page != xas_load(&xas), page); -- cgit v1.2.3 From 47a7c01c3efc6581f5dcca40928baeb38e1e40c2 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:12 +0000 Subject: Revert "mm: shrinkers: convert shrinker_rwsem to mutex" Patch series "revert shrinker_srcu related changes". This patch (of 7): This reverts commit cf2e309ebca7bb0916771839f9b580b06c778530. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. After discussion, we will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_mutex back to shrinker_rwsem first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-1-qi.zheng@linux.dev Link: https://lkml.kernel.org/r/20230609081518.3039120-2-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Cc: Yujie Liu Signed-off-by: Andrew Morton --- drivers/md/dm-cache-metadata.c | 2 +- drivers/md/dm-thin-metadata.c | 2 +- fs/super.c | 2 +- mm/shrinker_debug.c | 14 +++++++------- mm/vmscan.c | 34 +++++++++++++++++----------------- 5 files changed, 27 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/drivers/md/dm-cache-metadata.c b/drivers/md/dm-cache-metadata.c index 9e0c69958587..acffed750e3e 100644 --- a/drivers/md/dm-cache-metadata.c +++ b/drivers/md/dm-cache-metadata.c @@ -1828,7 +1828,7 @@ int dm_cache_metadata_abort(struct dm_cache_metadata *cmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * cmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs cmd root_lock). - * - must take shrinker_mutex without holding cmd->root_lock + * - must take shrinker_rwsem without holding cmd->root_lock */ new_bm = dm_block_manager_create(cmd->bdev, DM_CACHE_METADATA_BLOCK_SIZE << SECTOR_SHIFT, CACHE_MAX_CONCURRENT_LOCKS); diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 9f5cb52c5763..fd464fb024c3 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c @@ -1887,7 +1887,7 @@ int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) * Replacement block manager (new_bm) is created and old_bm destroyed outside of * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of * shrinker associated with the block manager's bufio client vs pmd root_lock). - * - must take shrinker_mutex without holding pmd->root_lock + * - must take shrinker_rwsem without holding pmd->root_lock */ new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, THIN_MAX_CONCURRENT_LOCKS); diff --git a/fs/super.c b/fs/super.c index 34afe411cf2b..04bc62ab7dfe 100644 --- a/fs/super.c +++ b/fs/super.c @@ -54,7 +54,7 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = { * One thing we have to be careful of with a per-sb shrinker is that we don't * drop the last active reference to the superblock from within the shrinker. * If that happens we could trigger unregistering the shrinker from within the - * shrinker path and that leads to deadlock on the shrinker_mutex. Hence we + * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we * take a passive reference to the superblock to avoid this from occurring. */ static unsigned long super_cache_scan(struct shrinker *shrink, diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index fe10436d9911..2be15b8a6d0b 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -8,7 +8,7 @@ #include /* defined in vmscan.c */ -extern struct mutex shrinker_mutex; +extern struct rw_semaphore shrinker_rwsem; extern struct list_head shrinker_list; extern struct srcu_struct shrinker_srcu; @@ -168,7 +168,7 @@ int shrinker_debugfs_add(struct shrinker *shrinker) char buf[128]; int id; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); /* debugfs isn't initialized yet, add debugfs entries later. */ if (!shrinker_debugfs_root) @@ -211,7 +211,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) if (!new) return -ENOMEM; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); old = shrinker->name; shrinker->name = new; @@ -229,7 +229,7 @@ int shrinker_debugfs_rename(struct shrinker *shrinker, const char *fmt, ...) shrinker->debugfs_entry = entry; } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); kfree_const(old); @@ -242,7 +242,7 @@ struct dentry *shrinker_debugfs_detach(struct shrinker *shrinker, { struct dentry *entry = shrinker->debugfs_entry; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); kfree_const(shrinker->name); shrinker->name = NULL; @@ -271,14 +271,14 @@ static int __init shrinker_debugfs_init(void) shrinker_debugfs_root = dentry; /* Create debugfs entries for shrinkers registered at boot */ - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); list_for_each_entry(shrinker, &shrinker_list, list) if (!shrinker->debugfs_entry) { ret = shrinker_debugfs_add(shrinker); if (ret) break; } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 6d0cd2840cf0..4730dba253c8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include #include #include @@ -190,7 +190,7 @@ struct scan_control { int vm_swappiness = 60; LIST_HEAD(shrinker_list); -DEFINE_MUTEX(shrinker_mutex); +DECLARE_RWSEM(shrinker_rwsem); DEFINE_SRCU(shrinker_srcu); static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0); @@ -213,7 +213,7 @@ static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, { return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, &shrinker_srcu, - lockdep_is_held(&shrinker_mutex)); + lockdep_is_held(&shrinker_rwsem)); } static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, @@ -292,7 +292,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) int nid, size, ret = 0; int map_size, defer_size = 0; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); map_size = shrinker_map_size(shrinker_nr_max); defer_size = shrinker_defer_size(shrinker_nr_max); size = map_size + defer_size; @@ -308,7 +308,7 @@ int alloc_shrinker_info(struct mem_cgroup *memcg) info->map_nr_max = shrinker_nr_max; rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_info, info); } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -324,7 +324,7 @@ static int expand_shrinker_info(int new_id) if (!root_mem_cgroup) goto out; - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); map_size = shrinker_map_size(new_nr_max); defer_size = shrinker_defer_size(new_nr_max); @@ -374,7 +374,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) if (mem_cgroup_disabled()) return -ENOSYS; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -388,7 +388,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) shrinker->id = id; ret = 0; unlock: - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return ret; } @@ -398,7 +398,7 @@ static void unregister_memcg_shrinker(struct shrinker *shrinker) BUG_ON(id < 0); - lockdep_assert_held(&shrinker_mutex); + lockdep_assert_held(&shrinker_rwsem); idr_remove(&shrinker_idr, id); } @@ -433,7 +433,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -442,7 +442,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) atomic_long_add(nr, &parent_info->nr_deferred[i]); } } - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); } static bool cgroup_reclaim(struct scan_control *sc) @@ -743,9 +743,9 @@ void free_prealloced_shrinker(struct shrinker *shrinker) shrinker->name = NULL; #endif if (shrinker->flags & SHRINKER_MEMCG_AWARE) { - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); unregister_memcg_shrinker(shrinker); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); return; } @@ -755,11 +755,11 @@ void free_prealloced_shrinker(struct shrinker *shrinker) void register_shrinker_prepared(struct shrinker *shrinker) { - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); list_add_tail_rcu(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); } static int __register_shrinker(struct shrinker *shrinker) @@ -810,13 +810,13 @@ void unregister_shrinker(struct shrinker *shrinker) if (!(shrinker->flags & SHRINKER_REGISTERED)) return; - mutex_lock(&shrinker_mutex); + down_write(&shrinker_rwsem); list_del_rcu(&shrinker->list); shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); - mutex_unlock(&shrinker_mutex); + up_write(&shrinker_rwsem); atomic_inc(&shrinker_srcu_generation); synchronize_srcu(&shrinker_srcu); -- cgit v1.2.3 From 07252b0f97150b03050f74f42250f275566d70b9 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:13 +0000 Subject: Revert "mm: vmscan: remove shrinker_rwsem from synchronize_shrinkers()" This reverts commit 1643db98d9b314e0a592d152603094fbf7ab906e. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. We will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So we still need shrinker_rwsem in synchronize_shrinkers() after reverting the shrinker_srcu related changes. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-3-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 4730dba253c8..0ba0e1180f3f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -831,11 +831,15 @@ EXPORT_SYMBOL(unregister_shrinker); /** * synchronize_shrinkers - Wait for all running shrinkers to complete. * - * This is useful to guarantee that all shrinker invocations have seen an - * update, before freeing memory. + * This is equivalent to calling unregister_shrink() and register_shrinker(), + * but atomically and with less overhead. This is useful to guarantee that all + * shrinker invocations have seen an update, before freeing memory, similar to + * rcu. */ void synchronize_shrinkers(void) { + down_write(&shrinker_rwsem); + up_write(&shrinker_rwsem); atomic_inc(&shrinker_srcu_generation); synchronize_srcu(&shrinker_srcu); } -- cgit v1.2.3 From c534f7cca6b9b1c0dc97d6e9c5587858d4330cd9 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:14 +0000 Subject: Revert "mm: vmscan: hold write lock to reparent shrinker nr_deferred" This reverts commit b3cabea3c9153fd42fe5cb851ac58b51ea2b32b8. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. We will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. Because there will be other readers after reverting the shrinker_srcu related changes, so it is better to restore to hold read lock to reparent shrinker nr_deferred. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-4-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 0ba0e1180f3f..d1d309fc3212 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -433,7 +433,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) parent = root_mem_cgroup; /* Prevent from concurrent shrinker_info expand */ - down_write(&shrinker_rwsem); + down_read(&shrinker_rwsem); for_each_node(nid) { child_info = shrinker_info_protected(memcg, nid); parent_info = shrinker_info_protected(parent, nid); @@ -442,7 +442,7 @@ void reparent_shrinker_deferred(struct mem_cgroup *memcg) atomic_long_add(nr, &parent_info->nr_deferred[i]); } } - up_write(&shrinker_rwsem); + up_read(&shrinker_rwsem); } static bool cgroup_reclaim(struct scan_control *sc) -- cgit v1.2.3 From 1a554ecc971406e291cea867112f7f2e377e810e Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:15 +0000 Subject: Revert "mm: shrinkers: make count and scan in shrinker debugfs lockless" This reverts commit 20cd1892fcc3efc10a7ac327cc3790494bec46b5. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. We will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_srcu related changes first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-5-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/shrinker_debug.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c index 2be15b8a6d0b..3ab53fad8876 100644 --- a/mm/shrinker_debug.c +++ b/mm/shrinker_debug.c @@ -5,12 +5,10 @@ #include #include #include -#include /* defined in vmscan.c */ extern struct rw_semaphore shrinker_rwsem; extern struct list_head shrinker_list; -extern struct srcu_struct shrinker_srcu; static DEFINE_IDA(shrinker_debugfs_ida); static struct dentry *shrinker_debugfs_root; @@ -51,13 +49,18 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) struct mem_cgroup *memcg; unsigned long total; bool memcg_aware; - int ret = 0, nid, srcu_idx; + int ret, nid; count_per_node = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL); if (!count_per_node) return -ENOMEM; - srcu_idx = srcu_read_lock(&shrinker_srcu); + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + kfree(count_per_node); + return ret; + } + rcu_read_lock(); memcg_aware = shrinker->flags & SHRINKER_MEMCG_AWARE; @@ -88,7 +91,8 @@ static int shrinker_debugfs_count_show(struct seq_file *m, void *v) } } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); - srcu_read_unlock(&shrinker_srcu, srcu_idx); + rcu_read_unlock(); + up_read(&shrinker_rwsem); kfree(count_per_node); return ret; @@ -111,8 +115,9 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, .gfp_mask = GFP_KERNEL, }; struct mem_cgroup *memcg = NULL; - int nid, srcu_idx; + int nid; char kbuf[72]; + ssize_t ret; read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1); if (copy_from_user(kbuf, buf, read_len)) @@ -141,7 +146,11 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, return -EINVAL; } - srcu_idx = srcu_read_lock(&shrinker_srcu); + ret = down_read_killable(&shrinker_rwsem); + if (ret) { + mem_cgroup_put(memcg); + return ret; + } sc.nid = nid; sc.memcg = memcg; @@ -150,7 +159,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file, shrinker->scan_objects(shrinker, &sc); - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); mem_cgroup_put(memcg); return size; -- cgit v1.2.3 From d6ecbcd70fffcffe79d45f3be7b8fcf5d8e04a3d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:16 +0000 Subject: Revert "mm: vmscan: add shrinker_srcu_generation" This reverts commit 475733dda5aedba9e086379aafe6b5ffd53e8f5e. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. We will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_srcu related changes first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-6-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 24 ++++-------------------- 1 file changed, 4 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index d1d309fc3212..50775b73d0c7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -192,7 +192,6 @@ int vm_swappiness = 60; LIST_HEAD(shrinker_list); DECLARE_RWSEM(shrinker_rwsem); DEFINE_SRCU(shrinker_srcu); -static atomic_t shrinker_srcu_generation = ATOMIC_INIT(0); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -818,7 +817,6 @@ void unregister_shrinker(struct shrinker *shrinker) debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); up_write(&shrinker_rwsem); - atomic_inc(&shrinker_srcu_generation); synchronize_srcu(&shrinker_srcu); shrinker_debugfs_remove(debugfs_entry, debugfs_id); @@ -840,7 +838,6 @@ void synchronize_shrinkers(void) { down_write(&shrinker_rwsem); up_write(&shrinker_rwsem); - atomic_inc(&shrinker_srcu_generation); synchronize_srcu(&shrinker_srcu); } EXPORT_SYMBOL(synchronize_shrinkers); @@ -950,20 +947,18 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int srcu_idx, generation; - int i = 0; + int srcu_idx; + int i; if (!mem_cgroup_online(memcg)) return 0; -again: srcu_idx = srcu_read_lock(&shrinker_srcu); info = shrinker_info_srcu(memcg, nid); if (unlikely(!info)) goto unlock; - generation = atomic_read(&shrinker_srcu_generation); - for_each_set_bit_from(i, info->map, info->map_nr_max) { + for_each_set_bit(i, info->map, info->map_nr_max) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -1009,11 +1004,6 @@ again: set_shrinker_bit(memcg, nid, i); } freed += ret; - if (atomic_read(&shrinker_srcu_generation) != generation) { - srcu_read_unlock(&shrinker_srcu, srcu_idx); - i++; - goto again; - } } unlock: srcu_read_unlock(&shrinker_srcu, srcu_idx); @@ -1053,7 +1043,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { unsigned long ret, freed = 0; struct shrinker *shrinker; - int srcu_idx, generation; + int srcu_idx; /* * The root memcg might be allocated even though memcg is disabled @@ -1067,7 +1057,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, srcu_idx = srcu_read_lock(&shrinker_srcu); - generation = atomic_read(&shrinker_srcu_generation); list_for_each_entry_srcu(shrinker, &shrinker_list, list, srcu_read_lock_held(&shrinker_srcu)) { struct shrink_control sc = { @@ -1080,11 +1069,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; freed += ret; - - if (atomic_read(&shrinker_srcu_generation) != generation) { - freed = freed ? : 1; - break; - } } srcu_read_unlock(&shrinker_srcu, srcu_idx); -- cgit v1.2.3 From 7cee3603192a198b23c034a7372b599081cbee88 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:17 +0000 Subject: Revert "mm: vmscan: make memcg slab shrink lockless" This reverts commit caa05325c9126c77ebf114edce51536a0d0a9a08. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. After discussion, we will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_srcu related changes first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-7-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 50775b73d0c7..a008d7f2d0fc 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -210,21 +210,8 @@ static inline int shrinker_defer_size(int nr_items) static struct shrinker_info *shrinker_info_protected(struct mem_cgroup *memcg, int nid) { - return srcu_dereference_check(memcg->nodeinfo[nid]->shrinker_info, - &shrinker_srcu, - lockdep_is_held(&shrinker_rwsem)); -} - -static struct shrinker_info *shrinker_info_srcu(struct mem_cgroup *memcg, - int nid) -{ - return srcu_dereference(memcg->nodeinfo[nid]->shrinker_info, - &shrinker_srcu); -} - -static void free_shrinker_info_rcu(struct rcu_head *head) -{ - kvfree(container_of(head, struct shrinker_info, rcu)); + return rcu_dereference_protected(memcg->nodeinfo[nid]->shrinker_info, + lockdep_is_held(&shrinker_rwsem)); } static int expand_one_shrinker_info(struct mem_cgroup *memcg, @@ -265,7 +252,7 @@ static int expand_one_shrinker_info(struct mem_cgroup *memcg, defer_size - old_defer_size); rcu_assign_pointer(pn->shrinker_info, new); - call_srcu(&shrinker_srcu, &old->rcu, free_shrinker_info_rcu); + kvfree_rcu(old, rcu); } return 0; @@ -351,16 +338,15 @@ void set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) { struct shrinker_info *info; - int srcu_idx; - srcu_idx = srcu_read_lock(&shrinker_srcu); - info = shrinker_info_srcu(memcg, nid); + rcu_read_lock(); + info = rcu_dereference(memcg->nodeinfo[nid]->shrinker_info); if (!WARN_ON_ONCE(shrinker_id >= info->map_nr_max)) { /* Pairs with smp mb in shrink_slab() */ smp_mb__before_atomic(); set_bit(shrinker_id, info->map); } - srcu_read_unlock(&shrinker_srcu, srcu_idx); + rcu_read_unlock(); } } @@ -374,6 +360,7 @@ static int prealloc_memcg_shrinker(struct shrinker *shrinker) return -ENOSYS; down_write(&shrinker_rwsem); + /* This may call shrinker, so it must use down_read_trylock() */ id = idr_alloc(&shrinker_idr, shrinker, 0, 0, GFP_KERNEL); if (id < 0) goto unlock; @@ -407,7 +394,7 @@ static long xchg_nr_deferred_memcg(int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_srcu(memcg, nid); + info = shrinker_info_protected(memcg, nid); return atomic_long_xchg(&info->nr_deferred[shrinker->id], 0); } @@ -416,7 +403,7 @@ static long add_nr_deferred_memcg(long nr, int nid, struct shrinker *shrinker, { struct shrinker_info *info; - info = shrinker_info_srcu(memcg, nid); + info = shrinker_info_protected(memcg, nid); return atomic_long_add_return(nr, &info->nr_deferred[shrinker->id]); } @@ -947,14 +934,15 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, { struct shrinker_info *info; unsigned long ret, freed = 0; - int srcu_idx; int i; if (!mem_cgroup_online(memcg)) return 0; - srcu_idx = srcu_read_lock(&shrinker_srcu); - info = shrinker_info_srcu(memcg, nid); + if (!down_read_trylock(&shrinker_rwsem)) + return 0; + + info = shrinker_info_protected(memcg, nid); if (unlikely(!info)) goto unlock; @@ -1004,9 +992,14 @@ static unsigned long shrink_slab_memcg(gfp_t gfp_mask, int nid, set_shrinker_bit(memcg, nid, i); } freed += ret; + + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } } unlock: - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); return freed; } #else /* CONFIG_MEMCG */ -- cgit v1.2.3 From 71c3ad65fabec9620d3f548b2da948c79c7ad9d5 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Fri, 9 Jun 2023 08:15:18 +0000 Subject: Revert "mm: vmscan: make global slab shrink lockless" This reverts commit f95bdb700bc6bb74e1199b1f5f90c613e152cfa7. Kernel test robot reports -88.8% regression in stress-ng.ramfs.ops_per_sec test case [1], which is caused by commit f95bdb700bc6 ("mm: vmscan: make global slab shrink lockless"). The root cause is that SRCU has to be careful to not frequently check for SRCU read-side critical section exits. Therefore, even if no one is currently in the SRCU read-side critical section, synchronize_srcu() cannot return quickly. That's why unregister_shrinker() has become slower. After discussion, we will try to use the refcount+RCU method [2] proposed by Dave Chinner to continue to re-implement the lockless slab shrink. So revert the shrinker_srcu related changes first. [1]. https://lore.kernel.org/lkml/202305230837.db2c233f-yujie.liu@intel.com/ [2]. https://lore.kernel.org/lkml/ZIJhou1d55d4H1s0@dread.disaster.area/ Link: https://lkml.kernel.org/r/20230609081518.3039120-8-qi.zheng@linux.dev Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305230837.db2c233f-yujie.liu@intel.com Signed-off-by: Qi Zheng Cc: Dave Chinner Cc: Kirill Tkhai Cc: Muchun Song Cc: Roman Gushchin Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/vmscan.c | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index a008d7f2d0fc..5bf98d0a22c9 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include @@ -191,7 +190,6 @@ int vm_swappiness = 60; LIST_HEAD(shrinker_list); DECLARE_RWSEM(shrinker_rwsem); -DEFINE_SRCU(shrinker_srcu); #ifdef CONFIG_MEMCG static int shrinker_nr_max; @@ -742,7 +740,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker) void register_shrinker_prepared(struct shrinker *shrinker) { down_write(&shrinker_rwsem); - list_add_tail_rcu(&shrinker->list, &shrinker_list); + list_add_tail(&shrinker->list, &shrinker_list); shrinker->flags |= SHRINKER_REGISTERED; shrinker_debugfs_add(shrinker); up_write(&shrinker_rwsem); @@ -797,15 +795,13 @@ void unregister_shrinker(struct shrinker *shrinker) return; down_write(&shrinker_rwsem); - list_del_rcu(&shrinker->list); + list_del(&shrinker->list); shrinker->flags &= ~SHRINKER_REGISTERED; if (shrinker->flags & SHRINKER_MEMCG_AWARE) unregister_memcg_shrinker(shrinker); debugfs_entry = shrinker_debugfs_detach(shrinker, &debugfs_id); up_write(&shrinker_rwsem); - synchronize_srcu(&shrinker_srcu); - shrinker_debugfs_remove(debugfs_entry, debugfs_id); kfree(shrinker->nr_deferred); @@ -825,7 +821,6 @@ void synchronize_shrinkers(void) { down_write(&shrinker_rwsem); up_write(&shrinker_rwsem); - synchronize_srcu(&shrinker_srcu); } EXPORT_SYMBOL(synchronize_shrinkers); @@ -1036,7 +1031,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, { unsigned long ret, freed = 0; struct shrinker *shrinker; - int srcu_idx; /* * The root memcg might be allocated even though memcg is disabled @@ -1048,10 +1042,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) return shrink_slab_memcg(gfp_mask, nid, memcg, priority); - srcu_idx = srcu_read_lock(&shrinker_srcu); + if (!down_read_trylock(&shrinker_rwsem)) + goto out; - list_for_each_entry_srcu(shrinker, &shrinker_list, list, - srcu_read_lock_held(&shrinker_srcu)) { + list_for_each_entry(shrinker, &shrinker_list, list) { struct shrink_control sc = { .gfp_mask = gfp_mask, .nid = nid, @@ -1062,9 +1056,19 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (ret == SHRINK_EMPTY) ret = 0; freed += ret; + /* + * Bail out if someone want to register a new shrinker to + * prevent the registration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } } - srcu_read_unlock(&shrinker_srcu, srcu_idx); + up_read(&shrinker_rwsem); +out: cond_resched(); return freed; } -- cgit v1.2.3