diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-09-01 15:44:45 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-09-01 15:44:45 -0700 |
commit | e987af4546ac5de50e514182c1d0ca33843fa665 (patch) | |
tree | ed2fa9e146d4cd58ce35130e6dd08a00b2cb7a38 | |
parent | 0fe2b86c21253bb365947ceed3531eb214d4c5b5 (diff) | |
parent | 14ef95be6f5558fb9e43aaf06ef9a1d6e0cae6c8 (diff) |
Merge tag 'percpu-for-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu
Pull percpu updates from Dennis Zhou:
"One bigger change to percpu_counter's api allowing for init and
destroy of multiple counters via percpu_counter_init_many() and
percpu_counter_destroy_many(). This is used to help begin remediating
a performance regression with percpu rss stats.
Additionally, it seems larger core count machines are feeling the
burden of the single threaded allocation of percpu. Mateusz is
thinking about it and I will spend some time on it too.
percpu:
- A couple cleanups by Baoquan He and Bibo Mao. The only behavior
change is to start printing messages if we're under the warn limit
for failed atomic allocations.
percpu_counter:
- Shakeel introduced percpu counters into mm_struct which caused
percpu allocations be on the hot path [1]. Originally I spent some
time trying to improve the percpu allocator, but instead preferred
what Mateusz Guzik proposed grouping at the allocation site,
percpu_counter_init_many(). This allows a single percpu allocation
to be shared by the counters. I like this approach because it
creates a shared lifetime by the allocations. Additionally, I
believe many inits have higher level synchronization requirements,
like percpu_counter does against HOTPLUG_CPU. Therefore we can
group these optimizations together"
Link: https://lore.kernel.org/linux-mm/20221024052841.3291983-1-shakeelb@google.com/ [1]
* tag 'percpu-for-6.6' of git://git.kernel.org/pub/scm/linux/kernel/git/dennis/percpu:
kernel/fork: group allocation/free of per-cpu counters for mm struct
pcpcntr: add group allocation/free
mm/percpu.c: print error message too if atomic alloc failed
mm/percpu.c: optimize the code in pcpu_setup_first_chunk() a little bit
mm/percpu.c: remove redundant check
mm/percpu: Remove some local variables in pcpu_populate_pte
-rw-r--r-- | include/linux/percpu_counter.h | 41 | ||||
-rw-r--r-- | kernel/fork.c | 15 | ||||
-rw-r--r-- | lib/percpu_counter.c | 62 | ||||
-rw-r--r-- | mm/percpu.c | 69 |
4 files changed, 109 insertions, 78 deletions
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h index 75b73c83bc9d..d01351b1526f 100644 --- a/include/linux/percpu_counter.h +++ b/include/linux/percpu_counter.h @@ -30,17 +30,28 @@ struct percpu_counter { extern int percpu_counter_batch; -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, - struct lock_class_key *key); +int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, + gfp_t gfp, u32 nr_counters, + struct lock_class_key *key); -#define percpu_counter_init(fbc, value, gfp) \ +#define percpu_counter_init_many(fbc, value, gfp, nr_counters) \ ({ \ static struct lock_class_key __key; \ \ - __percpu_counter_init(fbc, value, gfp, &__key); \ + __percpu_counter_init_many(fbc, value, gfp, nr_counters,\ + &__key); \ }) -void percpu_counter_destroy(struct percpu_counter *fbc); + +#define percpu_counter_init(fbc, value, gfp) \ + percpu_counter_init_many(fbc, value, gfp, 1) + +void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters); +static inline void percpu_counter_destroy(struct percpu_counter *fbc) +{ + percpu_counter_destroy_many(fbc, 1); +} + void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch); @@ -116,11 +127,27 @@ struct percpu_counter { s64 count; }; +static inline int percpu_counter_init_many(struct percpu_counter *fbc, + s64 amount, gfp_t gfp, + u32 nr_counters) +{ + u32 i; + + for (i = 0; i < nr_counters; i++) + fbc[i].count = amount; + + return 0; +} + static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp) { - fbc->count = amount; - return 0; + return percpu_counter_init_many(fbc, amount, gfp, 1); +} + +static inline void percpu_counter_destroy_many(struct percpu_counter *fbc, + u32 nr_counters) +{ } static inline void percpu_counter_destroy(struct percpu_counter *fbc) diff --git a/kernel/fork.c b/kernel/fork.c index a9c18d480dc5..3b6d20dfb9a8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -909,8 +909,6 @@ static void cleanup_lazy_tlbs(struct mm_struct *mm) */ void __mmdrop(struct mm_struct *mm) { - int i; - BUG_ON(mm == &init_mm); WARN_ON_ONCE(mm == current->mm); @@ -925,9 +923,8 @@ void __mmdrop(struct mm_struct *mm) put_user_ns(mm->user_ns); mm_pasid_drop(mm); mm_destroy_cid(mm); + percpu_counter_destroy_many(mm->rss_stat, NR_MM_COUNTERS); - for (i = 0; i < NR_MM_COUNTERS; i++) - percpu_counter_destroy(&mm->rss_stat[i]); free_mm(mm); } EXPORT_SYMBOL_GPL(__mmdrop); @@ -1260,8 +1257,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { - int i; - mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); @@ -1309,17 +1304,15 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, if (mm_alloc_cid(mm)) goto fail_cid; - for (i = 0; i < NR_MM_COUNTERS; i++) - if (percpu_counter_init(&mm->rss_stat[i], 0, GFP_KERNEL_ACCOUNT)) - goto fail_pcpu; + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, + NR_MM_COUNTERS)) + goto fail_pcpu; mm->user_ns = get_user_ns(user_ns); lru_gen_init_mm(mm); return mm; fail_pcpu: - while (i > 0) - percpu_counter_destroy(&mm->rss_stat[--i]); mm_destroy_cid(mm); fail_cid: destroy_context(mm); diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c index 5004463c4f9f..9073430dc865 100644 --- a/lib/percpu_counter.c +++ b/lib/percpu_counter.c @@ -151,48 +151,72 @@ s64 __percpu_counter_sum(struct percpu_counter *fbc) } EXPORT_SYMBOL(__percpu_counter_sum); -int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, - struct lock_class_key *key) +int __percpu_counter_init_many(struct percpu_counter *fbc, s64 amount, + gfp_t gfp, u32 nr_counters, + struct lock_class_key *key) { unsigned long flags __maybe_unused; - - raw_spin_lock_init(&fbc->lock); - lockdep_set_class(&fbc->lock, key); - fbc->count = amount; - fbc->counters = alloc_percpu_gfp(s32, gfp); - if (!fbc->counters) + size_t counter_size; + s32 __percpu *counters; + u32 i; + + counter_size = ALIGN(sizeof(*counters), __alignof__(*counters)); + counters = __alloc_percpu_gfp(nr_counters * counter_size, + __alignof__(*counters), gfp); + if (!counters) { + fbc[0].counters = NULL; return -ENOMEM; + } - debug_percpu_counter_activate(fbc); + for (i = 0; i < nr_counters; i++) { + raw_spin_lock_init(&fbc[i].lock); + lockdep_set_class(&fbc[i].lock, key); +#ifdef CONFIG_HOTPLUG_CPU + INIT_LIST_HEAD(&fbc[i].list); +#endif + fbc[i].count = amount; + fbc[i].counters = (void *)counters + (i * counter_size); + + debug_percpu_counter_activate(&fbc[i]); + } #ifdef CONFIG_HOTPLUG_CPU - INIT_LIST_HEAD(&fbc->list); spin_lock_irqsave(&percpu_counters_lock, flags); - list_add(&fbc->list, &percpu_counters); + for (i = 0; i < nr_counters; i++) + list_add(&fbc[i].list, &percpu_counters); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif return 0; } -EXPORT_SYMBOL(__percpu_counter_init); +EXPORT_SYMBOL(__percpu_counter_init_many); -void percpu_counter_destroy(struct percpu_counter *fbc) +void percpu_counter_destroy_many(struct percpu_counter *fbc, u32 nr_counters) { unsigned long flags __maybe_unused; + u32 i; + + if (WARN_ON_ONCE(!fbc)) + return; - if (!fbc->counters) + if (!fbc[0].counters) return; - debug_percpu_counter_deactivate(fbc); + for (i = 0; i < nr_counters; i++) + debug_percpu_counter_deactivate(&fbc[i]); #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); - list_del(&fbc->list); + for (i = 0; i < nr_counters; i++) + list_del(&fbc[i].list); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif - free_percpu(fbc->counters); - fbc->counters = NULL; + + free_percpu(fbc[0].counters); + + for (i = 0; i < nr_counters; i++) + fbc[i].counters = NULL; } -EXPORT_SYMBOL(percpu_counter_destroy); +EXPORT_SYMBOL(percpu_counter_destroy_many); int percpu_counter_batch __read_mostly = 32; EXPORT_SYMBOL(percpu_counter_batch); diff --git a/mm/percpu.c b/mm/percpu.c index 28e07ede46f6..a7665de8485f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1890,13 +1890,15 @@ fail_unlock: fail: trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); - if (!is_atomic && do_warn && warn_limit) { + if (do_warn && warn_limit) { pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", size, align, is_atomic, err); - dump_stack(); + if (!is_atomic) + dump_stack(); if (!--warn_limit) pr_info("limit reached, disable warning\n"); } + if (is_atomic) { /* see the flag handling in pcpu_balance_workfn() */ pcpu_atomic_alloc_failed = true; @@ -2581,14 +2583,12 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, { size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; size_t static_size, dyn_size; - struct pcpu_chunk *chunk; unsigned long *group_offsets; size_t *group_sizes; unsigned long *unit_off; unsigned int cpu; int *unit_map; int group, unit, i; - int map_size; unsigned long tmp_addr; size_t alloc_size; @@ -2615,7 +2615,6 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE)); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); - PCPU_SETUP_BUG_ON(!ai->dyn_size); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE)); PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) || IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE))); @@ -2698,7 +2697,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_atom_size = ai->atom_size; - pcpu_chunk_struct_size = struct_size(chunk, populated, + pcpu_chunk_struct_size = struct_size((struct pcpu_chunk *)0, populated, BITS_TO_LONGS(pcpu_unit_pages)); pcpu_stats_save_ai(ai); @@ -2735,29 +2734,23 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, dyn_size = ai->dyn_size - (static_size - ai->static_size); /* - * Initialize first chunk. - * If the reserved_size is non-zero, this initializes the reserved - * chunk. If the reserved_size is zero, the reserved chunk is NULL - * and the dynamic region is initialized here. The first chunk, - * pcpu_first_chunk, will always point to the chunk that serves - * the dynamic region. + * Initialize first chunk: + * This chunk is broken up into 3 parts: + * < static | [reserved] | dynamic > + * - static - there is no backing chunk because these allocations can + * never be freed. + * - reserved (pcpu_reserved_chunk) - exists primarily to serve + * allocations from module load. + * - dynamic (pcpu_first_chunk) - serves the dynamic part of the first + * chunk. */ tmp_addr = (unsigned long)base_addr + static_size; - map_size = ai->reserved_size ?: dyn_size; - chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); - - /* init dynamic chunk if necessary */ - if (ai->reserved_size) { - pcpu_reserved_chunk = chunk; + if (ai->reserved_size) + pcpu_reserved_chunk = pcpu_alloc_first_chunk(tmp_addr, + ai->reserved_size); + tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size; + pcpu_first_chunk = pcpu_alloc_first_chunk(tmp_addr, dyn_size); - tmp_addr = (unsigned long)base_addr + static_size + - ai->reserved_size; - map_size = dyn_size; - chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); - } - - /* link the first chunk in */ - pcpu_first_chunk = chunk; pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); @@ -3189,32 +3182,26 @@ void __init __weak pcpu_populate_pte(unsigned long addr) pmd_t *pmd; if (pgd_none(*pgd)) { - p4d_t *new; - - new = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); - if (!new) + p4d = memblock_alloc(P4D_TABLE_SIZE, P4D_TABLE_SIZE); + if (!p4d) goto err_alloc; - pgd_populate(&init_mm, pgd, new); + pgd_populate(&init_mm, pgd, p4d); } p4d = p4d_offset(pgd, addr); if (p4d_none(*p4d)) { - pud_t *new; - - new = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); - if (!new) + pud = memblock_alloc(PUD_TABLE_SIZE, PUD_TABLE_SIZE); + if (!pud) goto err_alloc; - p4d_populate(&init_mm, p4d, new); + p4d_populate(&init_mm, p4d, pud); } pud = pud_offset(p4d, addr); if (pud_none(*pud)) { - pmd_t *new; - - new = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); - if (!new) + pmd = memblock_alloc(PMD_TABLE_SIZE, PMD_TABLE_SIZE); + if (!pmd) goto err_alloc; - pud_populate(&init_mm, pud, new); + pud_populate(&init_mm, pud, pmd); } pmd = pmd_offset(pud, addr); |