diff options
Diffstat (limited to 'fs/ext4/mballoc.c')
-rw-r--r-- | fs/ext4/mballoc.c | 603 |
1 files changed, 571 insertions, 32 deletions
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 99bf091fee10..3239e6669e84 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -127,11 +127,50 @@ * smallest multiple of the stripe value (sbi->s_stripe) which is * greater than the default mb_group_prealloc. * + * If "mb_optimize_scan" mount option is set, we maintain in memory group info + * structures in two data structures: + * + * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) + * + * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) + * + * This is an array of lists where the index in the array represents the + * largest free order in the buddy bitmap of the participating group infos of + * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total + * number of buddy bitmap orders possible) number of lists. Group-infos are + * placed in appropriate lists. + * + * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) + * + * Locking: sbi->s_mb_rb_lock (rwlock) + * + * This is a red black tree consisting of group infos and the tree is sorted + * by average fragment sizes (which is calculated as ext4_group_info->bb_free + * / ext4_group_info->bb_fragments). + * + * When "mb_optimize_scan" mount option is set, mballoc consults the above data + * structures to decide the order in which groups are to be traversed for + * fulfilling an allocation request. + * + * At CR = 0, we look for groups which have the largest_free_order >= the order + * of the request. We directly look at the largest free order list in the data + * structure (1) above where largest_free_order = order of the request. If that + * list is empty, we look at remaining list in the increasing order of + * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. + * + * At CR = 1, we only consider groups where average fragment size > request + * size. So, we lookup a group which has average fragment size just above or + * equal to request size using our rb tree (data structure 2) in O(log N) time. + * + * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in + * linear order which requires O(N) search time for each CR 0 and CR 1 phase. + * * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req + * /sys/fs/ext4/<partition>/mb_linear_limit * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The @@ -149,6 +188,16 @@ * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * + * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not + * get traversed linearly. That may result in subsequent allocations being not + * close to each other. And so, the underlying device may get filled up in a + * non-linear fashion. While that may not matter on non-rotational devices, for + * rotational devices that may result in higher seek times. "mb_linear_limit" + * tells mballoc how many groups mballoc should search linearly before + * performing consulting above data structures for more efficient lookups. For + * non rotational devices, this value defaults to 0 and for rotational devices + * this is set to MB_DEFAULT_LINEAR_LIMIT. + * * Both the prealloc space are getting populated as above. So for the first * request we will hit the buddy cache which will result in this prealloc * space getting filled. The prealloc space is then later used for the @@ -299,6 +348,8 @@ * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) + * - cr0 lists lock (cr0) + * - cr1 tree lock (cr1) * * Paths: * - new pa @@ -328,6 +379,9 @@ * group * object * + * - allocation path (ext4_mb_regular_allocator) + * group + * cr0/cr1 */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; @@ -351,6 +405,9 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); +static bool ext4_mb_good_group(struct ext4_allocation_context *ac, + ext4_group_t group, int cr); + /* * The algorithm using this percpu seq counter goes below: * 1. We sample the percpu discard_pa_seq counter before trying for block @@ -744,6 +801,269 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, } } +static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, + int (*cmp)(struct rb_node *, struct rb_node *)) +{ + struct rb_node **iter = &root->rb_node, *parent = NULL; + + while (*iter) { + parent = *iter; + if (cmp(new, *iter) > 0) + iter = &((*iter)->rb_left); + else + iter = &((*iter)->rb_right); + } + + rb_link_node(new, parent, iter); + rb_insert_color(new, root); +} + +static int +ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2) +{ + struct ext4_group_info *grp1 = rb_entry(rb1, + struct ext4_group_info, + bb_avg_fragment_size_rb); + struct ext4_group_info *grp2 = rb_entry(rb2, + struct ext4_group_info, + bb_avg_fragment_size_rb); + int num_frags_1, num_frags_2; + + num_frags_1 = grp1->bb_fragments ? + grp1->bb_free / grp1->bb_fragments : 0; + num_frags_2 = grp2->bb_fragments ? + grp2->bb_free / grp2->bb_fragments : 0; + + return (num_frags_2 - num_frags_1); +} + +/* + * Reinsert grpinfo into the avg_fragment_size tree with new average + * fragment size. + */ +static void +mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) + return; + + write_lock(&sbi->s_mb_rb_lock); + if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { + rb_erase(&grp->bb_avg_fragment_size_rb, + &sbi->s_mb_avg_fragment_size_root); + RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb); + } + + ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, + &grp->bb_avg_fragment_size_rb, + ext4_mb_avg_fragment_size_cmp); + write_unlock(&sbi->s_mb_rb_lock); +} + +/* + * Choose next group by traversing largest_free_order lists. Updates *new_cr if + * cr level needs an update. + */ +static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_group_info *iter, *grp; + int i; + + if (ac->ac_status == AC_STATUS_FOUND) + return; + + if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) + atomic_inc(&sbi->s_bal_cr0_bad_suggestions); + + grp = NULL; + for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { + if (list_empty(&sbi->s_mb_largest_free_orders[i])) + continue; + read_lock(&sbi->s_mb_largest_free_orders_locks[i]); + if (list_empty(&sbi->s_mb_largest_free_orders[i])) { + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + continue; + } + grp = NULL; + list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], + bb_largest_free_order_node) { + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { + grp = iter; + break; + } + } + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + if (grp) + break; + } + + if (!grp) { + /* Increment cr and search again */ + *new_cr = 1; + } else { + *group = grp->bb_group; + ac->ac_last_optimal_group = *group; + ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; + } +} + +/* + * Choose next group by traversing average fragment size tree. Updates *new_cr + * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that + * the linear search should continue for one iteration since there's lock + * contention on the rb tree lock. + */ +static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int avg_fragment_size, best_so_far; + struct rb_node *node, *found; + struct ext4_group_info *grp; + + /* + * If there is contention on the lock, instead of waiting for the lock + * to become available, just continue searching lineraly. We'll resume + * our rb tree search later starting at ac->ac_last_optimal_group. + */ + if (!read_trylock(&sbi->s_mb_rb_lock)) { + ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR; + return; + } + + if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { + if (sbi->s_mb_stats) + atomic_inc(&sbi->s_bal_cr1_bad_suggestions); + /* We have found something at CR 1 in the past */ + grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group); + for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; + found = rb_next(found)) { + grp = rb_entry(found, struct ext4_group_info, + bb_avg_fragment_size_rb); + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); + if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) + break; + } + goto done; + } + + node = sbi->s_mb_avg_fragment_size_root.rb_node; + best_so_far = 0; + found = NULL; + + while (node) { + grp = rb_entry(node, struct ext4_group_info, + bb_avg_fragment_size_rb); + avg_fragment_size = 0; + if (ext4_mb_good_group(ac, grp->bb_group, 1)) { + avg_fragment_size = grp->bb_fragments ? + grp->bb_free / grp->bb_fragments : 0; + if (!best_so_far || avg_fragment_size < best_so_far) { + best_so_far = avg_fragment_size; + found = node; + } + } + if (avg_fragment_size > ac->ac_g_ex.fe_len) + node = node->rb_right; + else + node = node->rb_left; + } + +done: + if (found) { + grp = rb_entry(found, struct ext4_group_info, + bb_avg_fragment_size_rb); + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; + } else { + *new_cr = 2; + } + + read_unlock(&sbi->s_mb_rb_lock); + ac->ac_last_optimal_group = *group; +} + +static inline int should_optimize_scan(struct ext4_allocation_context *ac) +{ + if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) + return 0; + if (ac->ac_criteria >= 2) + return 0; + if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) + return 0; + return 1; +} + +/* + * Return next linear group for allocation. If linear traversal should not be + * performed, this function just returns the same group + */ +static int +next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) +{ + if (!should_optimize_scan(ac)) + goto inc_and_return; + + if (ac->ac_groups_linear_remaining) { + ac->ac_groups_linear_remaining--; + goto inc_and_return; + } + + if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) { + ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR; + goto inc_and_return; + } + + return group; +inc_and_return: + /* + * Artificially restricted ngroups for non-extent + * files makes group > ngroups possible on first loop. + */ + return group + 1 >= ngroups ? 0 : group + 1; +} + +/* + * ext4_mb_choose_next_group: choose next group for allocation. + * + * @ac Allocation Context + * @new_cr This is an output parameter. If the there is no good group + * available at current CR level, this field is updated to indicate + * the new cr level that should be used. + * @group This is an input / output parameter. As an input it indicates the + * next group that the allocator intends to use for allocation. As + * output, this field indicates the next group that should be used as + * determined by the optimization functions. + * @ngroups Total number of groups + */ +static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + *new_cr = ac->ac_criteria; + + if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) + return; + + if (*new_cr == 0) { + ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); + } else if (*new_cr == 1) { + ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); + } else { + /* + * TODO: For CR=2, we can arrange groups in an rb tree sorted by + * bb_free. But until that happens, we should never come here. + */ + WARN_ON(1); + } +} + /* * Cache the order of the largest free extent we have available in this block * group. @@ -751,18 +1071,33 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { + struct ext4_sb_info *sbi = EXT4_SB(sb); int i; - int bits; + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { + write_lock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + list_del_init(&grp->bb_largest_free_order_node); + write_unlock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + } grp->bb_largest_free_order = -1; /* uninit */ - bits = sb->s_blocksize_bits + 1; - for (i = bits; i >= 0; i--) { + for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) { if (grp->bb_counters[i] > 0) { grp->bb_largest_free_order = i; break; } } + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && + grp->bb_largest_free_order >= 0 && grp->bb_free) { + write_lock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + list_add_tail(&grp->bb_largest_free_order_node, + &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); + write_unlock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + } } static noinline_for_stack @@ -816,10 +1151,9 @@ void ext4_mb_generate_buddy(struct super_block *sb, clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; - spin_lock(&sbi->s_bal_lock); - sbi->s_mb_buddies_generated++; - sbi->s_mb_generation_time += period; - spin_unlock(&sbi->s_bal_lock); + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); + mb_update_avg_fragment_size(sb, grp); } /* The buddy information is attached the buddy cache inode @@ -959,7 +1293,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * - (sb->s_blocksize_bits+2)); + (MB_NUM_ORDERS(sb))); /* * incore got set to the group block bitmap below */ @@ -1519,6 +1853,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, done: mb_set_largest_free_order(sb, e4b->bd_info); + mb_update_avg_fragment_size(sb, e4b->bd_info); mb_check_buddy(e4b); } @@ -1655,6 +1990,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); + mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); @@ -1930,7 +2266,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, int max; BUG_ON(ac->ac_2order <= 0); - for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { if (grp->bb_counters[i] == 0) continue; @@ -2109,7 +2445,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, if (free < ac->ac_g_ex.fe_len) return false; - if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) + if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) return true; if (grp->bb_largest_free_order < ac->ac_2order) @@ -2148,6 +2484,8 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_grpblk_t free; int ret = 0; + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); if (should_lock) ext4_lock_group(sb, group); free = grp->bb_free; @@ -2315,13 +2653,13 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * We also support searching for power-of-two requests only for * requests upto maximum buddy size we have constructed. */ - if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) { + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { /* * This should tell if fe_len is exactly power of 2 */ if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) ac->ac_2order = array_index_nospec(i - 1, - sb->s_blocksize_bits + 2); + MB_NUM_ORDERS(sb)); } /* if stream allocation is enabled, use global goal */ @@ -2347,17 +2685,21 @@ repeat: * from the goal value specified */ group = ac->ac_g_ex.fe_group; + ac->ac_last_optimal_group = group; + ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; prefetch_grp = group; - for (i = 0; i < ngroups; group++, i++) { - int ret = 0; + for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), + i++) { + int ret = 0, new_cr; + cond_resched(); - /* - * Artificially restricted ngroups for non-extent - * files makes group > ngroups possible on first loop. - */ - if (group >= ngroups) - group = 0; + + ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups); + if (new_cr != cr) { + cr = new_cr; + goto repeat; + } /* * Batch reads of the block allocation bitmaps @@ -2422,6 +2764,9 @@ repeat: if (ac->ac_status != AC_STATUS_CONTINUE) break; } + /* Processed all groups and haven't found blocks */ + if (sbi->s_mb_stats && i == ngroups) + atomic64_inc(&sbi->s_bal_cX_failed[cr]); } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && @@ -2451,6 +2796,9 @@ repeat: goto repeat; } } + + if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) + atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); out: if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) err = first_err; @@ -2550,6 +2898,157 @@ const struct seq_operations ext4_mb_seq_groups_ops = { .show = ext4_mb_seq_groups_show, }; +int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = (struct super_block *)seq->private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + seq_puts(seq, "mballoc:\n"); + if (!sbi->s_mb_stats) { + seq_puts(seq, "\tmb stats collection turned off.\n"); + seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); + return 0; + } + seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); + seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); + + seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); + + seq_puts(seq, "\tcr0_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[0])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[0])); + seq_printf(seq, "\t\tbad_suggestions: %u\n", + atomic_read(&sbi->s_bal_cr0_bad_suggestions)); + + seq_puts(seq, "\tcr1_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[1])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[1])); + seq_printf(seq, "\t\tbad_suggestions: %u\n", + atomic_read(&sbi->s_bal_cr1_bad_suggestions)); + + seq_puts(seq, "\tcr2_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[2])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[2])); + + seq_puts(seq, "\tcr3_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[3])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[3])); + seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); + seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); + seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); + seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); + seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); + + seq_printf(seq, "\tbuddies_generated: %u/%u\n", + atomic_read(&sbi->s_mb_buddies_generated), + ext4_get_groups_count(sb)); + seq_printf(seq, "\tbuddies_time_used: %llu\n", + atomic64_read(&sbi->s_mb_generation_time)); + seq_printf(seq, "\tpreallocated: %u\n", + atomic_read(&sbi->s_mb_preallocated)); + seq_printf(seq, "\tdiscarded: %u\n", + atomic_read(&sbi->s_mb_discarded)); + return 0; +} + +static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + unsigned long position; + + read_lock(&EXT4_SB(sb)->s_mb_rb_lock); + + if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) + return NULL; + position = *pos + 1; + return (void *) ((unsigned long) position); +} + +static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + unsigned long position; + + ++*pos; + if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) + return NULL; + position = *pos + 1; + return (void *) ((unsigned long) position); +} + +static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned long position = ((unsigned long) v); + struct ext4_group_info *grp; + struct rb_node *n; + unsigned int count, min, max; + + position--; + if (position >= MB_NUM_ORDERS(sb)) { + seq_puts(seq, "fragment_size_tree:\n"); + n = rb_first(&sbi->s_mb_avg_fragment_size_root); + if (!n) { + seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n"); + return 0; + } + grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); + min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; + count = 1; + while (rb_next(n)) { + count++; + n = rb_next(n); + } + grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); + max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; + + seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n", + min, max, count); + return 0; + } + + if (position == 0) { + seq_printf(seq, "optimize_scan: %d\n", + test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); + seq_puts(seq, "max_free_order_lists:\n"); + } + count = 0; + list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], + bb_largest_free_order_node) + count++; + seq_printf(seq, "\tlist_order_%u_groups: %u\n", + (unsigned int)position, count); + + return 0; +} + +static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + + read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); +} + +const struct seq_operations ext4_mb_seq_structs_summary_ops = { + .start = ext4_mb_seq_structs_summary_start, + .next = ext4_mb_seq_structs_summary_next, + .stop = ext4_mb_seq_structs_summary_stop, + .show = ext4_mb_seq_structs_summary_show, +}; + static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) { int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; @@ -2590,7 +3089,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); if (old_groupinfo) ext4_kvfree_array_rcu(old_groupinfo); - ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", sbi->s_group_info_size); return 0; } @@ -2652,7 +3151,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; + INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); + RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ + meta_group_info[i]->bb_group = group; mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); return 0; @@ -2709,8 +3211,15 @@ static int ext4_mb_init_backend(struct super_block *sb) } if (ext4_has_feature_flex_bg(sb)) { - /* a single flex group is supposed to be read by a single IO */ - sbi->s_mb_prefetch = min(1 << sbi->s_es->s_log_groups_per_flex, + /* a single flex group is supposed to be read by a single IO. + * 2 ^ s_log_groups_per_flex != UINT_MAX as s_mb_prefetch is + * unsigned integer, so the maximum shift is 32. + */ + if (sbi->s_es->s_log_groups_per_flex >= 32) { + ext4_msg(sb, KERN_ERR, "too many log groups per flexible block group"); + goto err_freesgi; + } + sbi->s_mb_prefetch = min_t(uint, 1 << sbi->s_es->s_log_groups_per_flex, BLK_MAX_SEGMENT_SIZE >> (sb->s_blocksize_bits - 9)); sbi->s_mb_prefetch *= 8; /* 8 prefetch IOs in flight at most */ } else { @@ -2806,7 +3315,7 @@ int ext4_mb_init(struct super_block *sb) unsigned max; int ret; - i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); + i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { @@ -2814,7 +3323,7 @@ int ext4_mb_init(struct super_block *sb) goto out; } - i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); + i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { ret = -ENOMEM; @@ -2840,10 +3349,30 @@ int ext4_mb_init(struct super_block *sb) offset_incr = offset_incr >> 1; max = max >> 1; i++; - } while (i <= sb->s_blocksize_bits + 1); + } while (i < MB_NUM_ORDERS(sb)); + + sbi->s_mb_avg_fragment_size_root = RB_ROOT; + sbi->s_mb_largest_free_orders = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + GFP_KERNEL); + if (!sbi->s_mb_largest_free_orders) { + ret = -ENOMEM; + goto out; + } + sbi->s_mb_largest_free_orders_locks = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), + GFP_KERNEL); + if (!sbi->s_mb_largest_free_orders_locks) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) { + INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); + rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); + } + rwlock_init(&sbi->s_mb_rb_lock); spin_lock_init(&sbi->s_md_lock); - spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_free_pending = 0; INIT_LIST_HEAD(&sbi->s_freed_data_list); @@ -2894,6 +3423,10 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&lg->lg_prealloc_lock); } + if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev))) + sbi->s_mb_max_linear_groups = 0; + else + sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; /* init file for buddy data */ ret = ext4_mb_init_backend(sb); if (ret != 0) @@ -2905,6 +3438,8 @@ out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; out: + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); @@ -2961,6 +3496,8 @@ int ext4_mb_release(struct super_block *sb) kvfree(group_info); rcu_read_unlock(); } + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); iput(sbi->s_buddy_cache); @@ -2971,17 +3508,18 @@ int ext4_mb_release(struct super_block *sb) atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); ext4_msg(sb, KERN_INFO, - "mballoc: %u extents scanned, %u goal hits, " + "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), + atomic_read(&sbi->s_bal_groups_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); ext4_msg(sb, KERN_INFO, - "mballoc: %lu generated and it took %Lu", - sbi->s_mb_buddies_generated, - sbi->s_mb_generation_time); + "mballoc: %u generated and it took %llu", + atomic_read(&sbi->s_mb_buddies_generated), + atomic64_read(&sbi->s_mb_generation_time)); ext4_msg(sb, KERN_INFO, "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), @@ -3576,12 +4114,13 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { + if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { atomic_inc(&sbi->s_bal_reqs); atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); + atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); |