diff options
Diffstat (limited to 'mm/list_lru.c')
-rw-r--r-- | mm/list_lru.c | 383 |
1 files changed, 190 insertions, 193 deletions
diff --git a/mm/list_lru.c b/mm/list_lru.c index 9b7ff06e9d32..f93ada6a207b 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -59,6 +59,53 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) } return &lru->node[nid].lru; } + +static inline struct list_lru_one * +lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + bool irq, bool skip_empty) +{ + struct list_lru_one *l; + long nr_items; + + rcu_read_lock(); +again: + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + if (likely(l)) { + if (irq) + spin_lock_irq(&l->lock); + else + spin_lock(&l->lock); + nr_items = READ_ONCE(l->nr_items); + if (likely(nr_items != LONG_MIN)) { + WARN_ON(nr_items < 0); + rcu_read_unlock(); + return l; + } + if (irq) + spin_unlock_irq(&l->lock); + else + spin_unlock(&l->lock); + } + /* + * Caller may simply bail out if raced with reparenting or + * may iterate through the list_lru and expect empty slots. + */ + if (skip_empty) { + rcu_read_unlock(); + return NULL; + } + VM_WARN_ON(!css_is_dying(&memcg->css)); + memcg = parent_mem_cgroup(memcg); + goto again; +} + +static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) +{ + if (irq_off) + spin_unlock_irq(&l->lock); + else + spin_unlock(&l->lock); +} #else static void list_lru_register(struct list_lru *lru) { @@ -83,30 +130,52 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { return &lru->node[nid].lru; } + +static inline struct list_lru_one * +lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + bool irq, bool skip_empty) +{ + struct list_lru_one *l = &lru->node[nid].lru; + + if (irq) + spin_lock_irq(&l->lock); + else + spin_lock(&l->lock); + + return l; +} + +static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) +{ + if (irq_off) + spin_unlock_irq(&l->lock); + else + spin_unlock(&l->lock); +} #endif /* CONFIG_MEMCG */ /* The caller must ensure the memcg lifetime. */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; - spin_lock(&nlru->lock); + l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); + if (!l) + return false; if (list_empty(item)) { - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); - nlru->nr_items++; - spin_unlock(&nlru->lock); + unlock_list_lru(l, false); + atomic_long_inc(&nlru->nr_items); return true; } - spin_unlock(&nlru->lock); + unlock_list_lru(l, false); return false; } -EXPORT_SYMBOL_GPL(list_lru_add); bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { @@ -127,24 +196,23 @@ EXPORT_SYMBOL_GPL(list_lru_add_obj); /* The caller must ensure the memcg lifetime. */ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; - - spin_lock(&nlru->lock); + l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); + if (!l) + return false; if (!list_empty(item)) { - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); list_del_init(item); l->nr_items--; - nlru->nr_items--; - spin_unlock(&nlru->lock); + unlock_list_lru(l, false); + atomic_long_dec(&nlru->nr_items); return true; } - spin_unlock(&nlru->lock); + unlock_list_lru(l, false); return false; } -EXPORT_SYMBOL_GPL(list_lru_del); bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) { @@ -201,25 +269,24 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid) struct list_lru_node *nlru; nlru = &lru->node[nid]; - return nlru->nr_items; + return atomic_long_read(&nlru->nr_items); } EXPORT_SYMBOL_GPL(list_lru_count_node); static unsigned long -__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, +__list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, - unsigned long *nr_to_walk) + unsigned long *nr_to_walk, bool irq_off) { struct list_lru_node *nlru = &lru->node[nid]; - struct list_lru_one *l; + struct list_lru_one *l = NULL; struct list_head *item, *n; unsigned long isolated = 0; restart: - l = list_lru_from_memcg_idx(lru, nid, memcg_idx); + l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true); if (!l) - goto out; - + return isolated; list_for_each_safe(item, n, &l->list) { enum lru_status ret; @@ -231,19 +298,19 @@ restart: break; --*nr_to_walk; - ret = isolate(item, l, &nlru->lock, cb_arg); + ret = isolate(item, l, cb_arg); switch (ret) { + /* + * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru + * lock. List traversal will have to restart from scratch. + */ + case LRU_RETRY: + goto restart; case LRU_REMOVED_RETRY: - assert_spin_locked(&nlru->lock); fallthrough; case LRU_REMOVED: isolated++; - nlru->nr_items--; - /* - * If the lru lock has been dropped, our list - * traversal is now invalid and so we have to - * restart from scratch. - */ + atomic_long_dec(&nlru->nr_items); if (ret == LRU_REMOVED_RETRY) goto restart; break; @@ -252,20 +319,13 @@ restart: break; case LRU_SKIP: break; - case LRU_RETRY: - /* - * The lru lock has been dropped, our list traversal is - * now invalid and so we have to restart from scratch. - */ - assert_spin_locked(&nlru->lock); - goto restart; case LRU_STOP: - assert_spin_locked(&nlru->lock); goto out; default: BUG(); } } + unlock_list_lru(l, irq_off); out: return isolated; } @@ -275,14 +335,8 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { - struct list_lru_node *nlru = &lru->node[nid]; - unsigned long ret; - - spin_lock(&nlru->lock); - ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, - cb_arg, nr_to_walk); - spin_unlock(&nlru->lock); - return ret; + return __list_lru_walk_one(lru, nid, memcg, isolate, + cb_arg, nr_to_walk, false); } EXPORT_SYMBOL_GPL(list_lru_walk_one); @@ -291,14 +345,8 @@ list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { - struct list_lru_node *nlru = &lru->node[nid]; - unsigned long ret; - - spin_lock_irq(&nlru->lock); - ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, - cb_arg, nr_to_walk); - spin_unlock_irq(&nlru->lock); - return ret; + return __list_lru_walk_one(lru, nid, memcg, isolate, + cb_arg, nr_to_walk, true); } unsigned long list_lru_walk_node(struct list_lru *lru, int nid, @@ -313,16 +361,21 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, #ifdef CONFIG_MEMCG if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { struct list_lru_memcg *mlru; + struct mem_cgroup *memcg; unsigned long index; xa_for_each(&lru->xa, index, mlru) { - struct list_lru_node *nlru = &lru->node[nid]; - - spin_lock(&nlru->lock); - isolated += __list_lru_walk_one(lru, nid, index, + rcu_read_lock(); + memcg = mem_cgroup_from_id(index); + if (!mem_cgroup_tryget(memcg)) { + rcu_read_unlock(); + continue; + } + rcu_read_unlock(); + isolated += __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, - nr_to_walk); - spin_unlock(&nlru->lock); + nr_to_walk, false); + mem_cgroup_put(memcg); if (*nr_to_walk <= 0) break; @@ -334,14 +387,19 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, } EXPORT_SYMBOL_GPL(list_lru_walk_node); -static void init_one_lru(struct list_lru_one *l) +static void init_one_lru(struct list_lru *lru, struct list_lru_one *l) { INIT_LIST_HEAD(&l->list); + spin_lock_init(&l->lock); l->nr_items = 0; +#ifdef CONFIG_LOCKDEP + if (lru->key) + lockdep_set_class(&l->lock, lru->key); +#endif } #ifdef CONFIG_MEMCG -static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) +static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp) { int nid; struct list_lru_memcg *mlru; @@ -351,25 +409,11 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) return NULL; for_each_node(nid) - init_one_lru(&mlru->node[nid]); + init_one_lru(lru, &mlru->node[nid]); return mlru; } -static void memcg_list_lru_free(struct list_lru *lru, int src_idx) -{ - struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); - - /* - * The __list_lru_walk_one() can walk the list of this node. - * We need kvfree_rcu() here. And the walking of the list - * is under lru->node[nid]->lock, which can serve as a RCU - * read-side critical section. - */ - if (mlru) - kvfree_rcu(mlru, rcu); -} - static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { if (memcg_aware) @@ -393,77 +437,64 @@ static void memcg_destroy_list_lru(struct list_lru *lru) xas_unlock_irq(&xas); } -static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, - int src_idx, struct mem_cgroup *dst_memcg) +static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, + struct list_lru_one *src, + struct mem_cgroup *dst_memcg) { - struct list_lru_node *nlru = &lru->node[nid]; int dst_idx = dst_memcg->kmemcg_id; - struct list_lru_one *src, *dst; - - /* - * Since list_lru_{add,del} may be called under an IRQ-safe lock, - * we have to use IRQ-safe primitives here to avoid deadlock. - */ - spin_lock_irq(&nlru->lock); + struct list_lru_one *dst; - src = list_lru_from_memcg_idx(lru, nid, src_idx); - if (!src) - goto out; + spin_lock_irq(&src->lock); dst = list_lru_from_memcg_idx(lru, nid, dst_idx); + spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING); list_splice_init(&src->list, &dst->list); - if (src->nr_items) { dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); - src->nr_items = 0; } -out: - spin_unlock_irq(&nlru->lock); -} - -static void memcg_reparent_list_lru(struct list_lru *lru, - int src_idx, struct mem_cgroup *dst_memcg) -{ - int i; + /* Mark the list_lru_one dead */ + src->nr_items = LONG_MIN; - for_each_node(i) - memcg_reparent_list_lru_node(lru, i, src_idx, dst_memcg); - - memcg_list_lru_free(lru, src_idx); + spin_unlock(&dst->lock); + spin_unlock_irq(&src->lock); } void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) { - struct cgroup_subsys_state *css; struct list_lru *lru; - int src_idx = memcg->kmemcg_id; + int i; - /* - * Change kmemcg_id of this cgroup and all its descendants to the - * parent's id, and then move all entries from this cgroup's list_lrus - * to ones of the parent. - * - * After we have finished, all list_lrus corresponding to this cgroup - * are guaranteed to remain empty. So we can safely free this cgroup's - * list lrus in memcg_list_lru_free(). - * - * Changing ->kmemcg_id to the parent can prevent memcg_list_lru_alloc() - * from allocating list lrus for this cgroup after memcg_list_lru_free() - * call. - */ - rcu_read_lock(); - css_for_each_descendant_pre(css, &memcg->css) { - struct mem_cgroup *child; + mutex_lock(&list_lrus_mutex); + list_for_each_entry(lru, &memcg_list_lrus, list) { + struct list_lru_memcg *mlru; + XA_STATE(xas, &lru->xa, memcg->kmemcg_id); - child = mem_cgroup_from_css(css); - WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); - } - rcu_read_unlock(); + /* + * Lock the Xarray to ensure no on going list_lru_memcg + * allocation and further allocation will see css_is_dying(). + */ + xas_lock_irq(&xas); + mlru = xas_store(&xas, NULL); + xas_unlock_irq(&xas); + if (!mlru) + continue; - mutex_lock(&list_lrus_mutex); - list_for_each_entry(lru, &memcg_list_lrus, list) - memcg_reparent_list_lru(lru, src_idx, parent); + /* + * With Xarray value set to NULL, holding the lru lock below + * prevents list_lru_{add,del,isolate} from touching the lru, + * safe to reparent. + */ + for_each_node(i) + memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent); + + /* + * Here all list_lrus corresponding to the cgroup are guaranteed + * to remain empty, we can safely free this lru, any further + * memcg_list_lru_alloc() call will simply bail out. + */ + kvfree_rcu(mlru, rcu); + } mutex_unlock(&list_lrus_mutex); } @@ -478,77 +509,48 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp) { - int i; unsigned long flags; - struct list_lru_memcg_table { - struct list_lru_memcg *mlru; - struct mem_cgroup *memcg; - } *table; + struct list_lru_memcg *mlru; + struct mem_cgroup *pos, *parent; XA_STATE(xas, &lru->xa, 0); if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) return 0; gfp &= GFP_RECLAIM_MASK; - table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp); - if (!table) - return -ENOMEM; - /* * Because the list_lru can be reparented to the parent cgroup's * list_lru, we should make sure that this cgroup and all its * ancestors have allocated list_lru_memcg. */ - for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) { - if (memcg_list_lru_allocated(memcg, lru)) - break; - - table[i].memcg = memcg; - table[i].mlru = memcg_init_list_lru_one(gfp); - if (!table[i].mlru) { - while (i--) - kfree(table[i].mlru); - kfree(table); - return -ENOMEM; + do { + /* + * Keep finding the farest parent that wasn't populated + * until found memcg itself. + */ + pos = memcg; + parent = parent_mem_cgroup(pos); + while (!memcg_list_lru_allocated(parent, lru)) { + pos = parent; + parent = parent_mem_cgroup(pos); } - } - - xas_lock_irqsave(&xas, flags); - while (i--) { - int index = READ_ONCE(table[i].memcg->kmemcg_id); - struct list_lru_memcg *mlru = table[i].mlru; - xas_set(&xas, index); -retry: - if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { - kfree(mlru); - } else { - xas_store(&xas, mlru); - if (xas_error(&xas) == -ENOMEM) { - xas_unlock_irqrestore(&xas, flags); - if (xas_nomem(&xas, gfp)) - xas_set_err(&xas, 0); - xas_lock_irqsave(&xas, flags); - /* - * The xas lock has been released, this memcg - * can be reparented before us. So reload - * memcg id. More details see the comments - * in memcg_reparent_list_lrus(). - */ - index = READ_ONCE(table[i].memcg->kmemcg_id); - if (index < 0) - xas_set_err(&xas, 0); - else if (!xas_error(&xas) && index != xas.xa_index) - xas_set(&xas, index); - goto retry; + mlru = memcg_init_list_lru_one(lru, gfp); + if (!mlru) + return -ENOMEM; + xas_set(&xas, pos->kmemcg_id); + do { + xas_lock_irqsave(&xas, flags); + if (!xas_load(&xas) && !css_is_dying(&pos->css)) { + xas_store(&xas, mlru); + if (!xas_error(&xas)) + mlru = NULL; } - } - } - /* xas_nomem() is used to free memory instead of memory allocation. */ - if (xas.xa_alloc) - xas_nomem(&xas, gfp); - xas_unlock_irqrestore(&xas, flags); - kfree(table); + xas_unlock_irqrestore(&xas, flags); + } while (xas_nomem(&xas, gfp)); + if (mlru) + kfree(mlru); + } while (pos != memcg && !css_is_dying(&pos->css)); return xas_error(&xas); } @@ -562,8 +564,7 @@ static void memcg_destroy_list_lru(struct list_lru *lru) } #endif /* CONFIG_MEMCG */ -int __list_lru_init(struct list_lru *lru, bool memcg_aware, - struct lock_class_key *key, struct shrinker *shrinker) +int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shrinker) { int i; @@ -581,12 +582,8 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, if (!lru->node) return -ENOMEM; - for_each_node(i) { - spin_lock_init(&lru->node[i].lock); - if (key) - lockdep_set_class(&lru->node[i].lock, key); - init_one_lru(&lru->node[i].lru); - } + for_each_node(i) + init_one_lru(lru, &lru->node[i].lru); memcg_init_list_lru(lru, memcg_aware); list_lru_register(lru); |