memcg: introduce memory.min

Memory controller implements the memory.low best-effort memory protection mechanism, which works perfectly in many cases and allows protecting working sets of important workloads from sudden reclaim. But its semantics has a significant limitation: it works only as long as there is a supply of reclaimable memory. This makes it pretty useless against any sort of slow memory leaks or memory usage increases. This is especially true for swapless systems. If swap is enabled, memory soft protection effectively postpones problems, allowing a leaking application to fill all swap area, which makes no sense. The only effective way to guarantee the memory protection in this case is to invoke the OOM killer. It's possible to handle this case in userspace by reacting on MEMCG_LOW events; but there is still a place for a fail-safe in-kernel mechanism to provide stronger guarantees. This patch introduces the memory.min interface for cgroup v2 memory controller. It works very similarly to memory.low (sharing the same hierarchical behavior), except that it's not disabled if there is no more reclaimable memory in the system. If cgroup is not populated, its memory.min is ignored, because otherwise even the OOM killer wouldn't be able to reclaim the protected memory, and the system can stall. [guro@fb.com: s/low/min/ in docs] Link: http://lkml.kernel.org/r/20180510130758.GA9129@castle.DHCP.thefacebook.com Link: http://lkml.kernel.org/r/20180509180734.GA4856@castle.DHCP.thefacebook.com Signed-off-by: Roman Gushchin <guro@fb.com> Reviewed-by: Randy Dunlap <rdunlap@infradead.org> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.com> Cc: Vladimir Davydov <vdavydov.dev@gmail.com> Cc: Tejun Heo <tj@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Roman Gushchin <guro@fb.com> 2018-06-07 17:07:46 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2018-06-07 17:34:36 -0700
commit: bf8d5d52ffe89aac5b46ddb39dd1a4351fae5df4 (patch)
tree: e0b0457ddf128b0562eb403762b2f2de2292e8b1 /mm/page_counter.c
parent: fb52bbaee598f58352d8732637ebe7013b2df79f (diff)
1 files changed, 46 insertions, 17 deletions
diff --git a/mm/page_counter.c b/mm/page_counter.c
index a5ff4cbc355a..de31470655f6 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -13,26 +13,38 @@
 #include <linux/bug.h>
 #include <asm/page.h>
 
-static void propagate_low_usage(struct page_counter *c, unsigned long usage)
+static void propagate_protected_usage(struct page_counter *c,
+				      unsigned long usage)
 {
-	unsigned long low_usage, old;
+	unsigned long protected, old_protected;
 	long delta;
 
 	if (!c->parent)
 		return;
 
-	if (!c->low && !atomic_long_read(&c->low_usage))
-		return;
+	if (c->min || atomic_long_read(&c->min_usage)) {
+		if (usage <= c->min)
+			protected = usage;
+		else
+			protected = 0;
+
+		old_protected = atomic_long_xchg(&c->min_usage, protected);
+		delta = protected - old_protected;
+		if (delta)
+			atomic_long_add(delta, &c->parent->children_min_usage);
+	}
 
-	if (usage <= c->low)
-		low_usage = usage;
-	else
-		low_usage = 0;
+	if (c->low || atomic_long_read(&c->low_usage)) {
+		if (usage <= c->low)
+			protected = usage;
+		else
+			protected = 0;
 
-	old = atomic_long_xchg(&c->low_usage, low_usage);
-	delta = low_usage - old;
-	if (delta)
-		atomic_long_add(delta, &c->parent->children_low_usage);
+		old_protected = atomic_long_xchg(&c->low_usage, protected);
+		delta = protected - old_protected;
+		if (delta)
+			atomic_long_add(delta, &c->parent->children_low_usage);
+	}
 }
 
 /**
@@ -45,7 +57,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
 	long new;
 
 	new = atomic_long_sub_return(nr_pages, &counter->usage);
-	propagate_low_usage(counter, new);
+	propagate_protected_usage(counter, new);
 	/* More uncharges than charges? */
 	WARN_ON_ONCE(new < 0);
 }
@@ -65,7 +77,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
 		long new;
 
 		new = atomic_long_add_return(nr_pages, &c->usage);
-		propagate_low_usage(counter, new);
+		propagate_protected_usage(counter, new);
 		/*
 		 * This is indeed racy, but we can live with some
 		 * inaccuracy in the watermark.
@@ -109,7 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter,
 		new = atomic_long_add_return(nr_pages, &c->usage);
 		if (new > c->max) {
 			atomic_long_sub(nr_pages, &c->usage);
-			propagate_low_usage(counter, new);
+			propagate_protected_usage(counter, new);
 			/*
 			 * This is racy, but we can live with some
 			 * inaccuracy in the failcnt.
@@ -118,7 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter,
 			*fail = c;
 			goto failed;
 		}
-		propagate_low_usage(counter, new);
+		propagate_protected_usage(counter, new);
 		/*
 		 * Just like with failcnt, we can live with some
 		 * inaccuracy in the watermark.
@@ -191,6 +203,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
 }
 
 /**
+ * page_counter_set_min - set the amount of protected memory
+ * @counter: counter
+ * @nr_pages: value to set
+ *
+ * The caller must serialize invocations on the same counter.
+ */
+void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages)
+{
+	struct page_counter *c;
+
+	counter->min = nr_pages;
+
+	for (c = counter; c; c = c->parent)
+		propagate_protected_usage(c, atomic_long_read(&c->usage));
+}
+
+/**
  * page_counter_set_low - set the amount of protected memory
  * @counter: counter
  * @nr_pages: value to set
@@ -204,7 +233,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages)
 	counter->low = nr_pages;
 
 	for (c = counter; c; c = c->parent)
-		propagate_low_usage(c, atomic_long_read(&c->usage));
+		propagate_protected_usage(c, atomic_long_read(&c->usage));
 }
 
 /**
author	Roman Gushchin <guro@fb.com>	2018-06-07 17:07:46 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2018-06-07 17:34:36 -0700
commit	bf8d5d52ffe89aac5b46ddb39dd1a4351fae5df4 (patch)
tree	e0b0457ddf128b0562eb403762b2f2de2292e8b1 /mm/page_counter.c
parent	fb52bbaee598f58352d8732637ebe7013b2df79f (diff)