summaryrefslogtreecommitdiff
path: root/mm/memcontrol.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-12-13 19:29:45 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2022-12-13 19:29:45 -0800
commite2ca6ba6ba0152361aa4fcbf6067db71b2c7a770 (patch)
treef7ed7753a2e66486a4ffe0fbbf98404ec4ba2212 /mm/memcontrol.c
parent7e68dd7d07a28faa2e6574dd6b9dbd90cdeaae91 (diff)
parentc45bc55a99957b20e4e0333bcd42e12d1833a7f5 (diff)
Merge tag 'mm-stable-2022-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - More userfaultfs work from Peter Xu - Several convert-to-folios series from Sidhartha Kumar and Huang Ying - Some filemap cleanups from Vishal Moola - David Hildenbrand added the ability to selftest anon memory COW handling - Some cpuset simplifications from Liu Shixin - Addition of vmalloc tracing support by Uladzislau Rezki - Some pagecache folioifications and simplifications from Matthew Wilcox - A pagemap cleanup from Kefeng Wang: we have VM_ACCESS_FLAGS, so use it - Miguel Ojeda contributed some cleanups for our use of the __no_sanitize_thread__ gcc keyword. This series should have been in the non-MM tree, my bad - Naoya Horiguchi improved the interaction between memory poisoning and memory section removal for huge pages - DAMON cleanups and tuneups from SeongJae Park - Tony Luck fixed the handling of COW faults against poisoned pages - Peter Xu utilized the PTE marker code for handling swapin errors - Hugh Dickins reworked compound page mapcount handling, simplifying it and making it more efficient - Removal of the autonuma savedwrite infrastructure from Nadav Amit and David Hildenbrand - zram support for multiple compression streams from Sergey Senozhatsky - David Hildenbrand reworked the GUP code's R/O long-term pinning so that drivers no longer need to use the FOLL_FORCE workaround which didn't work very well anyway - Mel Gorman altered the page allocator so that local IRQs can remnain enabled during per-cpu page allocations - Vishal Moola removed the try_to_release_page() wrapper - Stefan Roesch added some per-BDI sysfs tunables which are used to prevent network block devices from dirtying excessive amounts of pagecache - David Hildenbrand did some cleanup and repair work on KSM COW breaking - Nhat Pham and Johannes Weiner have implemented writeback in zswap's zsmalloc backend - Brian Foster has fixed a longstanding corner-case oddity in file[map]_write_and_wait_range() - sparse-vmemmap changes for MIPS, LoongArch and NIOS2 from Feiyang Chen - Shiyang Ruan has done some work on fsdax, to make its reflink mode work better under xfstests. Better, but still not perfect - Christoph Hellwig has removed the .writepage() method from several filesystems. They only need .writepages() - Yosry Ahmed wrote a series which fixes the memcg reclaim target beancounting - David Hildenbrand has fixed some of our MM selftests for 32-bit machines - Many singleton patches, as usual * tag 'mm-stable-2022-12-13' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (313 commits) mm/hugetlb: set head flag before setting compound_order in __prep_compound_gigantic_folio mm: mmu_gather: allow more than one batch of delayed rmaps mm: fix typo in struct pglist_data code comment kmsan: fix memcpy tests mm: add cond_resched() in swapin_walk_pmd_entry() mm: do not show fs mm pc for VM_LOCKONFAULT pages selftests/vm: ksm_functional_tests: fixes for 32bit selftests/vm: cow: fix compile warning on 32bit selftests/vm: madv_populate: fix missing MADV_POPULATE_(READ|WRITE) definitions mm/gup_test: fix PIN_LONGTERM_TEST_READ with highmem mm,thp,rmap: fix races between updates of subpages_mapcount mm: memcg: fix swapcached stat accounting mm: add nodes= arg to memory.reclaim mm: disable top-tier fallback to reclaim on proactive reclaim selftests: cgroup: make sure reclaim target memcg is unprotected selftests: cgroup: refactor proactive reclaim code to reclaim_until() mm: memcg: fix stale protection of reclaim target memcg mm/mmap: properly unaccount memory on mas_preallocate() failure omfs: remove ->writepage jfs: remove ->writepage ...
Diffstat (limited to 'mm/memcontrol.c')
-rw-r--r--mm/memcontrol.c113
1 files changed, 85 insertions, 28 deletions
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 266a1ab05434..ab457f0394ab 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -63,6 +63,7 @@
#include <linux/resume_user_mode.h>
#include <linux/psi.h>
#include <linux/seq_buf.h>
+#include <linux/parser.h>
#include "internal.h"
#include <net/sock.h>
#include <net/ip.h>
@@ -661,8 +662,10 @@ static const unsigned int memcg_vm_event_stat[] = {
PGPGOUT,
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
+ PGSCAN_KHUGEPAGED,
PGSTEAL_KSWAPD,
PGSTEAL_DIRECT,
+ PGSTEAL_KHUGEPAGED,
PGFAULT,
PGMAJFAULT,
PGREFILL,
@@ -1219,7 +1222,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
* cgroup root (root_mem_cgroup). So we have to handle
* dead_memcg from cgroup root separately.
*/
- if (last != root_mem_cgroup)
+ if (!mem_cgroup_is_root(last))
__invalidate_reclaim_iterators(root_mem_cgroup,
dead_memcg);
}
@@ -1243,7 +1246,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
struct mem_cgroup *iter;
int ret = 0;
- BUG_ON(memcg == root_mem_cgroup);
+ BUG_ON(mem_cgroup_is_root(memcg));
for_each_mem_cgroup_tree(iter, memcg) {
struct css_task_iter it;
@@ -1272,7 +1275,7 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio)
memcg = folio_memcg(folio);
if (!memcg)
- VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != root_mem_cgroup, folio);
+ VM_BUG_ON_FOLIO(!mem_cgroup_is_root(lruvec_memcg(lruvec)), folio);
else
VM_BUG_ON_FOLIO(lruvec_memcg(lruvec) != memcg, folio);
}
@@ -1574,10 +1577,12 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize)
/* Accumulated memory events */
seq_buf_printf(&s, "pgscan %lu\n",
memcg_events(memcg, PGSCAN_KSWAPD) +
- memcg_events(memcg, PGSCAN_DIRECT));
+ memcg_events(memcg, PGSCAN_DIRECT) +
+ memcg_events(memcg, PGSCAN_KHUGEPAGED));
seq_buf_printf(&s, "pgsteal %lu\n",
memcg_events(memcg, PGSTEAL_KSWAPD) +
- memcg_events(memcg, PGSTEAL_DIRECT));
+ memcg_events(memcg, PGSTEAL_DIRECT) +
+ memcg_events(memcg, PGSTEAL_KHUGEPAGED));
for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
if (memcg_vm_event_stat[i] == PGPGIN ||
@@ -2036,7 +2041,7 @@ struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
rcu_read_lock();
memcg = mem_cgroup_from_task(victim);
- if (memcg == root_mem_cgroup)
+ if (mem_cgroup_is_root(memcg))
goto out;
/*
@@ -2388,7 +2393,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
gfp_mask,
- MEMCG_RECLAIM_MAY_SWAP);
+ MEMCG_RECLAIM_MAY_SWAP,
+ NULL);
psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
@@ -2679,7 +2685,8 @@ retry:
psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
- gfp_mask, reclaim_options);
+ gfp_mask, reclaim_options,
+ NULL);
psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
@@ -2995,7 +3002,7 @@ static struct obj_cgroup *__get_obj_cgroup_from_memcg(struct mem_cgroup *memcg)
{
struct obj_cgroup *objcg = NULL;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
objcg = rcu_dereference(memcg->objcg);
if (objcg && obj_cgroup_tryget(objcg))
break;
@@ -3499,7 +3506,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
}
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
+ memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP,
+ NULL)) {
ret = -EBUSY;
break;
}
@@ -3610,7 +3618,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return -EINTR;
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
- MEMCG_RECLAIM_MAY_SWAP))
+ MEMCG_RECLAIM_MAY_SWAP,
+ NULL))
nr_retries--;
}
@@ -5659,15 +5668,21 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
unsigned long addr, pte_t ptent)
{
+ unsigned long index;
+ struct folio *folio;
+
if (!vma->vm_file) /* anonymous vma */
return NULL;
if (!(mc.flags & MOVE_FILE))
return NULL;
- /* page is moved even if it's not RSS of this task(page-faulted). */
+ /* folio is moved even if it's not RSS of this task(page-faulted). */
/* shmem/tmpfs may report page out on swap: account for that too. */
- return find_get_incore_page(vma->vm_file->f_mapping,
- linear_page_index(vma, addr));
+ index = linear_page_index(vma, addr);
+ folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index);
+ if (!folio)
+ return NULL;
+ return folio_file_page(folio, index);
}
/**
@@ -5752,6 +5767,12 @@ static int mem_cgroup_move_account(struct page *page,
}
}
+#ifdef CONFIG_SWAP
+ if (folio_test_swapcache(folio)) {
+ __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages);
+ }
+#endif
if (folio_test_writeback(folio)) {
__mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages);
__mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages);
@@ -6408,7 +6429,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
}
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
+ NULL);
if (!reclaimed && !nr_retries--)
break;
@@ -6457,7 +6479,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
- GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
+ GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
+ NULL))
nr_reclaims--;
continue;
}
@@ -6580,21 +6603,54 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes;
}
+enum {
+ MEMORY_RECLAIM_NODES = 0,
+ MEMORY_RECLAIM_NULL,
+};
+
+static const match_table_t if_tokens = {
+ { MEMORY_RECLAIM_NODES, "nodes=%s" },
+ { MEMORY_RECLAIM_NULL, NULL },
+};
+
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES;
unsigned long nr_to_reclaim, nr_reclaimed = 0;
- unsigned int reclaim_options;
- int err;
+ unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
+ MEMCG_RECLAIM_PROACTIVE;
+ char *old_buf, *start;
+ substring_t args[MAX_OPT_ARGS];
+ int token;
+ char value[256];
+ nodemask_t nodemask = NODE_MASK_ALL;
buf = strstrip(buf);
- err = page_counter_memparse(buf, "", &nr_to_reclaim);
- if (err)
- return err;
- reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
+ old_buf = buf;
+ nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
+ if (buf == old_buf)
+ return -EINVAL;
+
+ buf = strstrip(buf);
+
+ while ((start = strsep(&buf, " ")) != NULL) {
+ if (!strlen(start))
+ continue;
+ token = match_token(start, if_tokens, args);
+ match_strlcpy(value, args, sizeof(value));
+ switch (token) {
+ case MEMORY_RECLAIM_NODES:
+ if (nodelist_parse(value, nodemask) < 0)
+ return -EINVAL;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+
while (nr_reclaimed < nr_to_reclaim) {
unsigned long reclaimed;
@@ -6611,7 +6667,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
reclaimed = try_to_free_mem_cgroup_pages(memcg,
nr_to_reclaim - nr_reclaimed,
- GFP_KERNEL, reclaim_options);
+ GFP_KERNEL, reclaim_options,
+ &nodemask);
if (!reclaimed && !nr_retries--)
return -EAGAIN;
@@ -7174,7 +7231,7 @@ void mem_cgroup_sk_alloc(struct sock *sk)
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
- if (memcg == root_mem_cgroup)
+ if (mem_cgroup_is_root(memcg))
goto out;
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
goto out;
@@ -7309,7 +7366,7 @@ static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
* The root cgroup cannot be destroyed, so it's refcount must
* always be >= 1.
*/
- if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
+ if (WARN_ON_ONCE(mem_cgroup_is_root(memcg))) {
VM_BUG_ON(1);
break;
}
@@ -7473,7 +7530,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
if (mem_cgroup_disabled() || do_memsw_account())
return nr_swap_pages;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
READ_ONCE(memcg->swap.max) -
page_counter_read(&memcg->swap));
@@ -7495,7 +7552,7 @@ bool mem_cgroup_swap_full(struct folio *folio)
if (!memcg)
return false;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ for (; !mem_cgroup_is_root(memcg); memcg = parent_mem_cgroup(memcg)) {
unsigned long usage = page_counter_read(&memcg->swap);
if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
@@ -7659,7 +7716,7 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg)
return true;
original_memcg = get_mem_cgroup_from_objcg(objcg);
- for (memcg = original_memcg; memcg != root_mem_cgroup;
+ for (memcg = original_memcg; !mem_cgroup_is_root(memcg);
memcg = parent_mem_cgroup(memcg)) {
unsigned long max = READ_ONCE(memcg->zswap_max);
unsigned long pages;