summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig86
-rw-r--r--mm/Makefile8
-rw-r--r--mm/cma.c57
-rw-r--r--mm/compaction.c47
-rw-r--r--mm/damon/core.c24
-rw-r--r--mm/damon/dbgfs.c2
-rw-r--r--mm/damon/sysfs.c2
-rw-r--r--mm/damon/tests/.kunitconfig22
-rw-r--r--mm/damon/tests/core-kunit.h (renamed from mm/damon/core-test.h)35
-rw-r--r--mm/damon/tests/dbgfs-kunit.h (renamed from mm/damon/dbgfs-test.h)10
-rw-r--r--mm/damon/tests/sysfs-kunit.h (renamed from mm/damon/sysfs-test.h)0
-rw-r--r--mm/damon/tests/vaddr-kunit.h (renamed from mm/damon/vaddr-test.h)2
-rw-r--r--mm/damon/vaddr.c2
-rw-r--r--mm/debug.c31
-rw-r--r--mm/debug_vm_pgtable.c50
-rw-r--r--mm/filemap.c67
-rw-r--r--mm/folio-compat.c12
-rw-r--r--mm/gup.c68
-rw-r--r--mm/huge_memory.c589
-rw-r--r--mm/hugetlb.c442
-rw-r--r--mm/hugetlb_cgroup.c4
-rw-r--r--mm/hugetlb_vmemmap.c40
-rw-r--r--mm/internal.h216
-rw-r--r--mm/kfence/core.c53
-rw-r--r--mm/kfence/kfence.h1
-rw-r--r--mm/kfence/report.c15
-rw-r--r--mm/khugepaged.c75
-rw-r--r--mm/kmemleak.c159
-rw-r--r--mm/ksm.c146
-rw-r--r--mm/madvise.c13
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol-v1.c126
-rw-r--r--mm/memcontrol-v1.h26
-rw-r--r--mm/memcontrol.c490
-rw-r--r--mm/memory-failure.c92
-rw-r--r--mm/memory-tiers.c22
-rw-r--r--mm/memory.c562
-rw-r--r--mm/memory_hotplug.c85
-rw-r--r--mm/mempolicy.c8
-rw-r--r--mm/migrate.c270
-rw-r--r--mm/migrate_device.c108
-rw-r--r--mm/mm_init.c12
-rw-r--r--mm/mmap.c2156
-rw-r--r--mm/mmu_notifier.c2
-rw-r--r--mm/mmzone.c2
-rw-r--r--mm/mprotect.c86
-rw-r--r--mm/mremap.c32
-rw-r--r--mm/mseal.c55
-rw-r--r--mm/nommu.c11
-rw-r--r--mm/numa.c69
-rw-r--r--mm/numa_emulation.c571
-rw-r--r--mm/numa_memblks.c571
-rw-r--r--mm/page-writeback.c2
-rw-r--r--mm/page_alloc.c349
-rw-r--r--mm/page_counter.c48
-rw-r--r--mm/page_io.c113
-rw-r--r--mm/page_isolation.c36
-rw-r--r--mm/pagewalk.c202
-rw-r--r--mm/percpu.c31
-rw-r--r--mm/rmap.c71
-rw-r--r--mm/shmem.c450
-rw-r--r--mm/shmem_quota.c3
-rw-r--r--mm/show_mem.c11
-rw-r--r--mm/shrinker_debug.c2
-rw-r--r--mm/slab_common.c27
-rw-r--r--mm/swap.c298
-rw-r--r--mm/swap.h44
-rw-r--r--mm/swap_cgroup.c2
-rw-r--r--mm/swap_state.c78
-rw-r--r--mm/swapfile.c1482
-rw-r--r--mm/userfaultfd.c170
-rw-r--r--mm/util.c102
-rw-r--r--mm/vma.c2068
-rw-r--r--mm/vma.h558
-rw-r--r--mm/vma_internal.h49
-rw-r--r--mm/vmalloc.c139
-rw-r--r--mm/vmscan.c67
-rw-r--r--mm/vmstat.c28
-rw-r--r--mm/z3fold.c2
-rw-r--r--mm/zsmalloc.c15
-rw-r--r--mm/zswap.c307
81 files changed, 9148 insertions, 5242 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index b72e7d040f78..09aebca1cae3 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -128,7 +128,7 @@ config ZSWAP_COMPRESSOR_DEFAULT
choice
prompt "Default allocator"
depends on ZSWAP
- default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if HAVE_ZSMALLOC
+ default ZSWAP_ZPOOL_DEFAULT_ZSMALLOC if MMU
default ZSWAP_ZPOOL_DEFAULT_ZBUD
help
Selects the default allocator for the compressed cache for
@@ -146,15 +146,17 @@ config ZSWAP_ZPOOL_DEFAULT_ZBUD
help
Use the zbud allocator as the default allocator.
-config ZSWAP_ZPOOL_DEFAULT_Z3FOLD
- bool "z3fold"
- select Z3FOLD
+config ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED
+ bool "z3foldi (DEPRECATED)"
+ select Z3FOLD_DEPRECATED
help
Use the z3fold allocator as the default allocator.
+ Deprecated and scheduled for removal in a few cycles,
+ see CONFIG_Z3FOLD_DEPRECATED.
+
config ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
bool "zsmalloc"
- depends on HAVE_ZSMALLOC
select ZSMALLOC
help
Use the zsmalloc allocator as the default allocator.
@@ -164,7 +166,7 @@ config ZSWAP_ZPOOL_DEFAULT
string
depends on ZSWAP
default "zbud" if ZSWAP_ZPOOL_DEFAULT_ZBUD
- default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD
+ default "z3fold" if ZSWAP_ZPOOL_DEFAULT_Z3FOLD_DEPRECATED
default "zsmalloc" if ZSWAP_ZPOOL_DEFAULT_ZSMALLOC
default ""
@@ -178,24 +180,29 @@ config ZBUD
deterministic reclaim properties that make it preferable to a higher
density approach when reclaim will be used.
-config Z3FOLD
- tristate "3:1 compression allocator (z3fold)"
+config Z3FOLD_DEPRECATED
+ tristate "3:1 compression allocator (z3fold) (DEPRECATED)"
depends on ZSWAP
help
+ Deprecated and scheduled for removal in a few cycles. If you have
+ a good reason for using Z3FOLD over ZSMALLOC, please contact
+ linux-mm@kvack.org and the zswap maintainers.
+
A special purpose allocator for storing compressed pages.
It is designed to store up to three compressed pages per physical
page. It is a ZBUD derivative so the simplicity and determinism are
still there.
-config HAVE_ZSMALLOC
- def_bool y
- depends on MMU
- depends on PAGE_SIZE_LESS_THAN_256KB # we want <= 64 KiB
+config Z3FOLD
+ tristate
+ default y if Z3FOLD_DEPRECATED=y
+ default m if Z3FOLD_DEPRECATED=m
+ depends on Z3FOLD_DEPRECATED
config ZSMALLOC
tristate
- prompt "N:1 compression allocator (zsmalloc)" if ZSWAP
- depends on HAVE_ZSMALLOC
+ prompt "N:1 compression allocator (zsmalloc)" if (ZSWAP || ZRAM)
+ depends on MMU
help
zsmalloc is a slab-based memory allocator designed to store
pages of various compression levels efficiently. It achieves
@@ -585,17 +592,21 @@ config ARCH_MHP_MEMMAP_ON_MEMORY_ENABLE
# at the same time (e.g. copy_page_range()).
# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
#
-config SPLIT_PTLOCK_CPUS
- int
- default "999999" if !MMU
- default "999999" if ARM && !CPU_CACHE_VIPT
- default "999999" if PARISC && !PA20
- default "999999" if SPARC32
- default "4"
+config SPLIT_PTE_PTLOCKS
+ def_bool y
+ depends on MMU
+ depends on NR_CPUS >= 4
+ depends on !ARM || CPU_CACHE_VIPT
+ depends on !PARISC || PA20
+ depends on !SPARC32
config ARCH_ENABLE_SPLIT_PMD_PTLOCK
bool
+config SPLIT_PMD_PTLOCKS
+ def_bool y
+ depends on SPLIT_PTE_PTLOCKS && ARCH_ENABLE_SPLIT_PMD_PTLOCK
+
#
# support for memory balloon
config MEMORY_BALLOON
@@ -877,6 +888,19 @@ endif # TRANSPARENT_HUGEPAGE
config PGTABLE_HAS_HUGE_LEAVES
def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE
+# TODO: Allow to be enabled without THP
+config ARCH_SUPPORTS_HUGE_PFNMAP
+ def_bool n
+ depends on TRANSPARENT_HUGEPAGE
+
+config ARCH_SUPPORTS_PMD_PFNMAP
+ def_bool y
+ depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE
+
+config ARCH_SUPPORTS_PUD_PFNMAP
+ def_bool y
+ depends on ARCH_SUPPORTS_HUGE_PFNMAP && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+
#
# UP and nommu archs use km based percpu allocator
#
@@ -1081,13 +1105,10 @@ config ARCH_USES_HIGH_VMA_FLAGS
config ARCH_HAS_PKEYS
bool
-config ARCH_USES_PG_ARCH_X
+config ARCH_USES_PG_ARCH_2
+ bool
+config ARCH_USES_PG_ARCH_3
bool
- help
- Enable the definition of PG_arch_x page flags with x > 1. Only
- suitable for 64-bit architectures with CONFIG_FLATMEM or
- CONFIG_SPARSEMEM_VMEMMAP enabled, otherwise there may not be
- enough room for additional bits in page->flags.
config VM_EVENT_COUNTERS
default y
@@ -1263,6 +1284,17 @@ config IOMMU_MM_DATA
config EXECMEM
bool
+config NUMA_MEMBLKS
+ bool
+
+config NUMA_EMU
+ bool "NUMA emulation"
+ depends on NUMA_MEMBLKS
+ help
+ Enable NUMA emulation. A flat machine will be split
+ into virtual nodes when booted with "numa=fake=N", where N is the
+ number of nodes. This is only useful for debugging.
+
source "mm/damon/Kconfig"
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index d2915f8c9dc0..d5639b036166 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -37,7 +37,7 @@ mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
- pgtable-generic.o rmap.o vmalloc.o
+ pgtable-generic.o rmap.o vmalloc.o vma.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
@@ -53,7 +53,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
readahead.o swap.o truncate.o vmscan.o shrinker.o \
shmem.o util.o mmzone.o vmstat.o backing-dev.o \
mm_init.o percpu.o slab_common.o \
- compaction.o show_mem.o shmem_quota.o\
+ compaction.o show_mem.o \
interval_tree.o list_lru.o workingset.o \
debug.o gup.o mmap_lock.o $(mmu-y)
@@ -117,6 +117,9 @@ obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
obj-$(CONFIG_Z3FOLD) += z3fold.o
obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
obj-$(CONFIG_CMA) += cma.o
+obj-$(CONFIG_NUMA) += numa.o
+obj-$(CONFIG_NUMA_MEMBLKS) += numa_memblks.o
+obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o
@@ -141,3 +144,4 @@ obj-$(CONFIG_HAVE_BOOTMEM_INFO_NODE) += bootmem_info.o
obj-$(CONFIG_GENERIC_IOREMAP) += ioremap.o
obj-$(CONFIG_SHRINKER_DEBUG) += shrinker_debug.o
obj-$(CONFIG_EXECMEM) += execmem.o
+obj-$(CONFIG_TMPFS_QUOTA) += shmem_quota.o
diff --git a/mm/cma.c b/mm/cma.c
index 3e9724716bad..2d9fae939283 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -202,7 +202,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
cma->order_per_bit = order_per_bit;
*res_cma = cma;
cma_area_count++;
- totalcma_pages += (size / PAGE_SIZE);
+ totalcma_pages += cma->count;
return 0;
}
@@ -403,18 +403,8 @@ static void cma_debug_show_areas(struct cma *cma)
spin_unlock_irq(&cma->lock);
}
-/**
- * cma_alloc() - allocate pages from contiguous area
- * @cma: Contiguous memory region for which the allocation is performed.
- * @count: Requested number of pages.
- * @align: Requested alignment of pages (in PAGE_SIZE order).
- * @no_warn: Avoid printing message about failed allocation
- *
- * This function allocates part of contiguous memory on specific
- * contiguous memory area.
- */
-struct page *cma_alloc(struct cma *cma, unsigned long count,
- unsigned int align, bool no_warn)
+static struct page *__cma_alloc(struct cma *cma, unsigned long count,
+ unsigned int align, gfp_t gfp)
{
unsigned long mask, offset;
unsigned long pfn = -1;
@@ -463,8 +453,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
mutex_lock(&cma_mutex);
- ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
- GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0));
+ ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA, gfp);
mutex_unlock(&cma_mutex);
if (ret == 0) {
page = pfn_to_page(pfn);
@@ -494,7 +483,7 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
page_kasan_tag_reset(nth_page(page, i));
}
- if (ret && !no_warn) {
+ if (ret && !(gfp & __GFP_NOWARN)) {
pr_err_ratelimited("%s: %s: alloc failed, req-size: %lu pages, ret: %d\n",
__func__, cma->name, count, ret);
cma_debug_show_areas(cma);
@@ -513,6 +502,34 @@ struct page *cma_alloc(struct cma *cma, unsigned long count,
return page;
}
+/**
+ * cma_alloc() - allocate pages from contiguous area
+ * @cma: Contiguous memory region for which the allocation is performed.
+ * @count: Requested number of pages.
+ * @align: Requested alignment of pages (in PAGE_SIZE order).
+ * @no_warn: Avoid printing message about failed allocation
+ *
+ * This function allocates part of contiguous memory on specific
+ * contiguous memory area.
+ */
+struct page *cma_alloc(struct cma *cma, unsigned long count,
+ unsigned int align, bool no_warn)
+{
+ return __cma_alloc(cma, count, align, GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0));
+}
+
+struct folio *cma_alloc_folio(struct cma *cma, int order, gfp_t gfp)
+{
+ struct page *page;
+
+ if (WARN_ON(!order || !(gfp & __GFP_COMP)))
+ return NULL;
+
+ page = __cma_alloc(cma, 1 << order, order, gfp);
+
+ return page ? page_folio(page) : NULL;
+}
+
bool cma_pages_valid(struct cma *cma, const struct page *pages,
unsigned long count)
{
@@ -564,6 +581,14 @@ bool cma_release(struct cma *cma, const struct page *pages,
return true;
}
+bool cma_free_folio(struct cma *cma, const struct folio *folio)
+{
+ if (WARN_ON(!folio_test_large(folio)))
+ return false;
+
+ return cma_release(cma, &folio->page, folio_nr_pages(folio));
+}
+
int cma_for_each_area(int (*it)(struct cma *cma, void *data), void *data)
{
int i;
diff --git a/mm/compaction.c b/mm/compaction.c
index eb95e9b435d0..a2b16b08cbbf 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -23,6 +23,7 @@
#include <linux/freezer.h>
#include <linux/page_owner.h>
#include <linux/psi.h>
+#include <linux/cpuset.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -86,33 +87,6 @@ static struct page *mark_allocated_noprof(struct page *page, unsigned int order,
}
#define mark_allocated(...) alloc_hooks(mark_allocated_noprof(__VA_ARGS__))
-static void split_map_pages(struct list_head *freepages)
-{
- unsigned int i, order;
- struct page *page, *next;
- LIST_HEAD(tmp_list);
-
- for (order = 0; order < NR_PAGE_ORDERS; order++) {
- list_for_each_entry_safe(page, next, &freepages[order], lru) {
- unsigned int nr_pages;
-
- list_del(&page->lru);
-
- nr_pages = 1 << order;
-
- mark_allocated(page, order, __GFP_MOVABLE);
- if (order)
- split_page(page, order);
-
- for (i = 0; i < nr_pages; i++) {
- list_add(&page->lru, &tmp_list);
- page++;
- }
- }
- list_splice_init(&tmp_list, &freepages[0]);
- }
-}
-
static unsigned long release_free_list(struct list_head *freepages)
{
int order;
@@ -742,11 +716,11 @@ isolate_fail:
*
* Non-free pages, invalid PFNs, or zone boundaries within the
* [start_pfn, end_pfn) range are considered errors, cause function to
- * undo its actions and return zero.
+ * undo its actions and return zero. cc->freepages[] are empty.
*
* Otherwise, function returns one-past-the-last PFN of isolated page
* (which may be greater then end_pfn if end fell in a middle of
- * a free page).
+ * a free page). cc->freepages[] contain free pages isolated.
*/
unsigned long
isolate_freepages_range(struct compact_control *cc,
@@ -754,10 +728,9 @@ isolate_freepages_range(struct compact_control *cc,
{
unsigned long isolated, pfn, block_start_pfn, block_end_pfn;
int order;
- struct list_head tmp_freepages[NR_PAGE_ORDERS];
for (order = 0; order < NR_PAGE_ORDERS; order++)
- INIT_LIST_HEAD(&tmp_freepages[order]);
+ INIT_LIST_HEAD(&cc->freepages[order]);
pfn = start_pfn;
block_start_pfn = pageblock_start_pfn(pfn);
@@ -788,7 +761,7 @@ isolate_freepages_range(struct compact_control *cc,
break;
isolated = isolate_freepages_block(cc, &isolate_start_pfn,
- block_end_pfn, tmp_freepages, 0, true);
+ block_end_pfn, cc->freepages, 0, true);
/*
* In strict mode, isolate_freepages_block() returns 0 if
@@ -807,13 +780,10 @@ isolate_freepages_range(struct compact_control *cc,
if (pfn < end_pfn) {
/* Loop terminated early, cleanup. */
- release_free_list(tmp_freepages);
+ release_free_list(cc->freepages);
return 0;
}
- /* __isolate_free_page() does not map the pages */
- split_map_pages(tmp_freepages);
-
/* We don't use freelists for anything. */
return pfn;
}
@@ -2853,6 +2823,11 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
ac->highest_zoneidx, ac->nodemask) {
enum compact_result status;
+ if (cpusets_enabled() &&
+ (alloc_flags & ALLOC_CPUSET) &&
+ !__cpuset_zone_allowed(zone, gfp_mask))
+ continue;
+
if (prio > MIN_COMPACT_PRIORITY
&& compaction_deferred(zone, order)) {
rc = max_t(enum compact_result, COMPACT_DEFERRED, rc);
diff --git a/mm/damon/core.c b/mm/damon/core.c
index 7a87628b76ab..a83f3b736d51 100644
--- a/mm/damon/core.c
+++ b/mm/damon/core.c
@@ -552,7 +552,13 @@ static unsigned int damon_accesses_bp_to_nr_accesses(
return accesses_bp * damon_max_nr_accesses(attrs) / 10000;
}
-/* convert nr_accesses to access ratio in bp (per 10,000) */
+/*
+ * Convert nr_accesses to access ratio in bp (per 10,000).
+ *
+ * Callers should ensure attrs.aggr_interval is not zero, like
+ * damon_update_monitoring_results() does . Otherwise, divide-by-zero would
+ * happen.
+ */
static unsigned int damon_nr_accesses_to_accesses_bp(
unsigned int nr_accesses, struct damon_attrs *attrs)
{
@@ -1582,13 +1588,16 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
return;
/* Fill up the score histogram */
- memset(quota->histogram, 0, sizeof(quota->histogram));
+ memset(c->regions_score_histogram, 0,
+ sizeof(*c->regions_score_histogram) *
+ (DAMOS_MAX_SCORE + 1));
damon_for_each_target(t, c) {
damon_for_each_region(r, t) {
if (!__damos_valid_target(r, s))
continue;
score = c->ops.get_scheme_score(c, t, r, s);
- quota->histogram[score] += damon_sz_region(r);
+ c->regions_score_histogram[score] +=
+ damon_sz_region(r);
if (score > max_score)
max_score = score;
}
@@ -1596,7 +1605,7 @@ static void damos_adjust_quota(struct damon_ctx *c, struct damos *s)
/* Set the min score limit */
for (cumulated_sz = 0, score = max_score; ; score--) {
- cumulated_sz += quota->histogram[score];
+ cumulated_sz += c->regions_score_histogram[score];
if (cumulated_sz >= quota->esz || !score)
break;
}
@@ -1957,6 +1966,10 @@ static int kdamond_fn(void *data)
ctx->ops.init(ctx);
if (ctx->callback.before_start && ctx->callback.before_start(ctx))
goto done;
+ ctx->regions_score_histogram = kmalloc_array(DAMOS_MAX_SCORE + 1,
+ sizeof(*ctx->regions_score_histogram), GFP_KERNEL);
+ if (!ctx->regions_score_histogram)
+ goto done;
sz_limit = damon_region_sz_limit(ctx);
@@ -2034,6 +2047,7 @@ done:
ctx->callback.before_terminate(ctx);
if (ctx->ops.cleanup)
ctx->ops.cleanup(ctx);
+ kfree(ctx->regions_score_histogram);
pr_debug("kdamond (%d) finishes\n", current->pid);
mutex_lock(&ctx->kdamond_lock);
@@ -2205,4 +2219,4 @@ static int __init damon_init(void)
subsys_initcall(damon_init);
-#include "core-test.h"
+#include "tests/core-kunit.h"
diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c
index 51a6f1cac385..b4213bc47e44 100644
--- a/mm/damon/dbgfs.c
+++ b/mm/damon/dbgfs.c
@@ -1145,4 +1145,4 @@ out:
module_init(damon_dbgfs_init);
-#include "dbgfs-test.h"
+#include "tests/dbgfs-kunit.h"
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index cffc755e7775..58145d59881d 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1882,4 +1882,4 @@ out:
}
subsys_initcall(damon_sysfs_init);
-#include "sysfs-test.h"
+#include "tests/sysfs-kunit.h"
diff --git a/mm/damon/tests/.kunitconfig b/mm/damon/tests/.kunitconfig
new file mode 100644
index 000000000000..a73be044fc9b
--- /dev/null
+++ b/mm/damon/tests/.kunitconfig
@@ -0,0 +1,22 @@
+# for DAMON core
+CONFIG_KUNIT=y
+CONFIG_DAMON=y
+CONFIG_DAMON_KUNIT_TEST=y
+
+# for DAMON vaddr ops
+CONFIG_MMU=y
+CONFIG_PAGE_IDLE_FLAG=y
+CONFIG_DAMON_VADDR=y
+CONFIG_DAMON_VADDR_KUNIT_TEST=y
+
+# for DAMON sysfs interface
+CONFIG_SYSFS=y
+CONFIG_DAMON_SYSFS=y
+CONFIG_DAMON_SYSFS_KUNIT_TEST=y
+
+# for DAMON debugfs interface
+CONFIG_DEBUG_FS=y
+CONFIG_DAMON_PADDR=y
+CONFIG_DAMON_DBGFS_DEPRECATED=y
+CONFIG_DAMON_DBGFS=y
+CONFIG_DAMON_DBGFS_KUNIT_TEST=y
diff --git a/mm/damon/core-test.h b/mm/damon/tests/core-kunit.h
index 0cee634f3544..cf22e09a3507 100644
--- a/mm/damon/core-test.h
+++ b/mm/damon/tests/core-kunit.h
@@ -246,16 +246,20 @@ static void damon_test_split_regions_of(struct kunit *test)
static void damon_test_ops_registration(struct kunit *test)
{
struct damon_ctx *c = damon_new_ctx();
- struct damon_operations ops, bak;
+ struct damon_operations ops = {.id = DAMON_OPS_VADDR}, bak;
+ bool need_cleanup = false;
+
+ /* DAMON_OPS_VADDR is registered only if CONFIG_DAMON_VADDR is set */
+ if (!damon_is_registered_ops(DAMON_OPS_VADDR)) {
+ bak.id = DAMON_OPS_VADDR;
+ KUNIT_EXPECT_EQ(test, damon_register_ops(&bak), 0);
+ need_cleanup = true;
+ }
- /* DAMON_OPS_{V,P}ADDR are registered on subsys_initcall */
+ /* DAMON_OPS_VADDR is ensured to be registered */
KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_VADDR), 0);
- KUNIT_EXPECT_EQ(test, damon_select_ops(c, DAMON_OPS_PADDR), 0);
/* Double-registration is prohibited */
- ops.id = DAMON_OPS_VADDR;
- KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
- ops.id = DAMON_OPS_PADDR;
KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
/* Unknown ops id cannot be registered */
@@ -278,6 +282,13 @@ static void damon_test_ops_registration(struct kunit *test)
KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL);
damon_destroy_ctx(c);
+
+ if (need_cleanup) {
+ mutex_lock(&damon_ops_lock);
+ damon_registered_ops[DAMON_OPS_VADDR] =
+ (struct damon_operations){};
+ mutex_unlock(&damon_ops_lock);
+ }
}
static void damon_test_set_regions(struct kunit *test)
@@ -309,6 +320,18 @@ static void damon_test_nr_accesses_to_accesses_bp(struct kunit *test)
.aggr_interval = ((unsigned long)UINT_MAX + 1) * 10
};
+ /*
+ * In some cases such as 32bit architectures where UINT_MAX is
+ * ULONG_MAX, attrs.aggr_interval becomes zero. Calling
+ * damon_nr_accesses_to_accesses_bp() in the case will cause
+ * divide-by-zero. Such case is prohibited in normal execution since
+ * the caution is documented on the comment for the function, and
+ * damon_update_monitoring_results() does the check. Skip the test in
+ * the case.
+ */
+ if (!attrs.aggr_interval)
+ kunit_skip(test, "aggr_interval is zero.");
+
KUNIT_EXPECT_EQ(test, damon_nr_accesses_to_accesses_bp(123, &attrs), 0);
}
diff --git a/mm/damon/dbgfs-test.h b/mm/damon/tests/dbgfs-kunit.h
index 2d85217f5ba4..d2ecfcc8db86 100644
--- a/mm/damon/dbgfs-test.h
+++ b/mm/damon/tests/dbgfs-kunit.h
@@ -73,6 +73,11 @@ static void damon_dbgfs_test_set_targets(struct kunit *test)
struct damon_ctx *ctx = dbgfs_new_ctx();
char buf[64];
+ if (!damon_is_registered_ops(DAMON_OPS_PADDR)) {
+ dbgfs_destroy_ctx(ctx);
+ kunit_skip(test, "PADDR not registered");
+ }
+
/* Make DAMON consider target has no pid */
damon_select_ops(ctx, DAMON_OPS_PADDR);
@@ -111,6 +116,11 @@ static void damon_dbgfs_test_set_init_regions(struct kunit *test)
int i, rc;
char buf[256];
+ if (!damon_is_registered_ops(DAMON_OPS_PADDR)) {
+ damon_destroy_ctx(ctx);
+ kunit_skip(test, "PADDR not registered");
+ }
+
damon_select_ops(ctx, DAMON_OPS_PADDR);
dbgfs_set_targets(ctx, 3, NULL);
diff --git a/mm/damon/sysfs-test.h b/mm/damon/tests/sysfs-kunit.h
index 1c9b596057a7..1c9b596057a7 100644
--- a/mm/damon/sysfs-test.h
+++ b/mm/damon/tests/sysfs-kunit.h
diff --git a/mm/damon/vaddr-test.h b/mm/damon/tests/vaddr-kunit.h
index 83626483f82b..a339d117150f 100644
--- a/mm/damon/vaddr-test.h
+++ b/mm/damon/tests/vaddr-kunit.h
@@ -77,7 +77,7 @@ static void damon_test_three_regions_in_vmas(struct kunit *test)
(struct vm_area_struct) {.vm_start = 307, .vm_end = 330},
};
- mt_init_flags(&mm.mm_mt, MM_MT_FLAGS);
+ mt_init_flags(&mm.mm_mt, MT_FLAGS_ALLOC_RANGE | MT_FLAGS_USE_RCU);
if (__link_vmas(&mm.mm_mt, vmas, ARRAY_SIZE(vmas)))
kunit_skip(test, "Failed to create VMA tree");
diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c
index a0036dc78a3b..08cfd22b5249 100644
--- a/mm/damon/vaddr.c
+++ b/mm/damon/vaddr.c
@@ -732,4 +732,4 @@ static int __init damon_va_initcall(void)
subsys_initcall(damon_va_initcall);
-#include "vaddr-test.h"
+#include "tests/vaddr-kunit.h"
diff --git a/mm/debug.c b/mm/debug.c
index 69e524c3e601..aa57d3ffd4ed 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -36,11 +36,6 @@ const struct trace_print_flags pageflag_names[] = {
{0, NULL}
};
-const struct trace_print_flags pagetype_names[] = {
- __def_pagetype_names,
- {0, NULL}
-};
-
const struct trace_print_flags gfpflag_names[] = {
__def_gfpflag_names,
{0, NULL}
@@ -51,6 +46,27 @@ const struct trace_print_flags vmaflag_names[] = {
{0, NULL}
};
+#define DEF_PAGETYPE_NAME(_name) [PGTY_##_name - 0xf0] = __stringify(_name)
+
+static const char *page_type_names[] = {
+ DEF_PAGETYPE_NAME(slab),
+ DEF_PAGETYPE_NAME(hugetlb),
+ DEF_PAGETYPE_NAME(offline),
+ DEF_PAGETYPE_NAME(guard),
+ DEF_PAGETYPE_NAME(table),
+ DEF_PAGETYPE_NAME(buddy),
+ DEF_PAGETYPE_NAME(unaccepted),
+};
+
+static const char *page_type_name(unsigned int page_type)
+{
+ unsigned i = (page_type >> 24) - 0xf0;
+
+ if (i >= ARRAY_SIZE(page_type_names))
+ return "unknown";
+ return page_type_names[i];
+}
+
static void __dump_folio(struct folio *folio, struct page *page,
unsigned long pfn, unsigned long idx)
{
@@ -58,7 +74,7 @@ static void __dump_folio(struct folio *folio, struct page *page,
int mapcount = atomic_read(&page->_mapcount);
char *type = "";
- mapcount = page_type_has_type(mapcount) ? 0 : mapcount + 1;
+ mapcount = page_mapcount_is_type(mapcount) ? 0 : mapcount + 1;
pr_warn("page: refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
folio_ref_count(folio), mapcount, mapping,
folio->index + idx, pfn);
@@ -92,7 +108,8 @@ static void __dump_folio(struct folio *folio, struct page *page,
pr_warn("%sflags: %pGp%s\n", type, &folio->flags,
is_migrate_cma_folio(folio, pfn) ? " CMA" : "");
if (page_has_type(&folio->page))
- pr_warn("page_type: %pGt\n", &folio->page.page_type);
+ pr_warn("page_type: %x(%s)\n", folio->page.page_type >> 24,
+ page_type_name(folio->page.page_type));
print_hex_dump(KERN_WARNING, "raw: ", DUMP_PREFIX_NONE, 32,
sizeof(unsigned long), page,
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
index e4969fb54da3..bc748f700a9e 100644
--- a/mm/debug_vm_pgtable.c
+++ b/mm/debug_vm_pgtable.c
@@ -231,10 +231,10 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args)
set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
flush_dcache_page(page);
pmdp_set_wrprotect(args->mm, vaddr, args->pmdp);
- pmd = READ_ONCE(*args->pmdp);
+ pmd = pmdp_get(args->pmdp);
WARN_ON(pmd_write(pmd));
pmdp_huge_get_and_clear(args->mm, vaddr, args->pmdp);
- pmd = READ_ONCE(*args->pmdp);
+ pmd = pmdp_get(args->pmdp);
WARN_ON(!pmd_none(pmd));
pmd = pfn_pmd(args->pmd_pfn, args->page_prot);
@@ -245,10 +245,10 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args)
pmd = pmd_mkwrite(pmd, args->vma);
pmd = pmd_mkdirty(pmd);
pmdp_set_access_flags(args->vma, vaddr, args->pmdp, pmd, 1);
- pmd = READ_ONCE(*args->pmdp);
+ pmd = pmdp_get(args->pmdp);
WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
pmdp_huge_get_and_clear_full(args->vma, vaddr, args->pmdp, 1);
- pmd = READ_ONCE(*args->pmdp);
+ pmd = pmdp_get(args->pmdp);
WARN_ON(!pmd_none(pmd));
pmd = pmd_mkhuge(pfn_pmd(args->pmd_pfn, args->page_prot));
@@ -256,7 +256,7 @@ static void __init pmd_advanced_tests(struct pgtable_debug_args *args)
set_pmd_at(args->mm, vaddr, args->pmdp, pmd);
flush_dcache_page(page);
pmdp_test_and_clear_young(args->vma, vaddr, args->pmdp);
- pmd = READ_ONCE(*args->pmdp);
+ pmd = pmdp_get(args->pmdp);
WARN_ON(pmd_young(pmd));
/* Clear the pte entries */
@@ -357,12 +357,12 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
pudp_set_wrprotect(args->mm, vaddr, args->pudp);
- pud = READ_ONCE(*args->pudp);
+ pud = pudp_get(args->pudp);
WARN_ON(pud_write(pud));
#ifndef __PAGETABLE_PMD_FOLDED
pudp_huge_get_and_clear(args->mm, vaddr, args->pudp);
- pud = READ_ONCE(*args->pudp);
+ pud = pudp_get(args->pudp);
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
pud = pfn_pud(args->pud_pfn, args->page_prot);
@@ -374,12 +374,12 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
pud = pud_mkwrite(pud);
pud = pud_mkdirty(pud);
pudp_set_access_flags(args->vma, vaddr, args->pudp, pud, 1);
- pud = READ_ONCE(*args->pudp);
+ pud = pudp_get(args->pudp);
WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
#ifndef __PAGETABLE_PMD_FOLDED
pudp_huge_get_and_clear_full(args->vma, vaddr, args->pudp, 1);
- pud = READ_ONCE(*args->pudp);
+ pud = pudp_get(args->pudp);
WARN_ON(!pud_none(pud));
#endif /* __PAGETABLE_PMD_FOLDED */
@@ -389,7 +389,7 @@ static void __init pud_advanced_tests(struct pgtable_debug_args *args)
set_pud_at(args->mm, vaddr, args->pudp, pud);
flush_dcache_page(page);
pudp_test_and_clear_young(args->vma, vaddr, args->pudp);
- pud = READ_ONCE(*args->pudp);
+ pud = pudp_get(args->pudp);
WARN_ON(pud_young(pud));
pudp_huge_get_and_clear(args->mm, vaddr, args->pudp);
@@ -441,7 +441,7 @@ static void __init pmd_huge_tests(struct pgtable_debug_args *args)
WRITE_ONCE(*args->pmdp, __pmd(0));
WARN_ON(!pmd_set_huge(args->pmdp, __pfn_to_phys(args->fixed_pmd_pfn), args->page_prot));
WARN_ON(!pmd_clear_huge(args->pmdp));
- pmd = READ_ONCE(*args->pmdp);
+ pmd = pmdp_get(args->pmdp);
WARN_ON(!pmd_none(pmd));
}
@@ -461,7 +461,7 @@ static void __init pud_huge_tests(struct pgtable_debug_args *args)
WRITE_ONCE(*args->pudp, __pud(0));
WARN_ON(!pud_set_huge(args->pudp, __pfn_to_phys(args->fixed_pud_pfn), args->page_prot));
WARN_ON(!pud_clear_huge(args->pudp));
- pud = READ_ONCE(*args->pudp);
+ pud = pudp_get(args->pudp);
WARN_ON(!pud_none(pud));
}
#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
@@ -490,7 +490,7 @@ static void __init pgd_basic_tests(struct pgtable_debug_args *args)
#ifndef __PAGETABLE_PUD_FOLDED
static void __init pud_clear_tests(struct pgtable_debug_args *args)
{
- pud_t pud = READ_ONCE(*args->pudp);
+ pud_t pud = pudp_get(args->pudp);
if (mm_pmd_folded(args->mm))
return;
@@ -498,7 +498,7 @@ static void __init pud_clear_tests(struct pgtable_debug_args *args)
pr_debug("Validating PUD clear\n");
WARN_ON(pud_none(pud));
pud_clear(args->pudp);
- pud = READ_ONCE(*args->pudp);
+ pud = pudp_get(args->pudp);
WARN_ON(!pud_none(pud));
}
@@ -515,7 +515,7 @@ static void __init pud_populate_tests(struct pgtable_debug_args *args)
* Hence this must not qualify as pud_bad().
*/
pud_populate(args->mm, args->pudp, args->start_pmdp);
- pud = READ_ONCE(*args->pudp);
+ pud = pudp_get(args->pudp);
WARN_ON(pud_bad(pud));
}
#else /* !__PAGETABLE_PUD_FOLDED */
@@ -526,7 +526,7 @@ static void __init pud_populate_tests(struct pgtable_debug_args *args) { }
#ifndef __PAGETABLE_P4D_FOLDED
static void __init p4d_clear_tests(struct pgtable_debug_args *args)
{
- p4d_t p4d = READ_ONCE(*args->p4dp);
+ p4d_t p4d = p4dp_get(args->p4dp);
if (mm_pud_folded(args->mm))
return;
@@ -534,7 +534,7 @@ static void __init p4d_clear_tests(struct pgtable_debug_args *args)
pr_debug("Validating P4D clear\n");
WARN_ON(p4d_none(p4d));
p4d_clear(args->p4dp);
- p4d = READ_ONCE(*args->p4dp);
+ p4d = p4dp_get(args->p4dp);
WARN_ON(!p4d_none(p4d));
}
@@ -553,13 +553,13 @@ static void __init p4d_populate_tests(struct pgtable_debug_args *args)
pud_clear(args->pudp);
p4d_clear(args->p4dp);
p4d_populate(args->mm, args->p4dp, args->start_pudp);
- p4d = READ_ONCE(*args->p4dp);
+ p4d = p4dp_get(args->p4dp);
WARN_ON(p4d_bad(p4d));
}
static void __init pgd_clear_tests(struct pgtable_debug_args *args)
{
- pgd_t pgd = READ_ONCE(*(args->pgdp));
+ pgd_t pgd = pgdp_get(args->pgdp);
if (mm_p4d_folded(args->mm))
return;
@@ -567,7 +567,7 @@ static void __init pgd_clear_tests(struct pgtable_debug_args *args)
pr_debug("Validating PGD clear\n");
WARN_ON(pgd_none(pgd));
pgd_clear(args->pgdp);
- pgd = READ_ONCE(*args->pgdp);
+ pgd = pgdp_get(args->pgdp);
WARN_ON(!pgd_none(pgd));
}
@@ -586,7 +586,7 @@ static void __init pgd_populate_tests(struct pgtable_debug_args *args)
p4d_clear(args->p4dp);
pgd_clear(args->pgdp);
pgd_populate(args->mm, args->pgdp, args->start_p4dp);
- pgd = READ_ONCE(*args->pgdp);
+ pgd = pgdp_get(args->pgdp);
WARN_ON(pgd_bad(pgd));
}
#else /* !__PAGETABLE_P4D_FOLDED */
@@ -627,12 +627,12 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args)
static void __init pmd_clear_tests(struct pgtable_debug_args *args)
{
- pmd_t pmd = READ_ONCE(*args->pmdp);
+ pmd_t pmd = pmdp_get(args->pmdp);
pr_debug("Validating PMD clear\n");
WARN_ON(pmd_none(pmd));
pmd_clear(args->pmdp);
- pmd = READ_ONCE(*args->pmdp);
+ pmd = pmdp_get(args->pmdp);
WARN_ON(!pmd_none(pmd));
}
@@ -646,7 +646,7 @@ static void __init pmd_populate_tests(struct pgtable_debug_args *args)
* Hence this must not qualify as pmd_bad().
*/
pmd_populate(args->mm, args->pmdp, args->start_ptep);
- pmd = READ_ONCE(*args->pmdp);
+ pmd = pmdp_get(args->pmdp);
WARN_ON(pmd_bad(pmd));
}
@@ -1251,7 +1251,7 @@ static int __init init_args(struct pgtable_debug_args *args)
ret = -ENOMEM;
goto error;
}
- args->start_ptep = pmd_pgtable(READ_ONCE(*args->pmdp));
+ args->start_ptep = pmd_pgtable(pmdp_get(args->pmdp));
WARN_ON(!args->start_ptep);
init_fixed_pfns(args);
diff --git a/mm/filemap.c b/mm/filemap.c
index 65c515e7bbf0..4f3753f0a158 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -46,6 +46,7 @@
#include <linux/pipe_fs_i.h>
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
+#include <linux/sched/mm.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -112,8 +113,8 @@
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->i_pages lock (try_to_unmap_one)
- * ->lruvec->lru_lock (follow_page->mark_page_accessed)
- * ->lruvec->lru_lock (check_pte_range->isolate_lru_page)
+ * ->lruvec->lru_lock (follow_page_mask->mark_page_accessed)
+ * ->lruvec->lru_lock (check_pte_range->folio_isolate_lru)
* ->private_lock (folio_remove_rmap_pte->set_page_dirty)
* ->i_pages lock (folio_remove_rmap_pte->set_page_dirty)
* bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty)
@@ -530,7 +531,6 @@ static void __filemap_fdatawait_range(struct address_space *mapping,
struct folio *folio = fbatch.folios[i];
folio_wait_writeback(folio);
- folio_clear_error(folio);
}
folio_batch_release(&fbatch);
cond_resched();
@@ -2049,17 +2049,20 @@ unsigned find_get_entries(struct address_space *mapping, pgoff_t *start,
if (!folio_batch_add(fbatch, folio))
break;
}
- rcu_read_unlock();
if (folio_batch_count(fbatch)) {
- unsigned long nr = 1;
+ unsigned long nr;
int idx = folio_batch_count(fbatch) - 1;
folio = fbatch->folios[idx];
if (!xa_is_value(folio))
nr = folio_nr_pages(folio);
- *start = indices[idx] + nr;
+ else
+ nr = 1 << xa_get_order(&mapping->i_pages, indices[idx]);
+ *start = round_down(indices[idx] + nr, nr);
}
+ rcu_read_unlock();
+
return folio_batch_count(fbatch);
}
@@ -2091,10 +2094,17 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
rcu_read_lock();
while ((folio = find_get_entry(&xas, end, XA_PRESENT))) {
+ unsigned long base;
+ unsigned long nr;
+
if (!xa_is_value(folio)) {
- if (folio->index < *start)
+ nr = folio_nr_pages(folio);
+ base = folio->index;
+ /* Omit large folio which begins before the start */
+ if (base < *start)
goto put;
- if (folio_next_index(folio) - 1 > end)
+ /* Omit large folio which extends beyond the end */
+ if (base + nr - 1 > end)
goto put;
if (!folio_trylock(folio))
goto put;
@@ -2103,7 +2113,19 @@ unsigned find_lock_entries(struct address_space *mapping, pgoff_t *start,
goto unlock;
VM_BUG_ON_FOLIO(!folio_contains(folio, xas.xa_index),
folio);
+ } else {
+ nr = 1 << xas_get_order(&xas);
+ base = xas.xa_index & ~(nr - 1);
+ /* Omit order>0 value which begins before the start */
+ if (base < *start)
+ continue;
+ /* Omit order>0 value which extends beyond the end */
+ if (base + nr - 1 > end)
+ break;
}
+
+ /* Update start now so that last update is correct on return */
+ *start = base + nr;
indices[fbatch->nr] = xas.xa_index;
if (!folio_batch_add(fbatch, folio))
break;
@@ -2115,15 +2137,6 @@ put:
}
rcu_read_unlock();
- if (folio_batch_count(fbatch)) {
- unsigned long nr = 1;
- int idx = folio_batch_count(fbatch) - 1;
-
- folio = fbatch->folios[idx];
- if (!xa_is_value(folio))
- nr = folio_nr_pages(folio);
- *start = indices[idx] + nr;
- }
return folio_batch_count(fbatch);
}
@@ -2344,13 +2357,6 @@ static int filemap_read_folio(struct file *file, filler_t filler,
unsigned long pflags;
int error;
- /*
- * A previous I/O error may have been due to temporary failures,
- * eg. multipath errors. PG_error will be set again if read_folio
- * fails.
- */
- folio_clear_error(folio);
-
/* Start the actual read. The read will unlock the page. */
if (unlikely(workingset))
psi_memstall_enter(&pflags);
@@ -2519,6 +2525,7 @@ static int filemap_get_pages(struct kiocb *iocb, size_t count,
pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
pgoff_t last_index;
struct folio *folio;
+ unsigned int flags;
int err = 0;
/* "last_index" is the index of the page beyond the end of the read */
@@ -2531,8 +2538,12 @@ retry:
if (!folio_batch_count(fbatch)) {
if (iocb->ki_flags & IOCB_NOIO)
return -EAGAIN;
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ flags = memalloc_noio_save();
page_cache_sync_readahead(mapping, ra, filp, index,
last_index - index);
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ memalloc_noio_restore(flags);
filemap_get_read_batch(mapping, index, last_index - 1, fbatch);
}
if (!folio_batch_count(fbatch)) {
@@ -2560,6 +2571,7 @@ retry:
goto err;
}
+ trace_mm_filemap_get_pages(mapping, index, last_index - 1);
return 0;
err:
if (err < 0)
@@ -3000,7 +3012,7 @@ unlock:
static inline size_t seek_folio_size(struct xa_state *xas, struct folio *folio)
{
if (xa_is_value(folio))
- return PAGE_SIZE << xa_get_order(xas->xa, xas->xa_index);
+ return PAGE_SIZE << xas_get_order(xas);
return folio_size(folio);
}
@@ -3298,6 +3310,8 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
if (unlikely(index >= max_idx))
return VM_FAULT_SIGBUS;
+ trace_mm_filemap_fault(mapping, index);
+
/*
* Do we have something in the page cache already?
*/
@@ -3668,6 +3682,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
add_mm_counter(vma->vm_mm, folio_type, rss);
pte_unmap_unlock(vmf->pte, vmf->ptl);
+ trace_mm_filemap_map_pages(mapping, start_pgoff, end_pgoff);
out:
rcu_read_unlock();
@@ -4297,7 +4312,7 @@ static void filemap_cachestat(struct address_space *mapping,
if (xas_retry(&xas, folio))
continue;
- order = xa_get_order(xas.xa, xas.xa_index);
+ order = xas_get_order(&xas);
nr_pages = 1 << order;
folio_first_index = round_down(xas.xa_index, 1 << order);
folio_last_index = folio_first_index + nr_pages - 1;
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index f05906006b3c..80746182e9e8 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -92,15 +92,3 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(grab_cache_page_write_begin);
-
-bool isolate_lru_page(struct page *page)
-{
- if (WARN_RATELIMIT(PageTail(page), "trying to isolate tail page"))
- return false;
- return folio_isolate_lru((struct folio *)page);
-}
-
-void putback_lru_page(struct page *page)
-{
- folio_putback_lru(page_folio(page));
-}
diff --git a/mm/gup.c b/mm/gup.c
index 02c46ae33028..8232c8c9c372 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -832,6 +832,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
struct dev_pagemap **pgmap)
{
struct mm_struct *mm = vma->vm_mm;
+ struct folio *folio;
struct page *page;
spinlock_t *ptl;
pte_t *ptep, pte;
@@ -889,6 +890,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
goto out;
}
}
+ folio = page_folio(page);
if (!pte_write(pte) && gup_must_unshare(vma, flags, page)) {
page = ERR_PTR(-EMLINK);
@@ -899,7 +901,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
!PageAnonExclusive(page), page);
/* try_grab_folio() does nothing unless FOLL_GET or FOLL_PIN is set. */
- ret = try_grab_folio(page_folio(page), 1, flags);
+ ret = try_grab_folio(folio, 1, flags);
if (unlikely(ret)) {
page = ERR_PTR(ret);
goto out;
@@ -911,7 +913,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
* Documentation/core-api/pin_user_pages.rst for details.
*/
if (flags & FOLL_PIN) {
- ret = arch_make_page_accessible(page);
+ ret = arch_make_folio_accessible(folio);
if (ret) {
unpin_user_page(page);
page = ERR_PTR(ret);
@@ -1083,28 +1085,6 @@ static struct page *follow_page_mask(struct vm_area_struct *vma,
return page;
}
-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
- unsigned int foll_flags)
-{
- struct follow_page_context ctx = { NULL };
- struct page *page;
-
- if (vma_is_secretmem(vma))
- return NULL;
-
- if (WARN_ON_ONCE(foll_flags & FOLL_PIN))
- return NULL;
-
- /*
- * We never set FOLL_HONOR_NUMA_FAULT because callers don't expect
- * to fail on PROT_NONE-mapped pages.
- */
- page = follow_page_mask(vma, address, foll_flags, &ctx);
- if (ctx.pgmap)
- put_dev_pagemap(ctx.pgmap);
- return page;
-}
-
static int get_gate_page(struct mm_struct *mm, unsigned long address,
unsigned int gup_flags, struct vm_area_struct **vma,
struct page **page)
@@ -1166,19 +1146,19 @@ unmap:
* to 0 and -EBUSY returned.
*/
static int faultin_page(struct vm_area_struct *vma,
- unsigned long address, unsigned int *flags, bool unshare,
+ unsigned long address, unsigned int flags, bool unshare,
int *locked)
{
unsigned int fault_flags = 0;
vm_fault_t ret;
- if (*flags & FOLL_NOFAULT)
+ if (flags & FOLL_NOFAULT)
return -EFAULT;
- if (*flags & FOLL_WRITE)
+ if (flags & FOLL_WRITE)
fault_flags |= FAULT_FLAG_WRITE;
- if (*flags & FOLL_REMOTE)
+ if (flags & FOLL_REMOTE)
fault_flags |= FAULT_FLAG_REMOTE;
- if (*flags & FOLL_UNLOCKABLE) {
+ if (flags & FOLL_UNLOCKABLE) {
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE;
/*
* FAULT_FLAG_INTERRUPTIBLE is opt-in. GUP callers must set
@@ -1186,12 +1166,12 @@ static int faultin_page(struct vm_area_struct *vma,
* That's because some callers may not be prepared to
* handle early exits caused by non-fatal signals.
*/
- if (*flags & FOLL_INTERRUPTIBLE)
+ if (flags & FOLL_INTERRUPTIBLE)
fault_flags |= FAULT_FLAG_INTERRUPTIBLE;
}
- if (*flags & FOLL_NOWAIT)
+ if (flags & FOLL_NOWAIT)
fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT;
- if (*flags & FOLL_TRIED) {
+ if (flags & FOLL_TRIED) {
/*
* Note: FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_TRIED
* can co-exist
@@ -1225,7 +1205,7 @@ static int faultin_page(struct vm_area_struct *vma,
}
if (ret & VM_FAULT_ERROR) {
- int err = vm_fault_to_errno(ret, *flags);
+ int err = vm_fault_to_errno(ret, flags);
if (err)
return err;
@@ -1450,7 +1430,6 @@ static long __get_user_pages(struct mm_struct *mm,
do {
struct page *page;
- unsigned int foll_flags = gup_flags;
unsigned int page_increm;
/* first iteration or cross vma bound */
@@ -1501,9 +1480,9 @@ retry:
}
cond_resched();
- page = follow_page_mask(vma, start, foll_flags, &ctx);
+ page = follow_page_mask(vma, start, gup_flags, &ctx);
if (!page || PTR_ERR(page) == -EMLINK) {
- ret = faultin_page(vma, start, &foll_flags,
+ ret = faultin_page(vma, start, gup_flags,
PTR_ERR(page) == -EMLINK, locked);
switch (ret) {
case 0:
@@ -1560,13 +1539,12 @@ next_page:
* large folio, this should never fail.
*/
if (try_grab_folio(folio, page_increm - 1,
- foll_flags)) {
+ gup_flags)) {
/*
* Release the 1st page ref if the
* folio is problematic, fail hard.
*/
- gup_put_folio(folio, 1,
- foll_flags);
+ gup_put_folio(folio, 1, gup_flags);
ret = -EFAULT;
goto out;
}
@@ -2370,7 +2348,7 @@ static int migrate_longterm_unpinnable_folios(
folio_get(folio);
gup_put_folio(folio, 1, FOLL_PIN);
- if (migrate_device_coherent_page(&folio->page)) {
+ if (migrate_device_coherent_folio(folio)) {
ret = -EBUSY;
goto err;
}
@@ -2532,7 +2510,7 @@ static bool is_valid_gup_args(struct page **pages, int *locked,
* These flags not allowed to be specified externally to the gup
* interfaces:
* - FOLL_TOUCH/FOLL_PIN/FOLL_TRIED/FOLL_FAST_ONLY are internal only
- * - FOLL_REMOTE is internal only and used on follow_page()
+ * - FOLL_REMOTE is internal only, set in (get|pin)_user_pages_remote()
* - FOLL_UNLOCKABLE is internal only and used if locked is !NULL
*/
if (WARN_ON_ONCE(gup_flags & INTERNAL_GUP_FLAGS))
@@ -2934,7 +2912,7 @@ static int gup_fast_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr,
* details.
*/
if (flags & FOLL_PIN) {
- ret = arch_make_page_accessible(page);
+ ret = arch_make_folio_accessible(folio);
if (ret) {
gup_put_folio(folio, 1, flags);
goto pte_unmap;
@@ -3073,6 +3051,9 @@ static int gup_fast_pmd_leaf(pmd_t orig, pmd_t *pmdp, unsigned long addr,
if (!pmd_access_permitted(orig, flags & FOLL_WRITE))
return 0;
+ if (pmd_special(orig))
+ return 0;
+
if (pmd_devmap(orig)) {
if (unlikely(flags & FOLL_LONGTERM))
return 0;
@@ -3117,6 +3098,9 @@ static int gup_fast_pud_leaf(pud_t orig, pud_t *pudp, unsigned long addr,
if (!pud_access_permitted(orig, flags & FOLL_WRITE))
return 0;
+ if (pud_special(orig))
+ return 0;
+
if (pud_devmap(orig)) {
if (unlikely(flags & FOLL_LONGTERM))
return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b71f744d360c..0580ac9e47b9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -40,6 +40,7 @@
#include <linux/memory-tiers.h>
#include <linux/compat.h>
#include <linux/pgalloc_tag.h>
+#include <linux/pagewalk.h>
#include <asm/tlb.h>
#include <asm/pgalloc.h>
@@ -73,6 +74,7 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc);
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc);
+static bool split_underused_thp = true;
static atomic_t huge_zero_refcount;
struct folio *huge_zero_folio __read_mostly;
@@ -80,6 +82,7 @@ unsigned long huge_zero_pfn __read_mostly = ~0UL;
unsigned long huge_anon_orders_always __read_mostly;
unsigned long huge_anon_orders_madvise __read_mostly;
unsigned long huge_anon_orders_inherit __read_mostly;
+static bool anon_orders_configured __initdata;
unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
unsigned long vm_flags,
@@ -94,8 +97,8 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
/* Check the intersection of requested and supported orders. */
if (vma_is_anonymous(vma))
supported_orders = THP_ORDERS_ALL_ANON;
- else if (vma_is_dax(vma))
- supported_orders = THP_ORDERS_ALL_FILE_DAX;
+ else if (vma_is_special_huge(vma))
+ supported_orders = THP_ORDERS_ALL_SPECIAL;
else
supported_orders = THP_ORDERS_ALL_FILE_DEFAULT;
@@ -159,15 +162,10 @@ unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma,
* Must be done before hugepage flags check since shmem has its
* own flags.
*/
- if (!in_pf && shmem_file(vma->vm_file)) {
- bool global_huge = shmem_is_huge(file_inode(vma->vm_file), vma->vm_pgoff,
- !enforce_sysfs, vma->vm_mm, vm_flags);
-
- if (!vma_is_anon_shmem(vma))
- return global_huge ? orders : 0;
+ if (!in_pf && shmem_file(vma->vm_file))
return shmem_allowable_huge_orders(file_inode(vma->vm_file),
- vma, vma->vm_pgoff, global_huge);
- }
+ vma, vma->vm_pgoff, 0,
+ !enforce_sysfs);
if (!vma_is_anonymous(vma)) {
/*
@@ -445,6 +443,27 @@ static ssize_t hpage_pmd_size_show(struct kobject *kobj,
static struct kobj_attribute hpage_pmd_size_attr =
__ATTR_RO(hpage_pmd_size);
+static ssize_t split_underused_thp_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sysfs_emit(buf, "%d\n", split_underused_thp);
+}
+
+static ssize_t split_underused_thp_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err = kstrtobool(buf, &split_underused_thp);
+
+ if (err < 0)
+ return err;
+
+ return count;
+}
+
+static struct kobj_attribute split_underused_thp_attr = __ATTR(
+ shrink_underused, 0644, split_underused_thp_show, split_underused_thp_store);
+
static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
@@ -453,6 +472,7 @@ static struct attribute *hugepage_attr[] = {
#ifdef CONFIG_SHMEM
&shmem_enabled_attr.attr,
#endif
+ &split_underused_thp_attr.attr,
NULL,
};
@@ -465,8 +485,8 @@ static void thpsize_release(struct kobject *kobj);
static DEFINE_SPINLOCK(huge_anon_orders_lock);
static LIST_HEAD(thpsize_list);
-static ssize_t thpsize_enabled_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+static ssize_t anon_enabled_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
{
int order = to_thpsize(kobj)->order;
const char *output;
@@ -483,9 +503,9 @@ static ssize_t thpsize_enabled_show(struct kobject *kobj,
return sysfs_emit(buf, "%s\n", output);
}
-static ssize_t thpsize_enabled_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
+static ssize_t anon_enabled_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
{
int order = to_thpsize(kobj)->order;
ssize_t ret = count;
@@ -527,19 +547,35 @@ static ssize_t thpsize_enabled_store(struct kobject *kobj,
return ret;
}
-static struct kobj_attribute thpsize_enabled_attr =
- __ATTR(enabled, 0644, thpsize_enabled_show, thpsize_enabled_store);
+static struct kobj_attribute anon_enabled_attr =
+ __ATTR(enabled, 0644, anon_enabled_show, anon_enabled_store);
+
+static struct attribute *anon_ctrl_attrs[] = {
+ &anon_enabled_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group anon_ctrl_attr_grp = {
+ .attrs = anon_ctrl_attrs,
+};
-static struct attribute *thpsize_attrs[] = {
- &thpsize_enabled_attr.attr,
+static struct attribute *file_ctrl_attrs[] = {
#ifdef CONFIG_SHMEM
&thpsize_shmem_enabled_attr.attr,
#endif
NULL,
};
-static const struct attribute_group thpsize_attr_group = {
- .attrs = thpsize_attrs,
+static const struct attribute_group file_ctrl_attr_grp = {
+ .attrs = file_ctrl_attrs,
+};
+
+static struct attribute *any_ctrl_attrs[] = {
+ NULL,
+};
+
+static const struct attribute_group any_ctrl_attr_grp = {
+ .attrs = any_ctrl_attrs,
};
static const struct kobj_type thpsize_ktype = {
@@ -578,64 +614,136 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK);
DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE);
DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT);
DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK);
+#ifdef CONFIG_SHMEM
DEFINE_MTHP_STAT_ATTR(shmem_alloc, MTHP_STAT_SHMEM_ALLOC);
DEFINE_MTHP_STAT_ATTR(shmem_fallback, MTHP_STAT_SHMEM_FALLBACK);
DEFINE_MTHP_STAT_ATTR(shmem_fallback_charge, MTHP_STAT_SHMEM_FALLBACK_CHARGE);
+#endif
DEFINE_MTHP_STAT_ATTR(split, MTHP_STAT_SPLIT);
DEFINE_MTHP_STAT_ATTR(split_failed, MTHP_STAT_SPLIT_FAILED);
DEFINE_MTHP_STAT_ATTR(split_deferred, MTHP_STAT_SPLIT_DEFERRED);
+DEFINE_MTHP_STAT_ATTR(nr_anon, MTHP_STAT_NR_ANON);
+DEFINE_MTHP_STAT_ATTR(nr_anon_partially_mapped, MTHP_STAT_NR_ANON_PARTIALLY_MAPPED);
-static struct attribute *stats_attrs[] = {
+static struct attribute *anon_stats_attrs[] = {
&anon_fault_alloc_attr.attr,
&anon_fault_fallback_attr.attr,
&anon_fault_fallback_charge_attr.attr,
+#ifndef CONFIG_SHMEM
&swpout_attr.attr,
&swpout_fallback_attr.attr,
+#endif
+ &split_deferred_attr.attr,
+ &nr_anon_attr.attr,
+ &nr_anon_partially_mapped_attr.attr,
+ NULL,
+};
+
+static struct attribute_group anon_stats_attr_grp = {
+ .name = "stats",
+ .attrs = anon_stats_attrs,
+};
+
+static struct attribute *file_stats_attrs[] = {
+#ifdef CONFIG_SHMEM
&shmem_alloc_attr.attr,
&shmem_fallback_attr.attr,
&shmem_fallback_charge_attr.attr,
+#endif
+ NULL,
+};
+
+static struct attribute_group file_stats_attr_grp = {
+ .name = "stats",
+ .attrs = file_stats_attrs,
+};
+
+static struct attribute *any_stats_attrs[] = {
+#ifdef CONFIG_SHMEM
+ &swpout_attr.attr,
+ &swpout_fallback_attr.attr,
+#endif
&split_attr.attr,
&split_failed_attr.attr,
- &split_deferred_attr.attr,
NULL,
};
-static struct attribute_group stats_attr_group = {
+static struct attribute_group any_stats_attr_grp = {
.name = "stats",
- .attrs = stats_attrs,
+ .attrs = any_stats_attrs,
};
+static int sysfs_add_group(struct kobject *kobj,
+ const struct attribute_group *grp)
+{
+ int ret = -ENOENT;
+
+ /*
+ * If the group is named, try to merge first, assuming the subdirectory
+ * was already created. This avoids the warning emitted by
+ * sysfs_create_group() if the directory already exists.
+ */
+ if (grp->name)
+ ret = sysfs_merge_group(kobj, grp);
+ if (ret)
+ ret = sysfs_create_group(kobj, grp);
+
+ return ret;
+}
+
static struct thpsize *thpsize_create(int order, struct kobject *parent)
{
unsigned long size = (PAGE_SIZE << order) / SZ_1K;
struct thpsize *thpsize;
- int ret;
+ int ret = -ENOMEM;
thpsize = kzalloc(sizeof(*thpsize), GFP_KERNEL);
if (!thpsize)
- return ERR_PTR(-ENOMEM);
+ goto err;
+
+ thpsize->order = order;
ret = kobject_init_and_add(&thpsize->kobj, &thpsize_ktype, parent,
"hugepages-%lukB", size);
if (ret) {
kfree(thpsize);
- return ERR_PTR(ret);
+ goto err;
}
- ret = sysfs_create_group(&thpsize->kobj, &thpsize_attr_group);
- if (ret) {
- kobject_put(&thpsize->kobj);
- return ERR_PTR(ret);
+
+ ret = sysfs_add_group(&thpsize->kobj, &any_ctrl_attr_grp);
+ if (ret)
+ goto err_put;
+
+ ret = sysfs_add_group(&thpsize->kobj, &any_stats_attr_grp);
+ if (ret)
+ goto err_put;
+
+ if (BIT(order) & THP_ORDERS_ALL_ANON) {
+ ret = sysfs_add_group(&thpsize->kobj, &anon_ctrl_attr_grp);
+ if (ret)
+ goto err_put;
+
+ ret = sysfs_add_group(&thpsize->kobj, &anon_stats_attr_grp);
+ if (ret)
+ goto err_put;
}
- ret = sysfs_create_group(&thpsize->kobj, &stats_attr_group);
- if (ret) {
- kobject_put(&thpsize->kobj);
- return ERR_PTR(ret);
+ if (BIT(order) & THP_ORDERS_ALL_FILE_DEFAULT) {
+ ret = sysfs_add_group(&thpsize->kobj, &file_ctrl_attr_grp);
+ if (ret)
+ goto err_put;
+
+ ret = sysfs_add_group(&thpsize->kobj, &file_stats_attr_grp);
+ if (ret)
+ goto err_put;
}
- thpsize->order = order;
return thpsize;
+err_put:
+ kobject_put(&thpsize->kobj);
+err:
+ return ERR_PTR(ret);
}
static void thpsize_release(struct kobject *kobj)
@@ -655,7 +763,8 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
* disable all other sizes. powerpc's PMD_ORDER isn't a compile-time
* constant so we have to do this here.
*/
- huge_anon_orders_inherit = BIT(PMD_ORDER);
+ if (!anon_orders_configured)
+ huge_anon_orders_inherit = BIT(PMD_ORDER);
*hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj);
if (unlikely(!*hugepage_kobj)) {
@@ -675,7 +784,7 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj)
goto remove_hp_group;
}
- orders = THP_ORDERS_ALL_ANON;
+ orders = THP_ORDERS_ALL_ANON | THP_ORDERS_ALL_FILE_DEFAULT;
order = highest_order(orders);
while (orders) {
thpsize = thpsize_create(order, *hugepage_kobj);
@@ -840,6 +949,100 @@ out:
}
__setup("transparent_hugepage=", setup_transparent_hugepage);
+static inline int get_order_from_str(const char *size_str)
+{
+ unsigned long size;
+ char *endptr;
+ int order;
+
+ size = memparse(size_str, &endptr);
+
+ if (!is_power_of_2(size))
+ goto err;
+ order = get_order(size);
+ if (BIT(order) & ~THP_ORDERS_ALL_ANON)
+ goto err;
+
+ return order;
+err:
+ pr_err("invalid size %s in thp_anon boot parameter\n", size_str);
+ return -EINVAL;
+}
+
+static char str_dup[PAGE_SIZE] __initdata;
+static int __init setup_thp_anon(char *str)
+{
+ char *token, *range, *policy, *subtoken;
+ unsigned long always, inherit, madvise;
+ char *start_size, *end_size;
+ int start, end, nr;
+ char *p;
+
+ if (!str || strlen(str) + 1 > PAGE_SIZE)
+ goto err;
+ strcpy(str_dup, str);
+
+ always = huge_anon_orders_always;
+ madvise = huge_anon_orders_madvise;
+ inherit = huge_anon_orders_inherit;
+ p = str_dup;
+ while ((token = strsep(&p, ";")) != NULL) {
+ range = strsep(&token, ":");
+ policy = token;
+
+ if (!policy)
+ goto err;
+
+ while ((subtoken = strsep(&range, ",")) != NULL) {
+ if (strchr(subtoken, '-')) {
+ start_size = strsep(&subtoken, "-");
+ end_size = subtoken;
+
+ start = get_order_from_str(start_size);
+ end = get_order_from_str(end_size);
+ } else {
+ start = end = get_order_from_str(subtoken);
+ }
+
+ if (start < 0 || end < 0 || start > end)
+ goto err;
+
+ nr = end - start + 1;
+ if (!strcmp(policy, "always")) {
+ bitmap_set(&always, start, nr);
+ bitmap_clear(&inherit, start, nr);
+ bitmap_clear(&madvise, start, nr);
+ } else if (!strcmp(policy, "madvise")) {
+ bitmap_set(&madvise, start, nr);
+ bitmap_clear(&inherit, start, nr);
+ bitmap_clear(&always, start, nr);
+ } else if (!strcmp(policy, "inherit")) {
+ bitmap_set(&inherit, start, nr);
+ bitmap_clear(&madvise, start, nr);
+ bitmap_clear(&always, start, nr);
+ } else if (!strcmp(policy, "never")) {
+ bitmap_clear(&inherit, start, nr);
+ bitmap_clear(&madvise, start, nr);
+ bitmap_clear(&always, start, nr);
+ } else {
+ pr_err("invalid policy %s in thp_anon boot parameter\n", policy);
+ goto err;
+ }
+ }
+ }
+
+ huge_anon_orders_always = always;
+ huge_anon_orders_madvise = madvise;
+ huge_anon_orders_inherit = inherit;
+ anon_orders_configured = true;
+ return 1;
+
+err:
+ pr_warn("thp_anon=%s: error parsing string, ignoring setting\n", str);
+ return 0;
+}
+__setup("thp_anon=", setup_thp_anon);
+
pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
{
if (likely(vma->vm_flags & VM_WRITE))
@@ -1009,6 +1212,7 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(vma->vm_mm);
+ deferred_split_folio(folio, false);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC);
@@ -1168,6 +1372,8 @@ static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr,
entry = pmd_mkhuge(pfn_t_pmd(pfn, prot));
if (pfn_t_devmap(pfn))
entry = pmd_mkdevmap(entry);
+ else
+ entry = pmd_mkspecial(entry);
if (write) {
entry = pmd_mkyoung(pmd_mkdirty(entry));
entry = maybe_pmd_mkwrite(entry, vma);
@@ -1251,10 +1457,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
ptl = pud_lock(mm, pud);
if (!pud_none(*pud)) {
if (write) {
- if (pud_pfn(*pud) != pfn_t_to_pfn(pfn)) {
- WARN_ON_ONCE(!is_huge_zero_pud(*pud));
+ if (WARN_ON_ONCE(pud_pfn(*pud) != pfn_t_to_pfn(pfn)))
goto out_unlock;
- }
entry = pud_mkyoung(*pud);
entry = maybe_pud_mkwrite(pud_mkdirty(entry), vma);
if (pudp_set_access_flags(vma, addr, pud, entry, 1))
@@ -1266,6 +1470,8 @@ static void insert_pfn_pud(struct vm_area_struct *vma, unsigned long addr,
entry = pud_mkhuge(pfn_t_pud(pfn, prot));
if (pfn_t_devmap(pfn))
entry = pud_mkdevmap(entry);
+ else
+ entry = pud_mkspecial(entry);
if (write) {
entry = pud_mkyoung(pud_mkdirty(entry));
entry = maybe_pud_mkwrite(entry, vma);
@@ -1379,6 +1585,24 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pgtable_t pgtable = NULL;
int ret = -ENOMEM;
+ pmd = pmdp_get_lockless(src_pmd);
+ if (unlikely(pmd_special(pmd))) {
+ dst_ptl = pmd_lock(dst_mm, dst_pmd);
+ src_ptl = pmd_lockptr(src_mm, src_pmd);
+ spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ /*
+ * No need to recheck the pmd, it can't change with write
+ * mmap lock held here.
+ *
+ * Meanwhile, making sure it's not a CoW VMA with writable
+ * mapping, otherwise it means either the anon page wrongly
+ * applied special bit, or we made the PRIVATE mapping be
+ * able to wrongly write to the backend MMIO.
+ */
+ VM_WARN_ON_ONCE(is_cow_mapping(src_vma->vm_flags) && pmd_write(pmd));
+ goto set_pmd;
+ }
+
/* Skip if can be re-fill on fault */
if (!vma_is_anonymous(dst_vma))
return 0;
@@ -1460,7 +1684,9 @@ out_zero_page:
pmdp_set_wrprotect(src_mm, addr, src_pmd);
if (!userfaultfd_wp(dst_vma))
pmd = pmd_clear_uffd_wp(pmd);
- pmd = pmd_mkold(pmd_wrprotect(pmd));
+ pmd = pmd_wrprotect(pmd);
+set_pmd:
+ pmd = pmd_mkold(pmd);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
ret = 0;
@@ -1503,20 +1729,14 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
/*
- * When page table lock is held, the huge zero pud should not be
- * under splitting since we don't split the page itself, only pud to
- * a page table.
- */
- if (is_huge_zero_pud(pud)) {
- /* No huge zero pud yet */
- }
-
- /*
* TODO: once we support anonymous pages, use
* folio_try_dup_anon_rmap_*() and split if duplicating fails.
*/
- pudp_set_wrprotect(src_mm, addr, src_pud);
- pud = pud_mkold(pud_wrprotect(pud));
+ if (is_cow_mapping(vma->vm_flags) && pud_write(pud)) {
+ pudp_set_wrprotect(src_mm, addr, src_pud);
+ pud = pud_wrprotect(pud);
+ }
+ pud = pud_mkold(pud);
set_pud_at(dst_mm, addr, dst_pud, pud);
ret = 0;
@@ -1675,22 +1895,23 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- pmd_t oldpmd = vmf->orig_pmd;
- pmd_t pmd;
struct folio *folio;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
int nid = NUMA_NO_NODE;
- int target_nid, last_cpupid = (-1 & LAST_CPUPID_MASK);
+ int target_nid, last_cpupid;
+ pmd_t pmd, old_pmd;
bool writable = false;
int flags = 0;
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
+ old_pmd = pmdp_get(vmf->pmd);
+
+ if (unlikely(!pmd_same(old_pmd, vmf->orig_pmd))) {
spin_unlock(vmf->ptl);
return 0;
}
- pmd = pmd_modify(oldpmd, vma->vm_page_prot);
+ pmd = pmd_modify(old_pmd, vma->vm_page_prot);
/*
* Detect now whether the PMD could be writable; this information
@@ -1705,18 +1926,10 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
if (!folio)
goto out_map;
- /* See similar comment in do_numa_page for explanation */
- if (!writable)
- flags |= TNF_NO_GROUP;
-
nid = folio_nid(folio);
- /*
- * For memory tiering mode, cpupid of slow memory page is used
- * to record page access time. So use default value.
- */
- if (node_is_toptier(nid))
- last_cpupid = folio_last_cpupid(folio);
- target_nid = numa_migrate_prep(folio, vmf, haddr, nid, &flags);
+
+ target_nid = numa_migrate_check(folio, vmf, haddr, &flags, writable,
+ &last_cpupid);
if (target_nid == NUMA_NO_NODE)
goto out_map;
if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
@@ -1736,13 +1949,13 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
flags |= TNF_MIGRATE_FAIL;
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_same(oldpmd, *vmf->pmd))) {
+ if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) {
spin_unlock(vmf->ptl);
return 0;
}
out_map:
/* Restore the PMD */
- pmd = pmd_modify(oldpmd, vma->vm_page_prot);
+ pmd = pmd_modify(pmdp_get(vmf->pmd), vma->vm_page_prot);
pmd = pmd_mkyoung(pmd);
if (writable)
pmd = pmd_mkwrite(pmd, vma);
@@ -2064,8 +2277,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
toptier)
goto unlock;
- if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
- !toptier)
+ if (folio_use_access_time(folio))
folio_xchg_access_time(folio,
jiffies_to_msecs(jiffies));
}
@@ -2118,6 +2330,53 @@ unlock:
return ret;
}
+/*
+ * Returns:
+ *
+ * - 0: if pud leaf changed from under us
+ * - 1: if pud can be skipped
+ * - HPAGE_PUD_NR: if pud was successfully processed
+ */
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+int change_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ pud_t *pudp, unsigned long addr, pgprot_t newprot,
+ unsigned long cp_flags)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ pud_t oldpud, entry;
+ spinlock_t *ptl;
+
+ tlb_change_page_size(tlb, HPAGE_PUD_SIZE);
+
+ /* NUMA balancing doesn't apply to dax */
+ if (cp_flags & MM_CP_PROT_NUMA)
+ return 1;
+
+ /*
+ * Huge entries on userfault-wp only works with anonymous, while we
+ * don't have anonymous PUDs yet.
+ */
+ if (WARN_ON_ONCE(cp_flags & MM_CP_UFFD_WP_ALL))
+ return 1;
+
+ ptl = __pud_trans_huge_lock(pudp, vma);
+ if (!ptl)
+ return 0;
+
+ /*
+ * Can't clear PUD or it can race with concurrent zapping. See
+ * change_huge_pmd().
+ */
+ oldpud = pudp_invalidate(vma, addr, pudp);
+ entry = pud_modify(oldpud, newprot);
+ set_pud_at(mm, addr, pudp, entry);
+ tlb_flush_pud_range(tlb, addr, HPAGE_PUD_SIZE);
+
+ spin_unlock(ptl);
+ return HPAGE_PUD_NR;
+}
+#endif
+
#ifdef CONFIG_USERFAULTFD
/*
* The PT lock for src_pmd and dst_vma/src_vma (for reading) are locked by
@@ -2297,12 +2556,14 @@ int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma,
pud_t *pud, unsigned long addr)
{
spinlock_t *ptl;
+ pud_t orig_pud;
ptl = __pud_trans_huge_lock(pud, vma);
if (!ptl)
return 0;
- pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
+ orig_pud = pudp_huge_get_and_clear_full(vma, addr, pud, tlb->fullmm);
+ arch_check_zapped_pud(vma, orig_pud);
tlb_remove_pud_tlb_entry(tlb, pud, addr);
if (vma_is_special_huge(vma)) {
spin_unlock(ptl);
@@ -2346,6 +2607,11 @@ out:
spin_unlock(ptl);
mmu_notifier_invalidate_range_end(&range);
}
+#else
+void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
+ unsigned long address)
+{
+}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
@@ -2780,7 +3046,7 @@ bool unmap_huge_pmd_locked(struct vm_area_struct *vma, unsigned long addr,
return false;
}
-static void remap_page(struct folio *folio, unsigned long nr)
+static void remap_page(struct folio *folio, unsigned long nr, int flags)
{
int i = 0;
@@ -2788,7 +3054,7 @@ static void remap_page(struct folio *folio, unsigned long nr)
if (!folio_test_anon(folio))
return;
for (;;) {
- remove_migration_ptes(folio, folio, true);
+ remove_migration_ptes(folio, folio, RMP_LOCKED | flags);
i += folio_nr_pages(folio);
if (i >= nr)
break;
@@ -2796,25 +3062,25 @@ static void remap_page(struct folio *folio, unsigned long nr)
}
}
-static void lru_add_page_tail(struct page *head, struct page *tail,
+static void lru_add_page_tail(struct folio *folio, struct page *tail,
struct lruvec *lruvec, struct list_head *list)
{
- VM_BUG_ON_PAGE(!PageHead(head), head);
- VM_BUG_ON_PAGE(PageLRU(tail), head);
+ VM_BUG_ON_FOLIO(!folio_test_large(folio), folio);
+ VM_BUG_ON_FOLIO(PageLRU(tail), folio);
lockdep_assert_held(&lruvec->lru_lock);
if (list) {
/* page reclaim is reclaiming a huge page */
- VM_WARN_ON(PageLRU(head));
+ VM_WARN_ON(folio_test_lru(folio));
get_page(tail);
list_add_tail(&tail->lru, list);
} else {
/* head is still on lru (and we have it frozen) */
- VM_WARN_ON(!PageLRU(head));
- if (PageUnevictable(tail))
+ VM_WARN_ON(!folio_test_lru(folio));
+ if (folio_test_unevictable(folio))
tail->mlock_count = 0;
else
- list_add_tail(&tail->lru, &head->lru);
+ list_add_tail(&tail->lru, &folio->lru);
SetPageLRU(tail);
}
}
@@ -2857,8 +3123,10 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
(1L << PG_workingset) |
(1L << PG_locked) |
(1L << PG_unevictable) |
-#ifdef CONFIG_ARCH_USES_PG_ARCH_X
+#ifdef CONFIG_ARCH_USES_PG_ARCH_2
(1L << PG_arch_2) |
+#endif
+#ifdef CONFIG_ARCH_USES_PG_ARCH_3
(1L << PG_arch_3) |
#endif
(1L << PG_dirty) |
@@ -2913,7 +3181,7 @@ static void __split_huge_page_tail(struct folio *folio, int tail,
* pages to show after the currently processed elements - e.g.
* migrate_pages
*/
- lru_add_page_tail(head, page_tail, lruvec, list);
+ lru_add_page_tail(folio, page_tail, lruvec, list);
}
static void __split_huge_page(struct page *page, struct list_head *list,
@@ -2976,7 +3244,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
/* Caller disabled irqs, so they are still disabled here */
split_page_owner(head, order, new_order);
- pgalloc_tag_split(head, 1 << order);
+ pgalloc_tag_split(folio, order, new_order);
/* See comment in __split_huge_page_tail() */
if (folio_test_anon(folio)) {
@@ -2996,7 +3264,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
if (nr_dropped)
shmem_uncharge(folio->mapping->host, nr_dropped);
- remap_page(folio, nr);
+ remap_page(folio, nr, PageAnon(head) ? RMP_USE_SHARED_ZEROPAGE : 0);
/*
* set page to its compound_head when split to non order-0 pages, so
@@ -3025,7 +3293,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
/* Racy check whether the huge page can be split */
-bool can_split_folio(struct folio *folio, int *pextra_pins)
+bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
{
int extra_pins;
@@ -3037,7 +3305,8 @@ bool can_split_folio(struct folio *folio, int *pextra_pins)
extra_pins = folio_nr_pages(folio);
if (pextra_pins)
*pextra_pins = extra_pins;
- return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins - 1;
+ return folio_mapcount(folio) == folio_ref_count(folio) - extra_pins -
+ caller_pins;
}
/*
@@ -3094,8 +3363,9 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
/* reset xarray order to new order after split */
XA_STATE_ORDER(xas, &folio->mapping->i_pages, folio->index, new_order);
- struct anon_vma *anon_vma = NULL;
+ bool is_anon = folio_test_anon(folio);
struct address_space *mapping = NULL;
+ struct anon_vma *anon_vma = NULL;
int order = folio_order(folio);
int extra_pins, ret;
pgoff_t end;
@@ -3107,7 +3377,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
if (new_order >= folio_order(folio))
return -EINVAL;
- if (folio_test_anon(folio)) {
+ if (is_anon) {
/* order-1 is not supported for anonymous THP. */
if (new_order == 1) {
VM_WARN_ONCE(1, "Cannot split to order-1 folio");
@@ -3147,7 +3417,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
if (folio_test_writeback(folio))
return -EBUSY;
- if (folio_test_anon(folio)) {
+ if (is_anon) {
/*
* The caller does not necessarily hold an mmap_lock that would
* prevent the anon_vma disappearing so we first we take a
@@ -3217,7 +3487,7 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
* Racy check if we can split the page, before unmap_folio() will
* split PMDs
*/
- if (!can_split_folio(folio, &extra_pins)) {
+ if (!can_split_folio(folio, 1, &extra_pins)) {
ret = -EAGAIN;
goto out_unlock;
}
@@ -3243,6 +3513,11 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
if (folio_order(folio) > 1 &&
!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
+ if (folio_test_partially_mapped(folio)) {
+ __folio_clear_partially_mapped(folio);
+ mod_mthp_stat(folio_order(folio),
+ MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
+ }
/*
* Reinitialize page_deferred_list after removing the
* page from the split_queue, otherwise a subsequent
@@ -3269,6 +3544,10 @@ int split_huge_page_to_list_to_order(struct page *page, struct list_head *list,
}
}
+ if (is_anon) {
+ mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
+ mod_mthp_stat(new_order, MTHP_STAT_NR_ANON, 1 << (order - new_order));
+ }
__split_huge_page(page, list, end, new_order);
ret = 0;
} else {
@@ -3277,7 +3556,7 @@ fail:
if (mapping)
xas_unlock(&xas);
local_irq_enable();
- remap_page(folio, folio_nr_pages(folio));
+ remap_page(folio, folio_nr_pages(folio), 0);
ret = -EAGAIN;
}
@@ -3329,12 +3608,18 @@ void __folio_undo_large_rmappable(struct folio *folio)
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
if (!list_empty(&folio->_deferred_list)) {
ds_queue->split_queue_len--;
+ if (folio_test_partially_mapped(folio)) {
+ __folio_clear_partially_mapped(folio);
+ mod_mthp_stat(folio_order(folio),
+ MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
+ }
list_del_init(&folio->_deferred_list);
}
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
}
-void deferred_split_folio(struct folio *folio)
+/* partially_mapped=false won't clear PG_partially_mapped folio flag */
+void deferred_split_folio(struct folio *folio, bool partially_mapped)
{
struct deferred_split *ds_queue = get_deferred_split_queue(folio);
#ifdef CONFIG_MEMCG
@@ -3349,6 +3634,9 @@ void deferred_split_folio(struct folio *folio)
if (folio_order(folio) <= 1)
return;
+ if (!partially_mapped && !split_underused_thp)
+ return;
+
/*
* The try_to_unmap() in page reclaim path might reach here too,
* this may cause a race condition to corrupt deferred split queue.
@@ -3362,14 +3650,21 @@ void deferred_split_folio(struct folio *folio)
if (folio_test_swapcache(folio))
return;
- if (!list_empty(&folio->_deferred_list))
- return;
-
spin_lock_irqsave(&ds_queue->split_queue_lock, flags);
+ if (partially_mapped) {
+ if (!folio_test_partially_mapped(folio)) {
+ __folio_set_partially_mapped(folio);
+ if (folio_test_pmd_mappable(folio))
+ count_vm_event(THP_DEFERRED_SPLIT_PAGE);
+ count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
+ mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, 1);
+
+ }
+ } else {
+ /* partially mapped folios cannot become non-partially mapped */
+ VM_WARN_ON_FOLIO(folio_test_partially_mapped(folio), folio);
+ }
if (list_empty(&folio->_deferred_list)) {
- if (folio_test_pmd_mappable(folio))
- count_vm_event(THP_DEFERRED_SPLIT_PAGE);
- count_mthp_stat(folio_order(folio), MTHP_STAT_SPLIT_DEFERRED);
list_add_tail(&folio->_deferred_list, &ds_queue->split_queue);
ds_queue->split_queue_len++;
#ifdef CONFIG_MEMCG
@@ -3394,6 +3689,39 @@ static unsigned long deferred_split_count(struct shrinker *shrink,
return READ_ONCE(ds_queue->split_queue_len);
}
+static bool thp_underused(struct folio *folio)
+{
+ int num_zero_pages = 0, num_filled_pages = 0;
+ void *kaddr;
+ int i;
+
+ if (khugepaged_max_ptes_none == HPAGE_PMD_NR - 1)
+ return false;
+
+ for (i = 0; i < folio_nr_pages(folio); i++) {
+ kaddr = kmap_local_folio(folio, i * PAGE_SIZE);
+ if (!memchr_inv(kaddr, 0, PAGE_SIZE)) {
+ num_zero_pages++;
+ if (num_zero_pages > khugepaged_max_ptes_none) {
+ kunmap_local(kaddr);
+ return true;
+ }
+ } else {
+ /*
+ * Another path for early exit once the number
+ * of non-zero filled pages exceeds threshold.
+ */
+ num_filled_pages++;
+ if (num_filled_pages >= HPAGE_PMD_NR - khugepaged_max_ptes_none) {
+ kunmap_local(kaddr);
+ return false;
+ }
+ }
+ kunmap_local(kaddr);
+ }
+ return false;
+}
+
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
@@ -3417,6 +3745,11 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
list_move(&folio->_deferred_list, &list);
} else {
/* We lost race with folio_put() */
+ if (folio_test_partially_mapped(folio)) {
+ __folio_clear_partially_mapped(folio);
+ mod_mthp_stat(folio_order(folio),
+ MTHP_STAT_NR_ANON_PARTIALLY_MAPPED, -1);
+ }
list_del_init(&folio->_deferred_list);
ds_queue->split_queue_len--;
}
@@ -3426,13 +3759,35 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
spin_unlock_irqrestore(&ds_queue->split_queue_lock, flags);
list_for_each_entry_safe(folio, next, &list, _deferred_list) {
+ bool did_split = false;
+ bool underused = false;
+
+ if (!folio_test_partially_mapped(folio)) {
+ underused = thp_underused(folio);
+ if (!underused)
+ goto next;
+ }
if (!folio_trylock(folio))
goto next;
- /* split_huge_page() removes page from list on success */
- if (!split_folio(folio))
+ if (!split_folio(folio)) {
+ did_split = true;
+ if (underused)
+ count_vm_event(THP_UNDERUSED_SPLIT_PAGE);
split++;
+ }
folio_unlock(folio);
next:
+ /*
+ * split_folio() removes folio from list on success.
+ * Only add back to the queue if folio is partially mapped.
+ * If thp_underused returns false, or if split_folio fails
+ * in the case it was underused, then consider it used and
+ * don't add it back to split_queue.
+ */
+ if (!did_split && !folio_test_partially_mapped(folio)) {
+ list_del_init(&folio->_deferred_list);
+ ds_queue->split_queue_len--;
+ }
folio_put(folio);
}
@@ -3518,16 +3873,11 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
vaddr_start &= PAGE_MASK;
vaddr_end &= PAGE_MASK;
- /* Find the task_struct from pid */
- rcu_read_lock();
- task = find_task_by_vpid(pid);
+ task = find_get_task_by_vpid(pid);
if (!task) {
- rcu_read_unlock();
ret = -ESRCH;
goto out;
}
- get_task_struct(task);
- rcu_read_unlock();
/* Find the mm_struct */
mm = get_task_mm(task);
@@ -3548,7 +3898,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
*/
for (addr = vaddr_start; addr < vaddr_end; addr += PAGE_SIZE) {
struct vm_area_struct *vma = vma_lookup(mm, addr);
- struct page *page;
+ struct folio_walk fw;
struct folio *folio;
struct address_space *mapping;
unsigned int target_order = new_order;
@@ -3562,13 +3912,10 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
continue;
}
- /* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
-
- if (IS_ERR_OR_NULL(page))
+ folio = folio_walk_start(&fw, vma, addr, 0);
+ if (!folio)
continue;
- folio = page_folio(page);
if (!is_transparent_hugepage(folio))
goto next;
@@ -3588,11 +3935,13 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
* can be split or not. So skip the check here.
*/
if (!folio_test_private(folio) &&
- !can_split_folio(folio, NULL))
+ !can_split_folio(folio, 0, NULL))
goto next;
if (!folio_trylock(folio))
goto next;
+ folio_get(folio);
+ folio_walk_end(&fw, vma);
if (!folio_test_anon(folio) && folio->mapping != mapping)
goto unlock;
@@ -3603,8 +3952,12 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
unlock:
folio_unlock(folio);
-next:
folio_put(folio);
+
+ cond_resched();
+ continue;
+next:
+ folio_walk_end(&fw, vma);
cond_resched();
}
mmap_read_unlock(mm);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9a3a6e2dee97..def84d8bcf2d 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -56,16 +56,6 @@ struct hstate hstates[HUGE_MAX_HSTATE];
#ifdef CONFIG_CMA
static struct cma *hugetlb_cma[MAX_NUMNODES];
static unsigned long hugetlb_cma_size_in_node[MAX_NUMNODES] __initdata;
-static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
-{
- return cma_pages_valid(hugetlb_cma[folio_nid(folio)], &folio->page,
- 1 << order);
-}
-#else
-static bool hugetlb_cma_folio(struct folio *folio, unsigned int order)
-{
- return false;
-}
#endif
static unsigned long hugetlb_cma_size __initdata;
@@ -82,14 +72,14 @@ static unsigned int default_hugepages_in_node[MAX_NUMNODES] __initdata;
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
* free_huge_pages, and surplus_huge_pages.
*/
-DEFINE_SPINLOCK(hugetlb_lock);
+__cacheline_aligned_in_smp DEFINE_SPINLOCK(hugetlb_lock);
/*
* Serializes faults on the same logical page. This is used to
* prevent spurious OOMs when the hugepage pool is fully utilized.
*/
-static int num_fault_mutexes;
-struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp;
+static int num_fault_mutexes __ro_after_init;
+struct mutex *hugetlb_fault_mutex_table __ro_after_init;
/* Forward declaration */
static int hugetlb_acct_memory(struct hstate *h, long delta);
@@ -100,6 +90,17 @@ static void hugetlb_unshare_pmds(struct vm_area_struct *vma,
unsigned long start, unsigned long end);
static struct resv_map *vma_resv_map(struct vm_area_struct *vma);
+static void hugetlb_free_folio(struct folio *folio)
+{
+#ifdef CONFIG_CMA
+ int nid = folio_nid(folio);
+
+ if (cma_free_folio(hugetlb_cma[nid], folio))
+ return;
+#endif
+ folio_put(folio);
+}
+
static inline bool subpool_is_free(struct hugepage_subpool *spool)
{
if (spool->count)
@@ -1512,95 +1513,54 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-/* used to demote non-gigantic_huge pages as well */
-static void __destroy_compound_gigantic_folio(struct folio *folio,
- unsigned int order, bool demote)
-{
- int i;
- int nr_pages = 1 << order;
- struct page *p;
-
- atomic_set(&folio->_entire_mapcount, 0);
- atomic_set(&folio->_large_mapcount, 0);
- atomic_set(&folio->_pincount, 0);
-
- for (i = 1; i < nr_pages; i++) {
- p = folio_page(folio, i);
- p->flags &= ~PAGE_FLAGS_CHECK_AT_FREE;
- p->mapping = NULL;
- clear_compound_head(p);
- if (!demote)
- set_page_refcounted(p);
- }
-
- __folio_clear_head(folio);
-}
-
-static void destroy_compound_hugetlb_folio_for_demote(struct folio *folio,
- unsigned int order)
-{
- __destroy_compound_gigantic_folio(folio, order, true);
-}
-
#ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE
-static void destroy_compound_gigantic_folio(struct folio *folio,
- unsigned int order)
-{
- __destroy_compound_gigantic_folio(folio, order, false);
-}
-
-static void free_gigantic_folio(struct folio *folio, unsigned int order)
-{
- /*
- * If the page isn't allocated using the cma allocator,
- * cma_release() returns false.
- */
-#ifdef CONFIG_CMA
- int nid = folio_nid(folio);
-
- if (cma_release(hugetlb_cma[nid], &folio->page, 1 << order))
- return;
-#endif
-
- free_contig_range(folio_pfn(folio), 1 << order);
-}
-
#ifdef CONFIG_CONTIG_ALLOC
static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
- struct page *page;
- unsigned long nr_pages = pages_per_huge_page(h);
+ struct folio *folio;
+ int order = huge_page_order(h);
+ bool retried = false;
+
if (nid == NUMA_NO_NODE)
nid = numa_mem_id();
-
+retry:
+ folio = NULL;
#ifdef CONFIG_CMA
{
int node;
- if (hugetlb_cma[nid]) {
- page = cma_alloc(hugetlb_cma[nid], nr_pages,
- huge_page_order(h), true);
- if (page)
- return page_folio(page);
- }
+ if (hugetlb_cma[nid])
+ folio = cma_alloc_folio(hugetlb_cma[nid], order, gfp_mask);
- if (!(gfp_mask & __GFP_THISNODE)) {
+ if (!folio && !(gfp_mask & __GFP_THISNODE)) {
for_each_node_mask(node, *nodemask) {
if (node == nid || !hugetlb_cma[node])
continue;
- page = cma_alloc(hugetlb_cma[node], nr_pages,
- huge_page_order(h), true);
- if (page)
- return page_folio(page);
+ folio = cma_alloc_folio(hugetlb_cma[node], order, gfp_mask);
+ if (folio)
+ break;
}
}
}
#endif
+ if (!folio) {
+ folio = folio_alloc_gigantic(order, gfp_mask, nid, nodemask);
+ if (!folio)
+ return NULL;
+ }
- page = alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
- return page ? page_folio(page) : NULL;
+ if (folio_ref_freeze(folio, 1))
+ return folio;
+
+ pr_warn("HugeTLB: unexpected refcount on PFN %lu\n", folio_pfn(folio));
+ hugetlb_free_folio(folio);
+ if (!retried) {
+ retried = true;
+ goto retry;
+ }
+ return NULL;
}
#else /* !CONFIG_CONTIG_ALLOC */
@@ -1617,10 +1577,6 @@ static struct folio *alloc_gigantic_folio(struct hstate *h, gfp_t gfp_mask,
{
return NULL;
}
-static inline void free_gigantic_folio(struct folio *folio,
- unsigned int order) { }
-static inline void destroy_compound_gigantic_folio(struct folio *folio,
- unsigned int order) { }
#endif
/*
@@ -1748,18 +1704,8 @@ static void __update_and_free_hugetlb_folio(struct hstate *h,
folio_ref_unfreeze(folio, 1);
- /*
- * Non-gigantic pages demoted from CMA allocated gigantic pages
- * need to be given back to CMA in free_gigantic_folio.
- */
- if (hstate_is_gigantic(h) ||
- hugetlb_cma_folio(folio, huge_page_order(h))) {
- destroy_compound_gigantic_folio(folio, huge_page_order(h));
- free_gigantic_folio(folio, huge_page_order(h));
- } else {
- INIT_LIST_HEAD(&folio->_deferred_list);
- folio_put(folio);
- }
+ INIT_LIST_HEAD(&folio->_deferred_list);
+ hugetlb_free_folio(folio);
}
/*
@@ -2032,95 +1978,6 @@ static void prep_new_hugetlb_folio(struct hstate *h, struct folio *folio, int ni
spin_unlock_irq(&hugetlb_lock);
}
-static bool __prep_compound_gigantic_folio(struct folio *folio,
- unsigned int order, bool demote)
-{
- int i, j;
- int nr_pages = 1 << order;
- struct page *p;
-
- __folio_clear_reserved(folio);
- for (i = 0; i < nr_pages; i++) {
- p = folio_page(folio, i);
-
- /*
- * For gigantic hugepages allocated through bootmem at
- * boot, it's safer to be consistent with the not-gigantic
- * hugepages and clear the PG_reserved bit from all tail pages
- * too. Otherwise drivers using get_user_pages() to access tail
- * pages may get the reference counting wrong if they see
- * PG_reserved set on a tail page (despite the head page not
- * having PG_reserved set). Enforcing this consistency between
- * head and tail pages allows drivers to optimize away a check
- * on the head page when they need know if put_page() is needed
- * after get_user_pages().
- */
- if (i != 0) /* head page cleared above */
- __ClearPageReserved(p);
- /*
- * Subtle and very unlikely
- *
- * Gigantic 'page allocators' such as memblock or cma will
- * return a set of pages with each page ref counted. We need
- * to turn this set of pages into a compound page with tail
- * page ref counts set to zero. Code such as speculative page
- * cache adding could take a ref on a 'to be' tail page.
- * We need to respect any increased ref count, and only set
- * the ref count to zero if count is currently 1. If count
- * is not 1, we return an error. An error return indicates
- * the set of pages can not be converted to a gigantic page.
- * The caller who allocated the pages should then discard the
- * pages using the appropriate free interface.
- *
- * In the case of demote, the ref count will be zero.
- */
- if (!demote) {
- if (!page_ref_freeze(p, 1)) {
- pr_warn("HugeTLB page can not be used due to unexpected inflated ref count\n");
- goto out_error;
- }
- } else {
- VM_BUG_ON_PAGE(page_count(p), p);
- }
- if (i != 0)
- set_compound_head(p, &folio->page);
- }
- __folio_set_head(folio);
- /* we rely on prep_new_hugetlb_folio to set the hugetlb flag */
- folio_set_order(folio, order);
- atomic_set(&folio->_entire_mapcount, -1);
- atomic_set(&folio->_large_mapcount, -1);
- atomic_set(&folio->_pincount, 0);
- return true;
-
-out_error:
- /* undo page modifications made above */
- for (j = 0; j < i; j++) {
- p = folio_page(folio, j);
- if (j != 0)
- clear_compound_head(p);
- set_page_refcounted(p);
- }
- /* need to clear PG_reserved on remaining tail pages */
- for (; j < nr_pages; j++) {
- p = folio_page(folio, j);
- __ClearPageReserved(p);
- }
- return false;
-}
-
-static bool prep_compound_gigantic_folio(struct folio *folio,
- unsigned int order)
-{
- return __prep_compound_gigantic_folio(folio, order, false);
-}
-
-static bool prep_compound_gigantic_folio_for_demote(struct folio *folio,
- unsigned int order)
-{
- return __prep_compound_gigantic_folio(folio, order, true);
-}
-
/*
* Find and lock address space (mapping) in write mode.
*
@@ -2159,7 +2016,6 @@ static struct folio *alloc_buddy_hugetlb_folio(struct hstate *h,
*/
if (node_alloc_noretry && node_isset(nid, *node_alloc_noretry))
alloc_try_hard = false;
- gfp_mask |= __GFP_COMP|__GFP_NOWARN;
if (alloc_try_hard)
gfp_mask |= __GFP_RETRY_MAYFAIL;
if (nid == NUMA_NO_NODE)
@@ -2206,48 +2062,16 @@ retry:
return folio;
}
-static struct folio *__alloc_fresh_hugetlb_folio(struct hstate *h,
- gfp_t gfp_mask, int nid, nodemask_t *nmask,
- nodemask_t *node_alloc_noretry)
-{
- struct folio *folio;
- bool retry = false;
-
-retry:
- if (hstate_is_gigantic(h))
- folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
- else
- folio = alloc_buddy_hugetlb_folio(h, gfp_mask,
- nid, nmask, node_alloc_noretry);
- if (!folio)
- return NULL;
-
- if (hstate_is_gigantic(h)) {
- if (!prep_compound_gigantic_folio(folio, huge_page_order(h))) {
- /*
- * Rare failure to convert pages to compound page.
- * Free pages and try again - ONCE!
- */
- free_gigantic_folio(folio, huge_page_order(h));
- if (!retry) {
- retry = true;
- goto retry;
- }
- return NULL;
- }
- }
-
- return folio;
-}
-
static struct folio *only_alloc_fresh_hugetlb_folio(struct hstate *h,
gfp_t gfp_mask, int nid, nodemask_t *nmask,
nodemask_t *node_alloc_noretry)
{
struct folio *folio;
- folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask,
- node_alloc_noretry);
+ if (hstate_is_gigantic(h))
+ folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
+ else
+ folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, node_alloc_noretry);
if (folio)
init_new_hugetlb_folio(h, folio);
return folio;
@@ -2265,7 +2089,10 @@ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h,
{
struct folio *folio;
- folio = __alloc_fresh_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
+ if (hstate_is_gigantic(h))
+ folio = alloc_gigantic_folio(h, gfp_mask, nid, nmask);
+ else
+ folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, nmask, NULL);
if (!folio)
return NULL;
@@ -2549,9 +2376,8 @@ struct folio *alloc_buddy_hugetlb_folio_with_mpol(struct hstate *h,
nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask);
if (mpol_is_preferred_many(mpol)) {
- gfp_t gfp = gfp_mask | __GFP_NOWARN;
+ gfp_t gfp = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
- gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
folio = alloc_surplus_hugetlb_folio(h, gfp, nid, nodemask);
/* Fallback to all nodes if page==NULL */
@@ -3333,6 +3159,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio,
for (pfn = head_pfn + start_page_number; pfn < end_pfn; pfn++) {
struct page *page = pfn_to_page(pfn);
+ __ClearPageReserved(folio_page(folio, pfn - head_pfn));
__init_single_page(page, pfn, zone, nid);
prep_compound_tail((struct page *)folio, pfn - head_pfn);
ret = page_ref_freeze(page, 1);
@@ -3921,101 +3748,120 @@ out:
return 0;
}
-static int demote_free_hugetlb_folio(struct hstate *h, struct folio *folio)
+static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst,
+ struct list_head *src_list)
{
- int i, nid = folio_nid(folio);
- struct hstate *target_hstate;
- struct page *subpage;
- struct folio *inner_folio;
- int rc = 0;
-
- target_hstate = size_to_hstate(PAGE_SIZE << h->demote_order);
-
- remove_hugetlb_folio(h, folio, false);
- spin_unlock_irq(&hugetlb_lock);
+ long rc;
+ struct folio *folio, *next;
+ LIST_HEAD(dst_list);
+ LIST_HEAD(ret_list);
- /*
- * If vmemmap already existed for folio, the remove routine above would
- * have cleared the hugetlb folio flag. Hence the folio is technically
- * no longer a hugetlb folio. hugetlb_vmemmap_restore_folio can only be
- * passed hugetlb folios and will BUG otherwise.
- */
- if (folio_test_hugetlb(folio)) {
- rc = hugetlb_vmemmap_restore_folio(h, folio);
- if (rc) {
- /* Allocation of vmemmmap failed, we can not demote folio */
- spin_lock_irq(&hugetlb_lock);
- add_hugetlb_folio(h, folio, false);
- return rc;
- }
- }
-
- /*
- * Use destroy_compound_hugetlb_folio_for_demote for all huge page
- * sizes as it will not ref count folios.
- */
- destroy_compound_hugetlb_folio_for_demote(folio, huge_page_order(h));
+ rc = hugetlb_vmemmap_restore_folios(src, src_list, &ret_list);
+ list_splice_init(&ret_list, src_list);
/*
* Taking target hstate mutex synchronizes with set_max_huge_pages.
* Without the mutex, pages added to target hstate could be marked
* as surplus.
*
- * Note that we already hold h->resize_lock. To prevent deadlock,
+ * Note that we already hold src->resize_lock. To prevent deadlock,
* use the convention of always taking larger size hstate mutex first.
*/
- mutex_lock(&target_hstate->resize_lock);
- for (i = 0; i < pages_per_huge_page(h);
- i += pages_per_huge_page(target_hstate)) {
- subpage = folio_page(folio, i);
- inner_folio = page_folio(subpage);
- if (hstate_is_gigantic(target_hstate))
- prep_compound_gigantic_folio_for_demote(inner_folio,
- target_hstate->order);
- else
- prep_compound_page(subpage, target_hstate->order);
- folio_change_private(inner_folio, NULL);
- prep_new_hugetlb_folio(target_hstate, inner_folio, nid);
- free_huge_folio(inner_folio);
+ mutex_lock(&dst->resize_lock);
+
+ list_for_each_entry_safe(folio, next, src_list, lru) {
+ int i;
+
+ if (folio_test_hugetlb_vmemmap_optimized(folio))
+ continue;
+
+ list_del(&folio->lru);
+
+ split_page_owner(&folio->page, huge_page_order(src), huge_page_order(dst));
+ pgalloc_tag_split(folio, huge_page_order(src), huge_page_order(dst));
+
+ for (i = 0; i < pages_per_huge_page(src); i += pages_per_huge_page(dst)) {
+ struct page *page = folio_page(folio, i);
+
+ page->mapping = NULL;
+ clear_compound_head(page);
+ prep_compound_page(page, dst->order);
+
+ init_new_hugetlb_folio(dst, page_folio(page));
+ list_add(&page->lru, &dst_list);
+ }
}
- mutex_unlock(&target_hstate->resize_lock);
- spin_lock_irq(&hugetlb_lock);
+ prep_and_add_allocated_folios(dst, &dst_list);
- /*
- * Not absolutely necessary, but for consistency update max_huge_pages
- * based on pool changes for the demoted page.
- */
- h->max_huge_pages--;
- target_hstate->max_huge_pages +=
- pages_per_huge_page(h) / pages_per_huge_page(target_hstate);
+ mutex_unlock(&dst->resize_lock);
return rc;
}
-static int demote_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed)
+static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed,
+ unsigned long nr_to_demote)
__must_hold(&hugetlb_lock)
{
int nr_nodes, node;
- struct folio *folio;
+ struct hstate *dst;
+ long rc = 0;
+ long nr_demoted = 0;
lockdep_assert_held(&hugetlb_lock);
/* We should never get here if no demote order */
- if (!h->demote_order) {
+ if (!src->demote_order) {
pr_warn("HugeTLB: NULL demote order passed to demote_pool_huge_page.\n");
return -EINVAL; /* internal error */
}
+ dst = size_to_hstate(PAGE_SIZE << src->demote_order);
- for_each_node_mask_to_free(h, nr_nodes, node, nodes_allowed) {
- list_for_each_entry(folio, &h->hugepage_freelists[node], lru) {
+ for_each_node_mask_to_free(src, nr_nodes, node, nodes_allowed) {
+ LIST_HEAD(list);
+ struct folio *folio, *next;
+
+ list_for_each_entry_safe(folio, next, &src->hugepage_freelists[node], lru) {
if (folio_test_hwpoison(folio))
continue;
- return demote_free_hugetlb_folio(h, folio);
+
+ remove_hugetlb_folio(src, folio, false);
+ list_add(&folio->lru, &list);
+
+ if (++nr_demoted == nr_to_demote)
+ break;
+ }
+
+ spin_unlock_irq(&hugetlb_lock);
+
+ rc = demote_free_hugetlb_folios(src, dst, &list);
+
+ spin_lock_irq(&hugetlb_lock);
+
+ list_for_each_entry_safe(folio, next, &list, lru) {
+ list_del(&folio->lru);
+ add_hugetlb_folio(src, folio, false);
+
+ nr_demoted--;
}
+
+ if (rc < 0 || nr_demoted == nr_to_demote)
+ break;
}
/*
+ * Not absolutely necessary, but for consistency update max_huge_pages
+ * based on pool changes for the demoted page.
+ */
+ src->max_huge_pages -= nr_demoted;
+ dst->max_huge_pages += nr_demoted << (huge_page_order(src) - huge_page_order(dst));
+
+ if (rc < 0)
+ return rc;
+
+ if (nr_demoted)
+ return nr_demoted;
+ /*
* Only way to get here is if all pages on free lists are poisoned.
* Return -EBUSY so that caller will not retry.
*/
@@ -4249,6 +4095,8 @@ static ssize_t demote_store(struct kobject *kobj,
spin_lock_irq(&hugetlb_lock);
while (nr_demote) {
+ long rc;
+
/*
* Check for available pages to demote each time thorough the
* loop as demote_pool_huge_page will drop hugetlb_lock.
@@ -4261,11 +4109,13 @@ static ssize_t demote_store(struct kobject *kobj,
if (!nr_available)
break;
- err = demote_pool_huge_page(h, n_mask);
- if (err)
+ rc = demote_pool_huge_page(h, n_mask, nr_demote);
+ if (rc < 0) {
+ err = rc;
break;
+ }
- nr_demote--;
+ nr_demote -= rc;
}
spin_unlock_irq(&hugetlb_lock);
@@ -7227,7 +7077,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end,
return 0;
}
-#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
static unsigned long page_table_shareable(struct vm_area_struct *svma,
struct vm_area_struct *vma,
unsigned long addr, pgoff_t idx)
@@ -7389,7 +7239,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
return 1;
}
-#else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+#else /* !CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
@@ -7412,7 +7262,7 @@ bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr)
{
return false;
}
-#endif /* CONFIG_ARCH_WANT_HUGE_PMD_SHARE */
+#endif /* CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING */
#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -7510,7 +7360,7 @@ unsigned long hugetlb_mask_last_page(struct hstate *h)
/* See description above. Architectures can provide their own version. */
__weak unsigned long hugetlb_mask_last_page(struct hstate *h)
{
-#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
+#ifdef CONFIG_HUGETLB_PMD_PAGE_TABLE_SHARING
if (huge_page_size(h) == PMD_SIZE)
return PUD_SIZE - PMD_SIZE;
#endif
@@ -7519,10 +7369,6 @@ __weak unsigned long hugetlb_mask_last_page(struct hstate *h)
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
-/*
- * These functions are overwritable if your architecture needs its own
- * behavior.
- */
bool isolate_hugetlb(struct folio *folio, struct list_head *list)
{
bool ret = true;
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 4ff238ba1250..e716c4671a15 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -114,10 +114,10 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
}
page_counter_init(hugetlb_cgroup_counter_from_cgroup(h_cgroup,
idx),
- fault_parent);
+ fault_parent, false);
page_counter_init(
hugetlb_cgroup_counter_from_cgroup_rsvd(h_cgroup, idx),
- rsvd_parent);
+ rsvd_parent, false);
limit = round_down(PAGE_COUNTER_MAX,
pages_per_huge_page(&hstates[idx]));
diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c
index 0c3f56b3578e..57b7f591eee8 100644
--- a/mm/hugetlb_vmemmap.c
+++ b/mm/hugetlb_vmemmap.c
@@ -43,6 +43,8 @@ struct vmemmap_remap_walk {
#define VMEMMAP_SPLIT_NO_TLB_FLUSH BIT(0)
/* Skip the TLB flush when we remap the PTE */
#define VMEMMAP_REMAP_NO_TLB_FLUSH BIT(1)
+/* synchronize_rcu() to avoid writes from page_ref_add_unless() */
+#define VMEMMAP_SYNCHRONIZE_RCU BIT(2)
unsigned long flags;
};
@@ -457,6 +459,9 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
if (!folio_test_hugetlb_vmemmap_optimized(folio))
return 0;
+ if (flags & VMEMMAP_SYNCHRONIZE_RCU)
+ synchronize_rcu();
+
vmemmap_end = vmemmap_start + hugetlb_vmemmap_size(h);
vmemmap_reuse = vmemmap_start;
vmemmap_start += HUGETLB_VMEMMAP_RESERVE_SIZE;
@@ -489,10 +494,7 @@ static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
*/
int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
{
- /* avoid writes from page_ref_add_unless() while unfolding vmemmap */
- synchronize_rcu();
-
- return __hugetlb_vmemmap_restore_folio(h, folio, 0);
+ return __hugetlb_vmemmap_restore_folio(h, folio, VMEMMAP_SYNCHRONIZE_RCU);
}
/**
@@ -515,14 +517,14 @@ long hugetlb_vmemmap_restore_folios(const struct hstate *h,
struct folio *folio, *t_folio;
long restored = 0;
long ret = 0;
-
- /* avoid writes from page_ref_add_unless() while unfolding vmemmap */
- synchronize_rcu();
+ unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
if (folio_test_hugetlb_vmemmap_optimized(folio)) {
- ret = __hugetlb_vmemmap_restore_folio(h, folio,
- VMEMMAP_REMAP_NO_TLB_FLUSH);
+ ret = __hugetlb_vmemmap_restore_folio(h, folio, flags);
+ /* only need to synchronize_rcu() once for each batch */
+ flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
+
if (ret)
break;
restored++;
@@ -570,6 +572,9 @@ static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
return ret;
static_branch_inc(&hugetlb_optimize_vmemmap_key);
+
+ if (flags & VMEMMAP_SYNCHRONIZE_RCU)
+ synchronize_rcu();
/*
* Very Subtle
* If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
@@ -617,10 +622,7 @@ void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
{
LIST_HEAD(vmemmap_pages);
- /* avoid writes from page_ref_add_unless() while folding vmemmap */
- synchronize_rcu();
-
- __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0);
+ __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, VMEMMAP_SYNCHRONIZE_RCU);
free_vmemmap_page_list(&vmemmap_pages);
}
@@ -647,6 +649,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
{
struct folio *folio;
LIST_HEAD(vmemmap_pages);
+ unsigned long flags = VMEMMAP_REMAP_NO_TLB_FLUSH | VMEMMAP_SYNCHRONIZE_RCU;
list_for_each_entry(folio, folio_list, lru) {
int ret = hugetlb_vmemmap_split_folio(h, folio);
@@ -663,14 +666,12 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
flush_tlb_all();
- /* avoid writes from page_ref_add_unless() while folding vmemmap */
- synchronize_rcu();
-
list_for_each_entry(folio, folio_list, lru) {
int ret;
- ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
- VMEMMAP_REMAP_NO_TLB_FLUSH);
+ ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
+ /* only need to synchronize_rcu() once for each batch */
+ flags &= ~VMEMMAP_SYNCHRONIZE_RCU;
/*
* Pages to be freed may have been accumulated. If we
@@ -684,8 +685,7 @@ void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_l
flush_tlb_all();
free_vmemmap_page_list(&vmemmap_pages);
INIT_LIST_HEAD(&vmemmap_pages);
- __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
- VMEMMAP_REMAP_NO_TLB_FLUSH);
+ __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, flags);
}
}
diff --git a/mm/internal.h b/mm/internal.h
index a963f67d3452..93083bbeeefa 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -8,13 +8,19 @@
#define __MM_INTERNAL_H
#include <linux/fs.h>
+#include <linux/khugepaged.h>
#include <linux/mm.h>
+#include <linux/mm_inline.h>
#include <linux/pagemap.h>
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/swap_cgroup.h>
#include <linux/tracepoint-defs.h>
+/* Internal core VMA manipulation functions. */
+#include "vma.h"
+
struct folio_batch;
/*
@@ -270,18 +276,22 @@ static inline int swap_pte_batch(pte_t *start_ptep, int max_nr, pte_t pte)
{
pte_t expected_pte = pte_next_swp_offset(pte);
const pte_t *end_ptep = start_ptep + max_nr;
+ swp_entry_t entry = pte_to_swp_entry(pte);
pte_t *ptep = start_ptep + 1;
+ unsigned short cgroup_id;
VM_WARN_ON(max_nr < 1);
VM_WARN_ON(!is_swap_pte(pte));
- VM_WARN_ON(non_swap_entry(pte_to_swp_entry(pte)));
+ VM_WARN_ON(non_swap_entry(entry));
+ cgroup_id = lookup_swap_cgroup_id(entry);
while (ptep < end_ptep) {
pte = ptep_get(ptep);
if (!pte_same(pte, expected_pte))
break;
-
+ if (lookup_swap_cgroup_id(pte_to_swp_entry(pte)) != cgroup_id)
+ break;
expected_pte = pte_next_swp_offset(expected_pte);
ptep++;
}
@@ -415,9 +425,7 @@ extern unsigned long highest_memmap_pfn;
/*
* in mm/vmscan.c:
*/
-bool isolate_lru_page(struct page *page);
bool folio_isolate_lru(struct folio *folio);
-void putback_lru_page(struct page *page);
void folio_putback_lru(struct folio *folio);
extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason);
@@ -787,37 +795,6 @@ static inline bool free_area_empty(struct free_area *area, int migratetype)
return list_empty(&area->free_list[migratetype]);
}
-/*
- * These three helpers classifies VMAs for virtual memory accounting.
- */
-
-/*
- * Executable code area - executable, not writable, not stack
- */
-static inline bool is_exec_mapping(vm_flags_t flags)
-{
- return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
-}
-
-/*
- * Stack area (including shadow stacks)
- *
- * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
- * do_mmap() forbids all other combinations.
- */
-static inline bool is_stack_mapping(vm_flags_t flags)
-{
- return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK);
-}
-
-/*
- * Data area - private, writable, not stack
- */
-static inline bool is_data_mapping(vm_flags_t flags)
-{
- return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
-}
-
/* mm/util.c */
struct anon_vma *folio_anon_vma(struct folio *folio);
@@ -1078,6 +1055,8 @@ static inline int find_next_best_node(int node, nodemask_t *used_node_mask)
/*
* mm/memory-failure.c
*/
+#ifdef CONFIG_MEMORY_FAILURE
+void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu);
void shake_folio(struct folio *folio);
extern int hwpoison_filter(struct page *p);
@@ -1098,6 +1077,12 @@ void add_to_kill_ksm(struct task_struct *tsk, struct page *p,
unsigned long ksm_addr);
unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
+#else
+static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+{
+}
+#endif
+
extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long,
unsigned long, unsigned long);
@@ -1174,7 +1159,6 @@ static inline void flush_tlb_batched_pending(struct mm_struct *mm)
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
extern const struct trace_print_flags pageflag_names[];
-extern const struct trace_print_flags pagetype_names[];
extern const struct trace_print_flags vmaflag_names[];
extern const struct trace_print_flags gfpflag_names[];
@@ -1226,11 +1210,12 @@ void vunmap_range_noflush(unsigned long start, unsigned long end);
void __vunmap_range_noflush(unsigned long start, unsigned long end);
-int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
- unsigned long addr, int page_nid, int *flags);
+int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
+ unsigned long addr, int *flags, bool writable,
+ int *last_cpupid);
void free_zone_device_folio(struct folio *folio);
-int migrate_device_coherent_page(struct page *page);
+int migrate_device_coherent_folio(struct folio *folio);
/*
* mm/gup.c
@@ -1246,13 +1231,6 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr,
void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, bool write);
-/*
- * mm/mmap.c
- */
-struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
- struct vm_area_struct *vma,
- unsigned long delta);
-
enum {
/* mark page accessed */
FOLL_TOUCH = 1 << 16,
@@ -1379,117 +1357,6 @@ static inline bool pte_needs_soft_dirty_wp(struct vm_area_struct *vma, pte_t pte
return vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte);
}
-static inline void vma_iter_config(struct vma_iterator *vmi,
- unsigned long index, unsigned long last)
-{
- __mas_set_range(&vmi->mas, index, last - 1);
-}
-
-static inline void vma_iter_reset(struct vma_iterator *vmi)
-{
- mas_reset(&vmi->mas);
-}
-
-static inline
-struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min)
-{
- return mas_prev_range(&vmi->mas, min);
-}
-
-static inline
-struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max)
-{
- return mas_next_range(&vmi->mas, max);
-}
-
-static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
- unsigned long max, unsigned long size)
-{
- return mas_empty_area(&vmi->mas, min, max - 1, size);
-}
-
-static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
- unsigned long max, unsigned long size)
-{
- return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
-}
-
-/*
- * VMA Iterator functions shared between nommu and mmap
- */
-static inline int vma_iter_prealloc(struct vma_iterator *vmi,
- struct vm_area_struct *vma)
-{
- return mas_preallocate(&vmi->mas, vma, GFP_KERNEL);
-}
-
-static inline void vma_iter_clear(struct vma_iterator *vmi)
-{
- mas_store_prealloc(&vmi->mas, NULL);
-}
-
-static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
-{
- return mas_walk(&vmi->mas);
-}
-
-/* Store a VMA with preallocated memory */
-static inline void vma_iter_store(struct vma_iterator *vmi,
- struct vm_area_struct *vma)
-{
-
-#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
- if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
- vmi->mas.index > vma->vm_start)) {
- pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
- vmi->mas.index, vma->vm_start, vma->vm_start,
- vma->vm_end, vmi->mas.index, vmi->mas.last);
- }
- if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
- vmi->mas.last < vma->vm_start)) {
- pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
- vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
- vmi->mas.index, vmi->mas.last);
- }
-#endif
-
- if (vmi->mas.status != ma_start &&
- ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
- vma_iter_invalidate(vmi);
-
- __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
- mas_store_prealloc(&vmi->mas, vma);
-}
-
-static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
- struct vm_area_struct *vma, gfp_t gfp)
-{
- if (vmi->mas.status != ma_start &&
- ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
- vma_iter_invalidate(vmi);
-
- __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
- mas_store_gfp(&vmi->mas, vma, gfp);
- if (unlikely(mas_is_err(&vmi->mas)))
- return -ENOMEM;
-
- return 0;
-}
-
-/*
- * VMA lock generalization
- */
-struct vma_prepare {
- struct vm_area_struct *vma;
- struct vm_area_struct *adj_next;
- struct file *file;
- struct address_space *mapping;
- struct anon_vma *anon_vma;
- struct vm_area_struct *insert;
- struct vm_area_struct *remove;
- struct vm_area_struct *remove2;
-};
-
void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid);
@@ -1506,27 +1373,11 @@ static inline int can_do_mseal(unsigned long flags)
return 0;
}
-bool can_modify_mm(struct mm_struct *mm, unsigned long start,
- unsigned long end);
-bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
- unsigned long end, int behavior);
#else
static inline int can_do_mseal(unsigned long flags)
{
return -EPERM;
}
-
-static inline bool can_modify_mm(struct mm_struct *mm, unsigned long start,
- unsigned long end)
-{
- return true;
-}
-
-static inline bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start,
- unsigned long end, int behavior)
-{
- return true;
-}
#endif
#ifdef CONFIG_SHRINKER_DEBUG
@@ -1578,13 +1429,18 @@ static inline void shrinker_debugfs_remove(struct dentry *debugfs_entry,
void workingset_update_node(struct xa_node *node);
extern struct list_lru shadow_nodes;
-struct unlink_vma_file_batch {
- int count;
- struct vm_area_struct *vmas[8];
-};
+/* mremap.c */
+unsigned long move_page_tables(struct vm_area_struct *vma,
+ unsigned long old_addr, struct vm_area_struct *new_vma,
+ unsigned long new_addr, unsigned long len,
+ bool need_rmap_locks, bool for_stack);
-void unlink_file_vma_batch_init(struct unlink_vma_file_batch *);
-void unlink_file_vma_batch_add(struct unlink_vma_file_batch *, struct vm_area_struct *);
-void unlink_file_vma_batch_final(struct unlink_vma_file_batch *);
+#ifdef CONFIG_UNACCEPTED_MEMORY
+void accept_page(struct page *page);
+#else /* CONFIG_UNACCEPTED_MEMORY */
+static inline void accept_page(struct page *page)
+{
+}
+#endif /* CONFIG_UNACCEPTED_MEMORY */
#endif /* __MM_INTERNAL_H */
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index c5cb54fc696d..67fc321db79b 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -99,6 +99,10 @@ module_param_cb(sample_interval, &sample_interval_param_ops, &kfence_sample_inte
static unsigned long kfence_skip_covered_thresh __read_mostly = 75;
module_param_named(skip_covered_thresh, kfence_skip_covered_thresh, ulong, 0644);
+/* Allocation burst count: number of excess KFENCE allocations per sample. */
+static unsigned int kfence_burst __read_mostly;
+module_param_named(burst, kfence_burst, uint, 0644);
+
/* If true, use a deferrable timer. */
static bool kfence_deferrable __read_mostly = IS_ENABLED(CONFIG_KFENCE_DEFERRABLE);
module_param_named(deferrable, kfence_deferrable, bool, 0444);
@@ -269,6 +273,13 @@ static inline unsigned long metadata_to_pageaddr(const struct kfence_metadata *m
return pageaddr;
}
+static inline bool kfence_obj_allocated(const struct kfence_metadata *meta)
+{
+ enum kfence_object_state state = READ_ONCE(meta->state);
+
+ return state == KFENCE_OBJECT_ALLOCATED || state == KFENCE_OBJECT_RCU_FREEING;
+}
+
/*
* Update the object's metadata state, including updating the alloc/free stacks
* depending on the state transition.
@@ -278,10 +289,14 @@ metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state nex
unsigned long *stack_entries, size_t num_stack_entries)
{
struct kfence_track *track =
- next == KFENCE_OBJECT_FREED ? &meta->free_track : &meta->alloc_track;
+ next == KFENCE_OBJECT_ALLOCATED ? &meta->alloc_track : &meta->free_track;
lockdep_assert_held(&meta->lock);
+ /* Stack has been saved when calling rcu, skip. */
+ if (READ_ONCE(meta->state) == KFENCE_OBJECT_RCU_FREEING)
+ goto out;
+
if (stack_entries) {
memcpy(track->stack_entries, stack_entries,
num_stack_entries * sizeof(stack_entries[0]));
@@ -297,6 +312,7 @@ metadata_update_state(struct kfence_metadata *meta, enum kfence_object_state nex
track->cpu = raw_smp_processor_id();
track->ts_nsec = local_clock(); /* Same source as printk timestamps. */
+out:
/*
* Pairs with READ_ONCE() in
* kfence_shutdown_cache(),
@@ -502,7 +518,7 @@ static void kfence_guarded_free(void *addr, struct kfence_metadata *meta, bool z
raw_spin_lock_irqsave(&meta->lock, flags);
- if (meta->state != KFENCE_OBJECT_ALLOCATED || meta->addr != (unsigned long)addr) {
+ if (!kfence_obj_allocated(meta) || meta->addr != (unsigned long)addr) {
/* Invalid or double-free, bail out. */
atomic_long_inc(&counters[KFENCE_COUNTER_BUGS]);
kfence_report_error((unsigned long)addr, false, NULL, meta,
@@ -780,7 +796,7 @@ static void kfence_check_all_canary(void)
for (i = 0; i < CONFIG_KFENCE_NUM_OBJECTS; i++) {
struct kfence_metadata *meta = &kfence_metadata[i];
- if (meta->state == KFENCE_OBJECT_ALLOCATED)
+ if (kfence_obj_allocated(meta))
check_canary(meta);
}
}
@@ -827,12 +843,12 @@ static void toggle_allocation_gate(struct work_struct *work)
if (!READ_ONCE(kfence_enabled))
return;
- atomic_set(&kfence_allocation_gate, 0);
+ atomic_set(&kfence_allocation_gate, -kfence_burst);
#ifdef CONFIG_KFENCE_STATIC_KEYS
/* Enable static key, and await allocation to happen. */
static_branch_enable(&kfence_allocation_key);
- wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate));
+ wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate) > 0);
/* Disable static key and reset timer. */
static_branch_disable(&kfence_allocation_key);
@@ -1006,12 +1022,11 @@ void kfence_shutdown_cache(struct kmem_cache *s)
* the lock will not help, as different critical section
* serialization will have the same outcome.
*/
- if (READ_ONCE(meta->cache) != s ||
- READ_ONCE(meta->state) != KFENCE_OBJECT_ALLOCATED)
+ if (READ_ONCE(meta->cache) != s || !kfence_obj_allocated(meta))
continue;
raw_spin_lock_irqsave(&meta->lock, flags);
- in_use = meta->cache == s && meta->state == KFENCE_OBJECT_ALLOCATED;
+ in_use = meta->cache == s && kfence_obj_allocated(meta);
raw_spin_unlock_irqrestore(&meta->lock, flags);
if (in_use) {
@@ -1052,6 +1067,7 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
unsigned long stack_entries[KFENCE_STACK_DEPTH];
size_t num_stack_entries;
u32 alloc_stack_hash;
+ int allocation_gate;
/*
* Perform size check before switching kfence_allocation_gate, so that
@@ -1080,14 +1096,15 @@ void *__kfence_alloc(struct kmem_cache *s, size_t size, gfp_t flags)
if (s->flags & SLAB_SKIP_KFENCE)
return NULL;
- if (atomic_inc_return(&kfence_allocation_gate) > 1)
+ allocation_gate = atomic_inc_return(&kfence_allocation_gate);
+ if (allocation_gate > 1)
return NULL;
#ifdef CONFIG_KFENCE_STATIC_KEYS
/*
* waitqueue_active() is fully ordered after the update of
* kfence_allocation_gate per atomic_inc_return().
*/
- if (waitqueue_active(&allocation_wait)) {
+ if (allocation_gate == 1 && waitqueue_active(&allocation_wait)) {
/*
* Calling wake_up() here may deadlock when allocations happen
* from within timer code. Use an irq_work to defer it.
@@ -1154,11 +1171,19 @@ void __kfence_free(void *addr)
* the object, as the object page may be recycled for other-typed
* objects once it has been freed. meta->cache may be NULL if the cache
* was destroyed.
+ * Save the stack trace here so that reports show where the user freed
+ * the object.
*/
- if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU)))
+ if (unlikely(meta->cache && (meta->cache->flags & SLAB_TYPESAFE_BY_RCU))) {
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&meta->lock, flags);
+ metadata_update_state(meta, KFENCE_OBJECT_RCU_FREEING, NULL, 0);
+ raw_spin_unlock_irqrestore(&meta->lock, flags);
call_rcu(&meta->rcu_head, rcu_guarded_free);
- else
+ } else {
kfence_guarded_free(addr, meta, false);
+ }
}
bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs *regs)
@@ -1182,14 +1207,14 @@ bool kfence_handle_page_fault(unsigned long addr, bool is_write, struct pt_regs
int distance = 0;
meta = addr_to_metadata(addr - PAGE_SIZE);
- if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
+ if (meta && kfence_obj_allocated(meta)) {
to_report = meta;
/* Data race ok; distance calculation approximate. */
distance = addr - data_race(meta->addr + meta->size);
}
meta = addr_to_metadata(addr + PAGE_SIZE);
- if (meta && READ_ONCE(meta->state) == KFENCE_OBJECT_ALLOCATED) {
+ if (meta && kfence_obj_allocated(meta)) {
/* Data race ok; distance calculation approximate. */
if (!to_report || distance > data_race(meta->addr) - addr)
to_report = meta;
diff --git a/mm/kfence/kfence.h b/mm/kfence/kfence.h
index db87a05047bd..dfba5ea06b01 100644
--- a/mm/kfence/kfence.h
+++ b/mm/kfence/kfence.h
@@ -38,6 +38,7 @@
enum kfence_object_state {
KFENCE_OBJECT_UNUSED, /* Object is unused. */
KFENCE_OBJECT_ALLOCATED, /* Object is currently allocated. */
+ KFENCE_OBJECT_RCU_FREEING, /* Object was allocated, and then being freed by rcu. */
KFENCE_OBJECT_FREED, /* Object was allocated, and then freed. */
};
diff --git a/mm/kfence/report.c b/mm/kfence/report.c
index c509aed326ce..451991a3a8f2 100644
--- a/mm/kfence/report.c
+++ b/mm/kfence/report.c
@@ -16,6 +16,7 @@
#include <linux/sprintf.h>
#include <linux/stacktrace.h>
#include <linux/string.h>
+#include <linux/sched/clock.h>
#include <trace/events/error_report.h>
#include <asm/kfence.h>
@@ -108,11 +109,15 @@ static void kfence_print_stack(struct seq_file *seq, const struct kfence_metadat
const struct kfence_track *track = show_alloc ? &meta->alloc_track : &meta->free_track;
u64 ts_sec = track->ts_nsec;
unsigned long rem_nsec = do_div(ts_sec, NSEC_PER_SEC);
+ u64 interval_nsec = local_clock() - meta->alloc_track.ts_nsec;
+ unsigned long rem_interval_nsec = do_div(interval_nsec, NSEC_PER_SEC);
/* Timestamp matches printk timestamp format. */
- seq_con_printf(seq, "%s by task %d on cpu %d at %lu.%06lus:\n",
- show_alloc ? "allocated" : "freed", track->pid,
- track->cpu, (unsigned long)ts_sec, rem_nsec / 1000);
+ seq_con_printf(seq, "%s by task %d on cpu %d at %lu.%06lus (%lu.%06lus ago):\n",
+ show_alloc ? "allocated" : meta->state == KFENCE_OBJECT_RCU_FREEING ?
+ "rcu freeing" : "freed", track->pid,
+ track->cpu, (unsigned long)ts_sec, rem_nsec / 1000,
+ (unsigned long)interval_nsec, rem_interval_nsec / 1000);
if (track->num_stack_entries) {
/* Skip allocation/free internals stack. */
@@ -145,7 +150,7 @@ void kfence_print_object(struct seq_file *seq, const struct kfence_metadata *met
kfence_print_stack(seq, meta, true);
- if (meta->state == KFENCE_OBJECT_FREED) {
+ if (meta->state == KFENCE_OBJECT_FREED || meta->state == KFENCE_OBJECT_RCU_FREEING) {
seq_con_printf(seq, "\n");
kfence_print_stack(seq, meta, false);
}
@@ -314,7 +319,7 @@ bool __kfence_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *sla
kpp->kp_slab_cache = meta->cache;
kpp->kp_objp = (void *)meta->addr;
kfence_to_kp_stack(&meta->alloc_track, kpp->kp_stack);
- if (meta->state == KFENCE_OBJECT_FREED)
+ if (meta->state == KFENCE_OBJECT_FREED || meta->state == KFENCE_OBJECT_RCU_FREEING)
kfence_to_kp_stack(&meta->free_track, kpp->kp_free_stack);
/* get_stack_skipnr() ensures the first entry is outside allocator. */
kpp->kp_ret = kpp->kp_stack[0];
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cdd1d8655a76..f9c39898eaff 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -85,7 +85,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
*
* Note that these are only respected if collapse was initiated by khugepaged.
*/
-static unsigned int khugepaged_max_ptes_none __read_mostly;
+unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
static unsigned int khugepaged_max_ptes_shared __read_mostly;
@@ -546,12 +546,14 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
static bool is_refcount_suitable(struct folio *folio)
{
- int expected_refcount;
+ int expected_refcount = folio_mapcount(folio);
- expected_refcount = folio_mapcount(folio);
- if (folio_test_swapcache(folio))
+ if (!folio_test_anon(folio) || folio_test_swapcache(folio))
expected_refcount += folio_nr_pages(folio);
+ if (folio_test_private(folio))
+ expected_refcount++;
+
return folio_ref_count(folio) == expected_refcount;
}
@@ -625,8 +627,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
/*
- * We can do it before isolate_lru_page because the
- * page can't be freed from under us. NOTE: PG_lock
+ * We can do it before folio_isolate_lru because the
+ * folio can't be freed from under us. NOTE: PG_lock
* is needed to serialize against split_huge_page
* when invoked from the VM.
*/
@@ -1235,6 +1237,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address,
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
+ deferred_split_folio(folio, false);
spin_unlock(pmd_ptl);
folio = NULL;
@@ -1841,7 +1844,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
}
} while (1);
- for (index = start; index < end; index++) {
+ for (index = start; index < end;) {
xas_set(&xas, index);
folio = xas_load(&xas);
@@ -1860,18 +1863,19 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
}
}
nr_none++;
+ index++;
continue;
}
if (xa_is_value(folio) || !folio_test_uptodate(folio)) {
xas_unlock_irq(&xas);
/* swap in or instantiate fallocated page */
- if (shmem_get_folio(mapping->host, index,
+ if (shmem_get_folio(mapping->host, index, 0,
&folio, SGP_NOALLOC)) {
result = SCAN_FAIL;
goto xa_unlocked;
}
- /* drain lru cache to help isolate_lru_page() */
+ /* drain lru cache to help folio_isolate_lru() */
lru_add_drain();
} else if (folio_trylock(folio)) {
folio_get(folio);
@@ -1886,7 +1890,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
page_cache_sync_readahead(mapping, &file->f_ra,
file, index,
end - index);
- /* drain lru cache to help isolate_lru_page() */
+ /* drain lru cache to help folio_isolate_lru() */
lru_add_drain();
folio = filemap_lock_folio(mapping, index);
if (IS_ERR(folio)) {
@@ -1941,12 +1945,10 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
* we locked the first folio, then a THP might be there already.
* This will be discovered on the first iteration.
*/
- if (folio_test_large(folio)) {
- result = folio_order(folio) == HPAGE_PMD_ORDER &&
- folio->index == start
- /* Maybe PMD-mapped */
- ? SCAN_PTE_MAPPED_HUGEPAGE
- : SCAN_PAGE_COMPOUND;
+ if (folio_order(folio) == HPAGE_PMD_ORDER &&
+ folio->index == start) {
+ /* Maybe PMD-mapped */
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
goto out_unlock;
}
@@ -1986,9 +1988,9 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
VM_BUG_ON_FOLIO(folio != xa_load(xas.xa, index), folio);
/*
- * We control three references to the folio:
+ * We control 2 + nr_pages references to the folio:
* - we hold a pin on it;
- * - one reference from page cache;
+ * - nr_pages reference from page cache;
* - one from lru_isolate_folio;
* If those are the only references, then any new usage
* of the folio will have to fetch it from the page
@@ -1996,7 +1998,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
* truncate, so any new usage will be blocked until we
* unlock folio after collapse/during rollback.
*/
- if (folio_ref_count(folio) != 3) {
+ if (folio_ref_count(folio) != 2 + folio_nr_pages(folio)) {
result = SCAN_PAGE_COUNT;
xas_unlock_irq(&xas);
folio_putback_lru(folio);
@@ -2007,6 +2009,7 @@ static int collapse_file(struct mm_struct *mm, unsigned long addr,
* Accumulate the folios that are being collapsed.
*/
list_add_tail(&folio->lru, &pagelist);
+ index += folio_nr_pages(folio);
continue;
out_unlock:
folio_unlock(folio);
@@ -2054,17 +2057,22 @@ xa_unlocked:
index = start;
dst = folio_page(new_folio, 0);
list_for_each_entry(folio, &pagelist, lru) {
+ int i, nr_pages = folio_nr_pages(folio);
+
while (index < folio->index) {
clear_highpage(dst);
index++;
dst++;
}
- if (copy_mc_highpage(dst, folio_page(folio, 0)) > 0) {
- result = SCAN_COPY_MC;
- goto rollback;
+
+ for (i = 0; i < nr_pages; i++) {
+ if (copy_mc_highpage(dst, folio_page(folio, i)) > 0) {
+ result = SCAN_COPY_MC;
+ goto rollback;
+ }
+ index++;
+ dst++;
}
- index++;
- dst++;
}
while (index < end) {
clear_highpage(dst);
@@ -2179,7 +2187,7 @@ immap_locked:
folio_clear_active(folio);
folio_clear_unevictable(folio);
folio_unlock(folio);
- folio_put_refs(folio, 3);
+ folio_put_refs(folio, 2 + folio_nr_pages(folio));
}
goto out;
@@ -2254,16 +2262,10 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
continue;
}
- /*
- * TODO: khugepaged should compact smaller compound pages
- * into a PMD sized page
- */
- if (folio_test_large(folio)) {
- result = folio_order(folio) == HPAGE_PMD_ORDER &&
- folio->index == start
- /* Maybe PMD-mapped */
- ? SCAN_PTE_MAPPED_HUGEPAGE
- : SCAN_PAGE_COMPOUND;
+ if (folio_order(folio) == HPAGE_PMD_ORDER &&
+ folio->index == start) {
+ /* Maybe PMD-mapped */
+ result = SCAN_PTE_MAPPED_HUGEPAGE;
/*
* For SCAN_PTE_MAPPED_HUGEPAGE, further processing
* by the caller won't touch the page cache, and so
@@ -2285,8 +2287,7 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr,
break;
}
- if (folio_ref_count(folio) !=
- 1 + folio_mapcount(folio) + folio_test_private(folio)) {
+ if (!is_refcount_suitable(folio)) {
result = SCAN_PAGE_COUNT;
break;
}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 764b08100570..0400f5e8ac60 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -224,6 +224,10 @@ static int kmemleak_error;
static unsigned long min_addr = ULONG_MAX;
static unsigned long max_addr;
+/* minimum and maximum address that may be valid per-CPU pointers */
+static unsigned long min_percpu_addr = ULONG_MAX;
+static unsigned long max_percpu_addr;
+
static struct task_struct *scan_thread;
/* used to avoid reporting of recently allocated objects */
static unsigned long jiffies_min_age;
@@ -294,13 +298,20 @@ static void hex_dump_object(struct seq_file *seq,
const u8 *ptr = (const u8 *)object->pointer;
size_t len;
- if (WARN_ON_ONCE(object->flags & (OBJECT_PHYS | OBJECT_PERCPU)))
+ if (WARN_ON_ONCE(object->flags & OBJECT_PHYS))
return;
+ if (object->flags & OBJECT_PERCPU)
+ ptr = (const u8 *)this_cpu_ptr((void __percpu *)object->pointer);
+
/* limit the number of lines to HEX_MAX_LINES */
len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE);
- warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len);
+ if (object->flags & OBJECT_PERCPU)
+ warn_or_seq_printf(seq, " hex dump (first %zu bytes on cpu %d):\n",
+ len, raw_smp_processor_id());
+ else
+ warn_or_seq_printf(seq, " hex dump (first %zu bytes):\n", len);
kasan_disable_current();
warn_or_seq_hex_dump(seq, DUMP_PREFIX_NONE, HEX_ROW_SIZE,
HEX_GROUP_SIZE, kasan_reset_tag((void *)ptr), len, HEX_ASCII);
@@ -695,10 +706,14 @@ static int __link_object(struct kmemleak_object *object, unsigned long ptr,
untagged_ptr = (unsigned long)kasan_reset_tag((void *)ptr);
/*
- * Only update min_addr and max_addr with object
- * storing virtual address.
+ * Only update min_addr and max_addr with object storing virtual
+ * address. And update min_percpu_addr max_percpu_addr for per-CPU
+ * objects.
*/
- if (!(objflags & (OBJECT_PHYS | OBJECT_PERCPU))) {
+ if (objflags & OBJECT_PERCPU) {
+ min_percpu_addr = min(min_percpu_addr, untagged_ptr);
+ max_percpu_addr = max(max_percpu_addr, untagged_ptr + size);
+ } else if (!(objflags & OBJECT_PHYS)) {
min_addr = min(min_addr, untagged_ptr);
max_addr = max(max_addr, untagged_ptr + size);
}
@@ -1055,12 +1070,8 @@ void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
{
pr_debug("%s(0x%px, %zu)\n", __func__, ptr, size);
- /*
- * Percpu allocations are only scanned and not reported as leaks
- * (min_count is set to 0).
- */
- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
- create_object_percpu((unsigned long)ptr, size, 0, gfp);
+ if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr))
+ create_object_percpu((__force unsigned long)ptr, size, 0, gfp);
}
EXPORT_SYMBOL_GPL(kmemleak_alloc_percpu);
@@ -1134,8 +1145,8 @@ void __ref kmemleak_free_percpu(const void __percpu *ptr)
{
pr_debug("%s(0x%px)\n", __func__, ptr);
- if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
- delete_object_full((unsigned long)ptr, OBJECT_PERCPU);
+ if (kmemleak_free_enabled && ptr && !IS_ERR_PCPU(ptr))
+ delete_object_full((__force unsigned long)ptr, OBJECT_PERCPU);
}
EXPORT_SYMBOL_GPL(kmemleak_free_percpu);
@@ -1304,12 +1315,23 @@ static bool update_checksum(struct kmemleak_object *object)
{
u32 old_csum = object->checksum;
- if (WARN_ON_ONCE(object->flags & (OBJECT_PHYS | OBJECT_PERCPU)))
+ if (WARN_ON_ONCE(object->flags & OBJECT_PHYS))
return false;
kasan_disable_current();
kcsan_disable_current();
- object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size);
+ if (object->flags & OBJECT_PERCPU) {
+ unsigned int cpu;
+
+ object->checksum = 0;
+ for_each_possible_cpu(cpu) {
+ void *ptr = per_cpu_ptr((void __percpu *)object->pointer, cpu);
+
+ object->checksum ^= crc32(0, kasan_reset_tag((void *)ptr), object->size);
+ }
+ } else {
+ object->checksum = crc32(0, kasan_reset_tag((void *)object->pointer), object->size);
+ }
kasan_enable_current();
kcsan_enable_current();
@@ -1340,6 +1362,64 @@ static void update_refs(struct kmemleak_object *object)
}
}
+static void pointer_update_refs(struct kmemleak_object *scanned,
+ unsigned long pointer, unsigned int objflags)
+{
+ struct kmemleak_object *object;
+ unsigned long untagged_ptr;
+ unsigned long excess_ref;
+
+ untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer);
+ if (objflags & OBJECT_PERCPU) {
+ if (untagged_ptr < min_percpu_addr || untagged_ptr >= max_percpu_addr)
+ return;
+ } else {
+ if (untagged_ptr < min_addr || untagged_ptr >= max_addr)
+ return;
+ }
+
+ /*
+ * No need for get_object() here since we hold kmemleak_lock.
+ * object->use_count cannot be dropped to 0 while the object
+ * is still present in object_tree_root and object_list
+ * (with updates protected by kmemleak_lock).
+ */
+ object = __lookup_object(pointer, 1, objflags);
+ if (!object)
+ return;
+ if (object == scanned)
+ /* self referenced, ignore */
+ return;
+
+ /*
+ * Avoid the lockdep recursive warning on object->lock being
+ * previously acquired in scan_object(). These locks are
+ * enclosed by scan_mutex.
+ */
+ raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+ /* only pass surplus references (object already gray) */
+ if (color_gray(object)) {
+ excess_ref = object->excess_ref;
+ /* no need for update_refs() if object already gray */
+ } else {
+ excess_ref = 0;
+ update_refs(object);
+ }
+ raw_spin_unlock(&object->lock);
+
+ if (excess_ref) {
+ object = lookup_object(excess_ref, 0);
+ if (!object)
+ return;
+ if (object == scanned)
+ /* circular reference, ignore */
+ return;
+ raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
+ update_refs(object);
+ raw_spin_unlock(&object->lock);
+ }
+}
+
/*
* Memory scanning is a long process and it needs to be interruptible. This
* function checks whether such interrupt condition occurred.
@@ -1372,13 +1452,10 @@ static void scan_block(void *_start, void *_end,
unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER);
unsigned long *end = _end - (BYTES_PER_POINTER - 1);
unsigned long flags;
- unsigned long untagged_ptr;
raw_spin_lock_irqsave(&kmemleak_lock, flags);
for (ptr = start; ptr < end; ptr++) {
- struct kmemleak_object *object;
unsigned long pointer;
- unsigned long excess_ref;
if (scan_should_stop())
break;
@@ -1387,50 +1464,8 @@ static void scan_block(void *_start, void *_end,
pointer = *(unsigned long *)kasan_reset_tag((void *)ptr);
kasan_enable_current();
- untagged_ptr = (unsigned long)kasan_reset_tag((void *)pointer);
- if (untagged_ptr < min_addr || untagged_ptr >= max_addr)
- continue;
-
- /*
- * No need for get_object() here since we hold kmemleak_lock.
- * object->use_count cannot be dropped to 0 while the object
- * is still present in object_tree_root and object_list
- * (with updates protected by kmemleak_lock).
- */
- object = lookup_object(pointer, 1);
- if (!object)
- continue;
- if (object == scanned)
- /* self referenced, ignore */
- continue;
-
- /*
- * Avoid the lockdep recursive warning on object->lock being
- * previously acquired in scan_object(). These locks are
- * enclosed by scan_mutex.
- */
- raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
- /* only pass surplus references (object already gray) */
- if (color_gray(object)) {
- excess_ref = object->excess_ref;
- /* no need for update_refs() if object already gray */
- } else {
- excess_ref = 0;
- update_refs(object);
- }
- raw_spin_unlock(&object->lock);
-
- if (excess_ref) {
- object = lookup_object(excess_ref, 0);
- if (!object)
- continue;
- if (object == scanned)
- /* circular reference, ignore */
- continue;
- raw_spin_lock_nested(&object->lock, SINGLE_DEPTH_NESTING);
- update_refs(object);
- raw_spin_unlock(&object->lock);
- }
+ pointer_update_refs(scanned, pointer, 0);
+ pointer_update_refs(scanned, pointer, OBJECT_PERCPU);
}
raw_spin_unlock_irqrestore(&kmemleak_lock, flags);
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 14d9e53b1ec2..a2e2a521df0a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -608,47 +608,6 @@ static inline bool ksm_test_exit(struct mm_struct *mm)
return atomic_read(&mm->mm_users) == 0;
}
-static int break_ksm_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long next,
- struct mm_walk *walk)
-{
- struct page *page = NULL;
- spinlock_t *ptl;
- pte_t *pte;
- pte_t ptent;
- int ret;
-
- pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
- if (!pte)
- return 0;
- ptent = ptep_get(pte);
- if (pte_present(ptent)) {
- page = vm_normal_page(walk->vma, addr, ptent);
- } else if (!pte_none(ptent)) {
- swp_entry_t entry = pte_to_swp_entry(ptent);
-
- /*
- * As KSM pages remain KSM pages until freed, no need to wait
- * here for migration to end.
- */
- if (is_migration_entry(entry))
- page = pfn_swap_entry_to_page(entry);
- }
- /* return 1 if the page is an normal ksm page or KSM-placed zero page */
- ret = (page && PageKsm(page)) || is_ksm_zero_pte(ptent);
- pte_unmap_unlock(pte, ptl);
- return ret;
-}
-
-static const struct mm_walk_ops break_ksm_ops = {
- .pmd_entry = break_ksm_pmd_entry,
- .walk_lock = PGWALK_RDLOCK,
-};
-
-static const struct mm_walk_ops break_ksm_lock_vma_ops = {
- .pmd_entry = break_ksm_pmd_entry,
- .walk_lock = PGWALK_WRLOCK,
-};
-
/*
* We use break_ksm to break COW on a ksm page by triggering unsharing,
* such that the ksm page will get replaced by an exclusive anonymous page.
@@ -665,16 +624,26 @@ static const struct mm_walk_ops break_ksm_lock_vma_ops = {
static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_vma)
{
vm_fault_t ret = 0;
- const struct mm_walk_ops *ops = lock_vma ?
- &break_ksm_lock_vma_ops : &break_ksm_ops;
+
+ if (lock_vma)
+ vma_start_write(vma);
do {
- int ksm_page;
+ bool ksm_page = false;
+ struct folio_walk fw;
+ struct folio *folio;
cond_resched();
- ksm_page = walk_page_range_vma(vma, addr, addr + 1, ops, NULL);
- if (WARN_ON_ONCE(ksm_page < 0))
- return ksm_page;
+ folio = folio_walk_start(&fw, vma, addr,
+ FW_MIGRATION | FW_ZEROPAGE);
+ if (folio) {
+ /* Small folio implies FW_LEVEL_PTE. */
+ if (!folio_test_large(folio) &&
+ (folio_test_ksm(folio) || is_ksm_zero_pte(fw.pte)))
+ ksm_page = true;
+ folio_walk_end(&fw, vma);
+ }
+
if (!ksm_page)
return 0;
ret = handle_mm_fault(vma, addr,
@@ -767,26 +736,28 @@ static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item)
struct mm_struct *mm = rmap_item->mm;
unsigned long addr = rmap_item->address;
struct vm_area_struct *vma;
- struct page *page;
+ struct page *page = NULL;
+ struct folio_walk fw;
+ struct folio *folio;
mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (!vma)
goto out;
- page = follow_page(vma, addr, FOLL_GET);
- if (IS_ERR_OR_NULL(page))
- goto out;
- if (is_zone_device_page(page))
- goto out_putpage;
- if (PageAnon(page)) {
+ folio = folio_walk_start(&fw, vma, addr, 0);
+ if (folio) {
+ if (!folio_is_zone_device(folio) &&
+ folio_test_anon(folio)) {
+ folio_get(folio);
+ page = fw.page;
+ }
+ folio_walk_end(&fw, vma);
+ }
+out:
+ if (page) {
flush_anon_page(vma, page, addr);
flush_dcache_page(page);
- } else {
-out_putpage:
- put_page(page);
-out:
- page = NULL;
}
mmap_read_unlock(mm);
return page;
@@ -938,12 +909,13 @@ again:
*/
while (!folio_try_get(folio)) {
/*
- * Another check for page->mapping != expected_mapping would
- * work here too. We have chosen the !PageSwapCache test to
- * optimize the common case, when the page is or is about to
- * be freed: PageSwapCache is cleared (under spin_lock_irq)
- * in the ref_freeze section of __remove_mapping(); but Anon
- * folio->mapping reset to NULL later, in free_pages_prepare().
+ * Another check for folio->mapping != expected_mapping
+ * would work here too. We have chosen to test the
+ * swapcache flag to optimize the common case, when the
+ * folio is or is about to be freed: the swapcache flag
+ * is cleared (under spin_lock_irq) in the ref_freeze
+ * section of __remove_mapping(); but anon folio->mapping
+ * is reset to NULL later, in free_pages_prepare().
*/
if (!folio_test_swapcache(folio))
goto stale;
@@ -974,7 +946,7 @@ again:
stale:
/*
- * We come here from above when page->mapping or !PageSwapCache
+ * We come here from above when folio->mapping or the swapcache flag
* suggests that the node is stale; but it might be under migration.
* We need smp_rmb(), matching the smp_wmb() in folio_migrate_ksm(),
* before checking whether node->kpfn has been changed.
@@ -1481,7 +1453,7 @@ static int try_to_merge_one_page(struct vm_area_struct *vma,
goto out;
/*
- * We need the page lock to read a stable PageSwapCache in
+ * We need the folio lock to read a stable swapcache flag in
* write_protect_page(). We use trylock_page() instead of
* lock_page() because we don't want to wait here - we
* prefer to continue scanning and merging different pages,
@@ -2562,36 +2534,46 @@ next_mm:
ksm_scan.address = vma->vm_end;
while (ksm_scan.address < vma->vm_end) {
+ struct page *tmp_page = NULL;
+ struct folio_walk fw;
+ struct folio *folio;
+
if (ksm_test_exit(mm))
break;
- *page = follow_page(vma, ksm_scan.address, FOLL_GET);
- if (IS_ERR_OR_NULL(*page)) {
- ksm_scan.address += PAGE_SIZE;
- cond_resched();
- continue;
+
+ folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
+ if (folio) {
+ if (!folio_is_zone_device(folio) &&
+ folio_test_anon(folio)) {
+ folio_get(folio);
+ tmp_page = fw.page;
+ }
+ folio_walk_end(&fw, vma);
}
- if (is_zone_device_page(*page))
- goto next_page;
- if (PageAnon(*page)) {
- flush_anon_page(vma, *page, ksm_scan.address);
- flush_dcache_page(*page);
+
+ if (tmp_page) {
+ flush_anon_page(vma, tmp_page, ksm_scan.address);
+ flush_dcache_page(tmp_page);
rmap_item = get_next_rmap_item(mm_slot,
ksm_scan.rmap_list, ksm_scan.address);
if (rmap_item) {
ksm_scan.rmap_list =
&rmap_item->rmap_list;
- if (should_skip_rmap_item(*page, rmap_item))
+ if (should_skip_rmap_item(tmp_page, rmap_item)) {
+ folio_put(folio);
goto next_page;
+ }
ksm_scan.address += PAGE_SIZE;
- } else
- put_page(*page);
+ *page = tmp_page;
+ } else {
+ folio_put(folio);
+ }
mmap_read_unlock(mm);
return rmap_item;
}
next_page:
- put_page(*page);
ksm_scan.address += PAGE_SIZE;
cond_resched();
}
@@ -3142,7 +3124,7 @@ void folio_migrate_ksm(struct folio *newfolio, struct folio *folio)
* newfolio->mapping was set in advance; now we need smp_wmb()
* to make sure that the new stable_node->kpfn is visible
* to ksm_get_folio() before it can see that folio->mapping
- * has gone stale (or that folio_test_swapcache has been cleared).
+ * has gone stale (or that the swapcache flag has been cleared).
*/
smp_wmb();
folio_set_stable_node(folio, NULL);
diff --git a/mm/madvise.c b/mm/madvise.c
index 6e3a137b8e50..ff139e57cca2 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -1031,6 +1031,9 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
struct anon_vma_name *anon_name;
unsigned long new_flags = vma->vm_flags;
+ if (unlikely(!can_modify_vma_madv(vma, behavior)))
+ return -EPERM;
+
switch (behavior) {
case MADV_REMOVE:
return madvise_remove(vma, prev, start, end);
@@ -1448,15 +1451,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
start = untagged_addr_remote(mm, start);
end = start + len;
- /*
- * Check if the address range is sealed for do_madvise().
- * can_modify_mm_madv assumes we have acquired the lock on MM.
- */
- if (unlikely(!can_modify_mm_madv(mm, start, end, behavior))) {
- error = -EPERM;
- goto out;
- }
-
blk_start_plug(&plug);
switch (behavior) {
case MADV_POPULATE_READ:
@@ -1470,7 +1464,6 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
}
blk_finish_plug(&plug);
-out:
if (write)
mmap_write_unlock(mm);
else
diff --git a/mm/memblock.c b/mm/memblock.c
index 3b9dc2d89b8a..0a77a748a8eb 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1500,7 +1500,7 @@ done:
*
* Accept the memory of the allocated buffer.
*/
- accept_memory(found, found + size);
+ accept_memory(found, size);
return found;
}
diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c
index 417c96f2da28..b37c0d870816 100644
--- a/mm/memcontrol-v1.c
+++ b/mm/memcontrol-v1.c
@@ -742,6 +742,9 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
return folio_file_page(folio, index);
}
+static void memcg1_check_events(struct mem_cgroup *memcg, int nid);
+static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages);
+
/**
* mem_cgroup_move_account - move account of the folio
* @folio: The folio.
@@ -853,9 +856,9 @@ static int mem_cgroup_move_account(struct folio *folio,
nid = folio_nid(folio);
local_irq_disable();
- mem_cgroup_charge_statistics(to, nr_pages);
+ memcg1_charge_statistics(to, nr_pages);
memcg1_check_events(to, nid);
- mem_cgroup_charge_statistics(from, -nr_pages);
+ memcg1_charge_statistics(from, -nr_pages);
memcg1_check_events(from, nid);
local_irq_enable();
out:
@@ -1439,21 +1442,68 @@ static void mem_cgroup_threshold(struct mem_cgroup *memcg)
}
}
+/* Cgroup1: threshold notifications & softlimit tree updates */
+struct memcg1_events_percpu {
+ unsigned long nr_page_events;
+ unsigned long targets[MEM_CGROUP_NTARGETS];
+};
+
+static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
+{
+ /* pagein of a big page is an event. So, ignore page size */
+ if (nr_pages > 0)
+ __count_memcg_events(memcg, PGPGIN, 1);
+ else {
+ __count_memcg_events(memcg, PGPGOUT, 1);
+ nr_pages = -nr_pages; /* for event */
+ }
+
+ __this_cpu_add(memcg->events_percpu->nr_page_events, nr_pages);
+}
+
+#define THRESHOLDS_EVENTS_TARGET 128
+#define SOFTLIMIT_EVENTS_TARGET 1024
+
+static bool memcg1_event_ratelimit(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_target target)
+{
+ unsigned long val, next;
+
+ val = __this_cpu_read(memcg->events_percpu->nr_page_events);
+ next = __this_cpu_read(memcg->events_percpu->targets[target]);
+ /* from time_after() in jiffies.h */
+ if ((long)(next - val) < 0) {
+ switch (target) {
+ case MEM_CGROUP_TARGET_THRESH:
+ next = val + THRESHOLDS_EVENTS_TARGET;
+ break;
+ case MEM_CGROUP_TARGET_SOFTLIMIT:
+ next = val + SOFTLIMIT_EVENTS_TARGET;
+ break;
+ default:
+ break;
+ }
+ __this_cpu_write(memcg->events_percpu->targets[target], next);
+ return true;
+ }
+ return false;
+}
+
/*
* Check events in order.
*
*/
-void memcg1_check_events(struct mem_cgroup *memcg, int nid)
+static void memcg1_check_events(struct mem_cgroup *memcg, int nid)
{
if (IS_ENABLED(CONFIG_PREEMPT_RT))
return;
/* threshold event is triggered in finer grain than soft limit */
- if (unlikely(mem_cgroup_event_ratelimit(memcg,
+ if (unlikely(memcg1_event_ratelimit(memcg,
MEM_CGROUP_TARGET_THRESH))) {
bool do_softlimit;
- do_softlimit = mem_cgroup_event_ratelimit(memcg,
+ do_softlimit = memcg1_event_ratelimit(memcg,
MEM_CGROUP_TARGET_SOFTLIMIT);
mem_cgroup_threshold(memcg);
if (unlikely(do_softlimit))
@@ -1461,6 +1511,43 @@ void memcg1_check_events(struct mem_cgroup *memcg, int nid)
}
}
+void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ memcg1_charge_statistics(memcg, folio_nr_pages(folio));
+ memcg1_check_events(memcg, folio_nid(folio));
+ local_irq_restore(flags);
+}
+
+void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg)
+{
+ /*
+ * Interrupts should be disabled here because the caller holds the
+ * i_pages lock which is taken with interrupts-off. It is
+ * important here to have the interrupts disabled because it is the
+ * only synchronisation we have for updating the per-CPU variables.
+ */
+ preempt_disable_nested();
+ VM_WARN_ON_IRQS_ENABLED();
+ memcg1_charge_statistics(memcg, -folio_nr_pages(folio));
+ preempt_enable_nested();
+ memcg1_check_events(memcg, folio_nid(folio));
+}
+
+void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
+ unsigned long nr_memory, int nid)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __count_memcg_events(memcg, PGPGOUT, pgpgout);
+ __this_cpu_add(memcg->events_percpu->nr_page_events, nr_memory);
+ memcg1_check_events(memcg, nid);
+ local_irq_restore(flags);
+}
+
static int compare_thresholds(const void *a, const void *b)
{
const struct mem_cgroup_threshold *_a = a;
@@ -1907,9 +1994,15 @@ static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
event->register_event = mem_cgroup_usage_register_event;
event->unregister_event = mem_cgroup_usage_unregister_event;
} else if (!strcmp(name, "memory.oom_control")) {
+ pr_warn_once("oom_control is deprecated and will be removed. "
+ "Please report your usecase to linux-mm-@kvack.org"
+ " if you depend on this functionality. \n");
event->register_event = mem_cgroup_oom_register_event;
event->unregister_event = mem_cgroup_oom_unregister_event;
} else if (!strcmp(name, "memory.pressure_level")) {
+ pr_warn_once("pressure_level is deprecated and will be removed. "
+ "Please report your usecase to linux-mm-@kvack.org "
+ "if you depend on this functionality. \n");
event->register_event = vmpressure_register_event;
event->unregister_event = vmpressure_unregister_event;
} else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
@@ -2447,6 +2540,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
ret = 0;
break;
case _TCP:
+ pr_warn_once("kmem.tcp.limit_in_bytes is deprecated and will be removed. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
ret = memcg_update_tcp_max(memcg, nr_pages);
break;
}
@@ -2455,6 +2551,9 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
if (IS_ENABLED(CONFIG_PREEMPT_RT)) {
ret = -EOPNOTSUPP;
} else {
+ pr_warn_once("soft_limit_in_bytes is deprecated and will be removed. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
WRITE_ONCE(memcg->soft_limit, nr_pages);
ret = 0;
}
@@ -2748,6 +2847,10 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ pr_warn_once("oom_control is deprecated and will be removed. "
+ "Please report your usecase to linux-mm-@kvack.org if you "
+ "depend on this functionality. \n");
+
/* cannot set to root cgroup and only 0 and 1 are allowed */
if (mem_cgroup_is_root(memcg) || !((val == 0) || (val == 1)))
return -EINVAL;
@@ -2952,6 +3055,19 @@ bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages,
return false;
}
+bool memcg1_alloc_events(struct mem_cgroup *memcg)
+{
+ memcg->events_percpu = alloc_percpu_gfp(struct memcg1_events_percpu,
+ GFP_KERNEL_ACCOUNT);
+ return !!memcg->events_percpu;
+}
+
+void memcg1_free_events(struct mem_cgroup *memcg)
+{
+ if (memcg->events_percpu)
+ free_percpu(memcg->events_percpu);
+}
+
static int __init memcg1_init(void)
{
int node;
diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h
index 56d7eaa98274..c0672e25bcdb 100644
--- a/mm/memcontrol-v1.h
+++ b/mm/memcontrol-v1.h
@@ -7,7 +7,6 @@
/* Cgroup v1 and v2 common declarations */
-void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages);
int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages);
@@ -56,8 +55,6 @@ enum mem_cgroup_events_target {
MEM_CGROUP_NTARGETS,
};
-bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
- enum mem_cgroup_events_target target);
unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap);
void drain_all_stock(struct mem_cgroup *root_memcg);
@@ -71,6 +68,10 @@ int memory_stat_show(struct seq_file *m, void *v);
/* Cgroup v1-specific declarations */
#ifdef CONFIG_MEMCG_V1
+
+bool memcg1_alloc_events(struct mem_cgroup *memcg);
+void memcg1_free_events(struct mem_cgroup *memcg);
+
void memcg1_memcg_init(struct mem_cgroup *memcg);
void memcg1_remove_from_trees(struct mem_cgroup *memcg);
@@ -99,7 +100,10 @@ bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked);
void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked);
void memcg1_oom_recover(struct mem_cgroup *memcg);
-void memcg1_check_events(struct mem_cgroup *memcg, int nid);
+void memcg1_commit_charge(struct folio *folio, struct mem_cgroup *memcg);
+void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg);
+void memcg1_uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
+ unsigned long nr_memory, int nid);
void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s);
@@ -120,6 +124,9 @@ extern struct cftype mem_cgroup_legacy_files[];
#else /* CONFIG_MEMCG_V1 */
+static inline bool memcg1_alloc_events(struct mem_cgroup *memcg) { return true; }
+static inline void memcg1_free_events(struct mem_cgroup *memcg) {}
+
static inline void memcg1_memcg_init(struct mem_cgroup *memcg) {}
static inline void memcg1_remove_from_trees(struct mem_cgroup *memcg) {}
static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg) {}
@@ -130,7 +137,14 @@ static inline bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) {
static inline void memcg1_oom_finish(struct mem_cgroup *memcg, bool locked) {}
static inline void memcg1_oom_recover(struct mem_cgroup *memcg) {}
-static inline void memcg1_check_events(struct mem_cgroup *memcg, int nid) {}
+static inline void memcg1_commit_charge(struct folio *folio,
+ struct mem_cgroup *memcg) {}
+
+static inline void memcg1_swapout(struct folio *folio, struct mem_cgroup *memcg) {}
+
+static inline void memcg1_uncharge_batch(struct mem_cgroup *memcg,
+ unsigned long pgpgout,
+ unsigned long nr_memory, int nid) {}
static inline void memcg1_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) {}
@@ -140,8 +154,6 @@ static inline bool memcg1_charge_skmem(struct mem_cgroup *memcg, unsigned int nr
gfp_t gfp_mask) { return true; }
static inline void memcg1_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) {}
-extern struct cftype memsw_files[];
-extern struct cftype mem_cgroup_legacy_files[];
#endif /* CONFIG_MEMCG_V1 */
#endif /* __MM_MEMCONTROL_V1_H */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d563fb515766..7845c64a2c57 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -25,6 +25,7 @@
* Copyright (C) 2020 Alibaba, Inc, Alex Shi
*/
+#include <linux/cgroup-defs.h>
#include <linux/page_counter.h>
#include <linux/memcontrol.h>
#include <linux/cgroup.h>
@@ -41,6 +42,7 @@
#include <linux/rcupdate.h>
#include <linux/limits.h>
#include <linux/export.h>
+#include <linux/list.h>
#include <linux/mutex.h>
#include <linux/rbtree.h>
#include <linux/slab.h>
@@ -93,9 +95,6 @@ static bool cgroup_memory_nobpf __ro_after_init;
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
-#define THRESHOLDS_EVENTS_TARGET 128
-#define SOFTLIMIT_EVENTS_TARGET 1024
-
static inline bool task_is_dying(void)
{
return tsk_is_oom_victim(current) || fatal_signal_pending(current) ||
@@ -305,6 +304,12 @@ static const unsigned int memcg_node_stat_items[] = {
#ifdef CONFIG_SWAP
NR_SWAPCACHE,
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ PGPROMOTE_SUCCESS,
+#endif
+ PGDEMOTE_KSWAPD,
+ PGDEMOTE_DIRECT,
+ PGDEMOTE_KHUGEPAGED,
};
static const unsigned int memcg_stat_items[] = {
@@ -320,24 +325,27 @@ static const unsigned int memcg_stat_items[] = {
#define NR_MEMCG_NODE_STAT_ITEMS ARRAY_SIZE(memcg_node_stat_items)
#define MEMCG_VMSTAT_SIZE (NR_MEMCG_NODE_STAT_ITEMS + \
ARRAY_SIZE(memcg_stat_items))
-static int8_t mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
+#define BAD_STAT_IDX(index) ((u32)(index) >= U8_MAX)
+static u8 mem_cgroup_stats_index[MEMCG_NR_STAT] __read_mostly;
static void init_memcg_stats(void)
{
- int8_t i, j = 0;
+ u8 i, j = 0;
+
+ BUILD_BUG_ON(MEMCG_NR_STAT >= U8_MAX);
- BUILD_BUG_ON(MEMCG_NR_STAT >= S8_MAX);
+ memset(mem_cgroup_stats_index, U8_MAX, sizeof(mem_cgroup_stats_index));
- for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i)
- mem_cgroup_stats_index[memcg_node_stat_items[i]] = ++j;
+ for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; ++i, ++j)
+ mem_cgroup_stats_index[memcg_node_stat_items[i]] = j;
- for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i)
- mem_cgroup_stats_index[memcg_stat_items[i]] = ++j;
+ for (i = 0; i < ARRAY_SIZE(memcg_stat_items); ++i, ++j)
+ mem_cgroup_stats_index[memcg_stat_items[i]] = j;
}
static inline int memcg_stats_index(int idx)
{
- return mem_cgroup_stats_index[idx] - 1;
+ return mem_cgroup_stats_index[idx];
}
struct lruvec_stats_percpu {
@@ -369,7 +377,7 @@ unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx)
return node_page_state(lruvec_pgdat(lruvec), idx);
i = memcg_stats_index(idx);
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return 0;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
@@ -392,7 +400,7 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
return node_page_state(lruvec_pgdat(lruvec), idx);
i = memcg_stats_index(idx);
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return 0;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
@@ -406,8 +414,10 @@ unsigned long lruvec_page_state_local(struct lruvec *lruvec,
/* Subset of vm_event_item to report for memcg event stats */
static const unsigned int memcg_vm_event_stat[] = {
+#ifdef CONFIG_MEMCG_V1
PGPGIN,
PGPGOUT,
+#endif
PGSCAN_KSWAPD,
PGSCAN_DIRECT,
PGSCAN_KHUGEPAGED,
@@ -432,24 +442,32 @@ static const unsigned int memcg_vm_event_stat[] = {
THP_SWPOUT,
THP_SWPOUT_FALLBACK,
#endif
+#ifdef CONFIG_NUMA_BALANCING
+ NUMA_PAGE_MIGRATE,
+ NUMA_PTE_UPDATES,
+ NUMA_HINT_FAULTS,
+#endif
};
#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat)
-static int8_t mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
+static u8 mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly;
static void init_memcg_events(void)
{
- int8_t i;
+ u8 i;
- BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= S8_MAX);
+ BUILD_BUG_ON(NR_VM_EVENT_ITEMS >= U8_MAX);
+
+ memset(mem_cgroup_events_index, U8_MAX,
+ sizeof(mem_cgroup_events_index));
for (i = 0; i < NR_MEMCG_EVENTS; ++i)
- mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1;
+ mem_cgroup_events_index[memcg_vm_event_stat[i]] = i;
}
static inline int memcg_events_index(enum vm_event_item idx)
{
- return mem_cgroup_events_index[idx] - 1;
+ return mem_cgroup_events_index[idx];
}
struct memcg_vmstats_percpu {
@@ -469,10 +487,6 @@ struct memcg_vmstats_percpu {
/* Delta calculation for lockless upward propagation */
long state_prev[MEMCG_VMSTAT_SIZE];
unsigned long events_prev[NR_MEMCG_EVENTS];
-
- /* Cgroup1: threshold notifications & softlimit tree updates */
- unsigned long nr_page_events;
- unsigned long targets[MEM_CGROUP_NTARGETS];
} ____cacheline_aligned;
struct memcg_vmstats {
@@ -621,7 +635,7 @@ unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx)
long x;
int i = memcg_stats_index(idx);
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return 0;
x = READ_ONCE(memcg->vmstats->state[i]);
@@ -662,7 +676,7 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx,
if (mem_cgroup_disabled())
return;
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
__this_cpu_add(memcg->vmstats_percpu->state[i], val);
@@ -675,7 +689,7 @@ unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
long x;
int i = memcg_stats_index(idx);
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return 0;
x = READ_ONCE(memcg->vmstats->state_local[i]);
@@ -694,7 +708,7 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec,
struct mem_cgroup *memcg;
int i = memcg_stats_index(idx);
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
@@ -810,7 +824,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx,
if (mem_cgroup_disabled())
return;
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, idx))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, idx))
return;
memcg_stats_lock();
@@ -823,7 +837,7 @@ unsigned long memcg_events(struct mem_cgroup *memcg, int event)
{
int i = memcg_events_index(event);
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, event))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
return 0;
return READ_ONCE(memcg->vmstats->events[i]);
@@ -833,50 +847,12 @@ unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
{
int i = memcg_events_index(event);
- if (WARN_ONCE(i < 0, "%s: missing stat item %d\n", __func__, event))
+ if (WARN_ONCE(BAD_STAT_IDX(i), "%s: missing stat item %d\n", __func__, event))
return 0;
return READ_ONCE(memcg->vmstats->events_local[i]);
}
-void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, int nr_pages)
-{
- /* pagein of a big page is an event. So, ignore page size */
- if (nr_pages > 0)
- __count_memcg_events(memcg, PGPGIN, 1);
- else {
- __count_memcg_events(memcg, PGPGOUT, 1);
- nr_pages = -nr_pages; /* for event */
- }
-
- __this_cpu_add(memcg->vmstats_percpu->nr_page_events, nr_pages);
-}
-
-bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
- enum mem_cgroup_events_target target)
-{
- unsigned long val, next;
-
- val = __this_cpu_read(memcg->vmstats_percpu->nr_page_events);
- next = __this_cpu_read(memcg->vmstats_percpu->targets[target]);
- /* from time_after() in jiffies.h */
- if ((long)(next - val) < 0) {
- switch (target) {
- case MEM_CGROUP_TARGET_THRESH:
- next = val + THRESHOLDS_EVENTS_TARGET;
- break;
- case MEM_CGROUP_TARGET_SOFTLIMIT:
- next = val + SOFTLIMIT_EVENTS_TARGET;
- break;
- default:
- break;
- }
- __this_cpu_write(memcg->vmstats_percpu->targets[target], next);
- return true;
- }
- return false;
-}
-
struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
{
/*
@@ -971,6 +947,24 @@ again:
}
/**
+ * get_mem_cgroup_from_folio - Obtain a reference on a given folio's memcg.
+ * @folio: folio from which memcg should be extracted.
+ */
+struct mem_cgroup *get_mem_cgroup_from_folio(struct folio *folio)
+{
+ struct mem_cgroup *memcg = folio_memcg(folio);
+
+ if (mem_cgroup_disabled())
+ return NULL;
+
+ rcu_read_lock();
+ if (!memcg || WARN_ON_ONCE(!css_tryget(&memcg->css)))
+ memcg = root_mem_cgroup;
+ rcu_read_unlock();
+ return memcg;
+}
+
+/**
* mem_cgroup_iter - iterate over memory cgroup hierarchy
* @root: hierarchy root
* @prev: previously returned memcg, NULL on first invocation
@@ -992,9 +986,9 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup_reclaim_cookie *reclaim)
{
struct mem_cgroup_reclaim_iter *iter;
- struct cgroup_subsys_state *css = NULL;
- struct mem_cgroup *memcg = NULL;
- struct mem_cgroup *pos = NULL;
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *pos;
+ struct mem_cgroup *next;
if (mem_cgroup_disabled())
return NULL;
@@ -1003,81 +997,67 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
root = root_mem_cgroup;
rcu_read_lock();
+restart:
+ next = NULL;
if (reclaim) {
- struct mem_cgroup_per_node *mz;
+ int gen;
+ int nid = reclaim->pgdat->node_id;
- mz = root->nodeinfo[reclaim->pgdat->node_id];
- iter = &mz->iter;
+ iter = &root->nodeinfo[nid]->iter;
+ gen = atomic_read(&iter->generation);
/*
* On start, join the current reclaim iteration cycle.
* Exit when a concurrent walker completes it.
*/
if (!prev)
- reclaim->generation = iter->generation;
- else if (reclaim->generation != iter->generation)
+ reclaim->generation = gen;
+ else if (reclaim->generation != gen)
goto out_unlock;
- while (1) {
- pos = READ_ONCE(iter->position);
- if (!pos || css_tryget(&pos->css))
- break;
- /*
- * css reference reached zero, so iter->position will
- * be cleared by ->css_released. However, we should not
- * rely on this happening soon, because ->css_released
- * is called from a work queue, and by busy-waiting we
- * might block it. So we clear iter->position right
- * away.
- */
- (void)cmpxchg(&iter->position, pos, NULL);
- }
- } else if (prev) {
+ pos = READ_ONCE(iter->position);
+ } else
pos = prev;
- }
- if (pos)
- css = &pos->css;
-
- for (;;) {
- css = css_next_descendant_pre(css, &root->css);
- if (!css) {
- /*
- * Reclaimers share the hierarchy walk, and a
- * new one might jump in right at the end of
- * the hierarchy - make sure they see at least
- * one group and restart from the beginning.
- */
- if (!prev)
- continue;
- break;
- }
+ css = pos ? &pos->css : NULL;
+ while ((css = css_next_descendant_pre(css, &root->css))) {
/*
* Verify the css and acquire a reference. The root
* is provided by the caller, so we know it's alive
* and kicking, and don't take an extra reference.
*/
- if (css == &root->css || css_tryget(css)) {
- memcg = mem_cgroup_from_css(css);
+ if (css == &root->css || css_tryget(css))
break;
- }
}
+ next = mem_cgroup_from_css(css);
+
if (reclaim) {
/*
* The position could have already been updated by a competing
* thread, so check that the value hasn't changed since we read
* it to avoid reclaiming from the same cgroup twice.
*/
- (void)cmpxchg(&iter->position, pos, memcg);
+ if (cmpxchg(&iter->position, pos, next) != pos) {
+ if (css && css != &root->css)
+ css_put(css);
+ goto restart;
+ }
- if (pos)
- css_put(&pos->css);
+ if (!next) {
+ atomic_inc(&iter->generation);
- if (!memcg)
- iter->generation++;
+ /*
+ * Reclaimers share the hierarchy walk, and a
+ * new one might jump in right at the end of
+ * the hierarchy - make sure they see at least
+ * one group and restart from the beginning.
+ */
+ if (!prev)
+ goto restart;
+ }
}
out_unlock:
@@ -1085,7 +1065,7 @@ out_unlock:
if (prev && prev != root)
css_put(&prev->css);
- return memcg;
+ return next;
}
/**
@@ -1375,6 +1355,13 @@ static const struct memory_stat memory_stats[] = {
{ "workingset_restore_anon", WORKINGSET_RESTORE_ANON },
{ "workingset_restore_file", WORKINGSET_RESTORE_FILE },
{ "workingset_nodereclaim", WORKINGSET_NODERECLAIM },
+
+ { "pgdemote_kswapd", PGDEMOTE_KSWAPD },
+ { "pgdemote_direct", PGDEMOTE_DIRECT },
+ { "pgdemote_khugepaged", PGDEMOTE_KHUGEPAGED },
+#ifdef CONFIG_NUMA_BALANCING
+ { "pgpromote_success", PGPROMOTE_SUCCESS },
+#endif
};
/* The actual unit of the state item, not the same as the output unit */
@@ -1399,6 +1386,9 @@ static int memcg_page_state_output_unit(int item)
/*
* Workingset state is actually in pages, but we export it to userspace
* as a scalar count of events, so special case it here.
+ *
+ * Demotion and promotion activities are exported in pages, consistent
+ * with their global counterparts.
*/
switch (item) {
case WORKINGSET_REFAULT_ANON:
@@ -1408,6 +1398,12 @@ static int memcg_page_state_output_unit(int item)
case WORKINGSET_RESTORE_ANON:
case WORKINGSET_RESTORE_FILE:
case WORKINGSET_NODERECLAIM:
+ case PGDEMOTE_KSWAPD:
+ case PGDEMOTE_DIRECT:
+ case PGDEMOTE_KHUGEPAGED:
+#ifdef CONFIG_NUMA_BALANCING
+ case PGPROMOTE_SUCCESS:
+#endif
return 1;
default:
return memcg_page_state_unit(item);
@@ -1466,10 +1462,11 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s)
memcg_events(memcg, PGSTEAL_KHUGEPAGED));
for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) {
+#ifdef CONFIG_MEMCG_V1
if (memcg_vm_event_stat[i] == PGPGIN ||
memcg_vm_event_stat[i] == PGPGOUT)
continue;
-
+#endif
seq_buf_printf(s, "%s %lu\n",
vm_event_name(memcg_vm_event_stat[i]),
memcg_events(memcg, memcg_vm_event_stat[i]));
@@ -2366,7 +2363,7 @@ void mem_cgroup_cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
static void commit_charge(struct folio *folio, struct mem_cgroup *memcg)
{
- VM_BUG_ON_FOLIO(folio_memcg(folio), folio);
+ VM_BUG_ON_FOLIO(folio_memcg_charged(folio), folio);
/*
* Any of the following ensures page's memcg stability:
*
@@ -2388,11 +2385,7 @@ void mem_cgroup_commit_charge(struct folio *folio, struct mem_cgroup *memcg)
{
css_get(&memcg->css);
commit_charge(folio, memcg);
-
- local_irq_disable();
- mem_cgroup_charge_statistics(memcg, folio_nr_pages(folio));
- memcg1_check_events(memcg, folio_nid(folio));
- local_irq_enable();
+ memcg1_commit_charge(folio, memcg);
}
static inline void __mod_objcg_mlstate(struct obj_cgroup *objcg,
@@ -2446,37 +2439,7 @@ struct mem_cgroup *mem_cgroup_from_obj_folio(struct folio *folio, void *p)
/*
* Returns a pointer to the memory cgroup to which the kernel object is charged.
- *
- * A passed kernel object can be a slab object, vmalloc object or a generic
- * kernel page, so different mechanisms for getting the memory cgroup pointer
- * should be used.
- *
- * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
- * can not know for sure how the kernel object is implemented.
- * mem_cgroup_from_obj() can be safely used in such cases.
- *
- * The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
- * cgroup_mutex, etc.
- */
-struct mem_cgroup *mem_cgroup_from_obj(void *p)
-{
- struct folio *folio;
-
- if (mem_cgroup_disabled())
- return NULL;
-
- if (unlikely(is_vmalloc_addr(p)))
- folio = page_folio(vmalloc_to_page(p));
- else
- folio = virt_to_folio(p);
-
- return mem_cgroup_from_obj_folio(folio, p);
-}
-
-/*
- * Returns a pointer to the memory cgroup to which the kernel object is charged.
- * Similar to mem_cgroup_from_obj(), but faster and not suitable for objects,
- * allocated using vmalloc().
+ * It is not suitable for objects allocated using vmalloc().
*
* A passed kernel object must be a slab object or a generic kernel page.
*
@@ -3057,12 +3020,11 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab,
void split_page_memcg(struct page *head, int old_order, int new_order)
{
struct folio *folio = page_folio(head);
- struct mem_cgroup *memcg = folio_memcg(folio);
int i;
unsigned int old_nr = 1 << old_order;
unsigned int new_nr = 1 << new_order;
- if (mem_cgroup_disabled() || !memcg)
+ if (mem_cgroup_disabled() || !folio_memcg_charged(folio))
return;
for (i = new_nr; i < old_nr; i += new_nr)
@@ -3071,7 +3033,7 @@ void split_page_memcg(struct page *head, int old_order, int new_order)
if (folio_memcg_kmem(folio))
obj_cgroup_get_many(__folio_objcg(folio), old_nr / new_nr - 1);
else
- css_get_many(&memcg->css, old_nr / new_nr - 1);
+ css_get_many(&folio_memcg(folio)->css, old_nr / new_nr - 1);
}
unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
@@ -3385,29 +3347,12 @@ static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
*/
#define MEM_CGROUP_ID_MAX ((1UL << MEM_CGROUP_ID_SHIFT) - 1)
-static DEFINE_IDR(mem_cgroup_idr);
-static DEFINE_SPINLOCK(memcg_idr_lock);
-
-static int mem_cgroup_alloc_id(void)
-{
- int ret;
-
- idr_preload(GFP_KERNEL);
- spin_lock(&memcg_idr_lock);
- ret = idr_alloc(&mem_cgroup_idr, NULL, 1, MEM_CGROUP_ID_MAX + 1,
- GFP_NOWAIT);
- spin_unlock(&memcg_idr_lock);
- idr_preload_end();
- return ret;
-}
+static DEFINE_XARRAY_ALLOC1(mem_cgroup_ids);
static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
{
if (memcg->id.id > 0) {
- spin_lock(&memcg_idr_lock);
- idr_remove(&mem_cgroup_idr, memcg->id.id);
- spin_unlock(&memcg_idr_lock);
-
+ xa_erase(&mem_cgroup_ids, memcg->id.id);
memcg->id.id = 0;
}
}
@@ -3442,7 +3387,7 @@ static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
{
WARN_ON_ONCE(!rcu_read_lock_held());
- return idr_find(&mem_cgroup_idr, id);
+ return xa_load(&mem_cgroup_ids, id);
}
#ifdef CONFIG_SHRINKER_DEBUG
@@ -3517,6 +3462,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
+ memcg1_free_events(memcg);
kfree(memcg->vmstats);
free_percpu(memcg->vmstats_percpu);
kfree(memcg);
@@ -3535,17 +3481,17 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
struct mem_cgroup *memcg;
int node, cpu;
int __maybe_unused i;
- long error = -ENOMEM;
+ long error;
memcg = kzalloc(struct_size(memcg, nodeinfo, nr_node_ids), GFP_KERNEL);
if (!memcg)
- return ERR_PTR(error);
+ return ERR_PTR(-ENOMEM);
- memcg->id.id = mem_cgroup_alloc_id();
- if (memcg->id.id < 0) {
- error = memcg->id.id;
+ error = xa_alloc(&mem_cgroup_ids, &memcg->id.id, NULL,
+ XA_LIMIT(1, MEM_CGROUP_ID_MAX), GFP_KERNEL);
+ if (error)
goto fail;
- }
+ error = -ENOMEM;
memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats),
GFP_KERNEL_ACCOUNT);
@@ -3557,6 +3503,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
if (!memcg->vmstats_percpu)
goto fail;
+ if (!memcg1_alloc_events(memcg))
+ goto fail;
+
for_each_possible_cpu(cpu) {
if (parent)
pstatc = per_cpu_ptr(parent->vmstats_percpu, cpu);
@@ -3574,6 +3523,9 @@ static struct mem_cgroup *mem_cgroup_alloc(struct mem_cgroup *parent)
INIT_WORK(&memcg->high_work, high_work_func);
vmpressure_init(&memcg->vmpressure);
+ INIT_LIST_HEAD(&memcg->memory_peaks);
+ INIT_LIST_HEAD(&memcg->swap_peaks);
+ spin_lock_init(&memcg->peaks_lock);
memcg->socket_pressure = jiffies;
memcg1_memcg_init(memcg);
memcg->kmemcg_id = -1;
@@ -3619,21 +3571,21 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent) {
WRITE_ONCE(memcg->swappiness, mem_cgroup_swappiness(parent));
- page_counter_init(&memcg->memory, &parent->memory);
- page_counter_init(&memcg->swap, &parent->swap);
+ page_counter_init(&memcg->memory, &parent->memory, true);
+ page_counter_init(&memcg->swap, &parent->swap, false);
#ifdef CONFIG_MEMCG_V1
WRITE_ONCE(memcg->oom_kill_disable, READ_ONCE(parent->oom_kill_disable));
- page_counter_init(&memcg->kmem, &parent->kmem);
- page_counter_init(&memcg->tcpmem, &parent->tcpmem);
+ page_counter_init(&memcg->kmem, &parent->kmem, false);
+ page_counter_init(&memcg->tcpmem, &parent->tcpmem, false);
#endif
} else {
init_memcg_stats();
init_memcg_events();
- page_counter_init(&memcg->memory, NULL);
- page_counter_init(&memcg->swap, NULL);
+ page_counter_init(&memcg->memory, NULL, true);
+ page_counter_init(&memcg->swap, NULL, false);
#ifdef CONFIG_MEMCG_V1
- page_counter_init(&memcg->kmem, NULL);
- page_counter_init(&memcg->tcpmem, NULL);
+ page_counter_init(&memcg->kmem, NULL, false);
+ page_counter_init(&memcg->tcpmem, NULL, false);
#endif
root_mem_cgroup = memcg;
return &memcg->css;
@@ -3682,9 +3634,7 @@ static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
* publish it here at the end of onlining. This matches the
* regular ID destruction during offlining.
*/
- spin_lock(&memcg_idr_lock);
- idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
- spin_unlock(&memcg_idr_lock);
+ xa_store(&mem_cgroup_ids, memcg->id.id, memcg, GFP_KERNEL);
return 0;
offline_kmem:
@@ -3967,14 +3917,91 @@ static u64 memory_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
}
-static u64 memory_peak_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
+#define OFP_PEAK_UNSET (((-1UL)))
+
+static int peak_show(struct seq_file *sf, void *v, struct page_counter *pc)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct cgroup_of_peak *ofp = of_peak(sf->private);
+ u64 fd_peak = READ_ONCE(ofp->value), peak;
+
+ /* User wants global or local peak? */
+ if (fd_peak == OFP_PEAK_UNSET)
+ peak = pc->watermark;
+ else
+ peak = max(fd_peak, READ_ONCE(pc->local_watermark));
+
+ seq_printf(sf, "%llu\n", peak * PAGE_SIZE);
+ return 0;
+}
+
+static int memory_peak_show(struct seq_file *sf, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
- return (u64)memcg->memory.watermark * PAGE_SIZE;
+ return peak_show(sf, v, &memcg->memory);
}
+static int peak_open(struct kernfs_open_file *of)
+{
+ struct cgroup_of_peak *ofp = of_peak(of);
+
+ ofp->value = OFP_PEAK_UNSET;
+ return 0;
+}
+
+static void peak_release(struct kernfs_open_file *of)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ struct cgroup_of_peak *ofp = of_peak(of);
+
+ if (ofp->value == OFP_PEAK_UNSET) {
+ /* fast path (no writes on this fd) */
+ return;
+ }
+ spin_lock(&memcg->peaks_lock);
+ list_del(&ofp->list);
+ spin_unlock(&memcg->peaks_lock);
+}
+
+static ssize_t peak_write(struct kernfs_open_file *of, char *buf, size_t nbytes,
+ loff_t off, struct page_counter *pc,
+ struct list_head *watchers)
+{
+ unsigned long usage;
+ struct cgroup_of_peak *peer_ctx;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ struct cgroup_of_peak *ofp = of_peak(of);
+
+ spin_lock(&memcg->peaks_lock);
+
+ usage = page_counter_read(pc);
+ WRITE_ONCE(pc->local_watermark, usage);
+
+ list_for_each_entry(peer_ctx, watchers, list)
+ if (usage > peer_ctx->value)
+ WRITE_ONCE(peer_ctx->value, usage);
+
+ /* initial write, register watcher */
+ if (ofp->value == -1)
+ list_add(&ofp->list, watchers);
+
+ WRITE_ONCE(ofp->value, usage);
+ spin_unlock(&memcg->peaks_lock);
+
+ return nbytes;
+}
+
+static ssize_t memory_peak_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+
+ return peak_write(of, buf, nbytes, off, &memcg->memory,
+ &memcg->memory_peaks);
+}
+
+#undef OFP_PEAK_UNSET
+
static int memory_min_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
@@ -4324,7 +4351,10 @@ static struct cftype memory_files[] = {
{
.name = "peak",
.flags = CFTYPE_NOT_ON_ROOT,
- .read_u64 = memory_peak_read,
+ .open = peak_open,
+ .release = peak_release,
+ .seq_show = memory_peak_show,
+ .write = memory_peak_write,
},
{
.name = "min",
@@ -4528,14 +4558,15 @@ int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm,
/*
* mem_cgroup_swapin_uncharge_swap - uncharge swap slot
- * @entry: swap entry for which the page is charged
+ * @entry: the first swap entry for which the pages are charged
+ * @nr_pages: number of pages which will be uncharged
*
* Call this function after successfully adding the charged page to swapcache.
*
* Note: This function assumes the page for which swap slot is being uncharged
* is order 0 page.
*/
-void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
{
/*
* Cgroup1's unified memory+swap counter has been charged with the
@@ -4555,7 +4586,7 @@ void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
* let's not wait for it. The page already received a
* memory+swap charge, drop the swap entry duplicate.
*/
- mem_cgroup_uncharge_swap(entry, 1);
+ mem_cgroup_uncharge_swap(entry, nr_pages);
}
}
@@ -4574,8 +4605,6 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
- unsigned long flags;
-
if (ug->nr_memory) {
page_counter_uncharge(&ug->memcg->memory, ug->nr_memory);
if (do_memsw_account())
@@ -4587,11 +4616,7 @@ static void uncharge_batch(const struct uncharge_gather *ug)
memcg1_oom_recover(ug->memcg);
}
- local_irq_save(flags);
- __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory);
- memcg1_check_events(ug->memcg, ug->nid);
- local_irq_restore(flags);
+ memcg1_uncharge_batch(ug->memcg, ug->pgpgout, ug->nr_memory, ug->nid);
/* drop reference from uncharge_folio */
css_put(&ug->memcg->css);
@@ -4606,7 +4631,8 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
VM_BUG_ON_FOLIO(folio_order(folio) > 1 &&
!folio_test_hugetlb(folio) &&
- !list_empty(&folio->_deferred_list), folio);
+ !list_empty(&folio->_deferred_list) &&
+ folio_test_partially_mapped(folio), folio);
/*
* Nobody should be changing or seriously looking at
@@ -4664,7 +4690,7 @@ void __mem_cgroup_uncharge(struct folio *folio)
struct uncharge_gather ug;
/* Don't touch folio->lru of any random page, pre-check: */
- if (!folio_memcg(folio))
+ if (!folio_memcg_charged(folio))
return;
uncharge_gather_clear(&ug);
@@ -4698,7 +4724,6 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
{
struct mem_cgroup *memcg;
long nr_pages = folio_nr_pages(new);
- unsigned long flags;
VM_BUG_ON_FOLIO(!folio_test_locked(old), old);
VM_BUG_ON_FOLIO(!folio_test_locked(new), new);
@@ -4709,7 +4734,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
return;
/* Page cache replacement: new folio already charged? */
- if (folio_memcg(new))
+ if (folio_memcg_charged(new))
return;
memcg = folio_memcg(old);
@@ -4726,11 +4751,7 @@ void mem_cgroup_replace_folio(struct folio *old, struct folio *new)
css_get(&memcg->css);
commit_charge(new, memcg);
-
- local_irq_save(flags);
- mem_cgroup_charge_statistics(memcg, nr_pages);
- memcg1_check_events(memcg, folio_nid(new));
- local_irq_restore(flags);
+ memcg1_commit_charge(new, memcg);
}
/**
@@ -4966,17 +4987,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry)
page_counter_uncharge(&memcg->memsw, nr_entries);
}
- /*
- * Interrupts should be disabled here because the caller holds the
- * i_pages lock which is taken with interrupts-off. It is
- * important here to have the interrupts disabled because it is the
- * only synchronisation we have for updating the per-CPU variables.
- */
- memcg_stats_lock();
- mem_cgroup_charge_statistics(memcg, -nr_entries);
- memcg_stats_unlock();
- memcg1_check_events(memcg, folio_nid(folio));
-
+ memcg1_swapout(folio, memcg);
css_put(&memcg->css);
}
@@ -5116,12 +5127,20 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
}
-static u64 swap_peak_read(struct cgroup_subsys_state *css,
- struct cftype *cft)
+static int swap_peak_show(struct seq_file *sf, void *v)
{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
+
+ return peak_show(sf, v, &memcg->swap);
+}
+
+static ssize_t swap_peak_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- return (u64)memcg->swap.watermark * PAGE_SIZE;
+ return peak_write(of, buf, nbytes, off, &memcg->swap,
+ &memcg->swap_peaks);
}
static int swap_high_show(struct seq_file *m, void *v)
@@ -5205,7 +5224,10 @@ static struct cftype swap_files[] = {
{
.name = "swap.peak",
.flags = CFTYPE_NOT_ON_ROOT,
- .read_u64 = swap_peak_read,
+ .open = peak_open,
+ .release = peak_release,
+ .seq_show = swap_peak_show,
+ .write = swap_peak_write,
},
{
.name = "swap.events",
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 7066fc84f351..96ce31e5a203 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1554,6 +1554,32 @@ static int get_hwpoison_page(struct page *p, unsigned long flags)
return ret;
}
+void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu)
+{
+ if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
+ struct address_space *mapping;
+
+ /*
+ * For hugetlb folios in shared mappings, try_to_unmap
+ * could potentially call huge_pmd_unshare. Because of
+ * this, take semaphore in write mode here and set
+ * TTU_RMAP_LOCKED to indicate we have taken the lock
+ * at this higher level.
+ */
+ mapping = hugetlb_folio_mapping_lock_write(folio);
+ if (!mapping) {
+ pr_info("%#lx: could not lock mapping for mapped hugetlb folio\n",
+ folio_pfn(folio));
+ return;
+ }
+
+ try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
+ i_mmap_unlock_write(mapping);
+ } else {
+ try_to_unmap(folio, ttu);
+ }
+}
+
/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
@@ -1615,23 +1641,7 @@ static bool hwpoison_user_mappings(struct folio *folio, struct page *p,
*/
collect_procs(folio, p, &tokill, flags & MF_ACTION_REQUIRED);
- if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
- /*
- * For hugetlb pages in shared mappings, try_to_unmap
- * could potentially call huge_pmd_unshare. Because of
- * this, take semaphore in write mode here and set
- * TTU_RMAP_LOCKED to indicate we have taken the lock
- * at this higher level.
- */
- mapping = hugetlb_folio_mapping_lock_write(folio);
- if (mapping) {
- try_to_unmap(folio, ttu|TTU_RMAP_LOCKED);
- i_mmap_unlock_write(mapping);
- } else
- pr_info("%#lx: could not lock mapping for mapped huge page\n", pfn);
- } else {
- try_to_unmap(folio, ttu);
- }
+ unmap_poisoned_folio(folio, ttu);
unmap_success = !folio_mapped(folio);
if (!unmap_success)
@@ -2643,40 +2653,6 @@ EXPORT_SYMBOL(unpoison_memory);
#undef pr_fmt
#define pr_fmt(fmt) "Soft offline: " fmt
-static bool mf_isolate_folio(struct folio *folio, struct list_head *pagelist)
-{
- bool isolated = false;
-
- if (folio_test_hugetlb(folio)) {
- isolated = isolate_hugetlb(folio, pagelist);
- } else {
- bool lru = !__folio_test_movable(folio);
-
- if (lru)
- isolated = folio_isolate_lru(folio);
- else
- isolated = isolate_movable_page(&folio->page,
- ISOLATE_UNEVICTABLE);
-
- if (isolated) {
- list_add(&folio->lru, pagelist);
- if (lru)
- node_stat_add_folio(folio, NR_ISOLATED_ANON +
- folio_is_file_lru(folio));
- }
- }
-
- /*
- * If we succeed to isolate the folio, we grabbed another refcount on
- * the folio, so we can safely drop the one we got from get_any_page().
- * If we failed to isolate the folio, it means that we cannot go further
- * and we will return an error, so drop the reference we got from
- * get_any_page() as well.
- */
- folio_put(folio);
- return isolated;
-}
-
/*
* soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages.
* If the page is a non-dirty unmapped page-cache page, it simply invalidates.
@@ -2689,6 +2665,7 @@ static int soft_offline_in_use_page(struct page *page)
struct folio *folio = page_folio(page);
char const *msg_page[] = {"page", "hugepage"};
bool huge = folio_test_hugetlb(folio);
+ bool isolated;
LIST_HEAD(pagelist);
struct migration_target_control mtc = {
.nid = NUMA_NO_NODE,
@@ -2728,7 +2705,18 @@ static int soft_offline_in_use_page(struct page *page)
return 0;
}
- if (mf_isolate_folio(folio, &pagelist)) {
+ isolated = isolate_folio_to_list(folio, &pagelist);
+
+ /*
+ * If we succeed to isolate the folio, we grabbed another refcount on
+ * the folio, so we can safely drop the one we got from get_any_page().
+ * If we failed to isolate the folio, it means that we cannot go further
+ * and we will return an error, so drop the reference we got from
+ * get_any_page() as well.
+ */
+ folio_put(folio);
+
+ if (isolated) {
ret = migrate_pages(&pagelist, alloc_migration_target, NULL,
(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_FAILURE, NULL);
if (!ret) {
diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c
index 4775b3a3dabe..ed7607f692bd 100644
--- a/mm/memory-tiers.c
+++ b/mm/memory-tiers.c
@@ -6,6 +6,7 @@
#include <linux/memory.h>
#include <linux/memory-tiers.h>
#include <linux/notifier.h>
+#include <linux/sched/sysctl.h>
#include "internal.h"
@@ -50,6 +51,24 @@ static const struct bus_type memory_tier_subsys = {
.dev_name = "memory_tier",
};
+#ifdef CONFIG_NUMA_BALANCING
+/**
+ * folio_use_access_time - check if a folio reuses cpupid for page access time
+ * @folio: folio to check
+ *
+ * folio's _last_cpupid field is repurposed by memory tiering. In memory
+ * tiering mode, cpupid of slow memory folio (not toptier memory) is used to
+ * record page access time.
+ *
+ * Return: the folio _last_cpupid is used to record page access time
+ */
+bool folio_use_access_time(struct folio *folio)
+{
+ return (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+ !node_is_toptier(folio_nid(folio));
+}
+#endif
+
#ifdef CONFIG_MIGRATION
static int top_tier_adistance;
/*
@@ -895,13 +914,14 @@ static int __init memory_tier_init(void)
WARN_ON(!node_demotion);
#endif
- guard(mutex)(&memory_tier_lock);
+ mutex_lock(&memory_tier_lock);
/*
* For now we can have 4 faster memory tiers with smaller adistance
* than default DRAM tier.
*/
default_dram_type = mt_find_alloc_memory_type(MEMTIER_ADISTANCE_DRAM,
&default_memory_types);
+ mutex_unlock(&memory_tier_lock);
if (IS_ERR(default_dram_type))
panic("%s() failed to allocate default DRAM tier\n", __func__);
diff --git a/mm/memory.c b/mm/memory.c
index cda2c12c500b..2366578015ad 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -666,17 +666,16 @@ struct folio *vm_normal_folio(struct vm_area_struct *vma, unsigned long addr,
return NULL;
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#ifdef CONFIG_PGTABLE_HAS_HUGE_LEAVES
struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t pmd)
{
unsigned long pfn = pmd_pfn(pmd);
- /*
- * There is no pmd_special() but there may be special pmds, e.g.
- * in a direct-access (dax) mapping, so let's just replicate the
- * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here.
- */
+ /* Currently it's only used for huge pfnmaps */
+ if (unlikely(pmd_special(pmd)))
+ return NULL;
+
if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) {
if (vma->vm_flags & VM_MIXEDMAP) {
if (!pfn_valid(pfn))
@@ -927,8 +926,11 @@ copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma
* We have a prealloc page, all good! Take it
* over and copy the page & arm it.
*/
+
+ if (copy_mc_user_highpage(&new_folio->page, page, addr, src_vma))
+ return -EHWPOISON;
+
*prealloc = NULL;
- copy_user_highpage(&new_folio->page, page, addr, src_vma);
__folio_mark_uptodate(new_folio);
folio_add_new_anon_rmap(new_folio, dst_vma, addr, RMAP_EXCLUSIVE);
folio_add_lru_vma(new_folio, dst_vma);
@@ -1167,8 +1169,9 @@ again:
/*
* If we need a pre-allocated page for this pte, drop the
* locks, allocate, and try again.
+ * If copy failed due to hwpoison in source page, break out.
*/
- if (unlikely(ret == -EAGAIN))
+ if (unlikely(ret == -EAGAIN || ret == -EHWPOISON))
break;
if (unlikely(prealloc)) {
/*
@@ -1198,7 +1201,7 @@ again:
goto out;
}
entry.val = 0;
- } else if (ret == -EBUSY) {
+ } else if (ret == -EBUSY || unlikely(ret == -EHWPOISON)) {
goto out;
} else if (ret == -EAGAIN) {
prealloc = folio_prealloc(src_mm, src_vma, addr, false);
@@ -4001,6 +4004,194 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
+static struct folio *__alloc_swap_folio(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ struct folio *folio;
+ swp_entry_t entry;
+
+ folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma,
+ vmf->address, false);
+ if (!folio)
+ return NULL;
+
+ entry = pte_to_swp_entry(vmf->orig_pte);
+ if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
+ GFP_KERNEL, entry)) {
+ folio_put(folio);
+ return NULL;
+ }
+
+ return folio;
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static inline int non_swapcache_batch(swp_entry_t entry, int max_nr)
+{
+ struct swap_info_struct *si = swp_swap_info(entry);
+ pgoff_t offset = swp_offset(entry);
+ int i;
+
+ /*
+ * While allocating a large folio and doing swap_read_folio, which is
+ * the case the being faulted pte doesn't have swapcache. We need to
+ * ensure all PTEs have no cache as well, otherwise, we might go to
+ * swap devices while the content is in swapcache.
+ */
+ for (i = 0; i < max_nr; i++) {
+ if ((si->swap_map[offset + i] & SWAP_HAS_CACHE))
+ return i;
+ }
+
+ return i;
+}
+
+/*
+ * Check if the PTEs within a range are contiguous swap entries
+ * and have consistent swapcache, zeromap.
+ */
+static bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
+{
+ unsigned long addr;
+ swp_entry_t entry;
+ int idx;
+ pte_t pte;
+
+ addr = ALIGN_DOWN(vmf->address, nr_pages * PAGE_SIZE);
+ idx = (vmf->address - addr) / PAGE_SIZE;
+ pte = ptep_get(ptep);
+
+ if (!pte_same(pte, pte_move_swp_offset(vmf->orig_pte, -idx)))
+ return false;
+ entry = pte_to_swp_entry(pte);
+ if (swap_pte_batch(ptep, nr_pages, pte) != nr_pages)
+ return false;
+
+ /*
+ * swap_read_folio() can't handle the case a large folio is hybridly
+ * from different backends. And they are likely corner cases. Similar
+ * things might be added once zswap support large folios.
+ */
+ if (unlikely(swap_zeromap_batch(entry, nr_pages, NULL) != nr_pages))
+ return false;
+ if (unlikely(non_swapcache_batch(entry, nr_pages) != nr_pages))
+ return false;
+
+ return true;
+}
+
+static inline unsigned long thp_swap_suitable_orders(pgoff_t swp_offset,
+ unsigned long addr,
+ unsigned long orders)
+{
+ int order, nr;
+
+ order = highest_order(orders);
+
+ /*
+ * To swap in a THP with nr pages, we require that its first swap_offset
+ * is aligned with that number, as it was when the THP was swapped out.
+ * This helps filter out most invalid entries.
+ */
+ while (orders) {
+ nr = 1 << order;
+ if ((addr >> PAGE_SHIFT) % nr == swp_offset % nr)
+ break;
+ order = next_order(&orders, order);
+ }
+
+ return orders;
+}
+
+static struct folio *alloc_swap_folio(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+ unsigned long orders;
+ struct folio *folio;
+ unsigned long addr;
+ swp_entry_t entry;
+ spinlock_t *ptl;
+ pte_t *pte;
+ gfp_t gfp;
+ int order;
+
+ /*
+ * If uffd is active for the vma we need per-page fault fidelity to
+ * maintain the uffd semantics.
+ */
+ if (unlikely(userfaultfd_armed(vma)))
+ goto fallback;
+
+ /*
+ * A large swapped out folio could be partially or fully in zswap. We
+ * lack handling for such cases, so fallback to swapping in order-0
+ * folio.
+ */
+ if (!zswap_never_enabled())
+ goto fallback;
+
+ entry = pte_to_swp_entry(vmf->orig_pte);
+ /*
+ * Get a list of all the (large) orders below PMD_ORDER that are enabled
+ * and suitable for swapping THP.
+ */
+ orders = thp_vma_allowable_orders(vma, vma->vm_flags,
+ TVA_IN_PF | TVA_ENFORCE_SYSFS, BIT(PMD_ORDER) - 1);
+ orders = thp_vma_suitable_orders(vma, vmf->address, orders);
+ orders = thp_swap_suitable_orders(swp_offset(entry),
+ vmf->address, orders);
+
+ if (!orders)
+ goto fallback;
+
+ pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+ vmf->address & PMD_MASK, &ptl);
+ if (unlikely(!pte))
+ goto fallback;
+
+ /*
+ * For do_swap_page, find the highest order where the aligned range is
+ * completely swap entries with contiguous swap offsets.
+ */
+ order = highest_order(orders);
+ while (orders) {
+ addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+ if (can_swapin_thp(vmf, pte + pte_index(addr), 1 << order))
+ break;
+ order = next_order(&orders, order);
+ }
+
+ pte_unmap_unlock(pte, ptl);
+
+ /* Try allocating the highest of the remaining orders. */
+ gfp = vma_thp_gfp_mask(vma);
+ while (orders) {
+ addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order);
+ folio = vma_alloc_folio(gfp, order, vma, addr, true);
+ if (folio) {
+ if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm,
+ gfp, entry))
+ return folio;
+ folio_put(folio);
+ }
+ order = next_order(&orders, order);
+ }
+
+fallback:
+ return __alloc_swap_folio(vmf);
+}
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static inline bool can_swapin_thp(struct vm_fault *vmf, pte_t *ptep, int nr_pages)
+{
+ return false;
+}
+
+static struct folio *alloc_swap_folio(struct vm_fault *vmf)
+{
+ return __alloc_swap_folio(vmf);
+}
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
/*
* We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
@@ -4089,35 +4280,34 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!folio) {
if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
__swap_count(entry) == 1) {
- /*
- * Prevent parallel swapin from proceeding with
- * the cache flag. Otherwise, another thread may
- * finish swapin first, free the entry, and swapout
- * reusing the same entry. It's undetectable as
- * pte_same() returns true due to entry reuse.
- */
- if (swapcache_prepare(entry)) {
- /* Relax a bit to prevent rapid repeated page faults */
- schedule_timeout_uninterruptible(1);
- goto out;
- }
- need_clear_cache = true;
-
/* skip swapcache */
- folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0,
- vma, vmf->address, false);
- page = &folio->page;
+ folio = alloc_swap_folio(vmf);
if (folio) {
__folio_set_locked(folio);
__folio_set_swapbacked(folio);
- if (mem_cgroup_swapin_charge_folio(folio,
- vma->vm_mm, GFP_KERNEL,
- entry)) {
- ret = VM_FAULT_OOM;
+ nr_pages = folio_nr_pages(folio);
+ if (folio_test_large(folio))
+ entry.val = ALIGN_DOWN(entry.val, nr_pages);
+ /*
+ * Prevent parallel swapin from proceeding with
+ * the cache flag. Otherwise, another thread
+ * may finish swapin first, free the entry, and
+ * swapout reusing the same entry. It's
+ * undetectable as pte_same() returns true due
+ * to entry reuse.
+ */
+ if (swapcache_prepare(entry, nr_pages)) {
+ /*
+ * Relax a bit to prevent rapid
+ * repeated page faults.
+ */
+ schedule_timeout_uninterruptible(1);
goto out_page;
}
- mem_cgroup_swapin_uncharge_swap(entry);
+ need_clear_cache = true;
+
+ mem_cgroup_swapin_uncharge_swap(entry, nr_pages);
shadow = get_shadow_from_swap_cache(entry);
if (shadow)
@@ -4131,10 +4321,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
folio->private = NULL;
}
} else {
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+ folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
vmf);
- if (page)
- folio = page_folio(page);
swapcache = folio;
}
@@ -4155,6 +4343,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
ret = VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
+ page = folio_file_page(folio, swp_offset(entry));
} else if (PageHWPoison(page)) {
/*
* hwpoisoned dirty swapcache pages are kept for killing
@@ -4224,6 +4413,24 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_nomap;
}
+ /* allocated large folios for SWP_SYNCHRONOUS_IO */
+ if (folio_test_large(folio) && !folio_test_swapcache(folio)) {
+ unsigned long nr = folio_nr_pages(folio);
+ unsigned long folio_start = ALIGN_DOWN(vmf->address, nr * PAGE_SIZE);
+ unsigned long idx = (vmf->address - folio_start) / PAGE_SIZE;
+ pte_t *folio_ptep = vmf->pte - idx;
+ pte_t folio_pte = ptep_get(folio_ptep);
+
+ if (!pte_same(folio_pte, pte_move_swp_offset(vmf->orig_pte, -idx)) ||
+ swap_pte_batch(folio_ptep, nr, folio_pte) != nr)
+ goto out_nomap;
+
+ page_idx = idx;
+ address = folio_start;
+ ptep = folio_ptep;
+ goto check_folio;
+ }
+
nr_pages = 1;
page_idx = 0;
address = vmf->address;
@@ -4355,11 +4562,12 @@ check_folio:
folio_add_lru_vma(folio, vma);
} else if (!folio_test_anon(folio)) {
/*
- * We currently only expect small !anon folios, which are either
- * fully exclusive or fully shared. If we ever get large folios
- * here, we have to be careful.
+ * We currently only expect small !anon folios which are either
+ * fully exclusive or fully shared, or new allocated large
+ * folios which are fully exclusive. If we ever get large
+ * folios within swapcache here, we have to be careful.
*/
- VM_WARN_ON_ONCE(folio_test_large(folio));
+ VM_WARN_ON_ONCE(folio_test_large(folio) && folio_test_swapcache(folio));
VM_WARN_ON_FOLIO(!folio_test_locked(folio), folio);
folio_add_new_anon_rmap(folio, vma, address, rmap_flags);
} else {
@@ -4402,7 +4610,7 @@ unlock:
out:
/* Clear the swap cache pin for direct swapin after PTL unlock */
if (need_clear_cache)
- swapcache_clear(si, entry);
+ swapcache_clear(si, entry, nr_pages);
if (si)
put_swap_device(si);
return ret;
@@ -4418,7 +4626,7 @@ out_release:
folio_put(swapcache);
}
if (need_clear_cache)
- swapcache_clear(si, entry);
+ swapcache_clear(si, entry, nr_pages);
if (si)
put_swap_device(si);
return ret;
@@ -4612,9 +4820,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
folio_ref_add(folio, nr_pages - 1);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, nr_pages);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
count_mthp_stat(folio_order(folio), MTHP_STAT_ANON_FAULT_ALLOC);
-#endif
folio_add_new_anon_rmap(folio, vma, addr, RMAP_EXCLUSIVE);
folio_add_lru_vma(folio, vma);
setpte:
@@ -5109,10 +5315,14 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
if (ret & VM_FAULT_DONE_COW)
return ret;
- copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
+ if (copy_mc_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma)) {
+ ret = VM_FAULT_HWPOISON;
+ goto unlock;
+ }
__folio_mark_uptodate(folio);
ret |= finish_fault(vmf);
+unlock:
unlock_page(vmf->page);
put_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -5217,16 +5427,46 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
return ret;
}
-int numa_migrate_prep(struct folio *folio, struct vm_fault *vmf,
- unsigned long addr, int page_nid, int *flags)
+int numa_migrate_check(struct folio *folio, struct vm_fault *vmf,
+ unsigned long addr, int *flags,
+ bool writable, int *last_cpupid)
{
struct vm_area_struct *vma = vmf->vma;
+ /*
+ * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
+ * much anyway since they can be in shared cache state. This misses
+ * the case where a mapping is writable but the process never writes
+ * to it but pte_write gets cleared during protection updates and
+ * pte_dirty has unpredictable behaviour between PTE scan updates,
+ * background writeback, dirty balancing and application behaviour.
+ */
+ if (!writable)
+ *flags |= TNF_NO_GROUP;
+
+ /*
+ * Flag if the folio is shared between multiple address spaces. This
+ * is later used when determining whether to group tasks together
+ */
+ if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
+ *flags |= TNF_SHARED;
+ /*
+ * For memory tiering mode, cpupid of slow memory page is used
+ * to record page access time. So use default value.
+ */
+ if (folio_use_access_time(folio))
+ *last_cpupid = (-1 & LAST_CPUPID_MASK);
+ else
+ *last_cpupid = folio_last_cpupid(folio);
+
/* Record the current PID acceesing VMA */
vma_set_access_pid_bit(vma);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (page_nid == numa_node_id()) {
+#ifdef CONFIG_NUMA_BALANCING
+ count_memcg_folio_events(folio, NUMA_HINT_FAULTS, 1);
+#endif
+ if (folio_nid(folio) == numa_node_id()) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
*flags |= TNF_FAULT_LOCAL;
}
@@ -5328,36 +5568,11 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
if (!folio || folio_is_zone_device(folio))
goto out_map;
- /*
- * Avoid grouping on RO pages in general. RO pages shouldn't hurt as
- * much anyway since they can be in shared cache state. This misses
- * the case where a mapping is writable but the process never writes
- * to it but pte_write gets cleared during protection updates and
- * pte_dirty has unpredictable behaviour between PTE scan updates,
- * background writeback, dirty balancing and application behaviour.
- */
- if (!writable)
- flags |= TNF_NO_GROUP;
-
- /*
- * Flag if the folio is shared between multiple address spaces. This
- * is later used when determining whether to group tasks together
- */
- if (folio_likely_mapped_shared(folio) && (vma->vm_flags & VM_SHARED))
- flags |= TNF_SHARED;
-
nid = folio_nid(folio);
nr_pages = folio_nr_pages(folio);
- /*
- * For memory tiering mode, cpupid of slow memory page is used
- * to record page access time. So use default value.
- */
- if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
- !node_is_toptier(nid))
- last_cpupid = (-1 & LAST_CPUPID_MASK);
- else
- last_cpupid = folio_last_cpupid(folio);
- target_nid = numa_migrate_prep(folio, vmf, vmf->address, nid, &flags);
+
+ target_nid = numa_migrate_check(folio, vmf, vmf->address, &flags,
+ writable, &last_cpupid);
if (target_nid == NUMA_NO_NODE)
goto out_map;
if (migrate_misplaced_folio_prepare(folio, vma, target_nid)) {
@@ -6013,10 +6228,6 @@ retry:
if (!vma_start_read(vma))
goto inval;
- /* Check since vm_start/vm_end might change before we lock the VMA */
- if (unlikely(address < vma->vm_start || address >= vma->vm_end))
- goto inval_end_read;
-
/* Check if the VMA got isolated after we found it */
if (vma->detached) {
vma_end_read(vma);
@@ -6024,6 +6235,16 @@ retry:
/* The area was replaced with another one */
goto retry;
}
+ /*
+ * At this point, we have a stable reference to a VMA: The VMA is
+ * locked and we know it hasn't already been isolated.
+ * From here on, we can access the VMA without worrying about which
+ * fields are accessible for RCU readers.
+ */
+
+ /* Check since vm_start/vm_end might change before we lock the VMA */
+ if (unlikely(address < vma->vm_start || address >= vma->vm_end))
+ goto inval_end_read;
rcu_read_unlock();
return vma;
@@ -6108,78 +6329,155 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
+static inline void pfnmap_args_setup(struct follow_pfnmap_args *args,
+ spinlock_t *lock, pte_t *ptep,
+ pgprot_t pgprot, unsigned long pfn_base,
+ unsigned long addr_mask, bool writable,
+ bool special)
+{
+ args->lock = lock;
+ args->ptep = ptep;
+ args->pfn = pfn_base + ((args->address & ~addr_mask) >> PAGE_SHIFT);
+ args->pgprot = pgprot;
+ args->writable = writable;
+ args->special = special;
+}
+
+static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma)
+{
+#ifdef CONFIG_LOCKDEP
+ struct address_space *mapping = vma->vm_file->f_mapping;
+
+ if (mapping)
+ lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) ||
+ lockdep_is_held(&vma->vm_mm->mmap_lock));
+ else
+ lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock));
+#endif
+}
+
/**
- * follow_pte - look up PTE at a user virtual address
- * @vma: the memory mapping
- * @address: user virtual address
- * @ptepp: location to store found PTE
- * @ptlp: location to store the lock for the PTE
+ * follow_pfnmap_start() - Look up a pfn mapping at a user virtual address
+ * @args: Pointer to struct @follow_pfnmap_args
+ *
+ * The caller needs to setup args->vma and args->address to point to the
+ * virtual address as the target of such lookup. On a successful return,
+ * the results will be put into other output fields.
*
- * On a successful return, the pointer to the PTE is stored in @ptepp;
- * the corresponding lock is taken and its location is stored in @ptlp.
+ * After the caller finished using the fields, the caller must invoke
+ * another follow_pfnmap_end() to proper releases the locks and resources
+ * of such look up request.
*
- * The contents of the PTE are only stable until @ptlp is released using
- * pte_unmap_unlock(). This function will fail if the PTE is non-present.
- * Present PTEs may include PTEs that map refcounted pages, such as
- * anonymous folios in COW mappings.
+ * During the start() and end() calls, the results in @args will be valid
+ * as proper locks will be held. After the end() is called, all the fields
+ * in @follow_pfnmap_args will be invalid to be further accessed. Further
+ * use of such information after end() may require proper synchronizations
+ * by the caller with page table updates, otherwise it can create a
+ * security bug.
*
- * Callers must be careful when relying on PTE content after
- * pte_unmap_unlock(). Especially if the PTE maps a refcounted page,
- * callers must protect against invalidation with MMU notifiers; otherwise
- * access to the PFN at a later point in time can trigger use-after-free.
+ * If the PTE maps a refcounted page, callers are responsible to protect
+ * against invalidation with MMU notifiers; otherwise access to the PFN at
+ * a later point in time can trigger use-after-free.
*
* Only IO mappings and raw PFN mappings are allowed. The mmap semaphore
- * should be taken for read.
+ * should be taken for read, and the mmap semaphore cannot be released
+ * before the end() is invoked.
*
* This function must not be used to modify PTE content.
*
- * Return: zero on success, -ve otherwise.
+ * Return: zero on success, negative otherwise.
*/
-int follow_pte(struct vm_area_struct *vma, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
+int follow_pfnmap_start(struct follow_pfnmap_args *args)
{
+ struct vm_area_struct *vma = args->vma;
+ unsigned long address = args->address;
struct mm_struct *mm = vma->vm_mm;
- pgd_t *pgd;
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *ptep;
+ spinlock_t *lock;
+ pgd_t *pgdp;
+ p4d_t *p4dp, p4d;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+
+ pfnmap_lockdep_assert(vma);
- mmap_assert_locked(mm);
if (unlikely(address < vma->vm_start || address >= vma->vm_end))
goto out;
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out;
-
- pgd = pgd_offset(mm, address);
- if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
+retry:
+ pgdp = pgd_offset(mm, address);
+ if (pgd_none(*pgdp) || unlikely(pgd_bad(*pgdp)))
goto out;
- p4d = p4d_offset(pgd, address);
- if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
+ p4dp = p4d_offset(pgdp, address);
+ p4d = READ_ONCE(*p4dp);
+ if (p4d_none(p4d) || unlikely(p4d_bad(p4d)))
goto out;
- pud = pud_offset(p4d, address);
- if (pud_none(*pud) || unlikely(pud_bad(*pud)))
+ pudp = pud_offset(p4dp, address);
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
goto out;
+ if (pud_leaf(pud)) {
+ lock = pud_lock(mm, pudp);
+ if (!unlikely(pud_leaf(pud))) {
+ spin_unlock(lock);
+ goto retry;
+ }
+ pfnmap_args_setup(args, lock, NULL, pud_pgprot(pud),
+ pud_pfn(pud), PUD_MASK, pud_write(pud),
+ pud_special(pud));
+ return 0;
+ }
- pmd = pmd_offset(pud, address);
- VM_BUG_ON(pmd_trans_huge(*pmd));
+ pmdp = pmd_offset(pudp, address);
+ pmd = pmdp_get_lockless(pmdp);
+ if (pmd_leaf(pmd)) {
+ lock = pmd_lock(mm, pmdp);
+ if (!unlikely(pmd_leaf(pmd))) {
+ spin_unlock(lock);
+ goto retry;
+ }
+ pfnmap_args_setup(args, lock, NULL, pmd_pgprot(pmd),
+ pmd_pfn(pmd), PMD_MASK, pmd_write(pmd),
+ pmd_special(pmd));
+ return 0;
+ }
- ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
+ ptep = pte_offset_map_lock(mm, pmdp, address, &lock);
if (!ptep)
goto out;
- if (!pte_present(ptep_get(ptep)))
+ pte = ptep_get(ptep);
+ if (!pte_present(pte))
goto unlock;
- *ptepp = ptep;
+ pfnmap_args_setup(args, lock, ptep, pte_pgprot(pte),
+ pte_pfn(pte), PAGE_MASK, pte_write(pte),
+ pte_special(pte));
return 0;
unlock:
- pte_unmap_unlock(ptep, *ptlp);
+ pte_unmap_unlock(ptep, lock);
out:
return -EINVAL;
}
-EXPORT_SYMBOL_GPL(follow_pte);
+EXPORT_SYMBOL_GPL(follow_pfnmap_start);
+
+/**
+ * follow_pfnmap_end(): End a follow_pfnmap_start() process
+ * @args: Pointer to struct @follow_pfnmap_args
+ *
+ * Must be used in pair of follow_pfnmap_start(). See the start() function
+ * above for more information.
+ */
+void follow_pfnmap_end(struct follow_pfnmap_args *args)
+{
+ if (args->lock)
+ spin_unlock(args->lock);
+ if (args->ptep)
+ pte_unmap(args->ptep);
+}
+EXPORT_SYMBOL_GPL(follow_pfnmap_end);
#ifdef CONFIG_HAVE_IOREMAP_PROT
/**
@@ -6200,34 +6498,34 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
resource_size_t phys_addr;
unsigned long prot = 0;
void __iomem *maddr;
- pte_t *ptep, pte;
- spinlock_t *ptl;
int offset = offset_in_page(addr);
int ret = -EINVAL;
+ bool writable;
+ struct follow_pfnmap_args args = { .vma = vma, .address = addr };
retry:
- if (follow_pte(vma, addr, &ptep, &ptl))
+ if (follow_pfnmap_start(&args))
return -EINVAL;
- pte = ptep_get(ptep);
- pte_unmap_unlock(ptep, ptl);
-
- prot = pgprot_val(pte_pgprot(pte));
- phys_addr = (resource_size_t)pte_pfn(pte) << PAGE_SHIFT;
+ prot = pgprot_val(args.pgprot);
+ phys_addr = (resource_size_t)args.pfn << PAGE_SHIFT;
+ writable = args.writable;
+ follow_pfnmap_end(&args);
- if ((write & FOLL_WRITE) && !pte_write(pte))
+ if ((write & FOLL_WRITE) && !writable)
return -EINVAL;
maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
if (!maddr)
return -ENOMEM;
- if (follow_pte(vma, addr, &ptep, &ptl))
+ if (follow_pfnmap_start(&args))
goto out_unmap;
- if (!pte_same(pte, ptep_get(ptep))) {
- pte_unmap_unlock(ptep, ptl);
+ if ((prot != pgprot_val(args.pgprot)) ||
+ (phys_addr != (args.pfn << PAGE_SHIFT)) ||
+ (writable != args.writable)) {
+ follow_pfnmap_end(&args);
iounmap(maddr);
-
goto retry;
}
@@ -6236,7 +6534,7 @@ retry:
else
memcpy_fromio(buf, maddr + offset, len);
ret = len;
- pte_unmap_unlock(ptep, ptl);
+ follow_pfnmap_end(&args);
out_unmap:
iounmap(maddr);
@@ -6587,7 +6885,7 @@ long copy_folio_from_user(struct folio *dst_folio,
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
-#if USE_SPLIT_PTE_PTLOCKS && ALLOC_SPLIT_PTLOCKS
+#if defined(CONFIG_SPLIT_PTE_PTLOCKS) && ALLOC_SPLIT_PTLOCKS
static struct kmem_cache *page_ptl_cachep;
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 951878ab627a..621ae1015106 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -366,7 +366,7 @@ struct page *pfn_to_online_page(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(pfn_to_online_page);
-int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
+int __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
struct mhp_params *params)
{
const unsigned long end_pfn = pfn + nr_pages;
@@ -524,7 +524,7 @@ static void update_pgdat_span(struct pglist_data *pgdat)
pgdat->node_spanned_pages = node_end_pfn - node_start_pfn;
}
-void __ref remove_pfn_range_from_zone(struct zone *zone,
+void remove_pfn_range_from_zone(struct zone *zone,
unsigned long start_pfn,
unsigned long nr_pages)
{
@@ -629,7 +629,7 @@ int restore_online_page_callback(online_page_callback_t callback)
EXPORT_SYMBOL_GPL(restore_online_page_callback);
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-void __ref generic_online_page(struct page *page, unsigned int order)
+void generic_online_page(struct page *page, unsigned int order)
{
__free_pages_core(page, order, MEMINIT_HOTPLUG);
}
@@ -741,7 +741,7 @@ static inline void section_taint_zone_device(unsigned long pfn)
* (usually MIGRATE_MOVABLE). Besides setting the migratetype, no related
* zone stats (e.g., nr_isolate_pageblock) are touched.
*/
-void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
+void move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
unsigned long nr_pages,
struct vmem_altmap *altmap, int migratetype)
{
@@ -1143,7 +1143,7 @@ void mhp_deinit_memmap_on_memory(unsigned long pfn, unsigned long nr_pages)
/*
* Must be called with mem_hotplug_lock in write mode.
*/
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
+int online_pages(unsigned long pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
unsigned long flags;
@@ -1233,7 +1233,7 @@ failed_addition:
}
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-static pg_data_t __ref *hotadd_init_pgdat(int nid)
+static pg_data_t *hotadd_init_pgdat(int nid)
{
struct pglist_data *pgdat;
@@ -1386,7 +1386,7 @@ bool mhp_supports_memmap_on_memory(void)
}
EXPORT_SYMBOL_GPL(mhp_supports_memmap_on_memory);
-static void __ref remove_memory_blocks_and_altmaps(u64 start, u64 size)
+static void remove_memory_blocks_and_altmaps(u64 start, u64 size)
{
unsigned long memblock_size = memory_block_size_bytes();
u64 cur_start;
@@ -1473,7 +1473,7 @@ out:
*
* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG
*/
-int __ref add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
+int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
{
struct mhp_params params = { .pgprot = pgprot_mhp(PAGE_KERNEL) };
enum memblock_flags memblock_flags = MEMBLOCK_NONE;
@@ -1580,7 +1580,7 @@ error_mem_hotplug_end:
}
/* requires device_hotplug_lock, see add_memory_resource() */
-int __ref __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
+int __add_memory(int nid, u64 start, u64 size, mhp_t mhp_flags)
{
struct resource *res;
int ret;
@@ -1772,67 +1772,59 @@ found:
static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
+ struct folio *folio;
unsigned long pfn;
- struct page *page, *head;
LIST_HEAD(source);
static DEFINE_RATELIMIT_STATE(migrate_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- struct folio *folio;
- bool isolated;
+ struct page *page;
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
folio = page_folio(page);
- head = &folio->page;
- if (PageHuge(page)) {
- pfn = page_to_pfn(head) + compound_nr(head) - 1;
- isolate_hugetlb(folio, &source);
- continue;
- } else if (PageTransHuge(page))
- pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
+ /*
+ * No reference or lock is held on the folio, so it might
+ * be modified concurrently (e.g. split). As such,
+ * folio_nr_pages() may read garbage. This is fine as the outer
+ * loop will revisit the split folio later.
+ */
+ if (folio_test_large(folio))
+ pfn = folio_pfn(folio) + folio_nr_pages(folio) - 1;
/*
* HWPoison pages have elevated reference counts so the migration would
* fail on them. It also doesn't make any sense to migrate them in the
* first place. Still try to unmap such a page in case it is still mapped
- * (e.g. current hwpoison implementation doesn't unmap KSM pages but keep
- * the unmap as the catch all safety net).
+ * (keep the unmap as the catch all safety net).
*/
- if (PageHWPoison(page)) {
+ if (folio_test_hwpoison(folio) ||
+ (folio_test_large(folio) && folio_test_has_hwpoisoned(folio))) {
if (WARN_ON(folio_test_lru(folio)))
folio_isolate_lru(folio);
if (folio_mapped(folio))
- try_to_unmap(folio, TTU_IGNORE_MLOCK);
+ unmap_poisoned_folio(folio, TTU_IGNORE_MLOCK);
continue;
}
- if (!get_page_unless_zero(page))
+ if (!folio_try_get(folio))
continue;
- /*
- * We can skip free pages. And we can deal with pages on
- * LRU and non-lru movable pages.
- */
- if (PageLRU(page))
- isolated = isolate_lru_page(page);
- else
- isolated = isolate_movable_page(page, ISOLATE_UNEVICTABLE);
- if (isolated) {
- list_add_tail(&page->lru, &source);
- if (!__PageMovable(page))
- inc_node_page_state(page, NR_ISOLATED_ANON +
- page_is_file_lru(page));
- } else {
+ if (unlikely(page_folio(page) != folio))
+ goto put_folio;
+
+ if (!isolate_folio_to_list(folio, &source)) {
if (__ratelimit(&migrate_rs)) {
- pr_warn("failed to isolate pfn %lx\n", pfn);
+ pr_warn("failed to isolate pfn %lx\n",
+ page_to_pfn(page));
dump_page(page, "isolation failed");
}
}
- put_page(page);
+put_folio:
+ folio_put(folio);
}
if (!list_empty(&source)) {
nodemask_t nmask = node_states[N_MEMORY];
@@ -1847,7 +1839,7 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
* We have checked that migration range is on a single zone so
* we can use the nid of the first page to all the others.
*/
- mtc.nid = page_to_nid(list_first_entry(&source, struct page, lru));
+ mtc.nid = folio_nid(list_first_entry(&source, struct folio, lru));
/*
* try to allocate from a different node but reuse this node
@@ -1860,11 +1852,12 @@ static void do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
ret = migrate_pages(&source, alloc_migration_target, NULL,
(unsigned long)&mtc, MIGRATE_SYNC, MR_MEMORY_HOTPLUG, NULL);
if (ret) {
- list_for_each_entry(page, &source, lru) {
+ list_for_each_entry(folio, &source, lru) {
if (__ratelimit(&migrate_rs)) {
pr_warn("migrating pfn %lx failed ret:%d\n",
- page_to_pfn(page), ret);
- dump_page(page, "migration failure");
+ folio_pfn(folio), ret);
+ dump_page(&folio->page,
+ "migration failure");
}
}
putback_movable_pages(&source);
@@ -1939,7 +1932,7 @@ static int count_system_ram_pages_cb(unsigned long start_pfn,
/*
* Must be called with mem_hotplug_lock in write mode.
*/
-int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages,
+int offline_pages(unsigned long start_pfn, unsigned long nr_pages,
struct zone *zone, struct memory_group *group)
{
const unsigned long end_pfn = start_pfn + nr_pages;
@@ -2240,7 +2233,7 @@ static int memory_blocks_have_altmaps(u64 start, u64 size)
return 1;
}
-static int __ref try_remove_memory(u64 start, u64 size)
+static int try_remove_memory(u64 start, u64 size)
{
int rc, nid = NUMA_NO_NODE;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index b858e22b259d..b646fab3e45e 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -676,8 +676,10 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
tlb_gather_mmu(&tlb, vma->vm_mm);
nr_updated = change_protection(&tlb, vma, addr, end, MM_CP_PROT_NUMA);
- if (nr_updated > 0)
+ if (nr_updated > 0) {
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
+ count_memcg_events_mm(vma->vm_mm, NUMA_PTE_UPDATES, nr_updated);
+ }
tlb_finish_mmu(&tlb);
@@ -1951,7 +1953,7 @@ unsigned int mempolicy_slab_node(void)
zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK];
z = first_zones_zonelist(zonelist, highest_zoneidx,
&policy->nodes);
- return z->zone ? zone_to_nid(z->zone) : node;
+ return zonelist_zone(z) ? zonelist_node_idx(z) : node;
}
case MPOL_LOCAL:
return node;
@@ -2809,7 +2811,7 @@ int mpol_misplaced(struct folio *folio, struct vm_fault *vmf,
node_zonelist(thisnid, GFP_HIGHUSER),
gfp_zone(GFP_HIGHUSER),
&pol->nodes);
- polnid = zone_to_nid(z->zone);
+ polnid = zonelist_node_idx(z);
break;
default:
diff --git a/mm/migrate.c b/mm/migrate.c
index 923ea80ba744..dfdb3a136bf8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -20,7 +20,6 @@
#include <linux/pagemap.h>
#include <linux/buffer_head.h>
#include <linux/mm_inline.h>
-#include <linux/nsproxy.h>
#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/topology.h>
@@ -35,21 +34,16 @@
#include <linux/syscalls.h>
#include <linux/compat.h>
#include <linux/hugetlb.h>
-#include <linux/hugetlb_cgroup.h>
#include <linux/gfp.h>
#include <linux/pfn_t.h>
-#include <linux/memremap.h>
-#include <linux/userfaultfd_k.h>
-#include <linux/balloon_compaction.h>
#include <linux/page_idle.h>
#include <linux/page_owner.h>
#include <linux/sched/mm.h>
#include <linux/ptrace.h>
-#include <linux/oom.h>
#include <linux/memory.h>
-#include <linux/random.h>
#include <linux/sched/sysctl.h>
#include <linux/memory-tiers.h>
+#include <linux/pagewalk.h>
#include <asm/tlbflush.h>
@@ -177,13 +171,83 @@ void putback_movable_pages(struct list_head *l)
}
}
+/* Must be called with an elevated refcount on the non-hugetlb folio */
+bool isolate_folio_to_list(struct folio *folio, struct list_head *list)
+{
+ bool isolated, lru;
+
+ if (folio_test_hugetlb(folio))
+ return isolate_hugetlb(folio, list);
+
+ lru = !__folio_test_movable(folio);
+ if (lru)
+ isolated = folio_isolate_lru(folio);
+ else
+ isolated = isolate_movable_page(&folio->page,
+ ISOLATE_UNEVICTABLE);
+
+ if (!isolated)
+ return false;
+
+ list_add(&folio->lru, list);
+ if (lru)
+ node_stat_add_folio(folio, NR_ISOLATED_ANON +
+ folio_is_file_lru(folio));
+
+ return true;
+}
+
+static bool try_to_map_unused_to_zeropage(struct page_vma_mapped_walk *pvmw,
+ struct folio *folio,
+ unsigned long idx)
+{
+ struct page *page = folio_page(folio, idx);
+ bool contains_data;
+ pte_t newpte;
+ void *addr;
+
+ VM_BUG_ON_PAGE(PageCompound(page), page);
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ VM_BUG_ON_PAGE(pte_present(*pvmw->pte), page);
+
+ if (folio_test_mlocked(folio) || (pvmw->vma->vm_flags & VM_LOCKED) ||
+ mm_forbids_zeropage(pvmw->vma->vm_mm))
+ return false;
+
+ /*
+ * The pmd entry mapping the old thp was flushed and the pte mapping
+ * this subpage has been non present. If the subpage is only zero-filled
+ * then map it to the shared zeropage.
+ */
+ addr = kmap_local_page(page);
+ contains_data = memchr_inv(addr, 0, PAGE_SIZE);
+ kunmap_local(addr);
+
+ if (contains_data)
+ return false;
+
+ newpte = pte_mkspecial(pfn_pte(my_zero_pfn(pvmw->address),
+ pvmw->vma->vm_page_prot));
+ set_pte_at(pvmw->vma->vm_mm, pvmw->address, pvmw->pte, newpte);
+
+ dec_mm_counter(pvmw->vma->vm_mm, mm_counter(folio));
+ return true;
+}
+
+struct rmap_walk_arg {
+ struct folio *folio;
+ bool map_unused_to_zeropage;
+};
+
/*
* Restore a potential migration pte to a working pte entry
*/
static bool remove_migration_pte(struct folio *folio,
- struct vm_area_struct *vma, unsigned long addr, void *old)
+ struct vm_area_struct *vma, unsigned long addr, void *arg)
{
- DEFINE_FOLIO_VMA_WALK(pvmw, old, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
+ struct rmap_walk_arg *rmap_walk_arg = arg;
+ DEFINE_FOLIO_VMA_WALK(pvmw, rmap_walk_arg->folio, vma, addr, PVMW_SYNC | PVMW_MIGRATION);
while (page_vma_mapped_walk(&pvmw)) {
rmap_t rmap_flags = RMAP_NONE;
@@ -207,6 +271,9 @@ static bool remove_migration_pte(struct folio *folio,
continue;
}
#endif
+ if (rmap_walk_arg->map_unused_to_zeropage &&
+ try_to_map_unused_to_zeropage(&pvmw, folio, idx))
+ continue;
folio_get(folio);
pte = mk_pte(new, READ_ONCE(vma->vm_page_prot));
@@ -285,14 +352,21 @@ static bool remove_migration_pte(struct folio *folio,
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
-void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked)
+void remove_migration_ptes(struct folio *src, struct folio *dst, int flags)
{
+ struct rmap_walk_arg rmap_walk_arg = {
+ .folio = src,
+ .map_unused_to_zeropage = flags & RMP_USE_SHARED_ZEROPAGE,
+ };
+
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
- .arg = src,
+ .arg = &rmap_walk_arg,
};
- if (locked)
+ VM_BUG_ON_FOLIO((flags & RMP_USE_SHARED_ZEROPAGE) && (src != dst), src);
+
+ if (flags & RMP_LOCKED)
rmap_walk_locked(dst, &rwc);
else
rmap_walk(dst, &rwc);
@@ -422,6 +496,8 @@ static int __folio_migrate_mapping(struct address_space *mapping,
/* No turning back from here */
newfolio->index = folio->index;
newfolio->mapping = folio->mapping;
+ if (folio_test_anon(folio) && folio_test_large(folio))
+ mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
if (folio_test_swapbacked(folio))
__folio_set_swapbacked(newfolio);
@@ -446,6 +522,8 @@ static int __folio_migrate_mapping(struct address_space *mapping,
*/
newfolio->index = folio->index;
newfolio->mapping = folio->mapping;
+ if (folio_test_anon(folio) && folio_test_large(folio))
+ mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
folio_ref_add(newfolio, nr); /* add cache reference */
if (folio_test_swapbacked(folio)) {
__folio_set_swapbacked(newfolio);
@@ -585,8 +663,6 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
{
int cpupid;
- if (folio_test_error(folio))
- folio_set_error(newfolio);
if (folio_test_referenced(folio))
folio_set_referenced(newfolio);
if (folio_test_uptodate(folio))
@@ -640,7 +716,8 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
folio_migrate_ksm(newfolio, folio);
/*
* Please do not reorder this without considering how mm/ksm.c's
- * ksm_get_folio() depends upon ksm_migrate_page() and PageSwapCache().
+ * ksm_get_folio() depends upon ksm_migrate_page() and the
+ * swapcache flag.
*/
if (folio_test_swapcache(folio))
folio_clear_swapcache(folio);
@@ -666,6 +743,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio)
folio_set_readahead(newfolio);
folio_copy_owner(newfolio, folio);
+ pgalloc_tag_copy(newfolio, folio);
mem_cgroup_migrate(folio, newfolio);
}
@@ -904,7 +982,7 @@ static int writeout(struct address_space *mapping, struct folio *folio)
* At this point we know that the migration attempt cannot
* be successful.
*/
- remove_migration_ptes(folio, folio, false);
+ remove_migration_ptes(folio, folio, 0);
rc = mapping->a_ops->writepage(&folio->page, &wbc);
@@ -1068,7 +1146,7 @@ static void migrate_folio_undo_src(struct folio *src,
struct list_head *ret)
{
if (page_was_mapped)
- remove_migration_ptes(src, src, false);
+ remove_migration_ptes(src, src, 0);
/* Drop an anon_vma reference if we took one */
if (anon_vma)
put_anon_vma(anon_vma);
@@ -1306,7 +1384,7 @@ static int migrate_folio_move(free_folio_t put_new_folio, unsigned long private,
lru_add_drain();
if (old_page_state & PAGE_WAS_MAPPED)
- remove_migration_ptes(src, dst, false);
+ remove_migration_ptes(src, dst, 0);
out_unlock_both:
folio_unlock(dst);
@@ -1444,7 +1522,7 @@ static int unmap_and_move_huge_page(new_folio_t get_new_folio,
if (page_was_mapped)
remove_migration_ptes(src,
- rc == MIGRATEPAGE_SUCCESS ? dst : src, false);
+ rc == MIGRATEPAGE_SUCCESS ? dst : src, 0);
unlock_put_anon:
folio_unlock(dst);
@@ -1682,7 +1760,8 @@ static int migrate_pages_batch(struct list_head *from,
* use _deferred_list.
*/
if (nr_pages > 2 &&
- !list_empty(&folio->_deferred_list)) {
+ !list_empty(&folio->_deferred_list) &&
+ folio_test_partially_mapped(folio)) {
if (!try_split_folio(folio, split_folios, mode)) {
nr_failed++;
stats->nr_thp_failed += is_thp;
@@ -2111,76 +2190,66 @@ static int do_move_pages_to_node(struct list_head *pagelist, int node)
return err;
}
+static int __add_folio_for_migration(struct folio *folio, int node,
+ struct list_head *pagelist, bool migrate_all)
+{
+ if (is_zero_folio(folio) || is_huge_zero_folio(folio))
+ return -EFAULT;
+
+ if (folio_is_zone_device(folio))
+ return -ENOENT;
+
+ if (folio_nid(folio) == node)
+ return 0;
+
+ if (folio_likely_mapped_shared(folio) && !migrate_all)
+ return -EACCES;
+
+ if (folio_test_hugetlb(folio)) {
+ if (isolate_hugetlb(folio, pagelist))
+ return 1;
+ } else if (folio_isolate_lru(folio)) {
+ list_add_tail(&folio->lru, pagelist);
+ node_stat_mod_folio(folio,
+ NR_ISOLATED_ANON + folio_is_file_lru(folio),
+ folio_nr_pages(folio));
+ return 1;
+ }
+ return -EBUSY;
+}
+
/*
- * Resolves the given address to a struct page, isolates it from the LRU and
+ * Resolves the given address to a struct folio, isolates it from the LRU and
* puts it to the given pagelist.
* Returns:
- * errno - if the page cannot be found/isolated
+ * errno - if the folio cannot be found/isolated
* 0 - when it doesn't have to be migrated because it is already on the
* target node
* 1 - when it has been queued
*/
-static int add_page_for_migration(struct mm_struct *mm, const void __user *p,
+static int add_folio_for_migration(struct mm_struct *mm, const void __user *p,
int node, struct list_head *pagelist, bool migrate_all)
{
struct vm_area_struct *vma;
- unsigned long addr;
- struct page *page;
+ struct folio_walk fw;
struct folio *folio;
- int err;
+ unsigned long addr;
+ int err = -EFAULT;
mmap_read_lock(mm);
addr = (unsigned long)untagged_addr_remote(mm, p);
- err = -EFAULT;
vma = vma_lookup(mm, addr);
- if (!vma || !vma_migratable(vma))
- goto out;
-
- /* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
-
- err = PTR_ERR(page);
- if (IS_ERR(page))
- goto out;
-
- err = -ENOENT;
- if (!page)
- goto out;
-
- folio = page_folio(page);
- if (folio_is_zone_device(folio))
- goto out_putfolio;
-
- err = 0;
- if (folio_nid(folio) == node)
- goto out_putfolio;
-
- err = -EACCES;
- if (folio_likely_mapped_shared(folio) && !migrate_all)
- goto out_putfolio;
-
- err = -EBUSY;
- if (folio_test_hugetlb(folio)) {
- if (isolate_hugetlb(folio, pagelist))
- err = 1;
- } else {
- if (!folio_isolate_lru(folio))
- goto out_putfolio;
-
- err = 1;
- list_add_tail(&folio->lru, pagelist);
- node_stat_mod_folio(folio,
- NR_ISOLATED_ANON + folio_is_file_lru(folio),
- folio_nr_pages(folio));
+ if (vma && vma_migratable(vma)) {
+ folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE);
+ if (folio) {
+ err = __add_folio_for_migration(folio, node, pagelist,
+ migrate_all);
+ folio_walk_end(&fw, vma);
+ } else {
+ err = -ENOENT;
+ }
}
-out_putfolio:
- /*
- * Either remove the duplicate refcount from folio_isolate_lru()
- * or drop the folio ref if it was not isolated.
- */
- folio_put(folio);
-out:
mmap_read_unlock(mm);
return err;
}
@@ -2274,8 +2343,8 @@ static int do_pages_move(struct mm_struct *mm, nodemask_t task_nodes,
* Errors in the page lookup or isolation are not fatal and we simply
* report them via status
*/
- err = add_page_for_migration(mm, p, current_node, &pagelist,
- flags & MPOL_MF_MOVE_ALL);
+ err = add_folio_for_migration(mm, p, current_node, &pagelist,
+ flags & MPOL_MF_MOVE_ALL);
if (err > 0) {
/* The page is successfully queued for migration */
@@ -2331,28 +2400,26 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
for (i = 0; i < nr_pages; i++) {
unsigned long addr = (unsigned long)(*pages);
struct vm_area_struct *vma;
- struct page *page;
+ struct folio_walk fw;
+ struct folio *folio;
int err = -EFAULT;
vma = vma_lookup(mm, addr);
if (!vma)
goto set_status;
- /* FOLL_DUMP to ignore special (like zero) pages */
- page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP);
-
- err = PTR_ERR(page);
- if (IS_ERR(page))
- goto set_status;
-
- err = -ENOENT;
- if (!page)
- goto set_status;
-
- if (!is_zone_device_page(page))
- err = page_to_nid(page);
-
- put_page(page);
+ folio = folio_walk_start(&fw, vma, addr, FW_ZEROPAGE);
+ if (folio) {
+ if (is_zero_folio(folio) || is_huge_zero_folio(folio))
+ err = -EFAULT;
+ else if (folio_is_zone_device(folio))
+ err = -ENOENT;
+ else
+ err = folio_nid(folio);
+ folio_walk_end(&fw, vma);
+ } else {
+ err = -ENOENT;
+ }
set_status:
*status = err;
@@ -2432,25 +2499,19 @@ static struct mm_struct *find_mm_struct(pid_t pid, nodemask_t *mem_nodes)
return current->mm;
}
- /* Find the mm_struct */
- rcu_read_lock();
- task = find_task_by_vpid(pid);
+ task = find_get_task_by_vpid(pid);
if (!task) {
- rcu_read_unlock();
return ERR_PTR(-ESRCH);
}
- get_task_struct(task);
/*
* Check if this process has the right to modify the specified
* process. Use the regular "ptrace_may_access()" checks.
*/
if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) {
- rcu_read_unlock();
mm = ERR_PTR(-EPERM);
goto out;
}
- rcu_read_unlock();
mm = ERR_PTR(security_task_movememory(task));
if (IS_ERR(mm))
@@ -2526,7 +2587,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
if (!zone_watermark_ok(zone, 0,
high_wmark_pages(zone) +
nr_migrate_pages,
- ZONE_MOVABLE, 0))
+ ZONE_MOVABLE, ALLOC_CMA))
continue;
return true;
}
@@ -2627,6 +2688,8 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
int nr_remaining;
unsigned int nr_succeeded;
LIST_HEAD(migratepages);
+ struct mem_cgroup *memcg = get_mem_cgroup_from_folio(folio);
+ struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat);
list_add(&folio->lru, &migratepages);
nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_folio,
@@ -2636,10 +2699,13 @@ int migrate_misplaced_folio(struct folio *folio, struct vm_area_struct *vma,
putback_movable_pages(&migratepages);
if (nr_succeeded) {
count_vm_numa_events(NUMA_PAGE_MIGRATE, nr_succeeded);
- if (!node_is_toptier(folio_nid(folio)) && node_is_toptier(node))
- mod_node_page_state(pgdat, PGPROMOTE_SUCCESS,
- nr_succeeded);
+ count_memcg_events(memcg, NUMA_PAGE_MIGRATE, nr_succeeded);
+ if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
+ && !node_is_toptier(folio_nid(folio))
+ && node_is_toptier(node))
+ mod_lruvec_state(lruvec, PGPROMOTE_SUCCESS, nr_succeeded);
}
+ mem_cgroup_put(memcg);
BUG_ON(!list_empty(&migratepages));
return nr_remaining ? -EAGAIN : 0;
}
diff --git a/mm/migrate_device.c b/mm/migrate_device.c
index 6d66dc1c6ffa..9cf26592ac93 100644
--- a/mm/migrate_device.c
+++ b/mm/migrate_device.c
@@ -328,8 +328,8 @@ static bool migrate_vma_check_page(struct page *page, struct page *fault_page)
/*
* One extra ref because caller holds an extra reference, either from
- * isolate_lru_page() for a regular page, or migrate_vma_collect() for
- * a device page.
+ * folio_isolate_lru() for a regular folio, or migrate_vma_collect() for
+ * a device folio.
*/
int extra = 1 + (page == fault_page);
@@ -379,33 +379,33 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
continue;
}
- /* ZONE_DEVICE pages are not on LRU */
- if (!is_zone_device_page(page)) {
- if (!PageLRU(page) && allow_drain) {
+ folio = page_folio(page);
+ /* ZONE_DEVICE folios are not on LRU */
+ if (!folio_is_zone_device(folio)) {
+ if (!folio_test_lru(folio) && allow_drain) {
/* Drain CPU's lru cache */
lru_add_drain_all();
allow_drain = false;
}
- if (!isolate_lru_page(page)) {
+ if (!folio_isolate_lru(folio)) {
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
restore++;
continue;
}
/* Drop the reference we took in collect */
- put_page(page);
+ folio_put(folio);
}
- folio = page_folio(page);
if (folio_mapped(folio))
try_to_migrate(folio, 0);
- if (page_mapped(page) ||
+ if (folio_mapped(folio) ||
!migrate_vma_check_page(page, fault_page)) {
- if (!is_zone_device_page(page)) {
- get_page(page);
- putback_lru_page(page);
+ if (!folio_is_zone_device(folio)) {
+ folio_get(folio);
+ folio_putback_lru(folio);
}
src_pfns[i] &= ~MIGRATE_PFN_MIGRATE;
@@ -424,7 +424,7 @@ static unsigned long migrate_device_unmap(unsigned long *src_pfns,
continue;
folio = page_folio(page);
- remove_migration_ptes(folio, folio, false);
+ remove_migration_ptes(folio, folio, 0);
src_pfns[i] = 0;
folio_unlock(folio);
@@ -708,7 +708,7 @@ static void __migrate_device_pages(unsigned long *src_pfns,
/*
* The only time there is no vma is when called from
- * migrate_device_coherent_page(). However this isn't
+ * migrate_device_coherent_folio(). However this isn't
* called if the page could not be unmapped.
*/
VM_BUG_ON(!migrate);
@@ -815,42 +815,45 @@ void migrate_device_finalize(unsigned long *src_pfns,
unsigned long i;
for (i = 0; i < npages; i++) {
- struct folio *dst, *src;
+ struct folio *dst = NULL, *src = NULL;
struct page *newpage = migrate_pfn_to_page(dst_pfns[i]);
struct page *page = migrate_pfn_to_page(src_pfns[i]);
+ if (newpage)
+ dst = page_folio(newpage);
+
if (!page) {
- if (newpage) {
- unlock_page(newpage);
- put_page(newpage);
+ if (dst) {
+ folio_unlock(dst);
+ folio_put(dst);
}
continue;
}
- if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !newpage) {
- if (newpage) {
- unlock_page(newpage);
- put_page(newpage);
+ src = page_folio(page);
+
+ if (!(src_pfns[i] & MIGRATE_PFN_MIGRATE) || !dst) {
+ if (dst) {
+ folio_unlock(dst);
+ folio_put(dst);
}
- newpage = page;
+ dst = src;
}
- src = page_folio(page);
- dst = page_folio(newpage);
- remove_migration_ptes(src, dst, false);
+ remove_migration_ptes(src, dst, 0);
folio_unlock(src);
- if (is_zone_device_page(page))
- put_page(page);
+ if (folio_is_zone_device(src))
+ folio_put(src);
else
- putback_lru_page(page);
+ folio_putback_lru(src);
- if (newpage != page) {
- unlock_page(newpage);
- if (is_zone_device_page(newpage))
- put_page(newpage);
+ if (dst != src) {
+ folio_unlock(dst);
+ if (folio_is_zone_device(dst))
+ folio_put(dst);
else
- putback_lru_page(newpage);
+ folio_putback_lru(dst);
}
}
}
@@ -898,16 +901,17 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start,
unsigned long i, pfn;
for (pfn = start, i = 0; i < npages; pfn++, i++) {
- struct page *page = pfn_to_page(pfn);
+ struct folio *folio;
- if (!get_page_unless_zero(page)) {
+ folio = folio_get_nontail_page(pfn_to_page(pfn));
+ if (!folio) {
src_pfns[i] = 0;
continue;
}
- if (!trylock_page(page)) {
+ if (!folio_trylock(folio)) {
src_pfns[i] = 0;
- put_page(page);
+ folio_put(folio);
continue;
}
@@ -921,38 +925,38 @@ int migrate_device_range(unsigned long *src_pfns, unsigned long start,
EXPORT_SYMBOL(migrate_device_range);
/*
- * Migrate a device coherent page back to normal memory. The caller should have
- * a reference on page which will be copied to the new page if migration is
+ * Migrate a device coherent folio back to normal memory. The caller should have
+ * a reference on folio which will be copied to the new folio if migration is
* successful or dropped on failure.
*/
-int migrate_device_coherent_page(struct page *page)
+int migrate_device_coherent_folio(struct folio *folio)
{
unsigned long src_pfn, dst_pfn = 0;
- struct page *dpage;
+ struct folio *dfolio;
- WARN_ON_ONCE(PageCompound(page));
+ WARN_ON_ONCE(folio_test_large(folio));
- lock_page(page);
- src_pfn = migrate_pfn(page_to_pfn(page)) | MIGRATE_PFN_MIGRATE;
+ folio_lock(folio);
+ src_pfn = migrate_pfn(folio_pfn(folio)) | MIGRATE_PFN_MIGRATE;
/*
* We don't have a VMA and don't need to walk the page tables to find
- * the source page. So call migrate_vma_unmap() directly to unmap the
- * page as migrate_vma_setup() will fail if args.vma == NULL.
+ * the source folio. So call migrate_vma_unmap() directly to unmap the
+ * folio as migrate_vma_setup() will fail if args.vma == NULL.
*/
migrate_device_unmap(&src_pfn, 1, NULL);
if (!(src_pfn & MIGRATE_PFN_MIGRATE))
return -EBUSY;
- dpage = alloc_page(GFP_USER | __GFP_NOWARN);
- if (dpage) {
- lock_page(dpage);
- dst_pfn = migrate_pfn(page_to_pfn(dpage));
+ dfolio = folio_alloc(GFP_USER | __GFP_NOWARN, 0);
+ if (dfolio) {
+ folio_lock(dfolio);
+ dst_pfn = migrate_pfn(folio_pfn(dfolio));
}
migrate_device_pages(&src_pfn, &dst_pfn, 1);
if (src_pfn & MIGRATE_PFN_MIGRATE)
- copy_highpage(dpage, page);
+ folio_copy(dfolio, folio);
migrate_device_finalize(&src_pfn, &dst_pfn, 1);
if (src_pfn & MIGRATE_PFN_MIGRATE)
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 51960079875b..4ba5607aaf19 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -1835,14 +1835,8 @@ void __init free_area_init(unsigned long *max_zone_pfn)
for_each_node(nid) {
pg_data_t *pgdat;
- if (!node_online(nid)) {
- /* Allocator not initialized yet */
- pgdat = arch_alloc_nodedata(nid);
- if (!pgdat)
- panic("Cannot allocate %zuB for node %d.\n",
- sizeof(*pgdat), nid);
- arch_refresh_nodedata(nid, pgdat);
- }
+ if (!node_online(nid))
+ alloc_offline_node_data(nid);
pgdat = NODE_DATA(nid);
free_area_init_node(nid);
@@ -1939,7 +1933,7 @@ static void __init deferred_free_pages(unsigned long pfn,
}
/* Accept chunks smaller than MAX_PAGE_ORDER upfront */
- accept_memory(PFN_PHYS(pfn), PFN_PHYS(pfn + nr_pages));
+ accept_memory(PFN_PHYS(pfn), nr_pages * PAGE_SIZE);
for (i = 0; i < nr_pages; i++, page++, pfn++) {
if (pageblock_aligned(pfn))
diff --git a/mm/mmap.c b/mm/mmap.c
index 6ddb278a5ee8..ee8f91eaadb9 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -76,16 +76,6 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS;
static bool ignore_rlimit_data;
core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644);
-static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
- struct vm_area_struct *vma, struct vm_area_struct *prev,
- struct vm_area_struct *next, unsigned long start,
- unsigned long end, unsigned long tree_end, bool mm_wr_locked);
-
-static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
-{
- return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
-}
-
/* Update vma->vm_page_prot to reflect vma->vm_flags. */
void vma_set_page_prot(struct vm_area_struct *vma)
{
@@ -102,100 +92,6 @@ void vma_set_page_prot(struct vm_area_struct *vma)
}
/*
- * Requires inode->i_mapping->i_mmap_rwsem
- */
-static void __remove_shared_vm_struct(struct vm_area_struct *vma,
- struct address_space *mapping)
-{
- if (vma_is_shared_maywrite(vma))
- mapping_unmap_writable(mapping);
-
- flush_dcache_mmap_lock(mapping);
- vma_interval_tree_remove(vma, &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
-}
-
-/*
- * Unlink a file-based vm structure from its interval tree, to hide
- * vma from rmap and vmtruncate before freeing its page tables.
- */
-void unlink_file_vma(struct vm_area_struct *vma)
-{
- struct file *file = vma->vm_file;
-
- if (file) {
- struct address_space *mapping = file->f_mapping;
- i_mmap_lock_write(mapping);
- __remove_shared_vm_struct(vma, mapping);
- i_mmap_unlock_write(mapping);
- }
-}
-
-void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
-{
- vb->count = 0;
-}
-
-static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
-{
- struct address_space *mapping;
- int i;
-
- mapping = vb->vmas[0]->vm_file->f_mapping;
- i_mmap_lock_write(mapping);
- for (i = 0; i < vb->count; i++) {
- VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
- __remove_shared_vm_struct(vb->vmas[i], mapping);
- }
- i_mmap_unlock_write(mapping);
-
- unlink_file_vma_batch_init(vb);
-}
-
-void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
- struct vm_area_struct *vma)
-{
- if (vma->vm_file == NULL)
- return;
-
- if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
- vb->count == ARRAY_SIZE(vb->vmas))
- unlink_file_vma_batch_process(vb);
-
- vb->vmas[vb->count] = vma;
- vb->count++;
-}
-
-void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
-{
- if (vb->count > 0)
- unlink_file_vma_batch_process(vb);
-}
-
-/*
- * Close a vm structure and free it.
- */
-static void remove_vma(struct vm_area_struct *vma, bool unreachable)
-{
- might_sleep();
- if (vma->vm_ops && vma->vm_ops->close)
- vma->vm_ops->close(vma);
- if (vma->vm_file)
- fput(vma->vm_file);
- mpol_put(vma_policy(vma));
- if (unreachable)
- __vm_area_free(vma);
- else
- vm_area_free(vma);
-}
-
-static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
- unsigned long min)
-{
- return mas_prev(&vmi->mas, min);
-}
-
-/*
* check_brk_limits() - Use platform specific check of range & verify mlock
* limits.
* @addr: The address to check
@@ -273,11 +169,12 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
goto out; /* mapping intersects with an existing non-brk vma. */
/*
* mm->brk must be protected by write mmap_lock.
- * do_vma_munmap() will drop the lock on success, so update it
- * before calling do_vma_munmap().
+ * do_vmi_align_munmap() will drop the lock on success, so
+ * update it before calling do_vma_munmap().
*/
mm->brk = brk;
- if (do_vma_munmap(&vmi, brkvma, newbrk, oldbrk, &uf, true))
+ if (do_vmi_align_munmap(&vmi, brkvma, mm, newbrk, oldbrk, &uf,
+ /* unlock = */ true))
goto out;
goto success_unlocked;
@@ -318,875 +215,6 @@ out:
return origbrk;
}
-#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
-static void validate_mm(struct mm_struct *mm)
-{
- int bug = 0;
- int i = 0;
- struct vm_area_struct *vma;
- VMA_ITERATOR(vmi, mm, 0);
-
- mt_validate(&mm->mm_mt);
- for_each_vma(vmi, vma) {
-#ifdef CONFIG_DEBUG_VM_RB
- struct anon_vma *anon_vma = vma->anon_vma;
- struct anon_vma_chain *avc;
-#endif
- unsigned long vmi_start, vmi_end;
- bool warn = 0;
-
- vmi_start = vma_iter_addr(&vmi);
- vmi_end = vma_iter_end(&vmi);
- if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
- warn = 1;
-
- if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
- warn = 1;
-
- if (warn) {
- pr_emerg("issue in %s\n", current->comm);
- dump_stack();
- dump_vma(vma);
- pr_emerg("tree range: %px start %lx end %lx\n", vma,
- vmi_start, vmi_end - 1);
- vma_iter_dump_tree(&vmi);
- }
-
-#ifdef CONFIG_DEBUG_VM_RB
- if (anon_vma) {
- anon_vma_lock_read(anon_vma);
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- anon_vma_interval_tree_verify(avc);
- anon_vma_unlock_read(anon_vma);
- }
-#endif
- i++;
- }
- if (i != mm->map_count) {
- pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
- bug = 1;
- }
- VM_BUG_ON_MM(bug, mm);
-}
-
-#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */
-#define validate_mm(mm) do { } while (0)
-#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
-
-/*
- * vma has some anon_vma assigned, and is already inserted on that
- * anon_vma's interval trees.
- *
- * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
- * vma must be removed from the anon_vma's interval trees using
- * anon_vma_interval_tree_pre_update_vma().
- *
- * After the update, the vma will be reinserted using
- * anon_vma_interval_tree_post_update_vma().
- *
- * The entire update must be protected by exclusive mmap_lock and by
- * the root anon_vma's mutex.
- */
-static inline void
-anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
-{
- struct anon_vma_chain *avc;
-
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
-}
-
-static inline void
-anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
-{
- struct anon_vma_chain *avc;
-
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
-}
-
-static unsigned long count_vma_pages_range(struct mm_struct *mm,
- unsigned long addr, unsigned long end)
-{
- VMA_ITERATOR(vmi, mm, addr);
- struct vm_area_struct *vma;
- unsigned long nr_pages = 0;
-
- for_each_vma_range(vmi, vma, end) {
- unsigned long vm_start = max(addr, vma->vm_start);
- unsigned long vm_end = min(end, vma->vm_end);
-
- nr_pages += PHYS_PFN(vm_end - vm_start);
- }
-
- return nr_pages;
-}
-
-static void __vma_link_file(struct vm_area_struct *vma,
- struct address_space *mapping)
-{
- if (vma_is_shared_maywrite(vma))
- mapping_allow_writable(mapping);
-
- flush_dcache_mmap_lock(mapping);
- vma_interval_tree_insert(vma, &mapping->i_mmap);
- flush_dcache_mmap_unlock(mapping);
-}
-
-static void vma_link_file(struct vm_area_struct *vma)
-{
- struct file *file = vma->vm_file;
- struct address_space *mapping;
-
- if (file) {
- mapping = file->f_mapping;
- i_mmap_lock_write(mapping);
- __vma_link_file(vma, mapping);
- i_mmap_unlock_write(mapping);
- }
-}
-
-static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
-{
- VMA_ITERATOR(vmi, mm, 0);
-
- vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
- if (vma_iter_prealloc(&vmi, vma))
- return -ENOMEM;
-
- vma_start_write(vma);
- vma_iter_store(&vmi, vma);
- vma_link_file(vma);
- mm->map_count++;
- validate_mm(mm);
- return 0;
-}
-
-/*
- * init_multi_vma_prep() - Initializer for struct vma_prepare
- * @vp: The vma_prepare struct
- * @vma: The vma that will be altered once locked
- * @next: The next vma if it is to be adjusted
- * @remove: The first vma to be removed
- * @remove2: The second vma to be removed
- */
-static inline void init_multi_vma_prep(struct vma_prepare *vp,
- struct vm_area_struct *vma, struct vm_area_struct *next,
- struct vm_area_struct *remove, struct vm_area_struct *remove2)
-{
- memset(vp, 0, sizeof(struct vma_prepare));
- vp->vma = vma;
- vp->anon_vma = vma->anon_vma;
- vp->remove = remove;
- vp->remove2 = remove2;
- vp->adj_next = next;
- if (!vp->anon_vma && next)
- vp->anon_vma = next->anon_vma;
-
- vp->file = vma->vm_file;
- if (vp->file)
- vp->mapping = vma->vm_file->f_mapping;
-
-}
-
-/*
- * init_vma_prep() - Initializer wrapper for vma_prepare struct
- * @vp: The vma_prepare struct
- * @vma: The vma that will be altered once locked
- */
-static inline void init_vma_prep(struct vma_prepare *vp,
- struct vm_area_struct *vma)
-{
- init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
-}
-
-
-/*
- * vma_prepare() - Helper function for handling locking VMAs prior to altering
- * @vp: The initialized vma_prepare struct
- */
-static inline void vma_prepare(struct vma_prepare *vp)
-{
- if (vp->file) {
- uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
-
- if (vp->adj_next)
- uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
- vp->adj_next->vm_end);
-
- i_mmap_lock_write(vp->mapping);
- if (vp->insert && vp->insert->vm_file) {
- /*
- * Put into interval tree now, so instantiated pages
- * are visible to arm/parisc __flush_dcache_page
- * throughout; but we cannot insert into address
- * space until vma start or end is updated.
- */
- __vma_link_file(vp->insert,
- vp->insert->vm_file->f_mapping);
- }
- }
-
- if (vp->anon_vma) {
- anon_vma_lock_write(vp->anon_vma);
- anon_vma_interval_tree_pre_update_vma(vp->vma);
- if (vp->adj_next)
- anon_vma_interval_tree_pre_update_vma(vp->adj_next);
- }
-
- if (vp->file) {
- flush_dcache_mmap_lock(vp->mapping);
- vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
- if (vp->adj_next)
- vma_interval_tree_remove(vp->adj_next,
- &vp->mapping->i_mmap);
- }
-
-}
-
-/*
- * vma_complete- Helper function for handling the unlocking after altering VMAs,
- * or for inserting a VMA.
- *
- * @vp: The vma_prepare struct
- * @vmi: The vma iterator
- * @mm: The mm_struct
- */
-static inline void vma_complete(struct vma_prepare *vp,
- struct vma_iterator *vmi, struct mm_struct *mm)
-{
- if (vp->file) {
- if (vp->adj_next)
- vma_interval_tree_insert(vp->adj_next,
- &vp->mapping->i_mmap);
- vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
- flush_dcache_mmap_unlock(vp->mapping);
- }
-
- if (vp->remove && vp->file) {
- __remove_shared_vm_struct(vp->remove, vp->mapping);
- if (vp->remove2)
- __remove_shared_vm_struct(vp->remove2, vp->mapping);
- } else if (vp->insert) {
- /*
- * split_vma has split insert from vma, and needs
- * us to insert it before dropping the locks
- * (it may either follow vma or precede it).
- */
- vma_iter_store(vmi, vp->insert);
- mm->map_count++;
- }
-
- if (vp->anon_vma) {
- anon_vma_interval_tree_post_update_vma(vp->vma);
- if (vp->adj_next)
- anon_vma_interval_tree_post_update_vma(vp->adj_next);
- anon_vma_unlock_write(vp->anon_vma);
- }
-
- if (vp->file) {
- i_mmap_unlock_write(vp->mapping);
- uprobe_mmap(vp->vma);
-
- if (vp->adj_next)
- uprobe_mmap(vp->adj_next);
- }
-
- if (vp->remove) {
-again:
- vma_mark_detached(vp->remove, true);
- if (vp->file) {
- uprobe_munmap(vp->remove, vp->remove->vm_start,
- vp->remove->vm_end);
- fput(vp->file);
- }
- if (vp->remove->anon_vma)
- anon_vma_merge(vp->vma, vp->remove);
- mm->map_count--;
- mpol_put(vma_policy(vp->remove));
- if (!vp->remove2)
- WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
- vm_area_free(vp->remove);
-
- /*
- * In mprotect's case 6 (see comments on vma_merge),
- * we are removing both mid and next vmas
- */
- if (vp->remove2) {
- vp->remove = vp->remove2;
- vp->remove2 = NULL;
- goto again;
- }
- }
- if (vp->insert && vp->file)
- uprobe_mmap(vp->insert);
- validate_mm(mm);
-}
-
-/*
- * dup_anon_vma() - Helper function to duplicate anon_vma
- * @dst: The destination VMA
- * @src: The source VMA
- * @dup: Pointer to the destination VMA when successful.
- *
- * Returns: 0 on success.
- */
-static inline int dup_anon_vma(struct vm_area_struct *dst,
- struct vm_area_struct *src, struct vm_area_struct **dup)
-{
- /*
- * Easily overlooked: when mprotect shifts the boundary, make sure the
- * expanding vma has anon_vma set if the shrinking vma had, to cover any
- * anon pages imported.
- */
- if (src->anon_vma && !dst->anon_vma) {
- int ret;
-
- vma_assert_write_locked(dst);
- dst->anon_vma = src->anon_vma;
- ret = anon_vma_clone(dst, src);
- if (ret)
- return ret;
-
- *dup = dst;
- }
-
- return 0;
-}
-
-/*
- * vma_expand - Expand an existing VMA
- *
- * @vmi: The vma iterator
- * @vma: The vma to expand
- * @start: The start of the vma
- * @end: The exclusive end of the vma
- * @pgoff: The page offset of vma
- * @next: The current of next vma.
- *
- * Expand @vma to @start and @end. Can expand off the start and end. Will
- * expand over @next if it's different from @vma and @end == @next->vm_end.
- * Checking if the @vma can expand and merge with @next needs to be handled by
- * the caller.
- *
- * Returns: 0 on success
- */
-int vma_expand(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff,
- struct vm_area_struct *next)
-{
- struct vm_area_struct *anon_dup = NULL;
- bool remove_next = false;
- struct vma_prepare vp;
-
- vma_start_write(vma);
- if (next && (vma != next) && (end == next->vm_end)) {
- int ret;
-
- remove_next = true;
- vma_start_write(next);
- ret = dup_anon_vma(vma, next, &anon_dup);
- if (ret)
- return ret;
- }
-
- init_multi_vma_prep(&vp, vma, NULL, remove_next ? next : NULL, NULL);
- /* Not merging but overwriting any part of next is not handled. */
- VM_WARN_ON(next && !vp.remove &&
- next != vma && end > next->vm_start);
- /* Only handles expanding */
- VM_WARN_ON(vma->vm_start < start || vma->vm_end > end);
-
- /* Note: vma iterator must be pointing to 'start' */
- vma_iter_config(vmi, start, end);
- if (vma_iter_prealloc(vmi, vma))
- goto nomem;
-
- vma_prepare(&vp);
- vma_adjust_trans_huge(vma, start, end, 0);
- vma_set_range(vma, start, end, pgoff);
- vma_iter_store(vmi, vma);
-
- vma_complete(&vp, vmi, vma->vm_mm);
- return 0;
-
-nomem:
- if (anon_dup)
- unlink_anon_vmas(anon_dup);
- return -ENOMEM;
-}
-
-/*
- * vma_shrink() - Reduce an existing VMAs memory area
- * @vmi: The vma iterator
- * @vma: The VMA to modify
- * @start: The new start
- * @end: The new end
- *
- * Returns: 0 on success, -ENOMEM otherwise
- */
-int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, pgoff_t pgoff)
-{
- struct vma_prepare vp;
-
- WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
-
- if (vma->vm_start < start)
- vma_iter_config(vmi, vma->vm_start, start);
- else
- vma_iter_config(vmi, end, vma->vm_end);
-
- if (vma_iter_prealloc(vmi, NULL))
- return -ENOMEM;
-
- vma_start_write(vma);
-
- init_vma_prep(&vp, vma);
- vma_prepare(&vp);
- vma_adjust_trans_huge(vma, start, end, 0);
-
- vma_iter_clear(vmi);
- vma_set_range(vma, start, end, pgoff);
- vma_complete(&vp, vmi, vma->vm_mm);
- return 0;
-}
-
-/*
- * If the vma has a ->close operation then the driver probably needs to release
- * per-vma resources, so we don't attempt to merge those if the caller indicates
- * the current vma may be removed as part of the merge.
- */
-static inline bool is_mergeable_vma(struct vm_area_struct *vma,
- struct file *file, unsigned long vm_flags,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name, bool may_remove_vma)
-{
- /*
- * VM_SOFTDIRTY should not prevent from VMA merging, if we
- * match the flags but dirty bit -- the caller should mark
- * merged VMA as dirty. If dirty bit won't be excluded from
- * comparison, we increase pressure on the memory system forcing
- * the kernel to generate new VMAs when old one could be
- * extended instead.
- */
- if ((vma->vm_flags ^ vm_flags) & ~VM_SOFTDIRTY)
- return false;
- if (vma->vm_file != file)
- return false;
- if (may_remove_vma && vma->vm_ops && vma->vm_ops->close)
- return false;
- if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
- return false;
- if (!anon_vma_name_eq(anon_vma_name(vma), anon_name))
- return false;
- return true;
-}
-
-static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
- struct anon_vma *anon_vma2, struct vm_area_struct *vma)
-{
- /*
- * The list_is_singular() test is to avoid merging VMA cloned from
- * parents. This can improve scalability caused by anon_vma lock.
- */
- if ((!anon_vma1 || !anon_vma2) && (!vma ||
- list_is_singular(&vma->anon_vma_chain)))
- return true;
- return anon_vma1 == anon_vma2;
-}
-
-/*
- * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
- * in front of (at a lower virtual address and file offset than) the vma.
- *
- * We cannot merge two vmas if they have differently assigned (non-NULL)
- * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
- *
- * We don't check here for the merged mmap wrapping around the end of pagecache
- * indices (16TB on ia32) because do_mmap() does not permit mmap's which
- * wrap, nor mmaps which cover the final page at index -1UL.
- *
- * We assume the vma may be removed as part of the merge.
- */
-static bool
-can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
- pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name)
-{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, true) &&
- is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
- if (vma->vm_pgoff == vm_pgoff)
- return true;
- }
- return false;
-}
-
-/*
- * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
- * beyond (at a higher virtual address and file offset than) the vma.
- *
- * We cannot merge two vmas if they have differently assigned (non-NULL)
- * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
- *
- * We assume that vma is not removed as part of the merge.
- */
-static bool
-can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
- struct anon_vma *anon_vma, struct file *file,
- pgoff_t vm_pgoff, struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name)
-{
- if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name, false) &&
- is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
- pgoff_t vm_pglen;
- vm_pglen = vma_pages(vma);
- if (vma->vm_pgoff + vm_pglen == vm_pgoff)
- return true;
- }
- return false;
-}
-
-/*
- * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
- * figure out whether that can be merged with its predecessor or its
- * successor. Or both (it neatly fills a hole).
- *
- * In most cases - when called for mmap, brk or mremap - [addr,end) is
- * certain not to be mapped by the time vma_merge is called; but when
- * called for mprotect, it is certain to be already mapped (either at
- * an offset within prev, or at the start of next), and the flags of
- * this area are about to be changed to vm_flags - and the no-change
- * case has already been eliminated.
- *
- * The following mprotect cases have to be considered, where **** is
- * the area passed down from mprotect_fixup, never extending beyond one
- * vma, PPPP is the previous vma, CCCC is a concurrent vma that starts
- * at the same address as **** and is of the same or larger span, and
- * NNNN the next vma after ****:
- *
- * **** **** ****
- * PPPPPPNNNNNN PPPPPPNNNNNN PPPPPPCCCCCC
- * cannot merge might become might become
- * PPNNNNNNNNNN PPPPPPPPPPCC
- * mmap, brk or case 4 below case 5 below
- * mremap move:
- * **** ****
- * PPPP NNNN PPPPCCCCNNNN
- * might become might become
- * PPPPPPPPPPPP 1 or PPPPPPPPPPPP 6 or
- * PPPPPPPPNNNN 2 or PPPPPPPPNNNN 7 or
- * PPPPNNNNNNNN 3 PPPPNNNNNNNN 8
- *
- * It is important for case 8 that the vma CCCC overlapping the
- * region **** is never going to extended over NNNN. Instead NNNN must
- * be extended in region **** and CCCC must be removed. This way in
- * all cases where vma_merge succeeds, the moment vma_merge drops the
- * rmap_locks, the properties of the merged vma will be already
- * correct for the whole merged range. Some of those properties like
- * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
- * be correct for the whole merged range immediately after the
- * rmap_locks are released. Otherwise if NNNN would be removed and
- * CCCC would be extended over the NNNN range, remove_migration_ptes
- * or other rmap walkers (if working on addresses beyond the "end"
- * parameter) may establish ptes with the wrong permissions of CCCC
- * instead of the right permissions of NNNN.
- *
- * In the code below:
- * PPPP is represented by *prev
- * CCCC is represented by *curr or not represented at all (NULL)
- * NNNN is represented by *next or not represented at all (NULL)
- * **** is not represented - it will be merged and the vma containing the
- * area is returned, or the function will return NULL
- */
-static struct vm_area_struct
-*vma_merge(struct vma_iterator *vmi, struct vm_area_struct *prev,
- struct vm_area_struct *src, unsigned long addr, unsigned long end,
- unsigned long vm_flags, pgoff_t pgoff, struct mempolicy *policy,
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
- struct anon_vma_name *anon_name)
-{
- struct mm_struct *mm = src->vm_mm;
- struct anon_vma *anon_vma = src->anon_vma;
- struct file *file = src->vm_file;
- struct vm_area_struct *curr, *next, *res;
- struct vm_area_struct *vma, *adjust, *remove, *remove2;
- struct vm_area_struct *anon_dup = NULL;
- struct vma_prepare vp;
- pgoff_t vma_pgoff;
- int err = 0;
- bool merge_prev = false;
- bool merge_next = false;
- bool vma_expanded = false;
- unsigned long vma_start = addr;
- unsigned long vma_end = end;
- pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
- long adj_start = 0;
-
- /*
- * We later require that vma->vm_flags == vm_flags,
- * so this tests vma->vm_flags & VM_SPECIAL, too.
- */
- if (vm_flags & VM_SPECIAL)
- return NULL;
-
- /* Does the input range span an existing VMA? (cases 5 - 8) */
- curr = find_vma_intersection(mm, prev ? prev->vm_end : 0, end);
-
- if (!curr || /* cases 1 - 4 */
- end == curr->vm_end) /* cases 6 - 8, adjacent VMA */
- next = vma_lookup(mm, end);
- else
- next = NULL; /* case 5 */
-
- if (prev) {
- vma_start = prev->vm_start;
- vma_pgoff = prev->vm_pgoff;
-
- /* Can we merge the predecessor? */
- if (addr == prev->vm_end && mpol_equal(vma_policy(prev), policy)
- && can_vma_merge_after(prev, vm_flags, anon_vma, file,
- pgoff, vm_userfaultfd_ctx, anon_name)) {
- merge_prev = true;
- vma_prev(vmi);
- }
- }
-
- /* Can we merge the successor? */
- if (next && mpol_equal(policy, vma_policy(next)) &&
- can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen,
- vm_userfaultfd_ctx, anon_name)) {
- merge_next = true;
- }
-
- /* Verify some invariant that must be enforced by the caller. */
- VM_WARN_ON(prev && addr <= prev->vm_start);
- VM_WARN_ON(curr && (addr != curr->vm_start || end > curr->vm_end));
- VM_WARN_ON(addr >= end);
-
- if (!merge_prev && !merge_next)
- return NULL; /* Not mergeable. */
-
- if (merge_prev)
- vma_start_write(prev);
-
- res = vma = prev;
- remove = remove2 = adjust = NULL;
-
- /* Can we merge both the predecessor and the successor? */
- if (merge_prev && merge_next &&
- is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) {
- vma_start_write(next);
- remove = next; /* case 1 */
- vma_end = next->vm_end;
- err = dup_anon_vma(prev, next, &anon_dup);
- if (curr) { /* case 6 */
- vma_start_write(curr);
- remove = curr;
- remove2 = next;
- /*
- * Note that the dup_anon_vma below cannot overwrite err
- * since the first caller would do nothing unless next
- * has an anon_vma.
- */
- if (!next->anon_vma)
- err = dup_anon_vma(prev, curr, &anon_dup);
- }
- } else if (merge_prev) { /* case 2 */
- if (curr) {
- vma_start_write(curr);
- if (end == curr->vm_end) { /* case 7 */
- /*
- * can_vma_merge_after() assumed we would not be
- * removing prev vma, so it skipped the check
- * for vm_ops->close, but we are removing curr
- */
- if (curr->vm_ops && curr->vm_ops->close)
- err = -EINVAL;
- remove = curr;
- } else { /* case 5 */
- adjust = curr;
- adj_start = (end - curr->vm_start);
- }
- if (!err)
- err = dup_anon_vma(prev, curr, &anon_dup);
- }
- } else { /* merge_next */
- vma_start_write(next);
- res = next;
- if (prev && addr < prev->vm_end) { /* case 4 */
- vma_start_write(prev);
- vma_end = addr;
- adjust = next;
- adj_start = -(prev->vm_end - addr);
- err = dup_anon_vma(next, prev, &anon_dup);
- } else {
- /*
- * Note that cases 3 and 8 are the ONLY ones where prev
- * is permitted to be (but is not necessarily) NULL.
- */
- vma = next; /* case 3 */
- vma_start = addr;
- vma_end = next->vm_end;
- vma_pgoff = next->vm_pgoff - pglen;
- if (curr) { /* case 8 */
- vma_pgoff = curr->vm_pgoff;
- vma_start_write(curr);
- remove = curr;
- err = dup_anon_vma(next, curr, &anon_dup);
- }
- }
- }
-
- /* Error in anon_vma clone. */
- if (err)
- goto anon_vma_fail;
-
- if (vma_start < vma->vm_start || vma_end > vma->vm_end)
- vma_expanded = true;
-
- if (vma_expanded) {
- vma_iter_config(vmi, vma_start, vma_end);
- } else {
- vma_iter_config(vmi, adjust->vm_start + adj_start,
- adjust->vm_end);
- }
-
- if (vma_iter_prealloc(vmi, vma))
- goto prealloc_fail;
-
- init_multi_vma_prep(&vp, vma, adjust, remove, remove2);
- VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
- vp.anon_vma != adjust->anon_vma);
-
- vma_prepare(&vp);
- vma_adjust_trans_huge(vma, vma_start, vma_end, adj_start);
- vma_set_range(vma, vma_start, vma_end, vma_pgoff);
-
- if (vma_expanded)
- vma_iter_store(vmi, vma);
-
- if (adj_start) {
- adjust->vm_start += adj_start;
- adjust->vm_pgoff += adj_start >> PAGE_SHIFT;
- if (adj_start < 0) {
- WARN_ON(vma_expanded);
- vma_iter_store(vmi, next);
- }
- }
-
- vma_complete(&vp, vmi, mm);
- khugepaged_enter_vma(res, vm_flags);
- return res;
-
-prealloc_fail:
- if (anon_dup)
- unlink_anon_vmas(anon_dup);
-
-anon_vma_fail:
- vma_iter_set(vmi, addr);
- vma_iter_load(vmi);
- return NULL;
-}
-
-/*
- * Rough compatibility check to quickly see if it's even worth looking
- * at sharing an anon_vma.
- *
- * They need to have the same vm_file, and the flags can only differ
- * in things that mprotect may change.
- *
- * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
- * we can merge the two vma's. For example, we refuse to merge a vma if
- * there is a vm_ops->close() function, because that indicates that the
- * driver is doing some kind of reference counting. But that doesn't
- * really matter for the anon_vma sharing case.
- */
-static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
-{
- return a->vm_end == b->vm_start &&
- mpol_equal(vma_policy(a), vma_policy(b)) &&
- a->vm_file == b->vm_file &&
- !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
- b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
-}
-
-/*
- * Do some basic sanity checking to see if we can re-use the anon_vma
- * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
- * the same as 'old', the other will be the new one that is trying
- * to share the anon_vma.
- *
- * NOTE! This runs with mmap_lock held for reading, so it is possible that
- * the anon_vma of 'old' is concurrently in the process of being set up
- * by another page fault trying to merge _that_. But that's ok: if it
- * is being set up, that automatically means that it will be a singleton
- * acceptable for merging, so we can do all of this optimistically. But
- * we do that READ_ONCE() to make sure that we never re-load the pointer.
- *
- * IOW: that the "list_is_singular()" test on the anon_vma_chain only
- * matters for the 'stable anon_vma' case (ie the thing we want to avoid
- * is to return an anon_vma that is "complex" due to having gone through
- * a fork).
- *
- * We also make sure that the two vma's are compatible (adjacent,
- * and with the same memory policies). That's all stable, even with just
- * a read lock on the mmap_lock.
- */
-static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b)
-{
- if (anon_vma_compatible(a, b)) {
- struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
-
- if (anon_vma && list_is_singular(&old->anon_vma_chain))
- return anon_vma;
- }
- return NULL;
-}
-
-/*
- * find_mergeable_anon_vma is used by anon_vma_prepare, to check
- * neighbouring vmas for a suitable anon_vma, before it goes off
- * to allocate a new anon_vma. It checks because a repetitive
- * sequence of mprotects and faults may otherwise lead to distinct
- * anon_vmas being allocated, preventing vma merge in subsequent
- * mprotect.
- */
-struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
-{
- struct anon_vma *anon_vma = NULL;
- struct vm_area_struct *prev, *next;
- VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
-
- /* Try next first. */
- next = vma_iter_load(&vmi);
- if (next) {
- anon_vma = reusable_anon_vma(next, vma, next);
- if (anon_vma)
- return anon_vma;
- }
-
- prev = vma_prev(&vmi);
- VM_BUG_ON_VMA(prev != vma, vma);
- prev = vma_prev(&vmi);
- /* Try prev next. */
- if (prev)
- anon_vma = reusable_anon_vma(prev, prev, vma);
-
- /*
- * We might reach here with anon_vma == NULL if we can't find
- * any reusable anon_vma.
- * There's no absolute need to look only at touching neighbours:
- * we could search further afield for "compatible" anon_vmas.
- * But it would probably just be a waste of time searching,
- * or lead to too many vmas hanging off the same anon_vma.
- * We're trying to allow mprotect remerging later on,
- * not trying to minimize memory used for anon_vmas.
- */
- return anon_vma;
-}
-
/*
* If a hint addr is less than mmap_min_addr change hint to be as
* low as possible but still greater than mmap_min_addr
@@ -1549,85 +577,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg)
}
#endif /* __ARCH_WANT_SYS_OLD_MMAP */
-static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
-{
- return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
-}
-
-static bool vma_is_shared_writable(struct vm_area_struct *vma)
-{
- return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
- (VM_WRITE | VM_SHARED);
-}
-
-static bool vma_fs_can_writeback(struct vm_area_struct *vma)
-{
- /* No managed pages to writeback. */
- if (vma->vm_flags & VM_PFNMAP)
- return false;
-
- return vma->vm_file && vma->vm_file->f_mapping &&
- mapping_can_writeback(vma->vm_file->f_mapping);
-}
-
-/*
- * Does this VMA require the underlying folios to have their dirty state
- * tracked?
- */
-bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
-{
- /* Only shared, writable VMAs require dirty tracking. */
- if (!vma_is_shared_writable(vma))
- return false;
-
- /* Does the filesystem need to be notified? */
- if (vm_ops_needs_writenotify(vma->vm_ops))
- return true;
-
- /*
- * Even if the filesystem doesn't indicate a need for writenotify, if it
- * can writeback, dirty tracking is still required.
- */
- return vma_fs_can_writeback(vma);
-}
-
-/*
- * Some shared mappings will want the pages marked read-only
- * to track write events. If so, we'll downgrade vm_page_prot
- * to the private version (using protection_map[] without the
- * VM_SHARED bit).
- */
-bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
-{
- /* If it was private or non-writable, the write bit is already clear */
- if (!vma_is_shared_writable(vma))
- return false;
-
- /* The backer wishes to know when pages are first written to? */
- if (vm_ops_needs_writenotify(vma->vm_ops))
- return true;
-
- /* The open routine did something to the protections that pgprot_modify
- * won't preserve? */
- if (pgprot_val(vm_page_prot) !=
- pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
- return false;
-
- /*
- * Do we need to track softdirty? hugetlb does not support softdirty
- * tracking yet.
- */
- if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
- return true;
-
- /* Do we need write faults for uffd-wp tracking? */
- if (userfaultfd_wp(vma))
- return true;
-
- /* Can the mapping track the dirty pages? */
- return vma_fs_can_writeback(vma);
-}
-
/*
* We account for memory if it's a private writeable mapping,
* not hugepages and VM_NORESERVE wasn't set.
@@ -1754,6 +703,18 @@ retry:
}
/*
+ * Determine if the allocation needs to ensure that there is no
+ * existing mapping within it's guard gaps, for use as start_gap.
+ */
+static inline unsigned long stack_guard_placement(vm_flags_t vm_flags)
+{
+ if (vm_flags & VM_SHADOW_STACK)
+ return PAGE_SIZE;
+
+ return 0;
+}
+
+/*
* Search for an unmapped address range.
*
* We are looking for a range that:
@@ -1789,7 +750,7 @@ unsigned long vm_unmapped_area(struct vm_unmapped_area_info *info)
unsigned long
generic_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
- unsigned long flags)
+ unsigned long flags, vm_flags_t vm_flags)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
@@ -1814,6 +775,7 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr,
info.length = len;
info.low_limit = mm->mmap_base;
info.high_limit = mmap_end;
+ info.start_gap = stack_guard_placement(vm_flags);
return vm_unmapped_area(&info);
}
@@ -1821,9 +783,10 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long
arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
- unsigned long flags)
+ unsigned long flags, vm_flags_t vm_flags)
{
- return generic_get_unmapped_area(filp, addr, len, pgoff, flags);
+ return generic_get_unmapped_area(filp, addr, len, pgoff, flags,
+ vm_flags);
}
#endif
@@ -1834,7 +797,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
unsigned long
generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
- unsigned long flags)
+ unsigned long flags, vm_flags_t vm_flags)
{
struct vm_area_struct *vma, *prev;
struct mm_struct *mm = current->mm;
@@ -1862,6 +825,7 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = arch_get_mmap_base(addr, mm->mmap_base);
+ info.start_gap = stack_guard_placement(vm_flags);
addr = vm_unmapped_area(&info);
/*
@@ -1885,26 +849,10 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
- unsigned long flags)
+ unsigned long flags, vm_flags_t vm_flags)
{
- return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
-}
-#endif
-
-#ifndef HAVE_ARCH_UNMAPPED_AREA_VMFLAGS
-unsigned long
-arch_get_unmapped_area_vmflags(struct file *filp, unsigned long addr, unsigned long len,
- unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags)
-{
- return arch_get_unmapped_area(filp, addr, len, pgoff, flags);
-}
-
-unsigned long
-arch_get_unmapped_area_topdown_vmflags(struct file *filp, unsigned long addr,
- unsigned long len, unsigned long pgoff,
- unsigned long flags, vm_flags_t vm_flags)
-{
- return arch_get_unmapped_area_topdown(filp, addr, len, pgoff, flags);
+ return generic_get_unmapped_area_topdown(filp, addr, len, pgoff, flags,
+ vm_flags);
}
#endif
@@ -1914,9 +862,9 @@ unsigned long mm_get_unmapped_area_vmflags(struct mm_struct *mm, struct file *fi
vm_flags_t vm_flags)
{
if (test_bit(MMF_TOPDOWN, &mm->flags))
- return arch_get_unmapped_area_topdown_vmflags(filp, addr, len, pgoff,
- flags, vm_flags);
- return arch_get_unmapped_area_vmflags(filp, addr, len, pgoff, flags, vm_flags);
+ return arch_get_unmapped_area_topdown(filp, addr, len, pgoff,
+ flags, vm_flags);
+ return arch_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags);
}
unsigned long
@@ -1978,8 +926,8 @@ mm_get_unmapped_area(struct mm_struct *mm, struct file *file,
unsigned long pgoff, unsigned long flags)
{
if (test_bit(MMF_TOPDOWN, &mm->flags))
- return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags);
- return arch_get_unmapped_area(file, addr, len, pgoff, flags);
+ return arch_get_unmapped_area_topdown(file, addr, len, pgoff, flags, 0);
+ return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0);
}
EXPORT_SYMBOL(mm_get_unmapped_area);
@@ -2393,443 +1341,6 @@ success:
return vma;
}
-/*
- * Ok - we have the memory areas we should free on a maple tree so release them,
- * and do the vma updates.
- *
- * Called with the mm semaphore held.
- */
-static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas)
-{
- unsigned long nr_accounted = 0;
- struct vm_area_struct *vma;
-
- /* Update high watermark before we lower total_vm */
- update_hiwater_vm(mm);
- mas_for_each(mas, vma, ULONG_MAX) {
- long nrpages = vma_pages(vma);
-
- if (vma->vm_flags & VM_ACCOUNT)
- nr_accounted += nrpages;
- vm_stat_account(mm, vma->vm_flags, -nrpages);
- remove_vma(vma, false);
- }
- vm_unacct_memory(nr_accounted);
-}
-
-/*
- * Get rid of page table information in the indicated region.
- *
- * Called with the mm semaphore held.
- */
-static void unmap_region(struct mm_struct *mm, struct ma_state *mas,
- struct vm_area_struct *vma, struct vm_area_struct *prev,
- struct vm_area_struct *next, unsigned long start,
- unsigned long end, unsigned long tree_end, bool mm_wr_locked)
-{
- struct mmu_gather tlb;
- unsigned long mt_start = mas->index;
-
- lru_add_drain();
- tlb_gather_mmu(&tlb, mm);
- update_hiwater_rss(mm);
- unmap_vmas(&tlb, mas, vma, start, end, tree_end, mm_wr_locked);
- mas_set(mas, mt_start);
- free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
- next ? next->vm_start : USER_PGTABLES_CEILING,
- mm_wr_locked);
- tlb_finish_mmu(&tlb);
-}
-
-/*
- * __split_vma() bypasses sysctl_max_map_count checking. We use this where it
- * has already been checked or doesn't make sense to fail.
- * VMA Iterator will point to the end VMA.
- */
-static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, int new_below)
-{
- struct vma_prepare vp;
- struct vm_area_struct *new;
- int err;
-
- WARN_ON(vma->vm_start >= addr);
- WARN_ON(vma->vm_end <= addr);
-
- if (vma->vm_ops && vma->vm_ops->may_split) {
- err = vma->vm_ops->may_split(vma, addr);
- if (err)
- return err;
- }
-
- new = vm_area_dup(vma);
- if (!new)
- return -ENOMEM;
-
- if (new_below) {
- new->vm_end = addr;
- } else {
- new->vm_start = addr;
- new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
- }
-
- err = -ENOMEM;
- vma_iter_config(vmi, new->vm_start, new->vm_end);
- if (vma_iter_prealloc(vmi, new))
- goto out_free_vma;
-
- err = vma_dup_policy(vma, new);
- if (err)
- goto out_free_vmi;
-
- err = anon_vma_clone(new, vma);
- if (err)
- goto out_free_mpol;
-
- if (new->vm_file)
- get_file(new->vm_file);
-
- if (new->vm_ops && new->vm_ops->open)
- new->vm_ops->open(new);
-
- vma_start_write(vma);
- vma_start_write(new);
-
- init_vma_prep(&vp, vma);
- vp.insert = new;
- vma_prepare(&vp);
- vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
-
- if (new_below) {
- vma->vm_start = addr;
- vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
- } else {
- vma->vm_end = addr;
- }
-
- /* vma_complete stores the new vma */
- vma_complete(&vp, vmi, vma->vm_mm);
-
- /* Success. */
- if (new_below)
- vma_next(vmi);
- return 0;
-
-out_free_mpol:
- mpol_put(vma_policy(new));
-out_free_vmi:
- vma_iter_free(vmi);
-out_free_vma:
- vm_area_free(new);
- return err;
-}
-
-/*
- * Split a vma into two pieces at address 'addr', a new vma is allocated
- * either for the first part or the tail.
- */
-static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long addr, int new_below)
-{
- if (vma->vm_mm->map_count >= sysctl_max_map_count)
- return -ENOMEM;
-
- return __split_vma(vmi, vma, addr, new_below);
-}
-
-/*
- * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
- * context and anonymous VMA name within the range [start, end).
- *
- * As a result, we might be able to merge the newly modified VMA range with an
- * adjacent VMA with identical properties.
- *
- * If no merge is possible and the range does not span the entirety of the VMA,
- * we then need to split the VMA to accommodate the change.
- *
- * The function returns either the merged VMA, the original VMA if a split was
- * required instead, or an error if the split failed.
- */
-struct vm_area_struct *vma_modify(struct vma_iterator *vmi,
- struct vm_area_struct *prev,
- struct vm_area_struct *vma,
- unsigned long start, unsigned long end,
- unsigned long vm_flags,
- struct mempolicy *policy,
- struct vm_userfaultfd_ctx uffd_ctx,
- struct anon_vma_name *anon_name)
-{
- pgoff_t pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
- struct vm_area_struct *merged;
-
- merged = vma_merge(vmi, prev, vma, start, end, vm_flags,
- pgoff, policy, uffd_ctx, anon_name);
- if (merged)
- return merged;
-
- if (vma->vm_start < start) {
- int err = split_vma(vmi, vma, start, 1);
-
- if (err)
- return ERR_PTR(err);
- }
-
- if (vma->vm_end > end) {
- int err = split_vma(vmi, vma, end, 0);
-
- if (err)
- return ERR_PTR(err);
- }
-
- return vma;
-}
-
-/*
- * Attempt to merge a newly mapped VMA with those adjacent to it. The caller
- * must ensure that [start, end) does not overlap any existing VMA.
- */
-static struct vm_area_struct
-*vma_merge_new_vma(struct vma_iterator *vmi, struct vm_area_struct *prev,
- struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgoff_t pgoff)
-{
- return vma_merge(vmi, prev, vma, start, end, vma->vm_flags, pgoff,
- vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma));
-}
-
-/*
- * Expand vma by delta bytes, potentially merging with an immediately adjacent
- * VMA with identical properties.
- */
-struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
- struct vm_area_struct *vma,
- unsigned long delta)
-{
- pgoff_t pgoff = vma->vm_pgoff + vma_pages(vma);
-
- /* vma is specified as prev, so case 1 or 2 will apply. */
- return vma_merge(vmi, vma, vma, vma->vm_end, vma->vm_end + delta,
- vma->vm_flags, pgoff, vma_policy(vma),
- vma->vm_userfaultfd_ctx, anon_vma_name(vma));
-}
-
-/*
- * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
- * @vmi: The vma iterator
- * @vma: The starting vm_area_struct
- * @mm: The mm_struct
- * @start: The aligned start address to munmap.
- * @end: The aligned end address to munmap.
- * @uf: The userfaultfd list_head
- * @unlock: Set to true to drop the mmap_lock. unlocking only happens on
- * success.
- *
- * Return: 0 on success and drops the lock if so directed, error and leaves the
- * lock held otherwise.
- */
-static int
-do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
- struct mm_struct *mm, unsigned long start,
- unsigned long end, struct list_head *uf, bool unlock)
-{
- struct vm_area_struct *prev, *next = NULL;
- struct maple_tree mt_detach;
- int count = 0;
- int error = -ENOMEM;
- unsigned long locked_vm = 0;
- MA_STATE(mas_detach, &mt_detach, 0, 0);
- mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
- mt_on_stack(mt_detach);
-
- /*
- * If we need to split any vma, do it now to save pain later.
- *
- * Note: mremap's move_vma VM_ACCOUNT handling assumes a partially
- * unmapped vm_area_struct will remain in use: so lower split_vma
- * places tmp vma above, and higher split_vma places tmp vma below.
- */
-
- /* Does it split the first one? */
- if (start > vma->vm_start) {
-
- /*
- * Make sure that map_count on return from munmap() will
- * not exceed its limit; but let map_count go just above
- * its limit temporarily, to help free resources as expected.
- */
- if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count)
- goto map_count_exceeded;
-
- error = __split_vma(vmi, vma, start, 1);
- if (error)
- goto start_split_failed;
- }
-
- /*
- * Detach a range of VMAs from the mm. Using next as a temp variable as
- * it is always overwritten.
- */
- next = vma;
- do {
- /* Does it split the end? */
- if (next->vm_end > end) {
- error = __split_vma(vmi, next, end, 0);
- if (error)
- goto end_split_failed;
- }
- vma_start_write(next);
- mas_set(&mas_detach, count);
- error = mas_store_gfp(&mas_detach, next, GFP_KERNEL);
- if (error)
- goto munmap_gather_failed;
- vma_mark_detached(next, true);
- if (next->vm_flags & VM_LOCKED)
- locked_vm += vma_pages(next);
-
- count++;
- if (unlikely(uf)) {
- /*
- * If userfaultfd_unmap_prep returns an error the vmas
- * will remain split, but userland will get a
- * highly unexpected error anyway. This is no
- * different than the case where the first of the two
- * __split_vma fails, but we don't undo the first
- * split, despite we could. This is unlikely enough
- * failure that it's not worth optimizing it for.
- */
- error = userfaultfd_unmap_prep(next, start, end, uf);
-
- if (error)
- goto userfaultfd_error;
- }
-#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
- BUG_ON(next->vm_start < start);
- BUG_ON(next->vm_start > end);
-#endif
- } for_each_vma_range(*vmi, next, end);
-
-#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
- /* Make sure no VMAs are about to be lost. */
- {
- MA_STATE(test, &mt_detach, 0, 0);
- struct vm_area_struct *vma_mas, *vma_test;
- int test_count = 0;
-
- vma_iter_set(vmi, start);
- rcu_read_lock();
- vma_test = mas_find(&test, count - 1);
- for_each_vma_range(*vmi, vma_mas, end) {
- BUG_ON(vma_mas != vma_test);
- test_count++;
- vma_test = mas_next(&test, count - 1);
- }
- rcu_read_unlock();
- BUG_ON(count != test_count);
- }
-#endif
-
- while (vma_iter_addr(vmi) > start)
- vma_iter_prev_range(vmi);
-
- error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
- if (error)
- goto clear_tree_failed;
-
- /* Point of no return */
- mm->locked_vm -= locked_vm;
- mm->map_count -= count;
- if (unlock)
- mmap_write_downgrade(mm);
-
- prev = vma_iter_prev_range(vmi);
- next = vma_next(vmi);
- if (next)
- vma_iter_prev_range(vmi);
-
- /*
- * We can free page tables without write-locking mmap_lock because VMAs
- * were isolated before we downgraded mmap_lock.
- */
- mas_set(&mas_detach, 1);
- unmap_region(mm, &mas_detach, vma, prev, next, start, end, count,
- !unlock);
- /* Statistics and freeing VMAs */
- mas_set(&mas_detach, 0);
- remove_mt(mm, &mas_detach);
- validate_mm(mm);
- if (unlock)
- mmap_read_unlock(mm);
-
- __mt_destroy(&mt_detach);
- return 0;
-
-clear_tree_failed:
-userfaultfd_error:
-munmap_gather_failed:
-end_split_failed:
- mas_set(&mas_detach, 0);
- mas_for_each(&mas_detach, next, end)
- vma_mark_detached(next, false);
-
- __mt_destroy(&mt_detach);
-start_split_failed:
-map_count_exceeded:
- validate_mm(mm);
- return error;
-}
-
-/*
- * do_vmi_munmap() - munmap a given range.
- * @vmi: The vma iterator
- * @mm: The mm_struct
- * @start: The start address to munmap
- * @len: The length of the range to munmap
- * @uf: The userfaultfd list_head
- * @unlock: set to true if the user wants to drop the mmap_lock on success
- *
- * This function takes a @mas that is either pointing to the previous VMA or set
- * to MA_START and sets it up to remove the mapping(s). The @len will be
- * aligned and any arch_unmap work will be preformed.
- *
- * Return: 0 on success and drops the lock if so directed, error and leaves the
- * lock held otherwise.
- */
-int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
- unsigned long start, size_t len, struct list_head *uf,
- bool unlock)
-{
- unsigned long end;
- struct vm_area_struct *vma;
-
- if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
- return -EINVAL;
-
- end = start + PAGE_ALIGN(len);
- if (end == start)
- return -EINVAL;
-
- /*
- * Check if memory is sealed before arch_unmap.
- * Prevent unmapping a sealed VMA.
- * can_modify_mm assumes we have acquired the lock on MM.
- */
- if (unlikely(!can_modify_mm(mm, start, end)))
- return -EPERM;
-
- /* arch_unmap() might do unmaps itself. */
- arch_unmap(mm, start, end);
-
- /* Find the first overlapping VMA */
- vma = vma_find(vmi, end);
- if (!vma) {
- if (unlock)
- mmap_write_unlock(mm);
- return 0;
- }
-
- return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
-}
-
/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls.
* @mm: The mm_struct
* @start: The start address to munmap
@@ -2852,100 +1363,67 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma = NULL;
- struct vm_area_struct *next, *prev, *merge;
- pgoff_t pglen = len >> PAGE_SHIFT;
+ pgoff_t pglen = PHYS_PFN(len);
+ struct vm_area_struct *merge;
unsigned long charged = 0;
+ struct vma_munmap_struct vms;
+ struct ma_state mas_detach;
+ struct maple_tree mt_detach;
unsigned long end = addr + len;
- unsigned long merge_start = addr, merge_end = end;
bool writable_file_mapping = false;
- pgoff_t vm_pgoff;
- int error;
+ int error = -ENOMEM;
VMA_ITERATOR(vmi, mm, addr);
+ VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff);
- /* Check against address space limit. */
- if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) {
- unsigned long nr_pages;
-
- /*
- * MAP_FIXED may remove pages of mappings that intersects with
- * requested mapping. Account for the pages it would unmap.
- */
- nr_pages = count_vma_pages_range(mm, addr, end);
+ vmg.file = file;
+ /* Find the first overlapping VMA */
+ vma = vma_find(&vmi, end);
+ init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false);
+ if (vma) {
+ mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
+ mt_on_stack(mt_detach);
+ mas_init(&mas_detach, &mt_detach, /* addr = */ 0);
+ /* Prepare to unmap any existing mapping in the area */
+ error = vms_gather_munmap_vmas(&vms, &mas_detach);
+ if (error)
+ goto gather_failed;
- if (!may_expand_vm(mm, vm_flags,
- (len >> PAGE_SHIFT) - nr_pages))
- return -ENOMEM;
+ vmg.next = vms.next;
+ vmg.prev = vms.prev;
+ vma = NULL;
+ } else {
+ vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev);
}
- /* Unmap any existing mapping in the area */
- error = do_vmi_munmap(&vmi, mm, addr, len, uf, false);
- if (error == -EPERM)
- return error;
- else if (error)
- return -ENOMEM;
+ /* Check against address space limit. */
+ if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages))
+ goto abort_munmap;
/*
* Private writable mapping: check memory availability
*/
if (accountable_mapping(file, vm_flags)) {
- charged = len >> PAGE_SHIFT;
- if (security_vm_enough_memory_mm(mm, charged))
- return -ENOMEM;
- vm_flags |= VM_ACCOUNT;
- }
-
- next = vma_next(&vmi);
- prev = vma_prev(&vmi);
- if (vm_flags & VM_SPECIAL) {
- if (prev)
- vma_iter_next_range(&vmi);
- goto cannot_expand;
- }
-
- /* Attempt to expand an old mapping */
- /* Check next */
- if (next && next->vm_start == end && !vma_policy(next) &&
- can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen,
- NULL_VM_UFFD_CTX, NULL)) {
- merge_end = next->vm_end;
- vma = next;
- vm_pgoff = next->vm_pgoff - pglen;
- }
+ charged = pglen;
+ charged -= vms.nr_accounted;
+ if (charged && security_vm_enough_memory_mm(mm, charged))
+ goto abort_munmap;
- /* Check prev */
- if (prev && prev->vm_end == addr && !vma_policy(prev) &&
- (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file,
- pgoff, vma->vm_userfaultfd_ctx, NULL) :
- can_vma_merge_after(prev, vm_flags, NULL, file, pgoff,
- NULL_VM_UFFD_CTX, NULL))) {
- merge_start = prev->vm_start;
- vma = prev;
- vm_pgoff = prev->vm_pgoff;
- } else if (prev) {
- vma_iter_next_range(&vmi);
+ vms.nr_accounted = 0;
+ vm_flags |= VM_ACCOUNT;
+ vmg.flags = vm_flags;
}
- /* Actually expand, if possible */
- if (vma &&
- !vma_expand(&vmi, vma, merge_start, merge_end, vm_pgoff, next)) {
- khugepaged_enter_vma(vma, vm_flags);
+ vma = vma_merge_new_range(&vmg);
+ if (vma)
goto expanded;
- }
-
- if (vma == prev)
- vma_iter_set(&vmi, addr);
-cannot_expand:
-
/*
* Determine the object being mapped and call the appropriate
* specific mapper. the address has already been validated, but
* not unmapped, but the maps are removed from the list.
*/
vma = vm_area_alloc(mm);
- if (!vma) {
- error = -ENOMEM;
+ if (!vma)
goto unacct_error;
- }
vma_iter_config(&vmi, addr, end);
vma_set_range(vma, addr, end, pgoff);
@@ -2954,6 +1432,11 @@ cannot_expand:
if (file) {
vma->vm_file = get_file(file);
+ /*
+ * call_mmap() may map PTE, so ensure there are no existing PTEs
+ * and call the vm_ops close function if one exists.
+ */
+ vms_clean_up_area(&vms, &mas_detach);
error = call_mmap(file, vma);
if (error)
goto unmap_and_free_vma;
@@ -2979,10 +1462,11 @@ cannot_expand:
* If vm_flags changed after call_mmap(), we should try merge
* vma again as we may succeed this time.
*/
- if (unlikely(vm_flags != vma->vm_flags && prev)) {
- merge = vma_merge_new_vma(&vmi, prev, vma,
- vma->vm_start, vma->vm_end,
- vma->vm_pgoff);
+ if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) {
+ vmg.flags = vma->vm_flags;
+ /* If this fails, state is reset ready for a reattempt. */
+ merge = vma_merge_new_range(&vmg);
+
if (merge) {
/*
* ->mmap() can change vma->vm_file and fput
@@ -2998,6 +1482,7 @@ cannot_expand:
vm_flags = vma->vm_flags;
goto unmap_writable;
}
+ vma_iter_config(&vmi, addr, end);
}
vm_flags = vma->vm_flags;
@@ -3030,7 +1515,7 @@ cannot_expand:
vma_link_file(vma);
/*
- * vma_merge() calls khugepaged_enter_vma() either, the below
+ * vma_merge_new_range() calls khugepaged_enter_vma() too, the below
* call covers the non-merge case.
*/
khugepaged_enter_vma(vma, vma->vm_flags);
@@ -3044,14 +1529,17 @@ unmap_writable:
expanded:
perf_event_mmap(vma);
- vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
+ /* Unmap any existing mapping in the area */
+ vms_complete_munmap_vmas(&vms, &mas_detach);
+
+ vm_stat_account(mm, vm_flags, pglen);
if (vm_flags & VM_LOCKED) {
if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) ||
is_vm_hugetlb_page(vma) ||
vma == get_gate_vma(current->mm))
vm_flags_clear(vma, VM_LOCKED_MASK);
else
- mm->locked_vm += (len >> PAGE_SHIFT);
+ mm->locked_vm += pglen;
}
if (file)
@@ -3072,7 +1560,7 @@ expanded:
return addr;
close_and_free_vma:
- if (file && vma->vm_ops && vma->vm_ops->close)
+ if (file && !vms.closed_vm_ops && vma->vm_ops && vma->vm_ops->close)
vma->vm_ops->close(vma);
if (file || vma->vm_file) {
@@ -3082,8 +1570,7 @@ unmap_and_free_vma:
vma_iter_set(&vmi, vma->vm_end);
/* Undo any partial mapping done by a device driver. */
- unmap_region(mm, &vmi.mas, vma, prev, next, vma->vm_start,
- vma->vm_end, vma->vm_end, true);
+ unmap_region(&vmi.mas, vma, vmg.prev, vmg.next);
}
if (writable_file_mapping)
mapping_unmap_writable(file->f_mapping);
@@ -3092,6 +1579,10 @@ free_vma:
unacct_error:
if (charged)
vm_unacct_memory(charged);
+
+abort_munmap:
+ vms_abort_munmap_vmas(&vms, &mas_detach);
+gather_failed:
validate_mm(mm);
return error;
}
@@ -3211,39 +1702,6 @@ out:
}
/*
- * do_vma_munmap() - Unmap a full or partial vma.
- * @vmi: The vma iterator pointing at the vma
- * @vma: The first vma to be munmapped
- * @start: the start of the address to unmap
- * @end: The end of the address to unmap
- * @uf: The userfaultfd list_head
- * @unlock: Drop the lock on success
- *
- * unmaps a VMA mapping when the vma iterator is already in position.
- * Does not handle alignment.
- *
- * Return: 0 on success drops the lock of so directed, error on failure and will
- * still hold the lock.
- */
-int do_vma_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
- unsigned long start, unsigned long end, struct list_head *uf,
- bool unlock)
-{
- struct mm_struct *mm = vma->vm_mm;
-
- /*
- * Check if memory is sealed before arch_unmap.
- * Prevent unmapping a sealed VMA.
- * can_modify_mm assumes we have acquired the lock on MM.
- */
- if (unlikely(!can_modify_mm(mm, start, end)))
- return -EPERM;
-
- arch_unmap(mm, start, end);
- return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
-}
-
-/*
* do_brk_flags() - Increase the brk vma if the flags match.
* @vmi: The vma iterator
* @addr: The start address
@@ -3259,7 +1717,6 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
unsigned long addr, unsigned long len, unsigned long flags)
{
struct mm_struct *mm = current->mm;
- struct vma_prepare vp;
/*
* Check against address space limits by the changed size
@@ -3279,25 +1736,16 @@ static int do_brk_flags(struct vma_iterator *vmi, struct vm_area_struct *vma,
* Expand the existing vma if possible; Note that singular lists do not
* occur after forking, so the expand will only happen on new VMAs.
*/
- if (vma && vma->vm_end == addr && !vma_policy(vma) &&
- can_vma_merge_after(vma, flags, NULL, NULL,
- addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) {
- vma_iter_config(vmi, vma->vm_start, addr + len);
- if (vma_iter_prealloc(vmi, vma))
- goto unacct_fail;
+ if (vma && vma->vm_end == addr) {
+ VMG_STATE(vmg, mm, vmi, addr, addr + len, flags, PHYS_PFN(addr));
- vma_start_write(vma);
-
- init_vma_prep(&vp, vma);
- vma_prepare(&vp);
- vma_adjust_trans_huge(vma, vma->vm_start, addr + len, 0);
- vma->vm_end = addr + len;
- vm_flags_set(vma, VM_SOFTDIRTY);
- vma_iter_store(vmi, vma);
+ vmg.prev = vma;
+ vma_iter_next_range(vmi);
- vma_complete(&vp, vmi, mm);
- khugepaged_enter_vma(vma, flags);
- goto out;
+ if (vma_merge_new_range(&vmg))
+ goto out;
+ else if (vmg_nomem(&vmg))
+ goto unacct_fail;
}
if (vma)
@@ -3433,7 +1881,7 @@ void exit_mmap(struct mm_struct *mm)
do {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
- remove_vma(vma, true);
+ remove_vma(vma, /* unreachable = */ true, /* closed = */ false);
count++;
cond_resched();
vma = vma_next(&vmi);
@@ -3491,92 +1939,6 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
}
/*
- * Copy the vma structure to a new location in the same mm,
- * prior to moving page table entries, to effect an mremap move.
- */
-struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
- unsigned long addr, unsigned long len, pgoff_t pgoff,
- bool *need_rmap_locks)
-{
- struct vm_area_struct *vma = *vmap;
- unsigned long vma_start = vma->vm_start;
- struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *new_vma, *prev;
- bool faulted_in_anon_vma = true;
- VMA_ITERATOR(vmi, mm, addr);
-
- /*
- * If anonymous vma has not yet been faulted, update new pgoff
- * to match new location, to increase its chance of merging.
- */
- if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
- pgoff = addr >> PAGE_SHIFT;
- faulted_in_anon_vma = false;
- }
-
- new_vma = find_vma_prev(mm, addr, &prev);
- if (new_vma && new_vma->vm_start < addr + len)
- return NULL; /* should never get here */
-
- new_vma = vma_merge_new_vma(&vmi, prev, vma, addr, addr + len, pgoff);
- if (new_vma) {
- /*
- * Source vma may have been merged into new_vma
- */
- if (unlikely(vma_start >= new_vma->vm_start &&
- vma_start < new_vma->vm_end)) {
- /*
- * The only way we can get a vma_merge with
- * self during an mremap is if the vma hasn't
- * been faulted in yet and we were allowed to
- * reset the dst vma->vm_pgoff to the
- * destination address of the mremap to allow
- * the merge to happen. mremap must change the
- * vm_pgoff linearity between src and dst vmas
- * (in turn preventing a vma_merge) to be
- * safe. It is only safe to keep the vm_pgoff
- * linear if there are no pages mapped yet.
- */
- VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
- *vmap = vma = new_vma;
- }
- *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
- } else {
- new_vma = vm_area_dup(vma);
- if (!new_vma)
- goto out;
- vma_set_range(new_vma, addr, addr + len, pgoff);
- if (vma_dup_policy(vma, new_vma))
- goto out_free_vma;
- if (anon_vma_clone(new_vma, vma))
- goto out_free_mempol;
- if (new_vma->vm_file)
- get_file(new_vma->vm_file);
- if (new_vma->vm_ops && new_vma->vm_ops->open)
- new_vma->vm_ops->open(new_vma);
- if (vma_link(mm, new_vma))
- goto out_vma_link;
- *need_rmap_locks = false;
- }
- return new_vma;
-
-out_vma_link:
- if (new_vma->vm_ops && new_vma->vm_ops->close)
- new_vma->vm_ops->close(new_vma);
-
- if (new_vma->vm_file)
- fput(new_vma->vm_file);
-
- unlink_anon_vmas(new_vma);
-out_free_mempol:
- mpol_put(vma_policy(new_vma));
-out_free_vma:
- vm_area_free(new_vma);
-out:
- return NULL;
-}
-
-/*
* Return true if the calling process may expand its vm space by the passed
* number of pages
*/
@@ -3620,10 +1982,16 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages)
static vm_fault_t special_mapping_fault(struct vm_fault *vmf);
/*
+ * Close hook, called for unmap() and on the old vma for mremap().
+ *
* Having a close hook prevents vma merging regardless of flags.
*/
static void special_mapping_close(struct vm_area_struct *vma)
{
+ const struct vm_special_mapping *sm = vma->vm_private_data;
+
+ if (sm->close)
+ sm->close(sm, vma);
}
static const char *special_mapping_name(struct vm_area_struct *vma)
@@ -3665,27 +2033,17 @@ static const struct vm_operations_struct special_mapping_vmops = {
.may_split = special_mapping_split,
};
-static const struct vm_operations_struct legacy_special_mapping_vmops = {
- .close = special_mapping_close,
- .fault = special_mapping_fault,
-};
-
static vm_fault_t special_mapping_fault(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
pgoff_t pgoff;
struct page **pages;
+ struct vm_special_mapping *sm = vma->vm_private_data;
- if (vma->vm_ops == &legacy_special_mapping_vmops) {
- pages = vma->vm_private_data;
- } else {
- struct vm_special_mapping *sm = vma->vm_private_data;
-
- if (sm->fault)
- return sm->fault(sm, vmf->vma, vmf);
+ if (sm->fault)
+ return sm->fault(sm, vmf->vma, vmf);
- pages = sm->pages;
- }
+ pages = sm->pages;
for (pgoff = vmf->pgoff; pgoff && *pages; ++pages)
pgoff--;
@@ -3740,8 +2098,7 @@ bool vma_is_special_mapping(const struct vm_area_struct *vma,
const struct vm_special_mapping *sm)
{
return vma->vm_private_data == sm &&
- (vma->vm_ops == &special_mapping_vmops ||
- vma->vm_ops == &legacy_special_mapping_vmops);
+ vma->vm_ops == &special_mapping_vmops;
}
/*
@@ -3762,214 +2119,6 @@ struct vm_area_struct *_install_special_mapping(
&special_mapping_vmops);
}
-int install_special_mapping(struct mm_struct *mm,
- unsigned long addr, unsigned long len,
- unsigned long vm_flags, struct page **pages)
-{
- struct vm_area_struct *vma = __install_special_mapping(
- mm, addr, len, vm_flags, (void *)pages,
- &legacy_special_mapping_vmops);
-
- return PTR_ERR_OR_ZERO(vma);
-}
-
-static DEFINE_MUTEX(mm_all_locks_mutex);
-
-static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
-{
- if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
- /*
- * The LSB of head.next can't change from under us
- * because we hold the mm_all_locks_mutex.
- */
- down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
- /*
- * We can safely modify head.next after taking the
- * anon_vma->root->rwsem. If some other vma in this mm shares
- * the same anon_vma we won't take it again.
- *
- * No need of atomic instructions here, head.next
- * can't change from under us thanks to the
- * anon_vma->root->rwsem.
- */
- if (__test_and_set_bit(0, (unsigned long *)
- &anon_vma->root->rb_root.rb_root.rb_node))
- BUG();
- }
-}
-
-static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
-{
- if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
- /*
- * AS_MM_ALL_LOCKS can't change from under us because
- * we hold the mm_all_locks_mutex.
- *
- * Operations on ->flags have to be atomic because
- * even if AS_MM_ALL_LOCKS is stable thanks to the
- * mm_all_locks_mutex, there may be other cpus
- * changing other bitflags in parallel to us.
- */
- if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
- BUG();
- down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
- }
-}
-
-/*
- * This operation locks against the VM for all pte/vma/mm related
- * operations that could ever happen on a certain mm. This includes
- * vmtruncate, try_to_unmap, and all page faults.
- *
- * The caller must take the mmap_lock in write mode before calling
- * mm_take_all_locks(). The caller isn't allowed to release the
- * mmap_lock until mm_drop_all_locks() returns.
- *
- * mmap_lock in write mode is required in order to block all operations
- * that could modify pagetables and free pages without need of
- * altering the vma layout. It's also needed in write mode to avoid new
- * anon_vmas to be associated with existing vmas.
- *
- * A single task can't take more than one mm_take_all_locks() in a row
- * or it would deadlock.
- *
- * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
- * mapping->flags avoid to take the same lock twice, if more than one
- * vma in this mm is backed by the same anon_vma or address_space.
- *
- * We take locks in following order, accordingly to comment at beginning
- * of mm/rmap.c:
- * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
- * hugetlb mapping);
- * - all vmas marked locked
- * - all i_mmap_rwsem locks;
- * - all anon_vma->rwseml
- *
- * We can take all locks within these types randomly because the VM code
- * doesn't nest them and we protected from parallel mm_take_all_locks() by
- * mm_all_locks_mutex.
- *
- * mm_take_all_locks() and mm_drop_all_locks are expensive operations
- * that may have to take thousand of locks.
- *
- * mm_take_all_locks() can fail if it's interrupted by signals.
- */
-int mm_take_all_locks(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
- struct anon_vma_chain *avc;
- VMA_ITERATOR(vmi, mm, 0);
-
- mmap_assert_write_locked(mm);
-
- mutex_lock(&mm_all_locks_mutex);
-
- /*
- * vma_start_write() does not have a complement in mm_drop_all_locks()
- * because vma_start_write() is always asymmetrical; it marks a VMA as
- * being written to until mmap_write_unlock() or mmap_write_downgrade()
- * is reached.
- */
- for_each_vma(vmi, vma) {
- if (signal_pending(current))
- goto out_unlock;
- vma_start_write(vma);
- }
-
- vma_iter_init(&vmi, mm, 0);
- for_each_vma(vmi, vma) {
- if (signal_pending(current))
- goto out_unlock;
- if (vma->vm_file && vma->vm_file->f_mapping &&
- is_vm_hugetlb_page(vma))
- vm_lock_mapping(mm, vma->vm_file->f_mapping);
- }
-
- vma_iter_init(&vmi, mm, 0);
- for_each_vma(vmi, vma) {
- if (signal_pending(current))
- goto out_unlock;
- if (vma->vm_file && vma->vm_file->f_mapping &&
- !is_vm_hugetlb_page(vma))
- vm_lock_mapping(mm, vma->vm_file->f_mapping);
- }
-
- vma_iter_init(&vmi, mm, 0);
- for_each_vma(vmi, vma) {
- if (signal_pending(current))
- goto out_unlock;
- if (vma->anon_vma)
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- vm_lock_anon_vma(mm, avc->anon_vma);
- }
-
- return 0;
-
-out_unlock:
- mm_drop_all_locks(mm);
- return -EINTR;
-}
-
-static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
-{
- if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
- /*
- * The LSB of head.next can't change to 0 from under
- * us because we hold the mm_all_locks_mutex.
- *
- * We must however clear the bitflag before unlocking
- * the vma so the users using the anon_vma->rb_root will
- * never see our bitflag.
- *
- * No need of atomic instructions here, head.next
- * can't change from under us until we release the
- * anon_vma->root->rwsem.
- */
- if (!__test_and_clear_bit(0, (unsigned long *)
- &anon_vma->root->rb_root.rb_root.rb_node))
- BUG();
- anon_vma_unlock_write(anon_vma);
- }
-}
-
-static void vm_unlock_mapping(struct address_space *mapping)
-{
- if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
- /*
- * AS_MM_ALL_LOCKS can't change to 0 from under us
- * because we hold the mm_all_locks_mutex.
- */
- i_mmap_unlock_write(mapping);
- if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
- &mapping->flags))
- BUG();
- }
-}
-
-/*
- * The mmap_lock cannot be released by the caller until
- * mm_drop_all_locks() returns.
- */
-void mm_drop_all_locks(struct mm_struct *mm)
-{
- struct vm_area_struct *vma;
- struct anon_vma_chain *avc;
- VMA_ITERATOR(vmi, mm, 0);
-
- mmap_assert_write_locked(mm);
- BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
-
- for_each_vma(vmi, vma) {
- if (vma->anon_vma)
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- vm_unlock_anon_vma(avc->anon_vma);
- if (vma->vm_file && vma->vm_file->f_mapping)
- vm_unlock_mapping(vma->vm_file->f_mapping);
- }
-
- mutex_unlock(&mm_all_locks_mutex);
-}
-
/*
* initialise the percpu counter for VM
*/
@@ -4088,3 +2237,86 @@ static int __meminit init_reserve_notifier(void)
return 0;
}
subsys_initcall(init_reserve_notifier);
+
+/*
+ * Relocate a VMA downwards by shift bytes. There cannot be any VMAs between
+ * this VMA and its relocated range, which will now reside at [vma->vm_start -
+ * shift, vma->vm_end - shift).
+ *
+ * This function is almost certainly NOT what you want for anything other than
+ * early executable temporary stack relocation.
+ */
+int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift)
+{
+ /*
+ * The process proceeds as follows:
+ *
+ * 1) Use shift to calculate the new vma endpoints.
+ * 2) Extend vma to cover both the old and new ranges. This ensures the
+ * arguments passed to subsequent functions are consistent.
+ * 3) Move vma's page tables to the new range.
+ * 4) Free up any cleared pgd range.
+ * 5) Shrink the vma to cover only the new range.
+ */
+
+ struct mm_struct *mm = vma->vm_mm;
+ unsigned long old_start = vma->vm_start;
+ unsigned long old_end = vma->vm_end;
+ unsigned long length = old_end - old_start;
+ unsigned long new_start = old_start - shift;
+ unsigned long new_end = old_end - shift;
+ VMA_ITERATOR(vmi, mm, new_start);
+ VMG_STATE(vmg, mm, &vmi, new_start, old_end, 0, vma->vm_pgoff);
+ struct vm_area_struct *next;
+ struct mmu_gather tlb;
+
+ BUG_ON(new_start > new_end);
+
+ /*
+ * ensure there are no vmas between where we want to go
+ * and where we are
+ */
+ if (vma != vma_next(&vmi))
+ return -EFAULT;
+
+ vma_iter_prev_range(&vmi);
+ /*
+ * cover the whole range: [new_start, old_end)
+ */
+ vmg.vma = vma;
+ if (vma_expand(&vmg))
+ return -ENOMEM;
+
+ /*
+ * move the page tables downwards, on failure we rely on
+ * process cleanup to remove whatever mess we made.
+ */
+ if (length != move_page_tables(vma, old_start,
+ vma, new_start, length, false, true))
+ return -ENOMEM;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm);
+ next = vma_next(&vmi);
+ if (new_end > old_start) {
+ /*
+ * when the old and new regions overlap clear from new_end.
+ */
+ free_pgd_range(&tlb, new_end, old_end, new_end,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ } else {
+ /*
+ * otherwise, clean from old_start; this is done to not touch
+ * the address space in [new_end, old_start) some architectures
+ * have constraints on va-space that make this illegal (IA64) -
+ * for the others its just a little faster.
+ */
+ free_pgd_range(&tlb, old_start, old_end, new_end,
+ next ? next->vm_start : USER_PGTABLES_CEILING);
+ }
+ tlb_finish_mmu(&tlb);
+
+ vma_prev(&vmi);
+ /* Shrink the vma to just the new range */
+ return vma_shrink(&vmi, vma, new_start, new_end, vma->vm_pgoff);
+}
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 8982e6139d07..fc18fe274505 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -19,6 +19,8 @@
#include <linux/sched/mm.h>
#include <linux/slab.h>
+#include "vma.h"
+
/* global SRCU for all MMs */
DEFINE_STATIC_SRCU(srcu);
diff --git a/mm/mmzone.c b/mm/mmzone.c
index c01896eca736..f9baa8882fbf 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -66,7 +66,7 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z,
z++;
else
while (zonelist_zone_idx(z) > highest_zoneidx ||
- (z->zone && !zref_in_nodemask(z, nodes)))
+ (zonelist_zone(z) && !zref_in_nodemask(z, nodes)))
z++;
return z;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 222ab434da54..0c5d6d06107d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -161,8 +161,7 @@ static long change_pte_range(struct mmu_gather *tlb,
if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_NORMAL) &&
toptier)
continue;
- if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING &&
- !toptier)
+ if (folio_use_access_time(folio))
folio_xchg_access_time(folio,
jiffies_to_msecs(jiffies));
}
@@ -303,8 +302,9 @@ pgtable_split_needed(struct vm_area_struct *vma, unsigned long cp_flags)
{
/*
* pte markers only resides in pte level, if we need pte markers,
- * we need to split. We cannot wr-protect shmem thp because file
- * thp is handled differently when split by erasing the pmd so far.
+ * we need to split. For example, we cannot wr-protect a file thp
+ * (e.g. 2M shmem) because file thp is handled differently when
+ * split by erasing the pmd so far.
*/
return (cp_flags & MM_CP_UFFD_WP) && !vma_is_anonymous(vma);
}
@@ -364,9 +364,6 @@ static inline long change_pmd_range(struct mmu_gather *tlb,
unsigned long next;
long pages = 0;
unsigned long nr_huge_updates = 0;
- struct mmu_notifier_range range;
-
- range.start = 0;
pmd = pmd_offset(pud, addr);
do {
@@ -384,14 +381,6 @@ again:
if (pmd_none(*pmd))
goto next;
- /* invoke the mmu notifier if the pmd is populated */
- if (!range.start) {
- mmu_notifier_range_init(&range,
- MMU_NOTIFY_PROTECTION_VMA, 0,
- vma->vm_mm, addr, end);
- mmu_notifier_invalidate_range_start(&range);
- }
-
_pmd = pmdp_get_lockless(pmd);
if (is_swap_pmd(_pmd) || pmd_trans_huge(_pmd) || pmd_devmap(_pmd)) {
if ((next - addr != HPAGE_PMD_SIZE) ||
@@ -432,9 +421,6 @@ next:
cond_resched();
} while (pmd++, addr = next, addr != end);
- if (range.start)
- mmu_notifier_invalidate_range_end(&range);
-
if (nr_huge_updates)
count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
return pages;
@@ -444,21 +430,57 @@ static inline long change_pud_range(struct mmu_gather *tlb,
struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr,
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
{
- pud_t *pud;
+ struct mmu_notifier_range range;
+ pud_t *pudp, pud;
unsigned long next;
long pages = 0, ret;
- pud = pud_offset(p4d, addr);
+ range.start = 0;
+
+ pudp = pud_offset(p4d, addr);
do {
+again:
next = pud_addr_end(addr, end);
- ret = change_prepare(vma, pud, pmd, addr, cp_flags);
- if (ret)
- return ret;
- if (pud_none_or_clear_bad(pud))
+ ret = change_prepare(vma, pudp, pmd, addr, cp_flags);
+ if (ret) {
+ pages = ret;
+ break;
+ }
+
+ pud = READ_ONCE(*pudp);
+ if (pud_none(pud))
continue;
- pages += change_pmd_range(tlb, vma, pud, addr, next, newprot,
+
+ if (!range.start) {
+ mmu_notifier_range_init(&range,
+ MMU_NOTIFY_PROTECTION_VMA, 0,
+ vma->vm_mm, addr, end);
+ mmu_notifier_invalidate_range_start(&range);
+ }
+
+ if (pud_leaf(pud)) {
+ if ((next - addr != PUD_SIZE) ||
+ pgtable_split_needed(vma, cp_flags)) {
+ __split_huge_pud(vma, pudp, addr);
+ goto again;
+ } else {
+ ret = change_huge_pud(tlb, vma, pudp,
+ addr, newprot, cp_flags);
+ if (ret == 0)
+ goto again;
+ /* huge pud was handled */
+ if (ret == HPAGE_PUD_NR)
+ pages += HPAGE_PUD_NR;
+ continue;
+ }
+ }
+
+ pages += change_pmd_range(tlb, vma, pudp, addr, next, newprot,
cp_flags);
- } while (pud++, addr = next, addr != end);
+ } while (pudp++, addr = next, addr != end);
+
+ if (range.start)
+ mmu_notifier_invalidate_range_end(&range);
return pages;
}
@@ -589,6 +611,9 @@ mprotect_fixup(struct vma_iterator *vmi, struct mmu_gather *tlb,
unsigned long charged = 0;
int error;
+ if (!can_modify_vma(vma))
+ return -EPERM;
+
if (newflags == oldflags) {
*pprev = vma;
return 0;
@@ -747,15 +772,6 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
}
}
- /*
- * checking if memory is sealed.
- * can_modify_mm assumes we have acquired the lock on MM.
- */
- if (unlikely(!can_modify_mm(current->mm, start, end))) {
- error = -EPERM;
- goto out;
- }
-
prev = vma_prev(&vmi);
if (start > vma->vm_start)
prev = vma;
diff --git a/mm/mremap.c b/mm/mremap.c
index e7ae140fc640..24712f8dbb6b 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -902,19 +902,6 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
if ((mm->map_count + 2) >= sysctl_max_map_count - 3)
return -ENOMEM;
- /*
- * In mremap_to().
- * Move a VMA to another location, check if src addr is sealed.
- *
- * Place can_modify_mm here because mremap_to()
- * does its own checking for address range, and we only
- * check the sealing after passing those checks.
- *
- * can_modify_mm assumes we have acquired the lock on MM.
- */
- if (unlikely(!can_modify_mm(mm, addr, addr + old_len)))
- return -EPERM;
-
if (flags & MREMAP_FIXED) {
/*
* In mremap_to().
@@ -1052,6 +1039,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
goto out;
}
+ /* Don't allow remapping vmas when they have already been sealed */
+ if (!can_modify_vma(vma)) {
+ ret = -EPERM;
+ goto out;
+ }
+
if (is_vm_hugetlb_page(vma)) {
struct hstate *h __maybe_unused = hstate_vma(vma);
@@ -1080,19 +1073,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
}
/*
- * Below is shrink/expand case (not mremap_to())
- * Check if src address is sealed, if so, reject.
- * In other words, prevent shrinking or expanding a sealed VMA.
- *
- * Place can_modify_mm here so we can keep the logic related to
- * shrink/expand together.
- */
- if (unlikely(!can_modify_mm(mm, addr, addr + old_len))) {
- ret = -EPERM;
- goto out;
- }
-
- /*
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
* do_vmi_munmap does all the needed commit accounting, and
diff --git a/mm/mseal.c b/mm/mseal.c
index c8787cc6ba55..ece977bd21e1 100644
--- a/mm/mseal.c
+++ b/mm/mseal.c
@@ -16,28 +16,11 @@
#include <linux/sched.h>
#include "internal.h"
-static inline bool vma_is_sealed(struct vm_area_struct *vma)
-{
- return (vma->vm_flags & VM_SEALED);
-}
-
static inline void set_vma_sealed(struct vm_area_struct *vma)
{
vm_flags_set(vma, VM_SEALED);
}
-/*
- * check if a vma is sealed for modification.
- * return true, if modification is allowed.
- */
-static bool can_modify_vma(struct vm_area_struct *vma)
-{
- if (unlikely(vma_is_sealed(vma)))
- return false;
-
- return true;
-}
-
static bool is_madv_discard(int behavior)
{
switch (behavior) {
@@ -71,45 +54,15 @@ static bool is_ro_anon(struct vm_area_struct *vma)
}
/*
- * Check if the vmas of a memory range are allowed to be modified.
- * the memory ranger can have a gap (unallocated memory).
- * return true, if it is allowed.
- */
-bool can_modify_mm(struct mm_struct *mm, unsigned long start, unsigned long end)
-{
- struct vm_area_struct *vma;
-
- VMA_ITERATOR(vmi, mm, start);
-
- /* going through each vma to check. */
- for_each_vma_range(vmi, vma, end) {
- if (unlikely(!can_modify_vma(vma)))
- return false;
- }
-
- /* Allow by default. */
- return true;
-}
-
-/*
- * Check if the vmas of a memory range are allowed to be modified by madvise.
- * the memory ranger can have a gap (unallocated memory).
- * return true, if it is allowed.
+ * Check if a vma is allowed to be modified by madvise.
*/
-bool can_modify_mm_madv(struct mm_struct *mm, unsigned long start, unsigned long end,
- int behavior)
+bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
{
- struct vm_area_struct *vma;
-
- VMA_ITERATOR(vmi, mm, start);
-
if (!is_madv_discard(behavior))
return true;
- /* going through each vma to check. */
- for_each_vma_range(vmi, vma, end)
- if (unlikely(is_ro_anon(vma) && !can_modify_vma(vma)))
- return false;
+ if (unlikely(!can_modify_vma(vma) && is_ro_anon(vma)))
+ return false;
/* Allow by default. */
return true;
diff --git a/mm/nommu.c b/mm/nommu.c
index 7296e775e04e..385b0c15add8 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -126,6 +126,11 @@ void *__vmalloc_noprof(unsigned long size, gfp_t gfp_mask)
}
EXPORT_SYMBOL(__vmalloc_noprof);
+void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
+{
+ return krealloc_noprof(p, size, (flags | __GFP_COMP) & ~__GFP_HIGHMEM);
+}
+
void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
@@ -1573,12 +1578,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
return ret;
}
-struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
- unsigned int foll_flags)
-{
- return NULL;
-}
-
int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn, unsigned long size, pgprot_t prot)
{
diff --git a/mm/numa.c b/mm/numa.c
new file mode 100644
index 000000000000..e2eec07707d1
--- /dev/null
+++ b/mm/numa.c
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/memblock.h>
+#include <linux/printk.h>
+#include <linux/numa.h>
+#include <linux/numa_memblks.h>
+
+struct pglist_data *node_data[MAX_NUMNODES];
+EXPORT_SYMBOL(node_data);
+
+/* Allocate NODE_DATA for a node on the local memory */
+void __init alloc_node_data(int nid)
+{
+ const size_t nd_size = roundup(sizeof(pg_data_t), SMP_CACHE_BYTES);
+ u64 nd_pa;
+ void *nd;
+ int tnid;
+
+ /* Allocate node data. Try node-local memory and then any node. */
+ nd_pa = memblock_phys_alloc_try_nid(nd_size, SMP_CACHE_BYTES, nid);
+ if (!nd_pa)
+ panic("Cannot allocate %zu bytes for node %d data\n",
+ nd_size, nid);
+ nd = __va(nd_pa);
+
+ /* report and initialize */
+ pr_info("NODE_DATA(%d) allocated [mem %#010Lx-%#010Lx]\n", nid,
+ nd_pa, nd_pa + nd_size - 1);
+ tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
+ if (tnid != nid)
+ pr_info(" NODE_DATA(%d) on node %d\n", nid, tnid);
+
+ node_data[nid] = nd;
+ memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
+}
+
+void __init alloc_offline_node_data(int nid)
+{
+ pg_data_t *pgdat;
+
+ pgdat = memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES);
+ if (!pgdat)
+ panic("Cannot allocate %zuB for node %d.\n",
+ sizeof(*pgdat), nid);
+
+ node_data[nid] = pgdat;
+}
+
+/* Stub functions: */
+
+#ifndef memory_add_physaddr_to_nid
+int memory_add_physaddr_to_nid(u64 start)
+{
+ pr_info_once("Unknown online node for memory at 0x%llx, assuming node 0\n",
+ start);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+
+#ifndef phys_to_target_node
+int phys_to_target_node(u64 start)
+{
+ pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+ start);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
+#endif
diff --git a/mm/numa_emulation.c b/mm/numa_emulation.c
new file mode 100644
index 000000000000..031fb9961bf7
--- /dev/null
+++ b/mm/numa_emulation.c
@@ -0,0 +1,571 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * NUMA emulation
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/topology.h>
+#include <linux/memblock.h>
+#include <linux/numa_memblks.h>
+#include <asm/numa.h>
+
+#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
+#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
+
+static int emu_nid_to_phys[MAX_NUMNODES];
+static char *emu_cmdline __initdata;
+
+int __init numa_emu_cmdline(char *str)
+{
+ emu_cmdline = str;
+ return 0;
+}
+
+static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
+{
+ int i;
+
+ for (i = 0; i < mi->nr_blks; i++)
+ if (mi->blk[i].nid == nid)
+ return i;
+ return -ENOENT;
+}
+
+static u64 __init mem_hole_size(u64 start, u64 end)
+{
+ unsigned long start_pfn = PFN_UP(start);
+ unsigned long end_pfn = PFN_DOWN(end);
+
+ if (start_pfn < end_pfn)
+ return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn));
+ return 0;
+}
+
+/*
+ * Sets up nid to range from @start to @end. The return value is -errno if
+ * something went wrong, 0 otherwise.
+ */
+static int __init emu_setup_memblk(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ int nid, int phys_blk, u64 size)
+{
+ struct numa_memblk *eb = &ei->blk[ei->nr_blks];
+ struct numa_memblk *pb = &pi->blk[phys_blk];
+
+ if (ei->nr_blks >= NR_NODE_MEMBLKS) {
+ pr_err("NUMA: Too many emulated memblks, failing emulation\n");
+ return -EINVAL;
+ }
+
+ ei->nr_blks++;
+ eb->start = pb->start;
+ eb->end = pb->start + size;
+ eb->nid = nid;
+
+ if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
+ emu_nid_to_phys[nid] = pb->nid;
+
+ pb->start += size;
+ if (pb->start >= pb->end) {
+ WARN_ON_ONCE(pb->start > pb->end);
+ numa_remove_memblk_from(phys_blk, pi);
+ }
+
+ printk(KERN_INFO "Faking node %d at [mem %#018Lx-%#018Lx] (%LuMB)\n",
+ nid, eb->start, eb->end - 1, (eb->end - eb->start) >> 20);
+ return 0;
+}
+
+/*
+ * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
+ * to max_addr.
+ *
+ * Returns zero on success or negative on error.
+ */
+static int __init split_nodes_interleave(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ u64 addr, u64 max_addr, int nr_nodes)
+{
+ nodemask_t physnode_mask = numa_nodes_parsed;
+ u64 size;
+ int big;
+ int nid = 0;
+ int i, ret;
+
+ if (nr_nodes <= 0)
+ return -1;
+ if (nr_nodes > MAX_NUMNODES) {
+ pr_info("numa=fake=%d too large, reducing to %d\n",
+ nr_nodes, MAX_NUMNODES);
+ nr_nodes = MAX_NUMNODES;
+ }
+
+ /*
+ * Calculate target node size. x86_32 freaks on __udivdi3() so do
+ * the division in ulong number of pages and convert back.
+ */
+ size = max_addr - addr - mem_hole_size(addr, max_addr);
+ size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
+
+ /*
+ * Calculate the number of big nodes that can be allocated as a result
+ * of consolidating the remainder.
+ */
+ big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
+ FAKE_NODE_MIN_SIZE;
+
+ size &= FAKE_NODE_MIN_HASH_MASK;
+ if (!size) {
+ pr_err("Not enough memory for each node. "
+ "NUMA emulation disabled.\n");
+ return -1;
+ }
+
+ /*
+ * Continue to fill physical nodes with fake nodes until there is no
+ * memory left on any of them.
+ */
+ while (!nodes_empty(physnode_mask)) {
+ for_each_node_mask(i, physnode_mask) {
+ u64 dma32_end = numa_emu_dma_end();
+ u64 start, limit, end;
+ int phys_blk;
+
+ phys_blk = emu_find_memblk_by_nid(i, pi);
+ if (phys_blk < 0) {
+ node_clear(i, physnode_mask);
+ continue;
+ }
+ start = pi->blk[phys_blk].start;
+ limit = pi->blk[phys_blk].end;
+ end = start + size;
+
+ if (nid < big)
+ end += FAKE_NODE_MIN_SIZE;
+
+ /*
+ * Continue to add memory to this fake node if its
+ * non-reserved memory is less than the per-node size.
+ */
+ while (end - start - mem_hole_size(start, end) < size) {
+ end += FAKE_NODE_MIN_SIZE;
+ if (end > limit) {
+ end = limit;
+ break;
+ }
+ }
+
+ /*
+ * If there won't be at least FAKE_NODE_MIN_SIZE of
+ * non-reserved memory in ZONE_DMA32 for the next node,
+ * this one must extend to the boundary.
+ */
+ if (end < dma32_end && dma32_end - end -
+ mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+ end = dma32_end;
+
+ /*
+ * If there won't be enough non-reserved memory for the
+ * next node, this one must extend to the end of the
+ * physical node.
+ */
+ if (limit - end - mem_hole_size(end, limit) < size)
+ end = limit;
+
+ ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
+ phys_blk,
+ min(end, limit) - start);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/*
+ * Returns the end address of a node so that there is at least `size' amount of
+ * non-reserved memory or `max_addr' is reached.
+ */
+static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
+{
+ u64 end = start + size;
+
+ while (end - start - mem_hole_size(start, end) < size) {
+ end += FAKE_NODE_MIN_SIZE;
+ if (end > max_addr) {
+ end = max_addr;
+ break;
+ }
+ }
+ return end;
+}
+
+static u64 uniform_size(u64 max_addr, u64 base, u64 hole, int nr_nodes)
+{
+ unsigned long max_pfn = PHYS_PFN(max_addr);
+ unsigned long base_pfn = PHYS_PFN(base);
+ unsigned long hole_pfns = PHYS_PFN(hole);
+
+ return PFN_PHYS((max_pfn - base_pfn - hole_pfns) / nr_nodes);
+}
+
+/*
+ * Sets up fake nodes of `size' interleaved over physical nodes ranging from
+ * `addr' to `max_addr'.
+ *
+ * Returns zero on success or negative on error.
+ */
+static int __init split_nodes_size_interleave_uniform(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ u64 addr, u64 max_addr, u64 size,
+ int nr_nodes, struct numa_memblk *pblk,
+ int nid)
+{
+ nodemask_t physnode_mask = numa_nodes_parsed;
+ int i, ret, uniform = 0;
+ u64 min_size;
+
+ if ((!size && !nr_nodes) || (nr_nodes && !pblk))
+ return -1;
+
+ /*
+ * In the 'uniform' case split the passed in physical node by
+ * nr_nodes, in the non-uniform case, ignore the passed in
+ * physical block and try to create nodes of at least size
+ * @size.
+ *
+ * In the uniform case, split the nodes strictly by physical
+ * capacity, i.e. ignore holes. In the non-uniform case account
+ * for holes and treat @size as a minimum floor.
+ */
+ if (!nr_nodes)
+ nr_nodes = MAX_NUMNODES;
+ else {
+ nodes_clear(physnode_mask);
+ node_set(pblk->nid, physnode_mask);
+ uniform = 1;
+ }
+
+ if (uniform) {
+ min_size = uniform_size(max_addr, addr, 0, nr_nodes);
+ size = min_size;
+ } else {
+ /*
+ * The limit on emulated nodes is MAX_NUMNODES, so the
+ * size per node is increased accordingly if the
+ * requested size is too small. This creates a uniform
+ * distribution of node sizes across the entire machine
+ * (but not necessarily over physical nodes).
+ */
+ min_size = uniform_size(max_addr, addr,
+ mem_hole_size(addr, max_addr), nr_nodes);
+ }
+ min_size = ALIGN(max(min_size, FAKE_NODE_MIN_SIZE), FAKE_NODE_MIN_SIZE);
+ if (size < min_size) {
+ pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
+ size >> 20, min_size >> 20);
+ size = min_size;
+ }
+ size = ALIGN_DOWN(size, FAKE_NODE_MIN_SIZE);
+
+ /*
+ * Fill physical nodes with fake nodes of size until there is no memory
+ * left on any of them.
+ */
+ while (!nodes_empty(physnode_mask)) {
+ for_each_node_mask(i, physnode_mask) {
+ u64 dma32_end = numa_emu_dma_end();
+ u64 start, limit, end;
+ int phys_blk;
+
+ phys_blk = emu_find_memblk_by_nid(i, pi);
+ if (phys_blk < 0) {
+ node_clear(i, physnode_mask);
+ continue;
+ }
+
+ start = pi->blk[phys_blk].start;
+ limit = pi->blk[phys_blk].end;
+
+ if (uniform)
+ end = start + size;
+ else
+ end = find_end_of_node(start, limit, size);
+ /*
+ * If there won't be at least FAKE_NODE_MIN_SIZE of
+ * non-reserved memory in ZONE_DMA32 for the next node,
+ * this one must extend to the boundary.
+ */
+ if (end < dma32_end && dma32_end - end -
+ mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
+ end = dma32_end;
+
+ /*
+ * If there won't be enough non-reserved memory for the
+ * next node, this one must extend to the end of the
+ * physical node.
+ */
+ if ((limit - end - mem_hole_size(end, limit) < size)
+ && !uniform)
+ end = limit;
+
+ ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
+ phys_blk,
+ min(end, limit) - start);
+ if (ret < 0)
+ return ret;
+ }
+ }
+ return nid;
+}
+
+static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
+ struct numa_meminfo *pi,
+ u64 addr, u64 max_addr, u64 size)
+{
+ return split_nodes_size_interleave_uniform(ei, pi, addr, max_addr, size,
+ 0, NULL, 0);
+}
+
+static int __init setup_emu2phys_nid(int *dfl_phys_nid)
+{
+ int i, max_emu_nid = 0;
+
+ *dfl_phys_nid = NUMA_NO_NODE;
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
+ if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
+ max_emu_nid = i;
+ if (*dfl_phys_nid == NUMA_NO_NODE)
+ *dfl_phys_nid = emu_nid_to_phys[i];
+ }
+ }
+
+ return max_emu_nid;
+}
+
+/**
+ * numa_emulation - Emulate NUMA nodes
+ * @numa_meminfo: NUMA configuration to massage
+ * @numa_dist_cnt: The size of the physical NUMA distance table
+ *
+ * Emulate NUMA nodes according to the numa=fake kernel parameter.
+ * @numa_meminfo contains the physical memory configuration and is modified
+ * to reflect the emulated configuration on success. @numa_dist_cnt is
+ * used to determine the size of the physical distance table.
+ *
+ * On success, the following modifications are made.
+ *
+ * - @numa_meminfo is updated to reflect the emulated nodes.
+ *
+ * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
+ * emulated nodes.
+ *
+ * - NUMA distance table is rebuilt to represent distances between emulated
+ * nodes. The distances are determined considering how emulated nodes
+ * are mapped to physical nodes and match the actual distances.
+ *
+ * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
+ * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
+ *
+ * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
+ * identity mapping and no other modification is made.
+ */
+void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
+{
+ static struct numa_meminfo ei __initdata;
+ static struct numa_meminfo pi __initdata;
+ const u64 max_addr = PFN_PHYS(max_pfn);
+ u8 *phys_dist = NULL;
+ size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
+ int max_emu_nid, dfl_phys_nid;
+ int i, j, ret;
+
+ if (!emu_cmdline)
+ goto no_emu;
+
+ memset(&ei, 0, sizeof(ei));
+ pi = *numa_meminfo;
+
+ for (i = 0; i < MAX_NUMNODES; i++)
+ emu_nid_to_phys[i] = NUMA_NO_NODE;
+
+ /*
+ * If the numa=fake command-line contains a 'M' or 'G', it represents
+ * the fixed node size. Otherwise, if it is just a single number N,
+ * split the system RAM into N fake nodes.
+ */
+ if (strchr(emu_cmdline, 'U')) {
+ nodemask_t physnode_mask = numa_nodes_parsed;
+ unsigned long n;
+ int nid = 0;
+
+ n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
+ ret = -1;
+ for_each_node_mask(i, physnode_mask) {
+ /*
+ * The reason we pass in blk[0] is due to
+ * numa_remove_memblk_from() called by
+ * emu_setup_memblk() will delete entry 0
+ * and then move everything else up in the pi.blk
+ * array. Therefore we should always be looking
+ * at blk[0].
+ */
+ ret = split_nodes_size_interleave_uniform(&ei, &pi,
+ pi.blk[0].start, pi.blk[0].end, 0,
+ n, &pi.blk[0], nid);
+ if (ret < 0)
+ break;
+ if (ret < n) {
+ pr_info("%s: phys: %d only got %d of %ld nodes, failing\n",
+ __func__, i, ret, n);
+ ret = -1;
+ break;
+ }
+ nid = ret;
+ }
+ } else if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
+ u64 size;
+
+ size = memparse(emu_cmdline, &emu_cmdline);
+ ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
+ } else {
+ unsigned long n;
+
+ n = simple_strtoul(emu_cmdline, &emu_cmdline, 0);
+ ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
+ }
+ if (*emu_cmdline == ':')
+ emu_cmdline++;
+
+ if (ret < 0)
+ goto no_emu;
+
+ if (numa_cleanup_meminfo(&ei) < 0) {
+ pr_warn("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
+ goto no_emu;
+ }
+
+ /* copy the physical distance table */
+ if (numa_dist_cnt) {
+ phys_dist = memblock_alloc(phys_size, PAGE_SIZE);
+ if (!phys_dist) {
+ pr_warn("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
+ goto no_emu;
+ }
+
+ for (i = 0; i < numa_dist_cnt; i++)
+ for (j = 0; j < numa_dist_cnt; j++)
+ phys_dist[i * numa_dist_cnt + j] =
+ node_distance(i, j);
+ }
+
+ /*
+ * Determine the max emulated nid and the default phys nid to use
+ * for unmapped nodes.
+ */
+ max_emu_nid = setup_emu2phys_nid(&dfl_phys_nid);
+
+ /* commit */
+ *numa_meminfo = ei;
+
+ /* Make sure numa_nodes_parsed only contains emulated nodes */
+ nodes_clear(numa_nodes_parsed);
+ for (i = 0; i < ARRAY_SIZE(ei.blk); i++)
+ if (ei.blk[i].start != ei.blk[i].end &&
+ ei.blk[i].nid != NUMA_NO_NODE)
+ node_set(ei.blk[i].nid, numa_nodes_parsed);
+
+ numa_emu_update_cpu_to_node(emu_nid_to_phys, ARRAY_SIZE(emu_nid_to_phys));
+
+ /* make sure all emulated nodes are mapped to a physical node */
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+ if (emu_nid_to_phys[i] == NUMA_NO_NODE)
+ emu_nid_to_phys[i] = dfl_phys_nid;
+
+ /* transform distance table */
+ numa_reset_distance();
+ for (i = 0; i < max_emu_nid + 1; i++) {
+ for (j = 0; j < max_emu_nid + 1; j++) {
+ int physi = emu_nid_to_phys[i];
+ int physj = emu_nid_to_phys[j];
+ int dist;
+
+ if (get_option(&emu_cmdline, &dist) == 2)
+ ;
+ else if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
+ dist = physi == physj ?
+ LOCAL_DISTANCE : REMOTE_DISTANCE;
+ else
+ dist = phys_dist[physi * numa_dist_cnt + physj];
+
+ numa_set_distance(i, j, dist);
+ }
+ }
+
+ /* free the copied physical distance table */
+ memblock_free(phys_dist, phys_size);
+ return;
+
+no_emu:
+ /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
+ for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
+ emu_nid_to_phys[i] = i;
+}
+
+#ifndef CONFIG_DEBUG_PER_CPU_MAPS
+void numa_add_cpu(unsigned int cpu)
+{
+ int physnid, nid;
+
+ nid = early_cpu_to_node(cpu);
+ BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
+
+ physnid = emu_nid_to_phys[nid];
+
+ /*
+ * Map the cpu to each emulated node that is allocated on the physical
+ * node of the cpu's apic id.
+ */
+ for_each_online_node(nid)
+ if (emu_nid_to_phys[nid] == physnid)
+ cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
+}
+
+void numa_remove_cpu(unsigned int cpu)
+{
+ int i;
+
+ for_each_online_node(i)
+ cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
+}
+#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
+static void numa_set_cpumask(unsigned int cpu, bool enable)
+{
+ int nid, physnid;
+
+ nid = early_cpu_to_node(cpu);
+ if (nid == NUMA_NO_NODE) {
+ /* early_cpu_to_node() already emits a warning and trace */
+ return;
+ }
+
+ physnid = emu_nid_to_phys[nid];
+
+ for_each_online_node(nid) {
+ if (emu_nid_to_phys[nid] != physnid)
+ continue;
+
+ debug_cpumask_set_cpu(cpu, nid, enable);
+ }
+}
+
+void numa_add_cpu(unsigned int cpu)
+{
+ numa_set_cpumask(cpu, true);
+}
+
+void numa_remove_cpu(unsigned int cpu)
+{
+ numa_set_cpumask(cpu, false);
+}
+#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/mm/numa_memblks.c b/mm/numa_memblks.c
new file mode 100644
index 000000000000..be52b93a9c58
--- /dev/null
+++ b/mm/numa_memblks.c
@@ -0,0 +1,571 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+#include <linux/array_size.h>
+#include <linux/sort.h>
+#include <linux/printk.h>
+#include <linux/memblock.h>
+#include <linux/numa.h>
+#include <linux/numa_memblks.h>
+
+static int numa_distance_cnt;
+static u8 *numa_distance;
+
+nodemask_t numa_nodes_parsed __initdata;
+
+static struct numa_meminfo numa_meminfo __initdata_or_meminfo;
+static struct numa_meminfo numa_reserved_meminfo __initdata_or_meminfo;
+
+/*
+ * Set nodes, which have memory in @mi, in *@nodemask.
+ */
+static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
+ const struct numa_meminfo *mi)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
+ if (mi->blk[i].start != mi->blk[i].end &&
+ mi->blk[i].nid != NUMA_NO_NODE)
+ node_set(mi->blk[i].nid, *nodemask);
+}
+
+/**
+ * numa_reset_distance - Reset NUMA distance table
+ *
+ * The current table is freed. The next numa_set_distance() call will
+ * create a new one.
+ */
+void __init numa_reset_distance(void)
+{
+ size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
+
+ /* numa_distance could be 1LU marking allocation failure, test cnt */
+ if (numa_distance_cnt)
+ memblock_free(numa_distance, size);
+ numa_distance_cnt = 0;
+ numa_distance = NULL; /* enable table creation */
+}
+
+static int __init numa_alloc_distance(void)
+{
+ nodemask_t nodes_parsed;
+ size_t size;
+ int i, j, cnt = 0;
+
+ /* size the new table and allocate it */
+ nodes_parsed = numa_nodes_parsed;
+ numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
+
+ for_each_node_mask(i, nodes_parsed)
+ cnt = i;
+ cnt++;
+ size = cnt * cnt * sizeof(numa_distance[0]);
+
+ numa_distance = memblock_alloc(size, PAGE_SIZE);
+ if (!numa_distance) {
+ pr_warn("Warning: can't allocate distance table!\n");
+ /* don't retry until explicitly reset */
+ numa_distance = (void *)1LU;
+ return -ENOMEM;
+ }
+
+ numa_distance_cnt = cnt;
+
+ /* fill with the default distances */
+ for (i = 0; i < cnt; i++)
+ for (j = 0; j < cnt; j++)
+ numa_distance[i * cnt + j] = i == j ?
+ LOCAL_DISTANCE : REMOTE_DISTANCE;
+ printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
+
+ return 0;
+}
+
+/**
+ * numa_set_distance - Set NUMA distance from one NUMA to another
+ * @from: the 'from' node to set distance
+ * @to: the 'to' node to set distance
+ * @distance: NUMA distance
+ *
+ * Set the distance from node @from to @to to @distance. If distance table
+ * doesn't exist, one which is large enough to accommodate all the currently
+ * known nodes will be created.
+ *
+ * If such table cannot be allocated, a warning is printed and further
+ * calls are ignored until the distance table is reset with
+ * numa_reset_distance().
+ *
+ * If @from or @to is higher than the highest known node or lower than zero
+ * at the time of table creation or @distance doesn't make sense, the call
+ * is ignored.
+ * This is to allow simplification of specific NUMA config implementations.
+ */
+void __init numa_set_distance(int from, int to, int distance)
+{
+ if (!numa_distance && numa_alloc_distance() < 0)
+ return;
+
+ if (from >= numa_distance_cnt || to >= numa_distance_cnt ||
+ from < 0 || to < 0) {
+ pr_warn_once("Warning: node ids are out of bound, from=%d to=%d distance=%d\n",
+ from, to, distance);
+ return;
+ }
+
+ if ((u8)distance != distance ||
+ (from == to && distance != LOCAL_DISTANCE)) {
+ pr_warn_once("Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
+ from, to, distance);
+ return;
+ }
+
+ numa_distance[from * numa_distance_cnt + to] = distance;
+}
+
+int __node_distance(int from, int to)
+{
+ if (from >= numa_distance_cnt || to >= numa_distance_cnt)
+ return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
+ return numa_distance[from * numa_distance_cnt + to];
+}
+EXPORT_SYMBOL(__node_distance);
+
+static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
+ struct numa_meminfo *mi)
+{
+ /* ignore zero length blks */
+ if (start == end)
+ return 0;
+
+ /* whine about and ignore invalid blks */
+ if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
+ pr_warn("Warning: invalid memblk node %d [mem %#010Lx-%#010Lx]\n",
+ nid, start, end - 1);
+ return 0;
+ }
+
+ if (mi->nr_blks >= NR_NODE_MEMBLKS) {
+ pr_err("too many memblk ranges\n");
+ return -EINVAL;
+ }
+
+ mi->blk[mi->nr_blks].start = start;
+ mi->blk[mi->nr_blks].end = end;
+ mi->blk[mi->nr_blks].nid = nid;
+ mi->nr_blks++;
+ return 0;
+}
+
+/**
+ * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
+ * @idx: Index of memblk to remove
+ * @mi: numa_meminfo to remove memblk from
+ *
+ * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
+ * decrementing @mi->nr_blks.
+ */
+void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
+{
+ mi->nr_blks--;
+ memmove(&mi->blk[idx], &mi->blk[idx + 1],
+ (mi->nr_blks - idx) * sizeof(mi->blk[0]));
+}
+
+/**
+ * numa_move_tail_memblk - Move a numa_memblk from one numa_meminfo to another
+ * @dst: numa_meminfo to append block to
+ * @idx: Index of memblk to remove
+ * @src: numa_meminfo to remove memblk from
+ */
+static void __init numa_move_tail_memblk(struct numa_meminfo *dst, int idx,
+ struct numa_meminfo *src)
+{
+ dst->blk[dst->nr_blks++] = src->blk[idx];
+ numa_remove_memblk_from(idx, src);
+}
+
+/**
+ * numa_add_memblk - Add one numa_memblk to numa_meminfo
+ * @nid: NUMA node ID of the new memblk
+ * @start: Start address of the new memblk
+ * @end: End address of the new memblk
+ *
+ * Add a new memblk to the default numa_meminfo.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_add_memblk(int nid, u64 start, u64 end)
+{
+ return numa_add_memblk_to(nid, start, end, &numa_meminfo);
+}
+
+/**
+ * numa_cleanup_meminfo - Cleanup a numa_meminfo
+ * @mi: numa_meminfo to clean up
+ *
+ * Sanitize @mi by merging and removing unnecessary memblks. Also check for
+ * conflicts and clear unused memblks.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
+{
+ const u64 low = memblock_start_of_DRAM();
+ const u64 high = memblock_end_of_DRAM();
+ int i, j, k;
+
+ /* first, trim all entries */
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *bi = &mi->blk[i];
+
+ /* move / save reserved memory ranges */
+ if (!memblock_overlaps_region(&memblock.memory,
+ bi->start, bi->end - bi->start)) {
+ numa_move_tail_memblk(&numa_reserved_meminfo, i--, mi);
+ continue;
+ }
+
+ /* make sure all non-reserved blocks are inside the limits */
+ bi->start = max(bi->start, low);
+
+ /* preserve info for non-RAM areas above 'max_pfn': */
+ if (bi->end > high) {
+ numa_add_memblk_to(bi->nid, high, bi->end,
+ &numa_reserved_meminfo);
+ bi->end = high;
+ }
+
+ /* and there's no empty block */
+ if (bi->start >= bi->end)
+ numa_remove_memblk_from(i--, mi);
+ }
+
+ /* merge neighboring / overlapping entries */
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *bi = &mi->blk[i];
+
+ for (j = i + 1; j < mi->nr_blks; j++) {
+ struct numa_memblk *bj = &mi->blk[j];
+ u64 start, end;
+
+ /*
+ * See whether there are overlapping blocks. Whine
+ * about but allow overlaps of the same nid. They
+ * will be merged below.
+ */
+ if (bi->end > bj->start && bi->start < bj->end) {
+ if (bi->nid != bj->nid) {
+ pr_err("node %d [mem %#010Lx-%#010Lx] overlaps with node %d [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1,
+ bj->nid, bj->start, bj->end - 1);
+ return -EINVAL;
+ }
+ pr_warn("Warning: node %d [mem %#010Lx-%#010Lx] overlaps with itself [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1,
+ bj->start, bj->end - 1);
+ }
+
+ /*
+ * Join together blocks on the same node, holes
+ * between which don't overlap with memory on other
+ * nodes.
+ */
+ if (bi->nid != bj->nid)
+ continue;
+ start = min(bi->start, bj->start);
+ end = max(bi->end, bj->end);
+ for (k = 0; k < mi->nr_blks; k++) {
+ struct numa_memblk *bk = &mi->blk[k];
+
+ if (bi->nid == bk->nid)
+ continue;
+ if (start < bk->end && end > bk->start)
+ break;
+ }
+ if (k < mi->nr_blks)
+ continue;
+ pr_info("NUMA: Node %d [mem %#010Lx-%#010Lx] + [mem %#010Lx-%#010Lx] -> [mem %#010Lx-%#010Lx]\n",
+ bi->nid, bi->start, bi->end - 1, bj->start,
+ bj->end - 1, start, end - 1);
+ bi->start = start;
+ bi->end = end;
+ numa_remove_memblk_from(j--, mi);
+ }
+ }
+
+ /* clear unused ones */
+ for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
+ mi->blk[i].start = mi->blk[i].end = 0;
+ mi->blk[i].nid = NUMA_NO_NODE;
+ }
+
+ return 0;
+}
+
+/*
+ * Mark all currently memblock-reserved physical memory (which covers the
+ * kernel's own memory ranges) as hot-unswappable.
+ */
+static void __init numa_clear_kernel_node_hotplug(void)
+{
+ nodemask_t reserved_nodemask = NODE_MASK_NONE;
+ struct memblock_region *mb_region;
+ int i;
+
+ /*
+ * We have to do some preprocessing of memblock regions, to
+ * make them suitable for reservation.
+ *
+ * At this time, all memory regions reserved by memblock are
+ * used by the kernel, but those regions are not split up
+ * along node boundaries yet, and don't necessarily have their
+ * node ID set yet either.
+ *
+ * So iterate over all parsed memory blocks and use those ranges to
+ * set the nid in memblock.reserved. This will split up the
+ * memblock regions along node boundaries and will set the node IDs
+ * as well.
+ */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ struct numa_memblk *mb = numa_meminfo.blk + i;
+ int ret;
+
+ ret = memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.reserved, mb->nid);
+ WARN_ON_ONCE(ret);
+ }
+
+ /*
+ * Now go over all reserved memblock regions, to construct a
+ * node mask of all kernel reserved memory areas.
+ *
+ * [ Note, when booting with mem=nn[kMG] or in a kdump kernel,
+ * numa_meminfo might not include all memblock.reserved
+ * memory ranges, because quirks such as trim_snb_memory()
+ * reserve specific pages for Sandy Bridge graphics. ]
+ */
+ for_each_reserved_mem_region(mb_region) {
+ int nid = memblock_get_region_node(mb_region);
+
+ if (nid != MAX_NUMNODES)
+ node_set(nid, reserved_nodemask);
+ }
+
+ /*
+ * Finally, clear the MEMBLOCK_HOTPLUG flag for all memory
+ * belonging to the reserved node mask.
+ *
+ * Note that this will include memory regions that reside
+ * on nodes that contain kernel memory - entire nodes
+ * become hot-unpluggable:
+ */
+ for (i = 0; i < numa_meminfo.nr_blks; i++) {
+ struct numa_memblk *mb = numa_meminfo.blk + i;
+
+ if (!node_isset(mb->nid, reserved_nodemask))
+ continue;
+
+ memblock_clear_hotplug(mb->start, mb->end - mb->start);
+ }
+}
+
+static int __init numa_register_meminfo(struct numa_meminfo *mi)
+{
+ int i;
+
+ /* Account for nodes with cpus and no memory */
+ node_possible_map = numa_nodes_parsed;
+ numa_nodemask_from_meminfo(&node_possible_map, mi);
+ if (WARN_ON(nodes_empty(node_possible_map)))
+ return -EINVAL;
+
+ for (i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *mb = &mi->blk[i];
+
+ memblock_set_node(mb->start, mb->end - mb->start,
+ &memblock.memory, mb->nid);
+ }
+
+ /*
+ * At very early time, the kernel have to use some memory such as
+ * loading the kernel image. We cannot prevent this anyway. So any
+ * node the kernel resides in should be un-hotpluggable.
+ *
+ * And when we come here, alloc node data won't fail.
+ */
+ numa_clear_kernel_node_hotplug();
+
+ /*
+ * If sections array is gonna be used for pfn -> nid mapping, check
+ * whether its granularity is fine enough.
+ */
+ if (IS_ENABLED(NODE_NOT_IN_PAGE_FLAGS)) {
+ unsigned long pfn_align = node_map_pfn_alignment();
+
+ if (pfn_align && pfn_align < PAGES_PER_SECTION) {
+ unsigned long node_align_mb = PFN_PHYS(pfn_align) >> 20;
+
+ unsigned long sect_align_mb = PFN_PHYS(PAGES_PER_SECTION) >> 20;
+
+ pr_warn("Node alignment %luMB < min %luMB, rejecting NUMA config\n",
+ node_align_mb, sect_align_mb);
+ return -EINVAL;
+ }
+ }
+
+ return 0;
+}
+
+int __init numa_memblks_init(int (*init_func)(void),
+ bool memblock_force_top_down)
+{
+ phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX;
+ int ret;
+
+ nodes_clear(numa_nodes_parsed);
+ nodes_clear(node_possible_map);
+ nodes_clear(node_online_map);
+ memset(&numa_meminfo, 0, sizeof(numa_meminfo));
+ WARN_ON(memblock_set_node(0, max_addr, &memblock.memory, NUMA_NO_NODE));
+ WARN_ON(memblock_set_node(0, max_addr, &memblock.reserved,
+ NUMA_NO_NODE));
+ /* In case that parsing SRAT failed. */
+ WARN_ON(memblock_clear_hotplug(0, max_addr));
+ numa_reset_distance();
+
+ ret = init_func();
+ if (ret < 0)
+ return ret;
+
+ /*
+ * We reset memblock back to the top-down direction
+ * here because if we configured ACPI_NUMA, we have
+ * parsed SRAT in init_func(). It is ok to have the
+ * reset here even if we did't configure ACPI_NUMA
+ * or acpi numa init fails and fallbacks to dummy
+ * numa init.
+ */
+ if (memblock_force_top_down)
+ memblock_set_bottom_up(false);
+
+ ret = numa_cleanup_meminfo(&numa_meminfo);
+ if (ret < 0)
+ return ret;
+
+ numa_emulation(&numa_meminfo, numa_distance_cnt);
+
+ return numa_register_meminfo(&numa_meminfo);
+}
+
+static int __init cmp_memblk(const void *a, const void *b)
+{
+ const struct numa_memblk *ma = *(const struct numa_memblk **)a;
+ const struct numa_memblk *mb = *(const struct numa_memblk **)b;
+
+ return (ma->start > mb->start) - (ma->start < mb->start);
+}
+
+static struct numa_memblk *numa_memblk_list[NR_NODE_MEMBLKS] __initdata;
+
+/**
+ * numa_fill_memblks - Fill gaps in numa_meminfo memblks
+ * @start: address to begin fill
+ * @end: address to end fill
+ *
+ * Find and extend numa_meminfo memblks to cover the physical
+ * address range @start-@end
+ *
+ * RETURNS:
+ * 0 : Success
+ * NUMA_NO_MEMBLK : No memblks exist in address range @start-@end
+ */
+
+int __init numa_fill_memblks(u64 start, u64 end)
+{
+ struct numa_memblk **blk = &numa_memblk_list[0];
+ struct numa_meminfo *mi = &numa_meminfo;
+ int count = 0;
+ u64 prev_end;
+
+ /*
+ * Create a list of pointers to numa_meminfo memblks that
+ * overlap start, end. The list is used to make in-place
+ * changes that fill out the numa_meminfo memblks.
+ */
+ for (int i = 0; i < mi->nr_blks; i++) {
+ struct numa_memblk *bi = &mi->blk[i];
+
+ if (memblock_addrs_overlap(start, end - start, bi->start,
+ bi->end - bi->start)) {
+ blk[count] = &mi->blk[i];
+ count++;
+ }
+ }
+ if (!count)
+ return NUMA_NO_MEMBLK;
+
+ /* Sort the list of pointers in memblk->start order */
+ sort(&blk[0], count, sizeof(blk[0]), cmp_memblk, NULL);
+
+ /* Make sure the first/last memblks include start/end */
+ blk[0]->start = min(blk[0]->start, start);
+ blk[count - 1]->end = max(blk[count - 1]->end, end);
+
+ /*
+ * Fill any gaps by tracking the previous memblks
+ * end address and backfilling to it if needed.
+ */
+ prev_end = blk[0]->end;
+ for (int i = 1; i < count; i++) {
+ struct numa_memblk *curr = blk[i];
+
+ if (prev_end >= curr->start) {
+ if (prev_end < curr->end)
+ prev_end = curr->end;
+ } else {
+ curr->start = prev_end;
+ prev_end = curr->end;
+ }
+ }
+ return 0;
+}
+
+#ifdef CONFIG_NUMA_KEEP_MEMINFO
+static int meminfo_to_nid(struct numa_meminfo *mi, u64 start)
+{
+ int i;
+
+ for (i = 0; i < mi->nr_blks; i++)
+ if (mi->blk[i].start <= start && mi->blk[i].end > start)
+ return mi->blk[i].nid;
+ return NUMA_NO_NODE;
+}
+
+int phys_to_target_node(u64 start)
+{
+ int nid = meminfo_to_nid(&numa_meminfo, start);
+
+ /*
+ * Prefer online nodes, but if reserved memory might be
+ * hot-added continue the search with reserved ranges.
+ */
+ if (nid != NUMA_NO_NODE)
+ return nid;
+
+ return meminfo_to_nid(&numa_reserved_meminfo, start);
+}
+EXPORT_SYMBOL_GPL(phys_to_target_node);
+
+int memory_add_physaddr_to_nid(u64 start)
+{
+ int nid = meminfo_to_nid(&numa_meminfo, start);
+
+ if (nid == NUMA_NO_NODE)
+ nid = numa_meminfo.blk[0].nid;
+ return nid;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+
+#endif /* CONFIG_NUMA_KEEP_MEMINFO */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7a04cb1918fd..fcd4c1439cb9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2612,7 +2612,7 @@ struct folio *writeback_iter(struct address_space *mapping,
done:
if (wbc->range_cyclic)
- mapping->writeback_index = folio->index + folio_nr_pages(folio);
+ mapping->writeback_index = folio_next_index(folio);
folio_batch_release(&wbc->fbatch);
return NULL;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 0aefae4a26b2..8afab64814dc 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -286,9 +286,7 @@ EXPORT_SYMBOL(nr_online_nodes);
#endif
static bool page_contains_unaccepted(struct page *page, unsigned int order);
-static void accept_page(struct page *page, unsigned int order);
static bool cond_accept_memory(struct zone *zone, unsigned int order);
-static inline bool has_unaccepted_memory(void);
static bool __free_unaccepted(struct page *page);
int page_group_by_mobility_disabled __read_mostly;
@@ -322,6 +320,11 @@ static inline bool deferred_pages_enabled(void)
{
return false;
}
+
+static inline bool _deferred_grow_zone(struct zone *zone, unsigned int order)
+{
+ return false;
+}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
/* Return a pointer to the bitmap storing bits affecting a block of pages */
@@ -958,8 +961,9 @@ static int free_tail_page_prepare(struct page *head_page, struct page *page)
break;
case 2:
/* the second tail page: deferred_list overlaps ->mapping */
- if (unlikely(!list_empty(&folio->_deferred_list))) {
- bad_page(page, "on deferred list");
+ if (unlikely(!list_empty(&folio->_deferred_list) &&
+ folio_test_partially_mapped(folio))) {
+ bad_page(page, "partially mapped folio on deferred list");
goto out;
}
break;
@@ -1087,8 +1091,11 @@ __always_inline bool free_pages_prepare(struct page *page,
(page + i)->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
}
}
- if (PageMappingFlags(page))
+ if (PageMappingFlags(page)) {
+ if (PageAnon(page))
+ mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
page->mapping = NULL;
+ }
if (is_check_pages_enabled()) {
if (free_page_is_bad(page))
bad++;
@@ -1199,17 +1206,39 @@ static void free_pcppages_bulk(struct zone *zone, int count,
spin_unlock_irqrestore(&zone->lock, flags);
}
+/* Split a multi-block free page into its individual pageblocks. */
+static void split_large_buddy(struct zone *zone, struct page *page,
+ unsigned long pfn, int order, fpi_t fpi)
+{
+ unsigned long end = pfn + (1 << order);
+
+ VM_WARN_ON_ONCE(!IS_ALIGNED(pfn, 1 << order));
+ /* Caller removed page from freelist, buddy info cleared! */
+ VM_WARN_ON_ONCE(PageBuddy(page));
+
+ if (order > pageblock_order)
+ order = pageblock_order;
+
+ while (pfn != end) {
+ int mt = get_pfnblock_migratetype(page, pfn);
+
+ __free_one_page(page, pfn, zone, order, mt, fpi);
+ pfn += 1 << order;
+ page = pfn_to_page(pfn);
+ }
+}
+
static void free_one_page(struct zone *zone, struct page *page,
unsigned long pfn, unsigned int order,
fpi_t fpi_flags)
{
unsigned long flags;
- int migratetype;
spin_lock_irqsave(&zone->lock, flags);
- migratetype = get_pfnblock_migratetype(page, pfn);
- __free_one_page(page, pfn, zone, order, migratetype, fpi_flags);
+ split_large_buddy(zone, page, pfn, order, fpi_flags);
spin_unlock_irqrestore(&zone->lock, flags);
+
+ __count_vm_events(PGFREE, 1 << order);
}
static void __free_pages_ok(struct page *page, unsigned int order,
@@ -1218,12 +1247,8 @@ static void __free_pages_ok(struct page *page, unsigned int order,
unsigned long pfn = page_to_pfn(page);
struct zone *zone = page_zone(page);
- if (!free_pages_prepare(page, order))
- return;
-
- free_one_page(zone, page, pfn, order, fpi_flags);
-
- __count_vm_events(PGFREE, 1 << order);
+ if (free_pages_prepare(page, order))
+ free_one_page(zone, page, pfn, order, fpi_flags);
}
void __meminit __free_pages_core(struct page *page, unsigned int order,
@@ -1270,7 +1295,7 @@ void __meminit __free_pages_core(struct page *page, unsigned int order,
if (order == MAX_PAGE_ORDER && __free_unaccepted(page))
return;
- accept_page(page, order);
+ accept_memory(page_to_phys(page), PAGE_SIZE << order);
}
/*
@@ -1346,11 +1371,11 @@ struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
*
* -- nyc
*/
-static inline void expand(struct zone *zone, struct page *page,
- int low, int high, int migratetype)
+static inline unsigned int expand(struct zone *zone, struct page *page, int low,
+ int high, int migratetype)
{
- unsigned long size = 1 << high;
- unsigned long nr_added = 0;
+ unsigned int size = 1 << high;
+ unsigned int nr_added = 0;
while (high > low) {
high--;
@@ -1370,7 +1395,19 @@ static inline void expand(struct zone *zone, struct page *page,
set_buddy_order(&page[size], high);
nr_added += size;
}
- account_freepages(zone, nr_added, migratetype);
+
+ return nr_added;
+}
+
+static __always_inline void page_del_and_expand(struct zone *zone,
+ struct page *page, int low,
+ int high, int migratetype)
+{
+ int nr_pages = 1 << high;
+
+ __del_page_from_free_list(page, zone, high, migratetype);
+ nr_pages -= expand(zone, page, low, high, migratetype);
+ account_freepages(zone, -nr_pages, migratetype);
}
static void check_new_page_bad(struct page *page)
@@ -1540,8 +1577,9 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
page = get_page_from_free_area(area, migratetype);
if (!page)
continue;
- del_page_from_free_list(page, zone, current_order, migratetype);
- expand(zone, page, order, current_order, migratetype);
+
+ page_del_and_expand(zone, page, order, current_order,
+ migratetype);
trace_mm_page_alloc_zone_locked(page, order, migratetype,
pcp_allowed_order(order) &&
migratetype < MIGRATE_PCPTYPES);
@@ -1700,27 +1738,6 @@ static unsigned long find_large_buddy(unsigned long start_pfn)
return start_pfn;
}
-/* Split a multi-block free page into its individual pageblocks */
-static void split_large_buddy(struct zone *zone, struct page *page,
- unsigned long pfn, int order)
-{
- unsigned long end_pfn = pfn + (1 << order);
-
- VM_WARN_ON_ONCE(order <= pageblock_order);
- VM_WARN_ON_ONCE(pfn & (pageblock_nr_pages - 1));
-
- /* Caller removed page from freelist, buddy info cleared! */
- VM_WARN_ON_ONCE(PageBuddy(page));
-
- while (pfn != end_pfn) {
- int mt = get_pfnblock_migratetype(page, pfn);
-
- __free_one_page(page, pfn, zone, pageblock_order, mt, FPI_NONE);
- pfn += pageblock_nr_pages;
- page = pfn_to_page(pfn);
- }
-}
-
/**
* move_freepages_block_isolate - move free pages in block for page isolation
* @zone: the zone
@@ -1761,7 +1778,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
del_page_from_free_list(buddy, zone, order,
get_pfnblock_migratetype(buddy, pfn));
set_pageblock_migratetype(page, migratetype);
- split_large_buddy(zone, buddy, pfn, order);
+ split_large_buddy(zone, buddy, pfn, order, FPI_NONE);
return true;
}
@@ -1772,7 +1789,7 @@ bool move_freepages_block_isolate(struct zone *zone, struct page *page,
del_page_from_free_list(page, zone, order,
get_pfnblock_migratetype(page, pfn));
set_pageblock_migratetype(page, migratetype);
- split_large_buddy(zone, page, pfn, order);
+ split_large_buddy(zone, page, pfn, order, FPI_NONE);
return true;
}
move:
@@ -1892,9 +1909,12 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
+ unsigned int nr_added;
+
del_page_from_free_list(page, zone, current_order, block_type);
change_pageblock_range(page, current_order, start_type);
- expand(zone, page, order, current_order, start_type);
+ nr_added = expand(zone, page, order, current_order, start_type);
+ account_freepages(zone, nr_added, start_type);
return page;
}
@@ -1947,8 +1967,7 @@ steal_suitable_fallback(struct zone *zone, struct page *page,
}
single_page:
- del_page_from_free_list(page, zone, current_order, block_type);
- expand(zone, page, order, current_order, block_type);
+ page_del_and_expand(zone, page, order, current_order, block_type);
return page;
}
@@ -2764,7 +2783,7 @@ void split_page(struct page *page, unsigned int order)
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
split_page_owner(page, order, 0);
- pgalloc_tag_split(page, 1 << order);
+ pgalloc_tag_split(page_folio(page), order, 0);
split_page_memcg(page, order, 0);
}
EXPORT_SYMBOL_GPL(split_page);
@@ -3033,12 +3052,6 @@ struct page *rmqueue(struct zone *preferred_zone,
{
struct page *page;
- /*
- * We most definitely don't want callers attempting to
- * allocate greater than order-1 page units with __GFP_NOFAIL.
- */
- WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1));
-
if (likely(pcp_allowed_order(order))) {
page = rmqueue_pcplist(preferred_zone, zone, order,
migratetype, alloc_flags);
@@ -3357,7 +3370,7 @@ retry:
}
if (no_fallback && nr_online_nodes > 1 &&
- zone != ac->preferred_zoneref->zone) {
+ zone != zonelist_zone(ac->preferred_zoneref)) {
int local_nid;
/*
@@ -3365,7 +3378,7 @@ retry:
* fragmenting fallbacks. Locality is more important
* than fragmentation avoidance.
*/
- local_nid = zone_to_nid(ac->preferred_zoneref->zone);
+ local_nid = zonelist_node_idx(ac->preferred_zoneref);
if (zone_to_nid(zone) != local_nid) {
alloc_flags &= ~ALLOC_NOFRAGMENT;
goto retry;
@@ -3402,7 +3415,6 @@ check_alloc_wmark:
if (cond_accept_memory(zone, order))
goto try_this_zone;
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
* Watermark failed for this zone, but see if we can
* grow this zone if it contains deferred pages.
@@ -3411,14 +3423,13 @@ check_alloc_wmark:
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
-#endif
/* Checked here to keep the fast path fast */
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (alloc_flags & ALLOC_NO_WATERMARKS)
goto try_this_zone;
if (!node_reclaim_enabled() ||
- !zone_allows_reclaim(ac->preferred_zoneref->zone, zone))
+ !zone_allows_reclaim(zonelist_zone(ac->preferred_zoneref), zone))
continue;
ret = node_reclaim(zone->zone_pgdat, gfp_mask, order);
@@ -3440,7 +3451,7 @@ check_alloc_wmark:
}
try_this_zone:
- page = rmqueue(ac->preferred_zoneref->zone, zone, order,
+ page = rmqueue(zonelist_zone(ac->preferred_zoneref), zone, order,
gfp_mask, alloc_flags, ac->migratetype);
if (page) {
prep_new_page(page, order, gfp_mask, alloc_flags);
@@ -3457,13 +3468,11 @@ try_this_zone:
if (cond_accept_memory(zone, order))
goto try_this_zone;
-#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/* Try again if zone has deferred pages */
if (deferred_pages_enabled()) {
if (_deferred_grow_zone(zone, order))
goto try_this_zone;
}
-#endif
}
}
@@ -4100,6 +4109,11 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
unsigned long min_wmark = min_wmark_pages(zone);
bool wmark;
+ if (cpusets_enabled() &&
+ (alloc_flags & ALLOC_CPUSET) &&
+ !__cpuset_zone_allowed(zone, gfp_mask))
+ continue;
+
available = reclaimable = zone_reclaimable_pages(zone);
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
@@ -4175,6 +4189,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
{
bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
bool can_compact = gfp_compaction_allowed(gfp_mask);
+ bool nofail = gfp_mask & __GFP_NOFAIL;
const bool costly_order = order > PAGE_ALLOC_COSTLY_ORDER;
struct page *page = NULL;
unsigned int alloc_flags;
@@ -4187,6 +4202,25 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
unsigned int zonelist_iter_cookie;
int reserve_flags;
+ if (unlikely(nofail)) {
+ /*
+ * We most definitely don't want callers attempting to
+ * allocate greater than order-1 page units with __GFP_NOFAIL.
+ */
+ WARN_ON_ONCE(order > 1);
+ /*
+ * Also we don't support __GFP_NOFAIL without __GFP_DIRECT_RECLAIM,
+ * otherwise, we may result in lockup.
+ */
+ WARN_ON_ONCE(!can_direct_reclaim);
+ /*
+ * PF_MEMALLOC request from this context is rather bizarre
+ * because we cannot reclaim anything and only can loop waiting
+ * for somebody to do a work for us.
+ */
+ WARN_ON_ONCE(current->flags & PF_MEMALLOC);
+ }
+
restart:
compaction_retries = 0;
no_progress_loops = 0;
@@ -4209,7 +4243,7 @@ restart:
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx, ac->nodemask);
- if (!ac->preferred_zoneref->zone)
+ if (!zonelist_zone(ac->preferred_zoneref))
goto nopage;
/*
@@ -4221,7 +4255,7 @@ restart:
struct zoneref *z = first_zones_zonelist(ac->zonelist,
ac->highest_zoneidx,
&cpuset_current_mems_allowed);
- if (!z->zone)
+ if (!zonelist_zone(z))
goto nopage;
}
@@ -4404,30 +4438,16 @@ nopage:
* Make sure that __GFP_NOFAIL request doesn't leak out and make sure
* we always retry
*/
- if (gfp_mask & __GFP_NOFAIL) {
+ if (unlikely(nofail)) {
/*
- * All existing users of the __GFP_NOFAIL are blockable, so warn
- * of any new users that actually require GFP_NOWAIT
+ * Lacking direct_reclaim we can't do anything to reclaim memory,
+ * we disregard these unreasonable nofail requests and still
+ * return NULL
*/
- if (WARN_ON_ONCE_GFP(!can_direct_reclaim, gfp_mask))
+ if (!can_direct_reclaim)
goto fail;
/*
- * PF_MEMALLOC request from this context is rather bizarre
- * because we cannot reclaim anything and only can loop waiting
- * for somebody to do a work for us
- */
- WARN_ON_ONCE_GFP(current->flags & PF_MEMALLOC, gfp_mask);
-
- /*
- * non failing costly orders are a hard requirement which we
- * are not prepared for much so let's warn about these users
- * so that we can identify them and convert them to something
- * else.
- */
- WARN_ON_ONCE_GFP(costly_order, gfp_mask);
-
- /*
* Help non-failing allocations by giving some access to memory
* reserves normally used for high priority non-blocking
* allocations but do not use ALLOC_NO_WATERMARKS because this
@@ -4578,17 +4598,28 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
continue;
}
- if (nr_online_nodes > 1 && zone != ac.preferred_zoneref->zone &&
- zone_to_nid(zone) != zone_to_nid(ac.preferred_zoneref->zone)) {
+ if (nr_online_nodes > 1 && zone != zonelist_zone(ac.preferred_zoneref) &&
+ zone_to_nid(zone) != zonelist_node_idx(ac.preferred_zoneref)) {
goto failed;
}
+ cond_accept_memory(zone, 0);
+retry_this_zone:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK) + nr_pages;
if (zone_watermark_fast(zone, 0, mark,
zonelist_zone_idx(ac.preferred_zoneref),
alloc_flags, gfp)) {
break;
}
+
+ if (cond_accept_memory(zone, 0))
+ goto retry_this_zone;
+
+ /* Try again if zone has deferred pages */
+ if (deferred_pages_enabled()) {
+ if (_deferred_grow_zone(zone, 0))
+ goto retry_this_zone;
+ }
}
/*
@@ -4638,7 +4669,7 @@ unsigned long alloc_pages_bulk_noprof(gfp_t gfp, int preferred_nid,
pcp_trylock_finish(UP_flags);
__count_zid_vm_events(PGALLOC, zone_idx(zone), nr_account);
- zone_statistics(ac.preferred_zoneref->zone, zone, nr_account);
+ zone_statistics(zonelist_zone(ac.preferred_zoneref), zone, nr_account);
out:
return nr_populated;
@@ -4696,7 +4727,7 @@ struct page *__alloc_pages_noprof(gfp_t gfp, unsigned int order,
* Forbid the first pass from falling back to types that fragment
* memory until all local zones are considered.
*/
- alloc_flags |= alloc_flags_nofragment(ac.preferred_zoneref->zone, gfp);
+ alloc_flags |= alloc_flags_nofragment(zonelist_zone(ac.preferred_zoneref), gfp);
/* First allocation attempt */
page = get_page_from_freelist(alloc_gfp, order, alloc_flags, &ac);
@@ -4950,7 +4981,7 @@ static void *make_alloc_exact(unsigned long addr, unsigned int order,
struct page *last = page + nr;
split_page_owner(page, order, 0);
- pgalloc_tag_split(page, 1 << order);
+ pgalloc_tag_split(page_folio(page), order, 0);
split_page_memcg(page, order, 0);
while (page < --last)
set_page_refcounted(last);
@@ -5301,7 +5332,7 @@ int local_memory_node(int node)
z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL),
gfp_zone(GFP_KERNEL),
NULL);
- return zone_to_nid(z->zone);
+ return zonelist_node_idx(z);
}
#endif
@@ -6433,6 +6464,31 @@ int __alloc_contig_migrate_range(struct compact_control *cc,
return (ret < 0) ? ret : 0;
}
+static void split_free_pages(struct list_head *list)
+{
+ int order;
+
+ for (order = 0; order < NR_PAGE_ORDERS; order++) {
+ struct page *page, *next;
+ int nr_pages = 1 << order;
+
+ list_for_each_entry_safe(page, next, &list[order], lru) {
+ int i;
+
+ post_alloc_hook(page, order, __GFP_MOVABLE);
+ if (!order)
+ continue;
+
+ split_page(page, order);
+
+ /* Add all subpages to the order-0 head, in sequence. */
+ list_del(&page->lru);
+ for (i = 0; i < nr_pages; i++)
+ list_add_tail(&page[i].lru, &list[0]);
+ }
+ }
+}
+
/**
* alloc_contig_range() -- tries to allocate given range of pages
* @start: start PFN to allocate
@@ -6545,12 +6601,25 @@ int alloc_contig_range_noprof(unsigned long start, unsigned long end,
goto done;
}
- /* Free head and tail (if any) */
- if (start != outer_start)
- free_contig_range(outer_start, start - outer_start);
- if (end != outer_end)
- free_contig_range(end, outer_end - end);
+ if (!(gfp_mask & __GFP_COMP)) {
+ split_free_pages(cc.freepages);
+ /* Free head and tail (if any) */
+ if (start != outer_start)
+ free_contig_range(outer_start, start - outer_start);
+ if (end != outer_end)
+ free_contig_range(end, outer_end - end);
+ } else if (start == outer_start && end == outer_end && is_power_of_2(end - start)) {
+ struct page *head = pfn_to_page(start);
+ int order = ilog2(end - start);
+
+ check_new_pages(head, order);
+ prep_new_page(head, order, gfp_mask, 0);
+ } else {
+ ret = -EINVAL;
+ WARN(true, "PFN range: requested [%lu, %lu), allocated [%lu, %lu)\n",
+ start, end, outer_start, outer_end);
+ }
done:
undo_isolate_page_range(start, end, migratetype);
return ret;
@@ -6659,6 +6728,18 @@ struct page *alloc_contig_pages_noprof(unsigned long nr_pages, gfp_t gfp_mask,
void free_contig_range(unsigned long pfn, unsigned long nr_pages)
{
unsigned long count = 0;
+ struct folio *folio = pfn_folio(pfn);
+
+ if (folio_test_large(folio)) {
+ int expected = folio_nr_pages(folio);
+
+ if (nr_pages == expected)
+ folio_put(folio);
+ else
+ WARN(true, "PFN %lu: nr_pages %lu != expected %d\n",
+ pfn, nr_pages, expected);
+ return;
+ }
for (; nr_pages--; pfn++) {
struct page *page = pfn_to_page(pfn);
@@ -6927,23 +7008,50 @@ early_param("accept_memory", accept_memory_parse);
static bool page_contains_unaccepted(struct page *page, unsigned int order)
{
phys_addr_t start = page_to_phys(page);
- phys_addr_t end = start + (PAGE_SIZE << order);
- return range_contains_unaccepted_memory(start, end);
+ return range_contains_unaccepted_memory(start, PAGE_SIZE << order);
}
-static void accept_page(struct page *page, unsigned int order)
+static void __accept_page(struct zone *zone, unsigned long *flags,
+ struct page *page)
{
- phys_addr_t start = page_to_phys(page);
+ bool last;
+
+ list_del(&page->lru);
+ last = list_empty(&zone->unaccepted_pages);
+
+ account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
+ __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
+ __ClearPageUnaccepted(page);
+ spin_unlock_irqrestore(&zone->lock, *flags);
+
+ accept_memory(page_to_phys(page), PAGE_SIZE << MAX_PAGE_ORDER);
+
+ __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
- accept_memory(start, start + (PAGE_SIZE << order));
+ if (last)
+ static_branch_dec(&zones_with_unaccepted_pages);
+}
+
+void accept_page(struct page *page)
+{
+ struct zone *zone = page_zone(page);
+ unsigned long flags;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ if (!PageUnaccepted(page)) {
+ spin_unlock_irqrestore(&zone->lock, flags);
+ return;
+ }
+
+ /* Unlocks zone->lock */
+ __accept_page(zone, &flags, page);
}
static bool try_to_accept_memory_one(struct zone *zone)
{
unsigned long flags;
struct page *page;
- bool last;
spin_lock_irqsave(&zone->lock, flags);
page = list_first_entry_or_null(&zone->unaccepted_pages,
@@ -6953,23 +7061,17 @@ static bool try_to_accept_memory_one(struct zone *zone)
return false;
}
- list_del(&page->lru);
- last = list_empty(&zone->unaccepted_pages);
-
- account_freepages(zone, -MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
- __mod_zone_page_state(zone, NR_UNACCEPTED, -MAX_ORDER_NR_PAGES);
- spin_unlock_irqrestore(&zone->lock, flags);
-
- accept_page(page, MAX_PAGE_ORDER);
-
- __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL);
-
- if (last)
- static_branch_dec(&zones_with_unaccepted_pages);
+ /* Unlocks zone->lock */
+ __accept_page(zone, &flags, page);
return true;
}
+static inline bool has_unaccepted_memory(void)
+{
+ return static_branch_unlikely(&zones_with_unaccepted_pages);
+}
+
static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
long to_accept;
@@ -6981,8 +7083,8 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order)
if (list_empty(&zone->unaccepted_pages))
return false;
- /* How much to accept to get to high watermark? */
- to_accept = high_wmark_pages(zone) -
+ /* How much to accept to get to promo watermark? */
+ to_accept = promo_wmark_pages(zone) -
(zone_page_state(zone, NR_FREE_PAGES) -
__zone_watermark_unusable_free(zone, order, 0) -
zone_page_state(zone, NR_UNACCEPTED));
@@ -6997,11 +7099,6 @@ static bool cond_accept_memory(struct zone *zone, unsigned int order)
return ret;
}
-static inline bool has_unaccepted_memory(void)
-{
- return static_branch_unlikely(&zones_with_unaccepted_pages);
-}
-
static bool __free_unaccepted(struct page *page)
{
struct zone *zone = page_zone(page);
@@ -7016,6 +7113,7 @@ static bool __free_unaccepted(struct page *page)
list_add_tail(&page->lru, &zone->unaccepted_pages);
account_freepages(zone, MAX_ORDER_NR_PAGES, MIGRATE_MOVABLE);
__mod_zone_page_state(zone, NR_UNACCEPTED, MAX_ORDER_NR_PAGES);
+ __SetPageUnaccepted(page);
spin_unlock_irqrestore(&zone->lock, flags);
if (first)
@@ -7031,20 +7129,11 @@ static bool page_contains_unaccepted(struct page *page, unsigned int order)
return false;
}
-static void accept_page(struct page *page, unsigned int order)
-{
-}
-
static bool cond_accept_memory(struct zone *zone, unsigned int order)
{
return false;
}
-static inline bool has_unaccepted_memory(void)
-{
- return false;
-}
-
static bool __free_unaccepted(struct page *page)
{
BUILD_BUG();
diff --git a/mm/page_counter.c b/mm/page_counter.c
index 0153f5bb3161..b249d15af9dd 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -13,6 +13,11 @@
#include <linux/bug.h>
#include <asm/page.h>
+static bool track_protection(struct page_counter *c)
+{
+ return c->protection_support;
+}
+
static void propagate_protected_usage(struct page_counter *c,
unsigned long usage)
{
@@ -57,7 +62,8 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
new = 0;
atomic_long_set(&counter->usage, new);
}
- propagate_protected_usage(counter, new);
+ if (track_protection(counter))
+ propagate_protected_usage(counter, new);
}
/**
@@ -70,18 +76,33 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages)
void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
{
struct page_counter *c;
+ bool protection = track_protection(counter);
for (c = counter; c; c = c->parent) {
long new;
new = atomic_long_add_return(nr_pages, &c->usage);
- propagate_protected_usage(c, new);
+ if (protection)
+ propagate_protected_usage(c, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
+ *
+ * Notably, we have two watermarks to allow for both a globally
+ * visible peak and one that can be reset at a smaller scope.
+ *
+ * Since we reset both watermarks when the global reset occurs,
+ * we can guarantee that watermark >= local_watermark, so we
+ * don't need to do both comparisons every time.
+ *
+ * On systems with branch predictors, the inner condition should
+ * be almost free.
*/
- if (new > READ_ONCE(c->watermark))
- WRITE_ONCE(c->watermark, new);
+ if (new > READ_ONCE(c->local_watermark)) {
+ WRITE_ONCE(c->local_watermark, new);
+ if (new > READ_ONCE(c->watermark))
+ WRITE_ONCE(c->watermark, new);
+ }
}
}
@@ -99,6 +120,7 @@ bool page_counter_try_charge(struct page_counter *counter,
struct page_counter **fail)
{
struct page_counter *c;
+ bool protection = track_protection(counter);
for (c = counter; c; c = c->parent) {
long new;
@@ -128,13 +150,15 @@ bool page_counter_try_charge(struct page_counter *counter,
*fail = c;
goto failed;
}
- propagate_protected_usage(c, new);
- /*
- * Just like with failcnt, we can live with some
- * inaccuracy in the watermark.
- */
- if (new > READ_ONCE(c->watermark))
- WRITE_ONCE(c->watermark, new);
+ if (protection)
+ propagate_protected_usage(c, new);
+
+ /* see comment on page_counter_charge */
+ if (new > READ_ONCE(c->local_watermark)) {
+ WRITE_ONCE(c->local_watermark, new);
+ if (new > READ_ONCE(c->watermark))
+ WRITE_ONCE(c->watermark, new);
+ }
}
return true;
@@ -264,6 +288,7 @@ int page_counter_memparse(const char *buf, const char *max,
}
+#ifdef CONFIG_MEMCG
/*
* This function calculates an individual page counter's effective
* protection which is derived from its own memory.min/low, its
@@ -435,3 +460,4 @@ void page_counter_calculate_protection(struct page_counter *root,
atomic_long_read(&parent->children_low_usage),
recursive_protection));
}
+#endif /* CONFIG_MEMCG */
diff --git a/mm/page_io.c b/mm/page_io.c
index ff8c99ee3af7..78bc88acee79 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -172,6 +172,60 @@ bad_bmap:
goto out;
}
+static bool is_folio_zero_filled(struct folio *folio)
+{
+ unsigned int pos, last_pos;
+ unsigned long *data;
+ unsigned int i;
+
+ last_pos = PAGE_SIZE / sizeof(*data) - 1;
+ for (i = 0; i < folio_nr_pages(folio); i++) {
+ data = kmap_local_folio(folio, i * PAGE_SIZE);
+ /*
+ * Check last word first, incase the page is zero-filled at
+ * the start and has non-zero data at the end, which is common
+ * in real-world workloads.
+ */
+ if (data[last_pos]) {
+ kunmap_local(data);
+ return false;
+ }
+ for (pos = 0; pos < last_pos; pos++) {
+ if (data[pos]) {
+ kunmap_local(data);
+ return false;
+ }
+ }
+ kunmap_local(data);
+ }
+
+ return true;
+}
+
+static void swap_zeromap_folio_set(struct folio *folio)
+{
+ struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ swp_entry_t entry;
+ unsigned int i;
+
+ for (i = 0; i < folio_nr_pages(folio); i++) {
+ entry = page_swap_entry(folio_page(folio, i));
+ set_bit(swp_offset(entry), sis->zeromap);
+ }
+}
+
+static void swap_zeromap_folio_clear(struct folio *folio)
+{
+ struct swap_info_struct *sis = swp_swap_info(folio->swap);
+ swp_entry_t entry;
+ unsigned int i;
+
+ for (i = 0; i < folio_nr_pages(folio); i++) {
+ entry = page_swap_entry(folio_page(folio, i));
+ clear_bit(swp_offset(entry), sis->zeromap);
+ }
+}
+
/*
* We may have stale swap cache pages in memory: notice
* them here and get rid of the unnecessary final write.
@@ -195,6 +249,25 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
folio_unlock(folio);
return ret;
}
+
+ /*
+ * Use a bitmap (zeromap) to avoid doing IO for zero-filled pages.
+ * The bits in zeromap are protected by the locked swapcache folio
+ * and atomic updates are used to protect against read-modify-write
+ * corruption due to other zero swap entries seeing concurrent updates.
+ */
+ if (is_folio_zero_filled(folio)) {
+ swap_zeromap_folio_set(folio);
+ folio_unlock(folio);
+ return 0;
+ } else {
+ /*
+ * Clear bits this folio occupies in the zeromap to prevent
+ * zero data being read in from any previous zero writes that
+ * occupied the same swap entries.
+ */
+ swap_zeromap_folio_clear(folio);
+ }
if (zswap_store(folio)) {
folio_unlock(folio);
return 0;
@@ -273,9 +346,7 @@ static void sio_write_complete(struct kiocb *iocb, long ret)
* memory for allocating transmit buffers.
* Mark the page dirty and avoid
* folio_rotate_reclaimable but rate-limit the
- * messages but do not flag PageError like
- * the normal direct-to-bio case as it could
- * be temporary.
+ * messages.
*/
pr_err_ratelimited("Write error %ld on dio swapfile (%llu)\n",
ret, swap_dev_pos(page_swap_entry(page)));
@@ -429,6 +500,28 @@ static void sio_read_complete(struct kiocb *iocb, long ret)
mempool_free(sio, sio_pool);
}
+static bool swap_read_folio_zeromap(struct folio *folio)
+{
+ int nr_pages = folio_nr_pages(folio);
+ bool is_zeromap;
+
+ /*
+ * Swapping in a large folio that is partially in the zeromap is not
+ * currently handled. Return true without marking the folio uptodate so
+ * that an IO error is emitted (e.g. do_swap_page() will sigbus).
+ */
+ if (WARN_ON_ONCE(swap_zeromap_batch(folio->swap, nr_pages,
+ &is_zeromap) != nr_pages))
+ return true;
+
+ if (!is_zeromap)
+ return false;
+
+ folio_zero_range(folio, 0, folio_size(folio));
+ folio_mark_uptodate(folio);
+ return true;
+}
+
static void swap_read_folio_fs(struct folio *folio, struct swap_iocb **plug)
{
struct swap_info_struct *sis = swp_swap_info(folio->swap);
@@ -519,9 +612,18 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
}
delayacct_swapin_start();
- if (zswap_load(folio)) {
+ if (swap_read_folio_zeromap(folio)) {
+ folio_unlock(folio);
+ goto finish;
+ } else if (zswap_load(folio)) {
folio_unlock(folio);
- } else if (data_race(sis->flags & SWP_FS_OPS)) {
+ goto finish;
+ }
+
+ /* We have to read from slower devices. Increase zswap protection. */
+ zswap_folio_swapin(folio);
+
+ if (data_race(sis->flags & SWP_FS_OPS)) {
swap_read_folio_fs(folio, plug);
} else if (synchronous) {
swap_read_folio_bdev_sync(folio, sis);
@@ -529,6 +631,7 @@ void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
swap_read_folio_bdev_async(folio, sis);
}
+finish:
if (workingset) {
delayacct_thrashing_end(&in_thrashing);
psi_memstall_leave(&pflags);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 042937d5abe4..7e04047977cf 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -152,6 +152,9 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
unsigned long flags;
unsigned long check_unmovable_start, check_unmovable_end;
+ if (PageUnaccepted(page))
+ accept_page(page);
+
spin_lock_irqsave(&zone->lock, flags);
/*
@@ -367,6 +370,11 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
VM_BUG_ON(!page);
pfn = page_to_pfn(page);
+ if (PageUnaccepted(page)) {
+ pfn += MAX_ORDER_NR_PAGES;
+ continue;
+ }
+
if (PageBuddy(page)) {
int order = buddy_order(page);
@@ -395,30 +403,8 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
unsigned long head_pfn = page_to_pfn(head);
unsigned long nr_pages = compound_nr(head);
- if (head_pfn + nr_pages <= boundary_pfn) {
- pfn = head_pfn + nr_pages;
- continue;
- }
-
-#if defined CONFIG_COMPACTION || defined CONFIG_CMA
- if (PageHuge(page)) {
- int page_mt = get_pageblock_migratetype(page);
- struct compact_control cc = {
- .nr_migratepages = 0,
- .order = -1,
- .zone = page_zone(pfn_to_page(head_pfn)),
- .mode = MIGRATE_SYNC,
- .ignore_skip_hint = true,
- .no_set_skip_hint = true,
- .gfp_mask = gfp_flags,
- .alloc_contig = true,
- };
- INIT_LIST_HEAD(&cc.migratepages);
-
- ret = __alloc_contig_migrate_range(&cc, head_pfn,
- head_pfn + nr_pages, page_mt);
- if (ret)
- goto failed;
+ if (head_pfn + nr_pages <= boundary_pfn ||
+ PageHuge(page)) {
pfn = head_pfn + nr_pages;
continue;
}
@@ -432,7 +418,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags,
*/
VM_WARN_ON_ONCE_PAGE(PageLRU(page), page);
VM_WARN_ON_ONCE_PAGE(__PageMovable(page), page);
-#endif
+
goto failed;
}
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index ae2f08ce991b..461ea3bbd8d9 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -3,6 +3,8 @@
#include <linux/highmem.h>
#include <linux/sched.h>
#include <linux/hugetlb.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
/*
* We want to know the real level where a entry is located ignoring any
@@ -654,3 +656,203 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index,
return err;
}
+
+/**
+ * folio_walk_start - walk the page tables to a folio
+ * @fw: filled with information on success.
+ * @vma: the VMA.
+ * @addr: the virtual address to use for the page table walk.
+ * @flags: flags modifying which folios to walk to.
+ *
+ * Walk the page tables using @addr in a given @vma to a mapped folio and
+ * return the folio, making sure that the page table entry referenced by
+ * @addr cannot change until folio_walk_end() was called.
+ *
+ * As default, this function returns only folios that are not special (e.g., not
+ * the zeropage) and never returns folios that are supposed to be ignored by the
+ * VM as documented by vm_normal_page(). If requested, zeropages will be
+ * returned as well.
+ *
+ * As default, this function only considers present page table entries.
+ * If requested, it will also consider migration entries.
+ *
+ * If this function returns NULL it might either indicate "there is nothing" or
+ * "there is nothing suitable".
+ *
+ * On success, @fw is filled and the function returns the folio while the PTL
+ * is still held and folio_walk_end() must be called to clean up,
+ * releasing any held locks. The returned folio must *not* be used after the
+ * call to folio_walk_end(), unless a short-term folio reference is taken before
+ * that call.
+ *
+ * @fw->page will correspond to the page that is effectively referenced by
+ * @addr. However, for migration entries and shared zeropages @fw->page is
+ * set to NULL. Note that large folios might be mapped by multiple page table
+ * entries, and this function will always only lookup a single entry as
+ * specified by @addr, which might or might not cover more than a single page of
+ * the returned folio.
+ *
+ * This function must *not* be used as a naive replacement for
+ * get_user_pages() / pin_user_pages(), especially not to perform DMA or
+ * to carelessly modify page content. This function may *only* be used to grab
+ * short-term folio references, never to grab long-term folio references.
+ *
+ * Using the page table entry pointers in @fw for reading or modifying the
+ * entry should be avoided where possible: however, there might be valid
+ * use cases.
+ *
+ * WARNING: Modifying page table entries in hugetlb VMAs requires a lot of care.
+ * For example, PMD page table sharing might require prior unsharing. Also,
+ * logical hugetlb entries might span multiple physical page table entries,
+ * which *must* be modified in a single operation (set_huge_pte_at(),
+ * huge_ptep_set_*, ...). Note that the page table entry stored in @fw might
+ * not correspond to the first physical entry of a logical hugetlb entry.
+ *
+ * The mmap lock must be held in read mode.
+ *
+ * Return: folio pointer on success, otherwise NULL.
+ */
+struct folio *folio_walk_start(struct folio_walk *fw,
+ struct vm_area_struct *vma, unsigned long addr,
+ folio_walk_flags_t flags)
+{
+ unsigned long entry_size;
+ bool expose_page = true;
+ struct page *page;
+ pud_t *pudp, pud;
+ pmd_t *pmdp, pmd;
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ pgd_t *pgdp;
+ p4d_t *p4dp;
+
+ mmap_assert_locked(vma->vm_mm);
+ vma_pgtable_walk_begin(vma);
+
+ if (WARN_ON_ONCE(addr < vma->vm_start || addr >= vma->vm_end))
+ goto not_found;
+
+ pgdp = pgd_offset(vma->vm_mm, addr);
+ if (pgd_none_or_clear_bad(pgdp))
+ goto not_found;
+
+ p4dp = p4d_offset(pgdp, addr);
+ if (p4d_none_or_clear_bad(p4dp))
+ goto not_found;
+
+ pudp = pud_offset(p4dp, addr);
+ pud = pudp_get(pudp);
+ if (pud_none(pud))
+ goto not_found;
+ if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pud_leaf(pud)) {
+ ptl = pud_lock(vma->vm_mm, pudp);
+ pud = pudp_get(pudp);
+
+ entry_size = PUD_SIZE;
+ fw->level = FW_LEVEL_PUD;
+ fw->pudp = pudp;
+ fw->pud = pud;
+
+ if (!pud_present(pud) || pud_devmap(pud) || pud_special(pud)) {
+ spin_unlock(ptl);
+ goto not_found;
+ } else if (!pud_leaf(pud)) {
+ spin_unlock(ptl);
+ goto pmd_table;
+ }
+ /*
+ * TODO: vm_normal_page_pud() will be handy once we want to
+ * support PUD mappings in VM_PFNMAP|VM_MIXEDMAP VMAs.
+ */
+ page = pud_page(pud);
+ goto found;
+ }
+
+pmd_table:
+ VM_WARN_ON_ONCE(pud_leaf(*pudp));
+ pmdp = pmd_offset(pudp, addr);
+ pmd = pmdp_get_lockless(pmdp);
+ if (pmd_none(pmd))
+ goto not_found;
+ if (IS_ENABLED(CONFIG_PGTABLE_HAS_HUGE_LEAVES) && pmd_leaf(pmd)) {
+ ptl = pmd_lock(vma->vm_mm, pmdp);
+ pmd = pmdp_get(pmdp);
+
+ entry_size = PMD_SIZE;
+ fw->level = FW_LEVEL_PMD;
+ fw->pmdp = pmdp;
+ fw->pmd = pmd;
+
+ if (pmd_none(pmd)) {
+ spin_unlock(ptl);
+ goto not_found;
+ } else if (!pmd_leaf(pmd)) {
+ spin_unlock(ptl);
+ goto pte_table;
+ } else if (pmd_present(pmd)) {
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (page) {
+ goto found;
+ } else if ((flags & FW_ZEROPAGE) &&
+ is_huge_zero_pmd(pmd)) {
+ page = pfn_to_page(pmd_pfn(pmd));
+ expose_page = false;
+ goto found;
+ }
+ } else if ((flags & FW_MIGRATION) &&
+ is_pmd_migration_entry(pmd)) {
+ swp_entry_t entry = pmd_to_swp_entry(pmd);
+
+ page = pfn_swap_entry_to_page(entry);
+ expose_page = false;
+ goto found;
+ }
+ spin_unlock(ptl);
+ goto not_found;
+ }
+
+pte_table:
+ VM_WARN_ON_ONCE(pmd_leaf(pmdp_get_lockless(pmdp)));
+ ptep = pte_offset_map_lock(vma->vm_mm, pmdp, addr, &ptl);
+ if (!ptep)
+ goto not_found;
+ pte = ptep_get(ptep);
+
+ entry_size = PAGE_SIZE;
+ fw->level = FW_LEVEL_PTE;
+ fw->ptep = ptep;
+ fw->pte = pte;
+
+ if (pte_present(pte)) {
+ page = vm_normal_page(vma, addr, pte);
+ if (page)
+ goto found;
+ if ((flags & FW_ZEROPAGE) &&
+ is_zero_pfn(pte_pfn(pte))) {
+ page = pfn_to_page(pte_pfn(pte));
+ expose_page = false;
+ goto found;
+ }
+ } else if (!pte_none(pte)) {
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ if ((flags & FW_MIGRATION) &&
+ is_migration_entry(entry)) {
+ page = pfn_swap_entry_to_page(entry);
+ expose_page = false;
+ goto found;
+ }
+ }
+ pte_unmap_unlock(ptep, ptl);
+not_found:
+ vma_pgtable_walk_end(vma);
+ return NULL;
+found:
+ if (expose_page)
+ /* Note: Offset from the mapped page, not the folio start. */
+ fw->page = nth_page(page, (addr & (entry_size - 1)) >> PAGE_SHIFT);
+ else
+ fw->page = NULL;
+ fw->ptl = ptl;
+ return page_folio(page);
+}
diff --git a/mm/percpu.c b/mm/percpu.c
index 20d91af8c033..da21680ff294 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -2217,37 +2217,6 @@ static void pcpu_balance_workfn(struct work_struct *work)
}
/**
- * pcpu_alloc_size - the size of the dynamic percpu area
- * @ptr: pointer to the dynamic percpu area
- *
- * Returns the size of the @ptr allocation. This is undefined for statically
- * defined percpu variables as there is no corresponding chunk->bound_map.
- *
- * RETURNS:
- * The size of the dynamic percpu area.
- *
- * CONTEXT:
- * Can be called from atomic context.
- */
-size_t pcpu_alloc_size(void __percpu *ptr)
-{
- struct pcpu_chunk *chunk;
- unsigned long bit_off, end;
- void *addr;
-
- if (!ptr)
- return 0;
-
- addr = __pcpu_ptr_to_addr(ptr);
- /* No pcpu_lock here: ptr has not been freed, so chunk is still alive */
- chunk = pcpu_chunk_addr_search(addr);
- bit_off = (addr - chunk->base_addr) / PCPU_MIN_ALLOC_SIZE;
- end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk),
- bit_off + 1);
- return (end - bit_off) * PCPU_MIN_ALLOC_SIZE;
-}
-
-/**
* free_percpu - free percpu area
* @ptr: pointer to area to free
*
diff --git a/mm/rmap.c b/mm/rmap.c
index 2490e727e2dc..a8797d1b3d49 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -75,6 +75,7 @@
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
#include <linux/mm_inline.h>
+#include <linux/oom.h>
#include <asm/tlbflush.h>
@@ -870,6 +871,20 @@ static bool folio_referenced_one(struct folio *folio,
continue;
}
+ /*
+ * Skip the non-shared swapbacked folio mapped solely by
+ * the exiting or OOM-reaped process. This avoids redundant
+ * swap-out followed by an immediate unmap.
+ */
+ if ((!atomic_read(&vma->vm_mm->mm_users) ||
+ check_stable_address_space(vma->vm_mm)) &&
+ folio_test_anon(folio) && folio_test_swapbacked(folio) &&
+ !folio_likely_mapped_shared(folio)) {
+ pra->referenced = -1;
+ page_vma_mapped_walk_done(&pvmw);
+ return false;
+ }
+
if (pvmw.pte) {
if (lru_gen_enabled() &&
pte_young(ptep_get(pvmw.pte))) {
@@ -1143,25 +1158,25 @@ static __always_inline unsigned int __folio_add_rmap(struct folio *folio,
{
atomic_t *mapped = &folio->_nr_pages_mapped;
const int orig_nr_pages = nr_pages;
- int first, nr = 0;
+ int first = 0, nr = 0;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
switch (level) {
case RMAP_LEVEL_PTE:
if (!folio_test_large(folio)) {
- nr = atomic_inc_and_test(&page->_mapcount);
+ nr = atomic_inc_and_test(&folio->_mapcount);
break;
}
do {
- first = atomic_inc_and_test(&page->_mapcount);
- if (first) {
- first = atomic_inc_return_relaxed(mapped);
- if (first < ENTIRELY_MAPPED)
- nr++;
- }
+ first += atomic_inc_and_test(&page->_mapcount);
} while (page++, --nr_pages > 0);
+
+ if (first &&
+ atomic_add_return_relaxed(first, mapped) < ENTIRELY_MAPPED)
+ nr = first;
+
atomic_add(orig_nr_pages, &folio->_large_mapcount);
break;
case RMAP_LEVEL_PMD:
@@ -1452,6 +1467,7 @@ void folio_add_new_anon_rmap(struct folio *folio, struct vm_area_struct *vma,
}
__folio_mod_stat(folio, nr, nr_pmdmapped);
+ mod_mthp_stat(folio_order(folio), MTHP_STAT_NR_ANON, 1);
}
static __always_inline void __folio_add_file_rmap(struct folio *folio,
@@ -1512,7 +1528,7 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
enum rmap_level level)
{
atomic_t *mapped = &folio->_nr_pages_mapped;
- int last, nr = 0, nr_pmdmapped = 0;
+ int last = 0, nr = 0, nr_pmdmapped = 0;
bool partially_mapped = false;
__folio_rmap_sanity_checks(folio, page, nr_pages, level);
@@ -1520,20 +1536,19 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
switch (level) {
case RMAP_LEVEL_PTE:
if (!folio_test_large(folio)) {
- nr = atomic_add_negative(-1, &page->_mapcount);
+ nr = atomic_add_negative(-1, &folio->_mapcount);
break;
}
atomic_sub(nr_pages, &folio->_large_mapcount);
do {
- last = atomic_add_negative(-1, &page->_mapcount);
- if (last) {
- last = atomic_dec_return_relaxed(mapped);
- if (last < ENTIRELY_MAPPED)
- nr++;
- }
+ last += atomic_add_negative(-1, &page->_mapcount);
} while (page++, --nr_pages > 0);
+ if (last &&
+ atomic_sub_return_relaxed(last, mapped) < ENTIRELY_MAPPED)
+ nr = last;
+
partially_mapped = nr && atomic_read(mapped);
break;
case RMAP_LEVEL_PMD:
@@ -1553,22 +1568,20 @@ static __always_inline void __folio_remove_rmap(struct folio *folio,
}
}
- partially_mapped = nr < nr_pmdmapped;
+ partially_mapped = nr && nr < nr_pmdmapped;
break;
}
- if (nr) {
- /*
- * Queue anon large folio for deferred split if at least one
- * page of the folio is unmapped and at least one page
- * is still mapped.
- *
- * Check partially_mapped first to ensure it is a large folio.
- */
- if (folio_test_anon(folio) && partially_mapped &&
- list_empty(&folio->_deferred_list))
- deferred_split_folio(folio);
- }
+ /*
+ * Queue anon large folio for deferred split if at least one page of
+ * the folio is unmapped and at least one page is still mapped.
+ *
+ * Check partially_mapped first to ensure it is a large folio.
+ */
+ if (partially_mapped && folio_test_anon(folio) &&
+ !folio_test_partially_mapped(folio))
+ deferred_split_folio(folio, true);
+
__folio_mod_stat(folio, -nr, -nr_pmdmapped);
/*
diff --git a/mm/shmem.c b/mm/shmem.c
index b875852df51f..6eff8771d9cb 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -155,7 +155,7 @@ static unsigned long shmem_default_max_inodes(void)
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
- struct mm_struct *fault_mm, vm_fault_t *fault_type);
+ struct vm_area_struct *vma, vm_fault_t *fault_type);
static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
{
@@ -502,8 +502,8 @@ static int shmem_replace_entry(struct address_space *mapping,
* Sometimes, before we decide whether to proceed or to fail, we must check
* that an entry was not already brought back from swap by a racing thread.
*
- * Checking page is not enough: by the time a SwapCache page is locked, it
- * might be reused, and again be SwapCache, using the same swap as before.
+ * Checking folio is not enough: by the time a swapcache folio is locked, it
+ * might be reused, and again be swapcache, using the same swap as before.
*/
static bool shmem_confirm_swap(struct address_space *mapping,
pgoff_t index, swp_entry_t swap)
@@ -548,10 +548,12 @@ static bool shmem_confirm_swap(struct address_space *mapping,
static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER;
-static bool __shmem_is_huge(struct inode *inode, pgoff_t index,
- bool shmem_huge_force, struct mm_struct *mm,
- unsigned long vm_flags)
+static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
+ loff_t write_end, bool shmem_huge_force,
+ struct vm_area_struct *vma,
+ unsigned long vm_flags)
{
+ struct mm_struct *mm = vma ? vma->vm_mm : NULL;
loff_t i_size;
if (!S_ISREG(inode->i_mode))
@@ -568,7 +570,8 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index,
return true;
case SHMEM_HUGE_WITHIN_SIZE:
index = round_up(index + 1, HPAGE_PMD_NR);
- i_size = round_up(i_size_read(inode), PAGE_SIZE);
+ i_size = max(write_end, i_size_read(inode));
+ i_size = round_up(i_size, PAGE_SIZE);
if (i_size >> PAGE_SHIFT >= index)
return true;
fallthrough;
@@ -581,14 +584,15 @@ static bool __shmem_is_huge(struct inode *inode, pgoff_t index,
}
}
-bool shmem_is_huge(struct inode *inode, pgoff_t index,
- bool shmem_huge_force, struct mm_struct *mm,
- unsigned long vm_flags)
+static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
+ loff_t write_end, bool shmem_huge_force,
+ struct vm_area_struct *vma, unsigned long vm_flags)
{
if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER)
return false;
- return __shmem_is_huge(inode, index, shmem_huge_force, mm, vm_flags);
+ return __shmem_huge_global_enabled(inode, index, write_end,
+ shmem_huge_force, vma, vm_flags);
}
#if defined(CONFIG_SYSFS)
@@ -634,15 +638,14 @@ static const char *shmem_format_huge(int huge)
#endif
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
- struct shrink_control *sc, unsigned long nr_to_split)
+ struct shrink_control *sc, unsigned long nr_to_free)
{
LIST_HEAD(list), *pos, *next;
- LIST_HEAD(to_remove);
struct inode *inode;
struct shmem_inode_info *info;
struct folio *folio;
unsigned long batch = sc ? sc->nr_to_scan : 128;
- int split = 0;
+ unsigned long split = 0, freed = 0;
if (list_empty(&sbinfo->shrinklist))
return SHRINK_STOP;
@@ -660,13 +663,6 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
goto next;
}
- /* Check if there's anything to gain */
- if (round_up(inode->i_size, PAGE_SIZE) ==
- round_up(inode->i_size, HPAGE_PMD_SIZE)) {
- list_move(&info->shrinklist, &to_remove);
- goto next;
- }
-
list_move(&info->shrinklist, &list);
next:
sbinfo->shrinklist_len--;
@@ -675,34 +671,36 @@ next:
}
spin_unlock(&sbinfo->shrinklist_lock);
- list_for_each_safe(pos, next, &to_remove) {
- info = list_entry(pos, struct shmem_inode_info, shrinklist);
- inode = &info->vfs_inode;
- list_del_init(&info->shrinklist);
- iput(inode);
- }
-
list_for_each_safe(pos, next, &list) {
+ pgoff_t next, end;
+ loff_t i_size;
int ret;
- pgoff_t index;
info = list_entry(pos, struct shmem_inode_info, shrinklist);
inode = &info->vfs_inode;
- if (nr_to_split && split >= nr_to_split)
+ if (nr_to_free && freed >= nr_to_free)
goto move_back;
- index = (inode->i_size & HPAGE_PMD_MASK) >> PAGE_SHIFT;
- folio = filemap_get_folio(inode->i_mapping, index);
- if (IS_ERR(folio))
+ i_size = i_size_read(inode);
+ folio = filemap_get_entry(inode->i_mapping, i_size / PAGE_SIZE);
+ if (!folio || xa_is_value(folio))
goto drop;
- /* No huge page at the end of the file: nothing to split */
+ /* No large folio at the end of the file: nothing to split */
if (!folio_test_large(folio)) {
folio_put(folio);
goto drop;
}
+ /* Check if there is anything to gain from splitting */
+ next = folio_next_index(folio);
+ end = shmem_fallocend(inode, DIV_ROUND_UP(i_size, PAGE_SIZE));
+ if (end <= folio->index || end >= next) {
+ folio_put(folio);
+ goto drop;
+ }
+
/*
* Move the inode on the list back to shrinklist if we failed
* to lock the page at this time.
@@ -723,6 +721,7 @@ next:
if (ret)
goto move_back;
+ freed += next - end;
split++;
drop:
list_del_init(&info->shrinklist);
@@ -767,10 +766,17 @@ static long shmem_unused_huge_count(struct super_block *sb,
#define shmem_huge SHMEM_HUGE_DENY
static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo,
- struct shrink_control *sc, unsigned long nr_to_split)
+ struct shrink_control *sc, unsigned long nr_to_free)
{
return 0;
}
+
+static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index,
+ loff_t write_end, bool shmem_huge_force,
+ struct vm_area_struct *vma, unsigned long vm_flags)
+{
+ return false;
+}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
/*
@@ -786,7 +792,6 @@ static int shmem_add_to_page_cache(struct folio *folio,
VM_BUG_ON_FOLIO(index != round_down(index, nr), folio);
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio);
- VM_BUG_ON(expected && folio_test_large(folio));
folio_ref_add(folio, nr);
folio->mapping = mapping;
@@ -842,23 +847,27 @@ static void shmem_delete_from_page_cache(struct folio *folio, void *radswap)
__lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr);
__lruvec_stat_mod_folio(folio, NR_SHMEM, -nr);
xa_unlock_irq(&mapping->i_pages);
- folio_put(folio);
+ folio_put_refs(folio, nr);
BUG_ON(error);
}
/*
- * Remove swap entry from page cache, free the swap and its page cache.
+ * Remove swap entry from page cache, free the swap and its page cache. Returns
+ * the number of pages being freed. 0 means entry not found in XArray (0 pages
+ * being freed).
*/
-static int shmem_free_swap(struct address_space *mapping,
- pgoff_t index, void *radswap)
+static long shmem_free_swap(struct address_space *mapping,
+ pgoff_t index, void *radswap)
{
+ int order = xa_get_order(&mapping->i_pages, index);
void *old;
old = xa_cmpxchg_irq(&mapping->i_pages, index, radswap, NULL, 0);
if (old != radswap)
- return -ENOENT;
- free_swap_and_cache(radix_to_swp_entry(radswap));
- return 0;
+ return 0;
+ free_swap_and_cache_nr(radix_to_swp_entry(radswap), 1 << order);
+
+ return 1 << order;
}
/*
@@ -881,7 +890,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping,
if (xas_retry(&xas, page))
continue;
if (xa_is_value(page))
- swapped++;
+ swapped += 1 << xas_get_order(&xas);
if (xas.xa_index == max)
break;
if (need_resched()) {
@@ -971,7 +980,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
* (although in some cases this is just a waste of time).
*/
folio = NULL;
- shmem_get_folio(inode, index, &folio, SGP_READ);
+ shmem_get_folio(inode, index, 0, &folio, SGP_READ);
return folio;
}
@@ -1010,7 +1019,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
if (xa_is_value(folio)) {
if (unfalloc)
continue;
- nr_swaps_freed += !shmem_free_swap(mapping,
+ nr_swaps_freed += shmem_free_swap(mapping,
indices[i], folio);
continue;
}
@@ -1077,14 +1086,17 @@ whole_folios:
folio = fbatch.folios[i];
if (xa_is_value(folio)) {
+ long swaps_freed;
+
if (unfalloc)
continue;
- if (shmem_free_swap(mapping, indices[i], folio)) {
+ swaps_freed = shmem_free_swap(mapping, indices[i], folio);
+ if (!swaps_freed) {
/* Swap was replaced by page: retry */
index = indices[i];
break;
}
- nr_swaps_freed++;
+ nr_swaps_freed += swaps_freed;
continue;
}
@@ -1156,7 +1168,7 @@ static int shmem_getattr(struct mnt_idmap *idmap,
STATX_ATTR_NODUMP);
generic_fillattr(idmap, request_mask, inode, stat);
- if (shmem_is_huge(inode, 0, false, NULL, 0))
+ if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0))
stat->blksize = HPAGE_PMD_SIZE;
if (request_mask & STATX_BTIME) {
@@ -1443,6 +1455,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
swp_entry_t swap;
pgoff_t index;
+ int nr_pages;
+ bool split = false;
/*
* Our capabilities prevent regular writeback or sync from ever calling
@@ -1461,20 +1475,33 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
goto redirty;
/*
- * If /sys/kernel/mm/transparent_hugepage/shmem_enabled is "always" or
- * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages,
- * and its shmem_writeback() needs them to be split when swapping.
+ * If CONFIG_THP_SWAP is not enabled, the large folio should be
+ * split when swapping.
+ *
+ * And shrinkage of pages beyond i_size does not split swap, so
+ * swapout of a large folio crossing i_size needs to split too
+ * (unless fallocate has been used to preallocate beyond EOF).
*/
if (folio_test_large(folio)) {
+ index = shmem_fallocend(inode,
+ DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE));
+ if ((index > folio->index && index < folio_next_index(folio)) ||
+ !IS_ENABLED(CONFIG_THP_SWAP))
+ split = true;
+ }
+
+ if (split) {
+try_split:
/* Ensure the subpages are still dirty */
folio_test_set_dirty(folio);
- if (split_huge_page(page) < 0)
+ if (split_huge_page_to_list_to_order(page, wbc->list, 0))
goto redirty;
folio = page_folio(page);
folio_clear_dirty(folio);
}
index = folio->index;
+ nr_pages = folio_nr_pages(folio);
/*
* This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC
@@ -1509,8 +1536,12 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
}
swap = folio_alloc_swap(folio);
- if (!swap.val)
+ if (!swap.val) {
+ if (nr_pages > 1)
+ goto try_split;
+
goto redirty;
+ }
/*
* Add inode to shmem_unuse()'s list of swapped-out inodes,
@@ -1527,8 +1558,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
if (add_to_swap_cache(folio, swap,
__GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
NULL) == 0) {
- shmem_recalc_inode(inode, 0, 1);
- swap_shmem_alloc(swap);
+ shmem_recalc_inode(inode, 0, nr_pages);
+ swap_shmem_alloc(swap, nr_pages);
shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap));
mutex_unlock(&shmem_swaplist_mutex);
@@ -1624,22 +1655,33 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
unsigned long shmem_allowable_huge_orders(struct inode *inode,
struct vm_area_struct *vma, pgoff_t index,
- bool global_huge)
+ loff_t write_end, bool shmem_huge_force)
{
unsigned long mask = READ_ONCE(huge_shmem_orders_always);
unsigned long within_size_orders = READ_ONCE(huge_shmem_orders_within_size);
- unsigned long vm_flags = vma->vm_flags;
+ unsigned long vm_flags = vma ? vma->vm_flags : 0;
+ bool global_huge;
loff_t i_size;
int order;
- if ((vm_flags & VM_NOHUGEPAGE) ||
- test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))
+ if (vma && ((vm_flags & VM_NOHUGEPAGE) ||
+ test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)))
return 0;
/* If the hardware/firmware marked hugepage support disabled. */
if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_UNSUPPORTED))
return 0;
+ global_huge = shmem_huge_global_enabled(inode, index, write_end,
+ shmem_huge_force, vma, vm_flags);
+ if (!vma || !vma_is_anon_shmem(vma)) {
+ /*
+ * For tmpfs, we now only support PMD sized THP if huge page
+ * is enabled, otherwise fallback to order 0.
+ */
+ return global_huge ? BIT(HPAGE_PMD_ORDER) : 0;
+ }
+
/*
* Following the 'deny' semantics of the top level, force the huge
* option off from all mounts.
@@ -1680,20 +1722,30 @@ static unsigned long shmem_suitable_orders(struct inode *inode, struct vm_fault
struct address_space *mapping, pgoff_t index,
unsigned long orders)
{
- struct vm_area_struct *vma = vmf->vma;
+ struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
pgoff_t aligned_index;
unsigned long pages;
int order;
- orders = thp_vma_suitable_orders(vma, vmf->address, orders);
- if (!orders)
- return 0;
+ if (vma) {
+ orders = thp_vma_suitable_orders(vma, vmf->address, orders);
+ if (!orders)
+ return 0;
+ }
/* Find the highest order that can add into the page cache */
order = highest_order(orders);
while (orders) {
pages = 1UL << order;
aligned_index = round_down(index, pages);
+ /*
+ * Check for conflict before waiting on a huge allocation.
+ * Conflict might be that a huge page has just been allocated
+ * and added to page cache by a racing thread, or that there
+ * is already at least one small page in the huge extent.
+ * Be careful to retry when appropriate, but not forever!
+ * Elsewhere -EEXIST would be the right code, but not here.
+ */
if (!xa_find(&mapping->i_pages, &aligned_index,
aligned_index + pages - 1, XA_PRESENT))
break;
@@ -1731,7 +1783,6 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
{
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
- struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
unsigned long suitable_orders = 0;
struct folio *folio = NULL;
long pages;
@@ -1741,26 +1792,8 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
orders = 0;
if (orders > 0) {
- if (vma && vma_is_anon_shmem(vma)) {
- suitable_orders = shmem_suitable_orders(inode, vmf,
+ suitable_orders = shmem_suitable_orders(inode, vmf,
mapping, index, orders);
- } else if (orders & BIT(HPAGE_PMD_ORDER)) {
- pages = HPAGE_PMD_NR;
- suitable_orders = BIT(HPAGE_PMD_ORDER);
- index = round_down(index, HPAGE_PMD_NR);
-
- /*
- * Check for conflict before waiting on a huge allocation.
- * Conflict might be that a huge page has just been allocated
- * and added to page cache by a racing thread, or that there
- * is already at least one small page in the huge extent.
- * Be careful to retry when appropriate, but not forever!
- * Elsewhere -EEXIST would be the right code, but not here.
- */
- if (xa_find(&mapping->i_pages, &index,
- index + HPAGE_PMD_NR - 1, XA_PRESENT))
- return ERR_PTR(-E2BIG);
- }
order = highest_order(suitable_orders);
while (suitable_orders) {
@@ -1772,9 +1805,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
if (pages == HPAGE_PMD_NR)
count_vm_event(THP_FILE_FALLBACK);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
count_mthp_stat(order, MTHP_STAT_SHMEM_FALLBACK);
-#endif
order = next_order(&suitable_orders, order);
}
} else {
@@ -1799,10 +1830,8 @@ allocated:
count_vm_event(THP_FILE_FALLBACK);
count_vm_event(THP_FILE_FALLBACK_CHARGE);
}
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK);
count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_FALLBACK_CHARGE);
-#endif
}
goto unlock;
}
@@ -1819,7 +1848,7 @@ allocated:
* Try to reclaim some space by splitting a few
* large folios beyond i_size on the filesystem.
*/
- shmem_unused_huge_shrink(sbinfo, NULL, 2);
+ shmem_unused_huge_shrink(sbinfo, NULL, pages);
/*
* And do a shmem_recalc_inode() to account for freed pages:
* except our folio is there in cache, so not quite balanced.
@@ -1867,30 +1896,35 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp)
}
static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
- struct shmem_inode_info *info, pgoff_t index)
+ struct shmem_inode_info *info, pgoff_t index,
+ struct vm_area_struct *vma)
{
- struct folio *old, *new;
- struct address_space *swap_mapping;
- swp_entry_t entry;
- pgoff_t swap_index;
- int error;
-
- old = *foliop;
- entry = old->swap;
- swap_index = swap_cache_index(entry);
- swap_mapping = swap_address_space(entry);
+ struct folio *new, *old = *foliop;
+ swp_entry_t entry = old->swap;
+ struct address_space *swap_mapping = swap_address_space(entry);
+ pgoff_t swap_index = swap_cache_index(entry);
+ XA_STATE(xas, &swap_mapping->i_pages, swap_index);
+ int nr_pages = folio_nr_pages(old);
+ int error = 0, i;
/*
* We have arrived here because our zones are constrained, so don't
* limit chance of success by further cpuset and node constraints.
*/
gfp &= ~GFP_CONSTRAINT_MASK;
- VM_BUG_ON_FOLIO(folio_test_large(old), old);
- new = shmem_alloc_folio(gfp, 0, info, index);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (nr_pages > 1) {
+ gfp_t huge_gfp = vma_thp_gfp_mask(vma);
+
+ gfp = limit_gfp_mask(huge_gfp, gfp);
+ }
+#endif
+
+ new = shmem_alloc_folio(gfp, folio_order(old), info, index);
if (!new)
return -ENOMEM;
- folio_get(new);
+ folio_ref_add(new, nr_pages);
folio_copy(new, old);
flush_dcache_folio(new);
@@ -1900,26 +1934,34 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
new->swap = entry;
folio_set_swapcache(new);
- /*
- * Our caller will very soon move newpage out of swapcache, but it's
- * a nice clean interface for us to replace oldpage by newpage there.
- */
+ /* Swap cache still stores N entries instead of a high-order entry */
xa_lock_irq(&swap_mapping->i_pages);
- error = shmem_replace_entry(swap_mapping, swap_index, old, new);
+ for (i = 0; i < nr_pages; i++) {
+ void *item = xas_load(&xas);
+
+ if (item != old) {
+ error = -ENOENT;
+ break;
+ }
+
+ xas_store(&xas, new);
+ xas_next(&xas);
+ }
if (!error) {
mem_cgroup_replace_folio(old, new);
- __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1);
- __lruvec_stat_mod_folio(new, NR_SHMEM, 1);
- __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1);
- __lruvec_stat_mod_folio(old, NR_SHMEM, -1);
+ __lruvec_stat_mod_folio(new, NR_FILE_PAGES, nr_pages);
+ __lruvec_stat_mod_folio(new, NR_SHMEM, nr_pages);
+ __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -nr_pages);
+ __lruvec_stat_mod_folio(old, NR_SHMEM, -nr_pages);
}
xa_unlock_irq(&swap_mapping->i_pages);
if (unlikely(error)) {
/*
- * Is this possible? I think not, now that our callers check
- * both PageSwapCache and page_private after getting page lock;
- * but be defensive. Reverse old to newpage for clear and free.
+ * Is this possible? I think not, now that our callers
+ * check both the swapcache flag and folio->private
+ * after getting the folio lock; but be defensive.
+ * Reverse old to newpage for clear and free.
*/
old = new;
} else {
@@ -1931,7 +1973,12 @@ static int shmem_replace_folio(struct folio **foliop, gfp_t gfp,
old->private = NULL;
folio_unlock(old);
- folio_put_refs(old, 2);
+ /*
+ * The old folio are removed from swap cache, drop the 'nr_pages'
+ * reference, as well as one temporary reference getting from swap
+ * cache.
+ */
+ folio_put_refs(old, nr_pages + 1);
return error;
}
@@ -1941,6 +1988,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
swp_entry_t swapin_error;
void *old;
+ int nr_pages;
swapin_error = make_poisoned_swp_entry();
old = xa_cmpxchg_irq(&mapping->i_pages, index,
@@ -1949,6 +1997,7 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
if (old != swp_to_radix_entry(swap))
return;
+ nr_pages = folio_nr_pages(folio);
folio_wait_writeback(folio);
delete_from_swap_cache(folio);
/*
@@ -1956,8 +2005,86 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
* won't be 0 when inode is released and thus trigger WARN_ON(i_blocks)
* in shmem_evict_inode().
*/
- shmem_recalc_inode(inode, -1, -1);
- swap_free(swap);
+ shmem_recalc_inode(inode, -nr_pages, -nr_pages);
+ swap_free_nr(swap, nr_pages);
+}
+
+static int shmem_split_large_entry(struct inode *inode, pgoff_t index,
+ swp_entry_t swap, gfp_t gfp)
+{
+ struct address_space *mapping = inode->i_mapping;
+ XA_STATE_ORDER(xas, &mapping->i_pages, index, 0);
+ void *alloced_shadow = NULL;
+ int alloced_order = 0, i;
+
+ /* Convert user data gfp flags to xarray node gfp flags */
+ gfp &= GFP_RECLAIM_MASK;
+
+ for (;;) {
+ int order = -1, split_order = 0;
+ void *old = NULL;
+
+ xas_lock_irq(&xas);
+ old = xas_load(&xas);
+ if (!xa_is_value(old) || swp_to_radix_entry(swap) != old) {
+ xas_set_err(&xas, -EEXIST);
+ goto unlock;
+ }
+
+ order = xas_get_order(&xas);
+
+ /* Swap entry may have changed before we re-acquire the lock */
+ if (alloced_order &&
+ (old != alloced_shadow || order != alloced_order)) {
+ xas_destroy(&xas);
+ alloced_order = 0;
+ }
+
+ /* Try to split large swap entry in pagecache */
+ if (order > 0) {
+ if (!alloced_order) {
+ split_order = order;
+ goto unlock;
+ }
+ xas_split(&xas, old, order);
+
+ /*
+ * Re-set the swap entry after splitting, and the swap
+ * offset of the original large entry must be continuous.
+ */
+ for (i = 0; i < 1 << order; i++) {
+ pgoff_t aligned_index = round_down(index, 1 << order);
+ swp_entry_t tmp;
+
+ tmp = swp_entry(swp_type(swap), swp_offset(swap) + i);
+ __xa_store(&mapping->i_pages, aligned_index + i,
+ swp_to_radix_entry(tmp), 0);
+ }
+ }
+
+unlock:
+ xas_unlock_irq(&xas);
+
+ /* split needed, alloc here and retry. */
+ if (split_order) {
+ xas_split_alloc(&xas, old, split_order, gfp);
+ if (xas_error(&xas))
+ goto error;
+ alloced_shadow = old;
+ alloced_order = split_order;
+ xas_reset(&xas);
+ continue;
+ }
+
+ if (!xas_nomem(&xas, gfp))
+ break;
+ }
+
+error:
+ if (xas_error(&xas))
+ return xas_error(&xas);
+
+ return alloced_order;
}
/*
@@ -1968,15 +2095,16 @@ static void shmem_set_folio_swapin_error(struct inode *inode, pgoff_t index,
*/
static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
struct folio **foliop, enum sgp_type sgp,
- gfp_t gfp, struct mm_struct *fault_mm,
+ gfp_t gfp, struct vm_area_struct *vma,
vm_fault_t *fault_type)
{
struct address_space *mapping = inode->i_mapping;
+ struct mm_struct *fault_mm = vma ? vma->vm_mm : NULL;
struct shmem_inode_info *info = SHMEM_I(inode);
struct swap_info_struct *si;
struct folio *folio = NULL;
swp_entry_t swap;
- int error;
+ int error, nr_pages;
VM_BUG_ON(!*foliop || !xa_is_value(*foliop));
swap = radix_to_swp_entry(*foliop);
@@ -1996,12 +2124,37 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
/* Look it up and read it in.. */
folio = swap_cache_get_folio(swap, NULL, 0);
if (!folio) {
+ int split_order;
+
/* Or update major stats only when swapin succeeds?? */
if (fault_type) {
*fault_type |= VM_FAULT_MAJOR;
count_vm_event(PGMAJFAULT);
count_memcg_event_mm(fault_mm, PGMAJFAULT);
}
+
+ /*
+ * Now swap device can only swap in order 0 folio, then we
+ * should split the large swap entry stored in the pagecache
+ * if necessary.
+ */
+ split_order = shmem_split_large_entry(inode, index, swap, gfp);
+ if (split_order < 0) {
+ error = split_order;
+ goto failed;
+ }
+
+ /*
+ * If the large swap entry has already been split, it is
+ * necessary to recalculate the new swap entry based on
+ * the old order alignment.
+ */
+ if (split_order > 0) {
+ pgoff_t offset = index - round_down(index, 1 << split_order);
+
+ swap = swp_entry(swp_type(swap), swp_offset(swap) + offset);
+ }
+
/* Here we actually start the io */
folio = shmem_swapin_cluster(swap, gfp, info, index);
if (!folio) {
@@ -2023,6 +2176,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
goto failed;
}
folio_wait_writeback(folio);
+ nr_pages = folio_nr_pages(folio);
/*
* Some architectures may have to restore extra metadata to the
@@ -2031,24 +2185,25 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index,
arch_swap_restore(folio_swap(swap, folio), folio);
if (shmem_should_replace_folio(folio, gfp)) {
- error = shmem_replace_folio(&folio, gfp, info, index);
+ error = shmem_replace_folio(&folio, gfp, info, index, vma);
if (error)
goto failed;
}
- error = shmem_add_to_page_cache(folio, mapping, index,
+ error = shmem_add_to_page_cache(folio, mapping,
+ round_down(index, nr_pages),
swp_to_radix_entry(swap), gfp);
if (error)
goto failed;
- shmem_recalc_inode(inode, 0, -1);
+ shmem_recalc_inode(inode, 0, -nr_pages);
if (sgp == SGP_WRITE)
folio_mark_accessed(folio);
delete_from_swap_cache(folio);
folio_mark_dirty(folio);
- swap_free(swap);
+ swap_free_nr(swap, nr_pages);
put_swap_device(si);
*foliop = folio;
@@ -2078,14 +2233,14 @@ unlock:
* vmf and fault_type are only supplied by shmem_fault: otherwise they are NULL.
*/
static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
- struct folio **foliop, enum sgp_type sgp, gfp_t gfp,
- struct vm_fault *vmf, vm_fault_t *fault_type)
+ loff_t write_end, struct folio **foliop, enum sgp_type sgp,
+ gfp_t gfp, struct vm_fault *vmf, vm_fault_t *fault_type)
{
struct vm_area_struct *vma = vmf ? vmf->vma : NULL;
struct mm_struct *fault_mm;
struct folio *folio;
int error;
- bool alloced, huge;
+ bool alloced;
unsigned long orders = 0;
if (WARN_ON_ONCE(!shmem_mapping(inode->i_mapping)))
@@ -2111,7 +2266,7 @@ repeat:
if (xa_is_value(folio)) {
error = shmem_swapin_folio(inode, index, &folio,
- sgp, gfp, fault_mm, fault_type);
+ sgp, gfp, vma, fault_type);
if (error == -EEXIST)
goto repeat;
@@ -2158,14 +2313,8 @@ repeat:
return 0;
}
- huge = shmem_is_huge(inode, index, false, fault_mm,
- vma ? vma->vm_flags : 0);
- /* Find hugepage orders that are allowed for anonymous shmem. */
- if (vma && vma_is_anon_shmem(vma))
- orders = shmem_allowable_huge_orders(inode, vma, index, huge);
- else if (huge)
- orders = BIT(HPAGE_PMD_ORDER);
-
+ /* Find hugepage orders that are allowed for anonymous shmem and tmpfs. */
+ orders = shmem_allowable_huge_orders(inode, vma, index, write_end, false);
if (orders > 0) {
gfp_t huge_gfp;
@@ -2176,9 +2325,7 @@ repeat:
if (!IS_ERR(folio)) {
if (folio_test_pmd_mappable(folio))
count_vm_event(THP_FILE_ALLOC);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
count_mthp_stat(folio_order(folio), MTHP_STAT_SHMEM_ALLOC);
-#endif
goto alloced;
}
if (PTR_ERR(folio) == -EEXIST)
@@ -2198,7 +2345,7 @@ alloced:
alloced = true;
if (folio_test_large(folio) &&
DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) <
- folio_next_index(folio) - 1) {
+ folio_next_index(folio)) {
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
struct shmem_inode_info *info = SHMEM_I(inode);
/*
@@ -2268,6 +2415,7 @@ unlock:
* shmem_get_folio - find, and lock a shmem folio.
* @inode: inode to search
* @index: the page index.
+ * @write_end: end of a write, could extend inode size
* @foliop: pointer to the folio if found
* @sgp: SGP_* flags to control behavior
*
@@ -2287,10 +2435,10 @@ unlock:
* Context: May sleep.
* Return: 0 if successful, else a negative error code.
*/
-int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop,
- enum sgp_type sgp)
+int shmem_get_folio(struct inode *inode, pgoff_t index, loff_t write_end,
+ struct folio **foliop, enum sgp_type sgp)
{
- return shmem_get_folio_gfp(inode, index, foliop, sgp,
+ return shmem_get_folio_gfp(inode, index, write_end, foliop, sgp,
mapping_gfp_mask(inode->i_mapping), NULL, NULL);
}
EXPORT_SYMBOL_GPL(shmem_get_folio);
@@ -2385,7 +2533,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf)
}
WARN_ON_ONCE(vmf->page != NULL);
- err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE,
+ err = shmem_get_folio_gfp(inode, vmf->pgoff, 0, &folio, SGP_CACHE,
gfp, vmf, &ret);
if (err)
return vmf_error(err);
@@ -2895,7 +3043,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping,
return -EPERM;
}
- ret = shmem_get_folio(inode, index, &folio, SGP_WRITE);
+ ret = shmem_get_folio(inode, index, pos + len, &folio, SGP_WRITE);
if (ret)
return ret;
@@ -2965,7 +3113,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
break;
}
- error = shmem_get_folio(inode, index, &folio, SGP_READ);
+ error = shmem_get_folio(inode, index, 0, &folio, SGP_READ);
if (error) {
if (error == -EINVAL)
error = 0;
@@ -3141,7 +3289,7 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos,
if (*ppos >= i_size_read(inode))
break;
- error = shmem_get_folio(inode, *ppos / PAGE_SIZE, &folio,
+ error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio,
SGP_READ);
if (error) {
if (error == -EINVAL)
@@ -3331,8 +3479,8 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
error = -ENOMEM;
else
- error = shmem_get_folio(inode, index, &folio,
- SGP_FALLOC);
+ error = shmem_get_folio(inode, index, offset + len,
+ &folio, SGP_FALLOC);
if (error) {
info->fallocend = undo_fallocend;
/* Remove the !uptodate folios we added */
@@ -3683,7 +3831,7 @@ static int shmem_symlink(struct mnt_idmap *idmap, struct inode *dir,
} else {
inode_nohighmem(inode);
inode->i_mapping->a_ops = &shmem_aops;
- error = shmem_get_folio(inode, 0, &folio, SGP_WRITE);
+ error = shmem_get_folio(inode, 0, 0, &folio, SGP_WRITE);
if (error)
goto out_remove_offset;
inode->i_op = &shmem_symlink_inode_operations;
@@ -3729,7 +3877,7 @@ static const char *shmem_get_link(struct dentry *dentry, struct inode *inode,
return ERR_PTR(-ECHILD);
}
} else {
- error = shmem_get_folio(inode, 0, &folio, SGP_READ);
+ error = shmem_get_folio(inode, 0, 0, &folio, SGP_READ);
if (error)
return ERR_PTR(error);
if (!folio)
@@ -5197,7 +5345,7 @@ struct folio *shmem_read_folio_gfp(struct address_space *mapping,
struct folio *folio;
int error;
- error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE,
+ error = shmem_get_folio_gfp(inode, index, 0, &folio, SGP_CACHE,
gfp, NULL, NULL);
if (error)
return ERR_PTR(error);
diff --git a/mm/shmem_quota.c b/mm/shmem_quota.c
index ce514e700d2f..d1e32ac01407 100644
--- a/mm/shmem_quota.c
+++ b/mm/shmem_quota.c
@@ -34,8 +34,6 @@
#include <linux/quotaops.h>
#include <linux/quota.h>
-#ifdef CONFIG_TMPFS_QUOTA
-
/*
* The following constants define the amount of time given a user
* before the soft limits are treated as hard limits (usually resulting
@@ -351,4 +349,3 @@ const struct dquot_operations shmem_quota_operations = {
.mark_dirty = shmem_mark_dquot_dirty,
.get_next_id = shmem_get_next_id,
};
-#endif /* CONFIG_TMPFS_QUOTA */
diff --git a/mm/show_mem.c b/mm/show_mem.c
index bdb439551eef..ec885a398fa0 100644
--- a/mm/show_mem.c
+++ b/mm/show_mem.c
@@ -435,15 +435,18 @@ void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx)
struct codetag *ct = tags[i].ct;
struct alloc_tag *tag = ct_to_alloc_tag(ct);
struct alloc_tag_counters counter = alloc_tag_read(tag);
+ char bytes[10];
+
+ string_get_size(counter.bytes, 1, STRING_UNITS_2, bytes, sizeof(bytes));
/* Same as alloc_tag_to_text() but w/o intermediate buffer */
if (ct->modname)
- pr_notice("%12lli %8llu %s:%u [%s] func:%s\n",
- counter.bytes, counter.calls, ct->filename,
+ pr_notice("%12s %8llu %s:%u [%s] func:%s\n",
+ bytes, counter.calls, ct->filename,
ct->lineno, ct->modname, ct->function);
else
- pr_notice("%12lli %8llu %s:%u func:%s\n",
- counter.bytes, counter.calls, ct->filename,
+ pr_notice("%12s %8llu %s:%u func:%s\n",
+ bytes, counter.calls, ct->filename,
ct->lineno, ct->function);
}
}
diff --git a/mm/shrinker_debug.c b/mm/shrinker_debug.c
index 12ea5486a3e9..4a85b94d12ce 100644
--- a/mm/shrinker_debug.c
+++ b/mm/shrinker_debug.c
@@ -114,7 +114,7 @@ static ssize_t shrinker_debugfs_scan_write(struct file *file,
int nid;
char kbuf[72];
- read_len = size < (sizeof(kbuf) - 1) ? size : (sizeof(kbuf) - 1);
+ read_len = min(size, sizeof(kbuf) - 1);
if (copy_from_user(kbuf, buf, read_len))
return -EFAULT;
kbuf[read_len] = '\0';
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 61f32420230a..744324465615 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -1205,6 +1205,13 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
/* If the object still fits, repoison it precisely. */
if (ks >= new_size) {
+ /* Zero out spare memory. */
+ if (want_init_on_alloc(flags)) {
+ kasan_disable_current();
+ memset((void *)p + new_size, 0, ks - new_size);
+ kasan_enable_current();
+ }
+
p = kasan_krealloc((void *)p, new_size, flags);
return (void *)p;
}
@@ -1226,11 +1233,27 @@ __do_krealloc(const void *p, size_t new_size, gfp_t flags)
* @new_size: how many bytes of memory are required.
* @flags: the type of memory to allocate.
*
- * The contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes (__GFP_ZERO flag is effectively ignored).
* If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size
* is 0 and @p is not a %NULL pointer, the object pointed to is freed.
*
+ * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
+ * initial memory allocation, every subsequent call to this API for the same
+ * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
+ * __GFP_ZERO is not fully honored by this API.
+ *
+ * This is the case, since krealloc() only knows about the bucket size of an
+ * allocation (but not the exact size it was allocated with) and hence
+ * implements the following semantics for shrinking and growing buffers with
+ * __GFP_ZERO.
+ *
+ * new bucket
+ * 0 size size
+ * |--------|----------------|
+ * | keep | zero |
+ *
+ * In any case, the contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.
+ *
* Return: pointer to the allocated memory or %NULL in case of error
*/
void *krealloc_noprof(const void *p, size_t new_size, gfp_t flags)
diff --git a/mm/swap.c b/mm/swap.c
index 9caf6b017cf0..835bdf324b76 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -47,31 +47,27 @@
int page_cluster;
const int page_cluster_max = 31;
-/* Protecting only lru_rotate.fbatch which requires disabling interrupts */
-struct lru_rotate {
- local_lock_t lock;
- struct folio_batch fbatch;
-};
-static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
- .lock = INIT_LOCAL_LOCK(lock),
-};
-
-/*
- * The following folio batches are grouped together because they are protected
- * by disabling preemption (and interrupts remain enabled).
- */
struct cpu_fbatches {
+ /*
+ * The following folio batches are grouped together because they are protected
+ * by disabling preemption (and interrupts remain enabled).
+ */
local_lock_t lock;
struct folio_batch lru_add;
struct folio_batch lru_deactivate_file;
struct folio_batch lru_deactivate;
struct folio_batch lru_lazyfree;
#ifdef CONFIG_SMP
- struct folio_batch activate;
+ struct folio_batch lru_activate;
#endif
+ /* Protecting the following batches which require disabling interrupts */
+ local_lock_t lock_irq;
+ struct folio_batch lru_move_tail;
};
+
static DEFINE_PER_CPU(struct cpu_fbatches, cpu_fbatches) = {
.lock = INIT_LOCAL_LOCK(lock),
+ .lock_irq = INIT_LOCAL_LOCK(lock_irq),
};
static void __page_cache_release(struct folio *folio, struct lruvec **lruvecp,
@@ -117,7 +113,9 @@ void __folio_put(struct folio *folio)
if (unlikely(folio_is_zone_device(folio))) {
free_zone_device_folio(folio);
return;
- } else if (folio_test_hugetlb(folio)) {
+ }
+
+ if (folio_test_hugetlb(folio)) {
free_huge_folio(folio);
return;
}
@@ -162,7 +160,7 @@ EXPORT_SYMBOL(put_pages_list);
typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio);
-static void lru_add_fn(struct lruvec *lruvec, struct folio *folio)
+static void lru_add(struct lruvec *lruvec, struct folio *folio)
{
int was_unevictable = folio_test_clear_unevictable(folio);
long nr_pages = folio_nr_pages(folio);
@@ -222,23 +220,50 @@ static void folio_batch_move_lru(struct folio_batch *fbatch, move_fn_t move_fn)
folios_put(fbatch);
}
-static void folio_batch_add_and_move(struct folio_batch *fbatch,
- struct folio *folio, move_fn_t move_fn)
+static void __folio_batch_add_and_move(struct folio_batch __percpu *fbatch,
+ struct folio *folio, move_fn_t move_fn,
+ bool on_lru, bool disable_irq)
{
- if (folio_batch_add(fbatch, folio) && !folio_test_large(folio) &&
- !lru_cache_disabled())
+ unsigned long flags;
+
+ if (on_lru && !folio_test_clear_lru(folio))
return;
- folio_batch_move_lru(fbatch, move_fn);
+
+ folio_get(folio);
+
+ if (disable_irq)
+ local_lock_irqsave(&cpu_fbatches.lock_irq, flags);
+ else
+ local_lock(&cpu_fbatches.lock);
+
+ if (!folio_batch_add(this_cpu_ptr(fbatch), folio) || folio_test_large(folio) ||
+ lru_cache_disabled())
+ folio_batch_move_lru(this_cpu_ptr(fbatch), move_fn);
+
+ if (disable_irq)
+ local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags);
+ else
+ local_unlock(&cpu_fbatches.lock);
}
-static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio)
+#define folio_batch_add_and_move(folio, op, on_lru) \
+ __folio_batch_add_and_move( \
+ &cpu_fbatches.op, \
+ folio, \
+ op, \
+ on_lru, \
+ offsetof(struct cpu_fbatches, op) >= offsetof(struct cpu_fbatches, lock_irq) \
+ )
+
+static void lru_move_tail(struct lruvec *lruvec, struct folio *folio)
{
- if (!folio_test_unevictable(folio)) {
- lruvec_del_folio(lruvec, folio);
- folio_clear_active(folio);
- lruvec_add_folio_tail(lruvec, folio);
- __count_vm_events(PGROTATED, folio_nr_pages(folio));
- }
+ if (folio_test_unevictable(folio))
+ return;
+
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ lruvec_add_folio_tail(lruvec, folio);
+ __count_vm_events(PGROTATED, folio_nr_pages(folio));
}
/*
@@ -250,22 +275,11 @@ static void lru_move_tail_fn(struct lruvec *lruvec, struct folio *folio)
*/
void folio_rotate_reclaimable(struct folio *folio)
{
- if (!folio_test_locked(folio) && !folio_test_dirty(folio) &&
- !folio_test_unevictable(folio)) {
- struct folio_batch *fbatch;
- unsigned long flags;
-
- folio_get(folio);
- if (!folio_test_clear_lru(folio)) {
- folio_put(folio);
- return;
- }
+ if (folio_test_locked(folio) || folio_test_dirty(folio) ||
+ folio_test_unevictable(folio))
+ return;
- local_lock_irqsave(&lru_rotate.lock, flags);
- fbatch = this_cpu_ptr(&lru_rotate.fbatch);
- folio_batch_add_and_move(fbatch, folio, lru_move_tail_fn);
- local_unlock_irqrestore(&lru_rotate.lock, flags);
- }
+ folio_batch_add_and_move(folio, lru_move_tail, true);
}
void lru_note_cost(struct lruvec *lruvec, bool file,
@@ -326,47 +340,38 @@ void lru_note_cost_refault(struct folio *folio)
folio_nr_pages(folio), 0);
}
-static void folio_activate_fn(struct lruvec *lruvec, struct folio *folio)
+static void lru_activate(struct lruvec *lruvec, struct folio *folio)
{
- if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
- long nr_pages = folio_nr_pages(folio);
+ long nr_pages = folio_nr_pages(folio);
- lruvec_del_folio(lruvec, folio);
- folio_set_active(folio);
- lruvec_add_folio(lruvec, folio);
- trace_mm_lru_activate(folio);
+ if (folio_test_active(folio) || folio_test_unevictable(folio))
+ return;
- __count_vm_events(PGACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
- nr_pages);
- }
+
+ lruvec_del_folio(lruvec, folio);
+ folio_set_active(folio);
+ lruvec_add_folio(lruvec, folio);
+ trace_mm_lru_activate(folio);
+
+ __count_vm_events(PGACTIVATE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE, nr_pages);
}
#ifdef CONFIG_SMP
static void folio_activate_drain(int cpu)
{
- struct folio_batch *fbatch = &per_cpu(cpu_fbatches.activate, cpu);
+ struct folio_batch *fbatch = &per_cpu(cpu_fbatches.lru_activate, cpu);
if (folio_batch_count(fbatch))
- folio_batch_move_lru(fbatch, folio_activate_fn);
+ folio_batch_move_lru(fbatch, lru_activate);
}
void folio_activate(struct folio *folio)
{
- if (!folio_test_active(folio) && !folio_test_unevictable(folio)) {
- struct folio_batch *fbatch;
-
- folio_get(folio);
- if (!folio_test_clear_lru(folio)) {
- folio_put(folio);
- return;
- }
+ if (folio_test_active(folio) || folio_test_unevictable(folio))
+ return;
- local_lock(&cpu_fbatches.lock);
- fbatch = this_cpu_ptr(&cpu_fbatches.activate);
- folio_batch_add_and_move(fbatch, folio, folio_activate_fn);
- local_unlock(&cpu_fbatches.lock);
- }
+ folio_batch_add_and_move(folio, lru_activate, true);
}
#else
@@ -378,12 +383,13 @@ void folio_activate(struct folio *folio)
{
struct lruvec *lruvec;
- if (folio_test_clear_lru(folio)) {
- lruvec = folio_lruvec_lock_irq(folio);
- folio_activate_fn(lruvec, folio);
- unlock_page_lruvec_irq(lruvec);
- folio_set_lru(folio);
- }
+ if (!folio_test_clear_lru(folio))
+ return;
+
+ lruvec = folio_lruvec_lock_irq(folio);
+ lru_activate(lruvec, folio);
+ unlock_page_lruvec_irq(lruvec);
+ folio_set_lru(folio);
}
#endif
@@ -482,7 +488,7 @@ void folio_mark_accessed(struct folio *folio)
} else if (!folio_test_active(folio)) {
/*
* If the folio is on the LRU, queue it for activation via
- * cpu_fbatches.activate. Otherwise, assume the folio is in a
+ * cpu_fbatches.lru_activate. Otherwise, assume the folio is in a
* folio_batch, mark it active and it'll be moved to the active
* LRU on the next drain.
*/
@@ -509,8 +515,6 @@ EXPORT_SYMBOL(folio_mark_accessed);
*/
void folio_add_lru(struct folio *folio)
{
- struct folio_batch *fbatch;
-
VM_BUG_ON_FOLIO(folio_test_active(folio) &&
folio_test_unevictable(folio), folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
@@ -520,11 +524,7 @@ void folio_add_lru(struct folio *folio)
lru_gen_in_fault() && !(current->flags & PF_MEMALLOC))
folio_set_active(folio);
- folio_get(folio);
- local_lock(&cpu_fbatches.lock);
- fbatch = this_cpu_ptr(&cpu_fbatches.lru_add);
- folio_batch_add_and_move(fbatch, folio, lru_add_fn);
- local_unlock(&cpu_fbatches.lock);
+ folio_batch_add_and_move(folio, lru_add, false);
}
EXPORT_SYMBOL(folio_add_lru);
@@ -567,7 +567,7 @@ void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma)
* written out by flusher threads as this is much more efficient
* than the single-page writeout from reclaim.
*/
-static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio)
+static void lru_deactivate_file(struct lruvec *lruvec, struct folio *folio)
{
bool active = folio_test_active(folio);
long nr_pages = folio_nr_pages(folio);
@@ -608,43 +608,43 @@ static void lru_deactivate_file_fn(struct lruvec *lruvec, struct folio *folio)
}
}
-static void lru_deactivate_fn(struct lruvec *lruvec, struct folio *folio)
+static void lru_deactivate(struct lruvec *lruvec, struct folio *folio)
{
- if (!folio_test_unevictable(folio) && (folio_test_active(folio) || lru_gen_enabled())) {
- long nr_pages = folio_nr_pages(folio);
+ long nr_pages = folio_nr_pages(folio);
- lruvec_del_folio(lruvec, folio);
- folio_clear_active(folio);
- folio_clear_referenced(folio);
- lruvec_add_folio(lruvec, folio);
+ if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled()))
+ return;
- __count_vm_events(PGDEACTIVATE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
- nr_pages);
- }
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ folio_clear_referenced(folio);
+ lruvec_add_folio(lruvec, folio);
+
+ __count_vm_events(PGDEACTIVATE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_pages);
}
-static void lru_lazyfree_fn(struct lruvec *lruvec, struct folio *folio)
+static void lru_lazyfree(struct lruvec *lruvec, struct folio *folio)
{
- if (folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) {
- long nr_pages = folio_nr_pages(folio);
+ long nr_pages = folio_nr_pages(folio);
- lruvec_del_folio(lruvec, folio);
- folio_clear_active(folio);
- folio_clear_referenced(folio);
- /*
- * Lazyfree folios are clean anonymous folios. They have
- * the swapbacked flag cleared, to distinguish them from normal
- * anonymous folios
- */
- folio_clear_swapbacked(folio);
- lruvec_add_folio(lruvec, folio);
+ if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) ||
+ folio_test_swapcache(folio) || folio_test_unevictable(folio))
+ return;
- __count_vm_events(PGLAZYFREE, nr_pages);
- __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
- nr_pages);
- }
+ lruvec_del_folio(lruvec, folio);
+ folio_clear_active(folio);
+ folio_clear_referenced(folio);
+ /*
+ * Lazyfree folios are clean anonymous folios. They have
+ * the swapbacked flag cleared, to distinguish them from normal
+ * anonymous folios
+ */
+ folio_clear_swapbacked(folio);
+ lruvec_add_folio(lruvec, folio);
+
+ __count_vm_events(PGLAZYFREE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE, nr_pages);
}
/*
@@ -658,30 +658,30 @@ void lru_add_drain_cpu(int cpu)
struct folio_batch *fbatch = &fbatches->lru_add;
if (folio_batch_count(fbatch))
- folio_batch_move_lru(fbatch, lru_add_fn);
+ folio_batch_move_lru(fbatch, lru_add);
- fbatch = &per_cpu(lru_rotate.fbatch, cpu);
+ fbatch = &fbatches->lru_move_tail;
/* Disabling interrupts below acts as a compiler barrier. */
if (data_race(folio_batch_count(fbatch))) {
unsigned long flags;
/* No harm done if a racing interrupt already did this */
- local_lock_irqsave(&lru_rotate.lock, flags);
- folio_batch_move_lru(fbatch, lru_move_tail_fn);
- local_unlock_irqrestore(&lru_rotate.lock, flags);
+ local_lock_irqsave(&cpu_fbatches.lock_irq, flags);
+ folio_batch_move_lru(fbatch, lru_move_tail);
+ local_unlock_irqrestore(&cpu_fbatches.lock_irq, flags);
}
fbatch = &fbatches->lru_deactivate_file;
if (folio_batch_count(fbatch))
- folio_batch_move_lru(fbatch, lru_deactivate_file_fn);
+ folio_batch_move_lru(fbatch, lru_deactivate_file);
fbatch = &fbatches->lru_deactivate;
if (folio_batch_count(fbatch))
- folio_batch_move_lru(fbatch, lru_deactivate_fn);
+ folio_batch_move_lru(fbatch, lru_deactivate);
fbatch = &fbatches->lru_lazyfree;
if (folio_batch_count(fbatch))
- folio_batch_move_lru(fbatch, lru_lazyfree_fn);
+ folio_batch_move_lru(fbatch, lru_lazyfree);
folio_activate_drain(cpu);
}
@@ -698,22 +698,11 @@ void lru_add_drain_cpu(int cpu)
*/
void deactivate_file_folio(struct folio *folio)
{
- struct folio_batch *fbatch;
-
/* Deactivating an unevictable folio will not accelerate reclaim */
if (folio_test_unevictable(folio))
return;
- folio_get(folio);
- if (!folio_test_clear_lru(folio)) {
- folio_put(folio);
- return;
- }
-
- local_lock(&cpu_fbatches.lock);
- fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate_file);
- folio_batch_add_and_move(fbatch, folio, lru_deactivate_file_fn);
- local_unlock(&cpu_fbatches.lock);
+ folio_batch_add_and_move(folio, lru_deactivate_file, true);
}
/*
@@ -726,21 +715,10 @@ void deactivate_file_folio(struct folio *folio)
*/
void folio_deactivate(struct folio *folio)
{
- if (!folio_test_unevictable(folio) && (folio_test_active(folio) ||
- lru_gen_enabled())) {
- struct folio_batch *fbatch;
-
- folio_get(folio);
- if (!folio_test_clear_lru(folio)) {
- folio_put(folio);
- return;
- }
+ if (folio_test_unevictable(folio) || !(folio_test_active(folio) || lru_gen_enabled()))
+ return;
- local_lock(&cpu_fbatches.lock);
- fbatch = this_cpu_ptr(&cpu_fbatches.lru_deactivate);
- folio_batch_add_and_move(fbatch, folio, lru_deactivate_fn);
- local_unlock(&cpu_fbatches.lock);
- }
+ folio_batch_add_and_move(folio, lru_deactivate, true);
}
/**
@@ -752,21 +730,11 @@ void folio_deactivate(struct folio *folio)
*/
void folio_mark_lazyfree(struct folio *folio)
{
- if (folio_test_anon(folio) && folio_test_swapbacked(folio) &&
- !folio_test_swapcache(folio) && !folio_test_unevictable(folio)) {
- struct folio_batch *fbatch;
-
- folio_get(folio);
- if (!folio_test_clear_lru(folio)) {
- folio_put(folio);
- return;
- }
+ if (!folio_test_anon(folio) || !folio_test_swapbacked(folio) ||
+ folio_test_swapcache(folio) || folio_test_unevictable(folio))
+ return;
- local_lock(&cpu_fbatches.lock);
- fbatch = this_cpu_ptr(&cpu_fbatches.lru_lazyfree);
- folio_batch_add_and_move(fbatch, folio, lru_lazyfree_fn);
- local_unlock(&cpu_fbatches.lock);
- }
+ folio_batch_add_and_move(folio, lru_lazyfree, true);
}
void lru_add_drain(void)
@@ -816,11 +784,11 @@ static bool cpu_needs_drain(unsigned int cpu)
/* Check these in order of likelihood that they're not zero */
return folio_batch_count(&fbatches->lru_add) ||
- data_race(folio_batch_count(&per_cpu(lru_rotate.fbatch, cpu))) ||
+ folio_batch_count(&fbatches->lru_move_tail) ||
folio_batch_count(&fbatches->lru_deactivate_file) ||
folio_batch_count(&fbatches->lru_deactivate) ||
folio_batch_count(&fbatches->lru_lazyfree) ||
- folio_batch_count(&fbatches->activate) ||
+ folio_batch_count(&fbatches->lru_activate) ||
need_mlock_drain(cpu) ||
has_bh_in_lru(cpu, NULL);
}
@@ -938,8 +906,8 @@ atomic_t lru_disable_count = ATOMIC_INIT(0);
/*
* lru_cache_disable() needs to be called before we start compiling
- * a list of pages to be migrated using isolate_lru_page().
- * It drains pages on LRU cache and then disable on all cpus until
+ * a list of folios to be migrated using folio_isolate_lru().
+ * It drains folios on LRU cache and then disable on all cpus until
* lru_cache_enable is called.
*
* Must be paired with a call to lru_cache_enable().
diff --git a/mm/swap.h b/mm/swap.h
index baa1fa946b34..ad2f121de970 100644
--- a/mm/swap.h
+++ b/mm/swap.h
@@ -59,7 +59,7 @@ void __delete_from_swap_cache(struct folio *folio,
void delete_from_swap_cache(struct folio *folio);
void clear_shadow_from_swap_cache(int type, unsigned long begin,
unsigned long end);
-void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry);
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr);
struct folio *swap_cache_get_folio(swp_entry_t entry,
struct vm_area_struct *vma, unsigned long addr);
struct folio *filemap_get_incore_folio(struct address_space *mapping,
@@ -73,13 +73,39 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_flags,
bool skip_if_exists);
struct folio *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct mempolicy *mpol, pgoff_t ilx);
-struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
- struct vm_fault *vmf);
+struct folio *swapin_readahead(swp_entry_t entry, gfp_t flag,
+ struct vm_fault *vmf);
static inline unsigned int folio_swap_flags(struct folio *folio)
{
return swp_swap_info(folio->swap)->flags;
}
+
+/*
+ * Return the count of contiguous swap entries that share the same
+ * zeromap status as the starting entry. If is_zeromap is not NULL,
+ * it will return the zeromap status of the starting entry.
+ */
+static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
+ bool *is_zeromap)
+{
+ struct swap_info_struct *sis = swp_swap_info(entry);
+ unsigned long start = swp_offset(entry);
+ unsigned long end = start + max_nr;
+ bool first_bit;
+
+ first_bit = test_bit(start, sis->zeromap);
+ if (is_zeromap)
+ *is_zeromap = first_bit;
+
+ if (max_nr <= 1)
+ return max_nr;
+ if (first_bit)
+ return find_next_zero_bit(sis->zeromap, end, start) - start;
+ else
+ return find_next_bit(sis->zeromap, end, start) - start;
+}
+
#else /* CONFIG_SWAP */
struct swap_iocb;
static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug)
@@ -109,7 +135,7 @@ static inline struct folio *swap_cluster_readahead(swp_entry_t entry,
return NULL;
}
-static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
+static inline struct folio *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask,
struct vm_fault *vmf)
{
return NULL;
@@ -120,7 +146,7 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc)
return 0;
}
-static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+static inline void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
{
}
@@ -171,5 +197,13 @@ static inline unsigned int folio_swap_flags(struct folio *folio)
{
return 0;
}
+
+static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr,
+ bool *has_zeromap)
+{
+ return 0;
+}
+
#endif /* CONFIG_SWAP */
+
#endif /* _MM_SWAP_H */
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index db6c4a26cf59..da1278f0563b 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -161,6 +161,8 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id,
*/
unsigned short lookup_swap_cgroup_id(swp_entry_t ent)
{
+ if (mem_cgroup_disabled())
+ return 0;
return lookup_swap_cgroup(ent, NULL)->id;
}
diff --git a/mm/swap_state.c b/mm/swap_state.c
index a1726e49a5eb..4669f29cf555 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -435,6 +435,8 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
{
struct swap_info_struct *si;
struct folio *folio;
+ struct folio *new_folio = NULL;
+ struct folio *result = NULL;
void *shadow = NULL;
*new_page_allocated = false;
@@ -463,27 +465,28 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* else swap_off will be aborted if we return NULL.
*/
if (!swap_swapcount(si, entry) && swap_slot_cache_enabled)
- goto fail_put_swap;
+ goto put_and_return;
/*
- * Get a new folio to read into from swap. Allocate it now,
- * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
- * cause any racers to loop around until we add it to cache.
+ * Get a new folio to read into from swap. Allocate it now if
+ * new_folio not exist, before marking swap_map SWAP_HAS_CACHE,
+ * when -EEXIST will cause any racers to loop around until we
+ * add it to cache.
*/
- folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id());
- if (!folio)
- goto fail_put_swap;
+ if (!new_folio) {
+ new_folio = folio_alloc_mpol(gfp_mask, 0, mpol, ilx, numa_node_id());
+ if (!new_folio)
+ goto put_and_return;
+ }
/*
* Swap entry may have been freed since our caller observed it.
*/
- err = swapcache_prepare(entry);
+ err = swapcache_prepare(entry, 1);
if (!err)
break;
-
- folio_put(folio);
- if (err != -EEXIST)
- goto fail_put_swap;
+ else if (err != -EEXIST)
+ goto put_and_return;
/*
* Protect against a recursive call to __read_swap_cache_async()
@@ -494,7 +497,7 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* __read_swap_cache_async() in the writeback path.
*/
if (skip_if_exists)
- goto fail_put_swap;
+ goto put_and_return;
/*
* We might race against __delete_from_swap_cache(), and
@@ -509,36 +512,37 @@ struct folio *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
/*
* The swap entry is ours to swap in. Prepare the new folio.
*/
+ __folio_set_locked(new_folio);
+ __folio_set_swapbacked(new_folio);
- __folio_set_locked(folio);
- __folio_set_swapbacked(folio);
-
- if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry))
+ if (mem_cgroup_swapin_charge_folio(new_folio, NULL, gfp_mask, entry))
goto fail_unlock;
/* May fail (-ENOMEM) if XArray node allocation failed. */
- if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
+ if (add_to_swap_cache(new_folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
goto fail_unlock;
- mem_cgroup_swapin_uncharge_swap(entry);
+ mem_cgroup_swapin_uncharge_swap(entry, 1);
if (shadow)
- workingset_refault(folio, shadow);
+ workingset_refault(new_folio, shadow);
- /* Caller will initiate read into locked folio */
- folio_add_lru(folio);
+ /* Caller will initiate read into locked new_folio */
+ folio_add_lru(new_folio);
*new_page_allocated = true;
+ folio = new_folio;
got_folio:
- put_swap_device(si);
- return folio;
+ result = folio;
+ goto put_and_return;
fail_unlock:
- put_swap_folio(folio, entry);
- folio_unlock(folio);
- folio_put(folio);
-fail_put_swap:
+ put_swap_folio(new_folio, entry);
+ folio_unlock(new_folio);
+put_and_return:
put_swap_device(si);
- return NULL;
+ if (!(*new_page_allocated) && new_folio)
+ folio_put(new_folio);
+ return result;
}
/*
@@ -698,10 +702,8 @@ skip:
/* The page was likely read above, so no need for plugging here */
folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated, false);
- if (unlikely(page_allocated)) {
- zswap_folio_swapin(folio);
+ if (unlikely(page_allocated))
swap_read_folio(folio, NULL);
- }
return folio;
}
@@ -850,10 +852,8 @@ skip:
/* The folio was likely read above, so no need for plugging here */
folio = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
&page_allocated, false);
- if (unlikely(page_allocated)) {
- zswap_folio_swapin(folio);
+ if (unlikely(page_allocated))
swap_read_folio(folio, NULL);
- }
return folio;
}
@@ -863,13 +863,13 @@ skip:
* @gfp_mask: memory allocation flags
* @vmf: fault information
*
- * Returns the struct page for entry and addr, after queueing swapin.
+ * Returns the struct folio for entry and addr, after queueing swapin.
*
* It's a main entry function for swap readahead. By the configuration,
* it will read ahead blocks by cluster-based(ie, physical disk based)
* or vma-based(ie, virtual address based on faulty address) readahead.
*/
-struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
+struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
{
struct mempolicy *mpol;
@@ -882,9 +882,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
swap_cluster_readahead(entry, gfp_mask, mpol, ilx);
mpol_cond_put(mpol);
- if (!folio)
- return NULL;
- return folio_file_page(folio, swp_offset(entry));
+ return folio;
}
#ifdef CONFIG_SYSFS
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 38bdc439651a..0cded32414a1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -53,6 +53,15 @@
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
+static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
+ unsigned int nr_pages);
+static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
+ unsigned int nr_entries);
+static bool folio_swapcache_freeable(struct folio *folio);
+static struct swap_cluster_info *lock_cluster_or_swap_info(
+ struct swap_info_struct *si, unsigned long offset);
+static void unlock_cluster_or_swap_info(struct swap_info_struct *si,
+ struct swap_cluster_info *ci);
static DEFINE_SPINLOCK(swap_lock);
static unsigned int nr_swapfiles;
@@ -127,8 +136,44 @@ static inline unsigned char swap_count(unsigned char ent)
* corresponding page
*/
#define TTRS_UNMAPPED 0x2
-/* Reclaim the swap entry if swap is getting full*/
+/* Reclaim the swap entry if swap is getting full */
#define TTRS_FULL 0x4
+/* Reclaim directly, bypass the slot cache and don't touch device lock */
+#define TTRS_DIRECT 0x8
+
+static bool swap_is_has_cache(struct swap_info_struct *si,
+ unsigned long offset, int nr_pages)
+{
+ unsigned char *map = si->swap_map + offset;
+ unsigned char *map_end = map + nr_pages;
+
+ do {
+ VM_BUG_ON(!(*map & SWAP_HAS_CACHE));
+ if (*map != SWAP_HAS_CACHE)
+ return false;
+ } while (++map < map_end);
+
+ return true;
+}
+
+static bool swap_is_last_map(struct swap_info_struct *si,
+ unsigned long offset, int nr_pages, bool *has_cache)
+{
+ unsigned char *map = si->swap_map + offset;
+ unsigned char *map_end = map + nr_pages;
+ unsigned char count = *map;
+
+ if (swap_count(count) != 1)
+ return false;
+
+ while (++map < map_end) {
+ if (*map != count)
+ return false;
+ }
+
+ *has_cache = !!(count & SWAP_HAS_CACHE);
+ return true;
+}
/*
* returns number of pages in the folio that backs the swap entry. If positive,
@@ -139,12 +184,22 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
unsigned long offset, unsigned long flags)
{
swp_entry_t entry = swp_entry(si->type, offset);
+ struct address_space *address_space = swap_address_space(entry);
+ struct swap_cluster_info *ci;
struct folio *folio;
- int ret = 0;
+ int ret, nr_pages;
+ bool need_reclaim;
- folio = filemap_get_folio(swap_address_space(entry), swap_cache_index(entry));
+ folio = filemap_get_folio(address_space, swap_cache_index(entry));
if (IS_ERR(folio))
return 0;
+
+ /* offset could point to the middle of a large folio */
+ entry = folio->swap;
+ offset = swp_offset(entry);
+ nr_pages = folio_nr_pages(folio);
+ ret = -nr_pages;
+
/*
* When this function is called from scan_swap_map_slots() and it's
* called by vmscan.c at reclaiming folios. So we hold a folio lock
@@ -152,14 +207,50 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si,
* case and you should use folio_free_swap() with explicit folio_lock()
* in usual operations.
*/
- if (folio_trylock(folio)) {
- if ((flags & TTRS_ANYWAY) ||
- ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
- ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)))
- ret = folio_free_swap(folio);
- folio_unlock(folio);
+ if (!folio_trylock(folio))
+ goto out;
+
+ need_reclaim = ((flags & TTRS_ANYWAY) ||
+ ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) ||
+ ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio)));
+ if (!need_reclaim || !folio_swapcache_freeable(folio))
+ goto out_unlock;
+
+ /*
+ * It's safe to delete the folio from swap cache only if the folio's
+ * swap_map is HAS_CACHE only, which means the slots have no page table
+ * reference or pending writeback, and can't be allocated to others.
+ */
+ ci = lock_cluster_or_swap_info(si, offset);
+ need_reclaim = swap_is_has_cache(si, offset, nr_pages);
+ unlock_cluster_or_swap_info(si, ci);
+ if (!need_reclaim)
+ goto out_unlock;
+
+ if (!(flags & TTRS_DIRECT)) {
+ /* Free through slot cache */
+ delete_from_swap_cache(folio);
+ folio_set_dirty(folio);
+ ret = nr_pages;
+ goto out_unlock;
}
- ret = ret ? folio_nr_pages(folio) : -folio_nr_pages(folio);
+
+ xa_lock_irq(&address_space->i_pages);
+ __delete_from_swap_cache(folio, entry, NULL);
+ xa_unlock_irq(&address_space->i_pages);
+ folio_ref_sub(folio, nr_pages);
+ folio_set_dirty(folio);
+
+ spin_lock(&si->lock);
+ /* Only sinple page folio can be backed by zswap */
+ if (nr_pages == 1)
+ zswap_invalidate(entry);
+ swap_entry_range_free(si, entry, nr_pages);
+ spin_unlock(&si->lock);
+ ret = nr_pages;
+out_unlock:
+ folio_unlock(folio);
+out:
folio_put(folio);
return ret;
}
@@ -290,62 +381,21 @@ static void discard_swap_cluster(struct swap_info_struct *si,
#endif
#define LATENCY_LIMIT 256
-static inline void cluster_set_flag(struct swap_cluster_info *info,
- unsigned int flag)
-{
- info->flags = flag;
-}
-
-static inline unsigned int cluster_count(struct swap_cluster_info *info)
-{
- return info->data;
-}
-
-static inline void cluster_set_count(struct swap_cluster_info *info,
- unsigned int c)
-{
- info->data = c;
-}
-
-static inline void cluster_set_count_flag(struct swap_cluster_info *info,
- unsigned int c, unsigned int f)
-{
- info->flags = f;
- info->data = c;
-}
-
-static inline unsigned int cluster_next(struct swap_cluster_info *info)
-{
- return info->data;
-}
-
-static inline void cluster_set_next(struct swap_cluster_info *info,
- unsigned int n)
-{
- info->data = n;
-}
-
-static inline void cluster_set_next_flag(struct swap_cluster_info *info,
- unsigned int n, unsigned int f)
-{
- info->flags = f;
- info->data = n;
-}
-
static inline bool cluster_is_free(struct swap_cluster_info *info)
{
return info->flags & CLUSTER_FLAG_FREE;
}
-static inline bool cluster_is_null(struct swap_cluster_info *info)
+static inline unsigned int cluster_index(struct swap_info_struct *si,
+ struct swap_cluster_info *ci)
{
- return info->flags & CLUSTER_FLAG_NEXT_NULL;
+ return ci - si->cluster_info;
}
-static inline void cluster_set_null(struct swap_cluster_info *info)
+static inline unsigned int cluster_offset(struct swap_info_struct *si,
+ struct swap_cluster_info *ci)
{
- info->flags = CLUSTER_FLAG_NEXT_NULL;
- info->data = 0;
+ return cluster_index(si, ci) * SWAPFILE_CLUSTER;
}
static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si,
@@ -394,65 +444,11 @@ static inline void unlock_cluster_or_swap_info(struct swap_info_struct *si,
spin_unlock(&si->lock);
}
-static inline bool cluster_list_empty(struct swap_cluster_list *list)
-{
- return cluster_is_null(&list->head);
-}
-
-static inline unsigned int cluster_list_first(struct swap_cluster_list *list)
-{
- return cluster_next(&list->head);
-}
-
-static void cluster_list_init(struct swap_cluster_list *list)
-{
- cluster_set_null(&list->head);
- cluster_set_null(&list->tail);
-}
-
-static void cluster_list_add_tail(struct swap_cluster_list *list,
- struct swap_cluster_info *ci,
- unsigned int idx)
-{
- if (cluster_list_empty(list)) {
- cluster_set_next_flag(&list->head, idx, 0);
- cluster_set_next_flag(&list->tail, idx, 0);
- } else {
- struct swap_cluster_info *ci_tail;
- unsigned int tail = cluster_next(&list->tail);
-
- /*
- * Nested cluster lock, but both cluster locks are
- * only acquired when we held swap_info_struct->lock
- */
- ci_tail = ci + tail;
- spin_lock_nested(&ci_tail->lock, SINGLE_DEPTH_NESTING);
- cluster_set_next(ci_tail, idx);
- spin_unlock(&ci_tail->lock);
- cluster_set_next_flag(&list->tail, idx, 0);
- }
-}
-
-static unsigned int cluster_list_del_first(struct swap_cluster_list *list,
- struct swap_cluster_info *ci)
-{
- unsigned int idx;
-
- idx = cluster_next(&list->head);
- if (cluster_next(&list->tail) == idx) {
- cluster_set_null(&list->head);
- cluster_set_null(&list->tail);
- } else
- cluster_set_next_flag(&list->head,
- cluster_next(&ci[idx]), 0);
-
- return idx;
-}
-
/* Add a cluster to discard list and schedule it to do discard */
static void swap_cluster_schedule_discard(struct swap_info_struct *si,
- unsigned int idx)
+ struct swap_cluster_info *ci)
{
+ unsigned int idx = cluster_index(si, ci);
/*
* If scan_swap_map_slots() can't find a free cluster, it will check
* si->swap_map directly. To make sure the discarding cluster isn't
@@ -462,17 +458,23 @@ static void swap_cluster_schedule_discard(struct swap_info_struct *si,
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
SWAP_MAP_BAD, SWAPFILE_CLUSTER);
- cluster_list_add_tail(&si->discard_clusters, si->cluster_info, idx);
-
+ VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
+ list_move_tail(&ci->list, &si->discard_clusters);
+ ci->flags = 0;
schedule_work(&si->discard_work);
}
-static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
+static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
- struct swap_cluster_info *ci = si->cluster_info;
+ lockdep_assert_held(&si->lock);
+ lockdep_assert_held(&ci->lock);
- cluster_set_flag(ci + idx, CLUSTER_FLAG_FREE);
- cluster_list_add_tail(&si->free_clusters, ci, idx);
+ if (ci->flags)
+ list_move_tail(&ci->list, &si->free_clusters);
+ else
+ list_add_tail(&ci->list, &si->free_clusters);
+ ci->flags = CLUSTER_FLAG_FREE;
+ ci->order = 0;
}
/*
@@ -481,24 +483,24 @@ static void __free_cluster(struct swap_info_struct *si, unsigned long idx)
*/
static void swap_do_scheduled_discard(struct swap_info_struct *si)
{
- struct swap_cluster_info *info, *ci;
+ struct swap_cluster_info *ci;
unsigned int idx;
- info = si->cluster_info;
-
- while (!cluster_list_empty(&si->discard_clusters)) {
- idx = cluster_list_del_first(&si->discard_clusters, info);
+ while (!list_empty(&si->discard_clusters)) {
+ ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
+ list_del(&ci->list);
+ idx = cluster_index(si, ci);
spin_unlock(&si->lock);
discard_swap_cluster(si, idx * SWAPFILE_CLUSTER,
SWAPFILE_CLUSTER);
spin_lock(&si->lock);
- ci = lock_cluster(si, idx * SWAPFILE_CLUSTER);
- __free_cluster(si, idx);
+ spin_lock(&ci->lock);
+ __free_cluster(si, ci);
memset(si->swap_map + idx * SWAPFILE_CLUSTER,
0, SWAPFILE_CLUSTER);
- unlock_cluster(ci);
+ spin_unlock(&ci->lock);
}
}
@@ -521,20 +523,15 @@ static void swap_users_ref_free(struct percpu_ref *ref)
complete(&si->comp);
}
-static void alloc_cluster(struct swap_info_struct *si, unsigned long idx)
+static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *ci)
{
- struct swap_cluster_info *ci = si->cluster_info;
-
- VM_BUG_ON(cluster_list_first(&si->free_clusters) != idx);
- cluster_list_del_first(&si->free_clusters, ci);
- cluster_set_count_flag(ci + idx, 0, 0);
-}
+ VM_BUG_ON(ci->count != 0);
+ lockdep_assert_held(&si->lock);
+ lockdep_assert_held(&ci->lock);
-static void free_cluster(struct swap_info_struct *si, unsigned long idx)
-{
- struct swap_cluster_info *ci = si->cluster_info + idx;
+ if (ci->flags & CLUSTER_FLAG_FRAG)
+ si->frag_cluster_nr[ci->order]--;
- VM_BUG_ON(cluster_count(ci) != 0);
/*
* If the swap is discardable, prepare discard the cluster
* instead of free it immediately. The cluster will be freed
@@ -542,175 +539,371 @@ static void free_cluster(struct swap_info_struct *si, unsigned long idx)
*/
if ((si->flags & (SWP_WRITEOK | SWP_PAGE_DISCARD)) ==
(SWP_WRITEOK | SWP_PAGE_DISCARD)) {
- swap_cluster_schedule_discard(si, idx);
+ swap_cluster_schedule_discard(si, ci);
return;
}
- __free_cluster(si, idx);
+ __free_cluster(si, ci);
}
/*
- * The cluster corresponding to page_nr will be used. The cluster will be
- * removed from free cluster list and its usage counter will be increased by
- * count.
+ * The cluster corresponding to page_nr will be used. The cluster will not be
+ * added to free cluster list and its usage counter will be increased by 1.
+ * Only used for initialization.
*/
-static void add_cluster_info_page(struct swap_info_struct *p,
- struct swap_cluster_info *cluster_info, unsigned long page_nr,
- unsigned long count)
+static void inc_cluster_info_page(struct swap_info_struct *si,
+ struct swap_cluster_info *cluster_info, unsigned long page_nr)
{
unsigned long idx = page_nr / SWAPFILE_CLUSTER;
+ struct swap_cluster_info *ci;
if (!cluster_info)
return;
- if (cluster_is_free(&cluster_info[idx]))
- alloc_cluster(p, idx);
- VM_BUG_ON(cluster_count(&cluster_info[idx]) + count > SWAPFILE_CLUSTER);
- cluster_set_count(&cluster_info[idx],
- cluster_count(&cluster_info[idx]) + count);
-}
+ ci = cluster_info + idx;
+ ci->count++;
-/*
- * The cluster corresponding to page_nr will be used. The cluster will be
- * removed from free cluster list and its usage counter will be increased by 1.
- */
-static void inc_cluster_info_page(struct swap_info_struct *p,
- struct swap_cluster_info *cluster_info, unsigned long page_nr)
-{
- add_cluster_info_page(p, cluster_info, page_nr, 1);
+ VM_BUG_ON(ci->count > SWAPFILE_CLUSTER);
+ VM_BUG_ON(ci->flags);
}
/*
- * The cluster corresponding to page_nr decreases one usage. If the usage
- * counter becomes 0, which means no page in the cluster is in using, we can
- * optionally discard the cluster and add it to free cluster list.
+ * The cluster ci decreases @nr_pages usage. If the usage counter becomes 0,
+ * which means no page in the cluster is in use, we can optionally discard
+ * the cluster and add it to free cluster list.
*/
-static void dec_cluster_info_page(struct swap_info_struct *p,
- struct swap_cluster_info *cluster_info, unsigned long page_nr)
+static void dec_cluster_info_page(struct swap_info_struct *si,
+ struct swap_cluster_info *ci, int nr_pages)
{
- unsigned long idx = page_nr / SWAPFILE_CLUSTER;
-
- if (!cluster_info)
+ if (!si->cluster_info)
return;
- VM_BUG_ON(cluster_count(&cluster_info[idx]) == 0);
- cluster_set_count(&cluster_info[idx],
- cluster_count(&cluster_info[idx]) - 1);
+ VM_BUG_ON(ci->count < nr_pages);
+ VM_BUG_ON(cluster_is_free(ci));
+ lockdep_assert_held(&si->lock);
+ lockdep_assert_held(&ci->lock);
+ ci->count -= nr_pages;
- if (cluster_count(&cluster_info[idx]) == 0)
- free_cluster(p, idx);
+ if (!ci->count) {
+ free_cluster(si, ci);
+ return;
+ }
+
+ if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
+ VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
+ if (ci->flags & CLUSTER_FLAG_FRAG)
+ si->frag_cluster_nr[ci->order]--;
+ list_move_tail(&ci->list, &si->nonfull_clusters[ci->order]);
+ ci->flags = CLUSTER_FLAG_NONFULL;
+ }
}
-/*
- * It's possible scan_swap_map_slots() uses a free cluster in the middle of free
- * cluster list. Avoiding such abuse to avoid list corruption.
- */
-static bool
-scan_swap_map_ssd_cluster_conflict(struct swap_info_struct *si,
- unsigned long offset, int order)
+static bool cluster_reclaim_range(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ unsigned long start, unsigned long end)
{
- struct percpu_cluster *percpu_cluster;
- bool conflict;
+ unsigned char *map = si->swap_map;
+ unsigned long offset;
- offset /= SWAPFILE_CLUSTER;
- conflict = !cluster_list_empty(&si->free_clusters) &&
- offset != cluster_list_first(&si->free_clusters) &&
- cluster_is_free(&si->cluster_info[offset]);
+ spin_unlock(&ci->lock);
+ spin_unlock(&si->lock);
- if (!conflict)
- return false;
+ for (offset = start; offset < end; offset++) {
+ switch (READ_ONCE(map[offset])) {
+ case 0:
+ continue;
+ case SWAP_HAS_CACHE:
+ if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0)
+ continue;
+ goto out;
+ default:
+ goto out;
+ }
+ }
+out:
+ spin_lock(&si->lock);
+ spin_lock(&ci->lock);
+
+ /*
+ * Recheck the range no matter reclaim succeeded or not, the slot
+ * could have been be freed while we are not holding the lock.
+ */
+ for (offset = start; offset < end; offset++)
+ if (READ_ONCE(map[offset]))
+ return false;
- percpu_cluster = this_cpu_ptr(si->percpu_cluster);
- percpu_cluster->next[order] = SWAP_NEXT_INVALID;
return true;
}
-static inline bool swap_range_empty(char *swap_map, unsigned int start,
- unsigned int nr_pages)
+static bool cluster_scan_range(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ unsigned long start, unsigned int nr_pages)
{
- unsigned int i;
+ unsigned long offset, end = start + nr_pages;
+ unsigned char *map = si->swap_map;
+ bool need_reclaim = false;
- for (i = 0; i < nr_pages; i++) {
- if (swap_map[start + i])
+ for (offset = start; offset < end; offset++) {
+ switch (READ_ONCE(map[offset])) {
+ case 0:
+ continue;
+ case SWAP_HAS_CACHE:
+ if (!vm_swap_full())
+ return false;
+ need_reclaim = true;
+ continue;
+ default:
return false;
+ }
}
+ if (need_reclaim)
+ return cluster_reclaim_range(si, ci, start, end);
+
return true;
}
+static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
+ unsigned int start, unsigned char usage,
+ unsigned int order)
+{
+ unsigned int nr_pages = 1 << order;
+
+ if (cluster_is_free(ci)) {
+ if (nr_pages < SWAPFILE_CLUSTER) {
+ list_move_tail(&ci->list, &si->nonfull_clusters[order]);
+ ci->flags = CLUSTER_FLAG_NONFULL;
+ }
+ ci->order = order;
+ }
+
+ memset(si->swap_map + start, usage, nr_pages);
+ swap_range_alloc(si, start, nr_pages);
+ ci->count += nr_pages;
+
+ if (ci->count == SWAPFILE_CLUSTER) {
+ VM_BUG_ON(!(ci->flags &
+ (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
+ if (ci->flags & CLUSTER_FLAG_FRAG)
+ si->frag_cluster_nr[ci->order]--;
+ list_move_tail(&ci->list, &si->full_clusters);
+ ci->flags = CLUSTER_FLAG_FULL;
+ }
+}
+
+static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigned long offset,
+ unsigned int *foundp, unsigned int order,
+ unsigned char usage)
+{
+ unsigned long start = offset & ~(SWAPFILE_CLUSTER - 1);
+ unsigned long end = min(start + SWAPFILE_CLUSTER, si->max);
+ unsigned int nr_pages = 1 << order;
+ struct swap_cluster_info *ci;
+
+ if (end < nr_pages)
+ return SWAP_NEXT_INVALID;
+ end -= nr_pages;
+
+ ci = lock_cluster(si, offset);
+ if (ci->count + nr_pages > SWAPFILE_CLUSTER) {
+ offset = SWAP_NEXT_INVALID;
+ goto done;
+ }
+
+ while (offset <= end) {
+ if (cluster_scan_range(si, ci, offset, nr_pages)) {
+ cluster_alloc_range(si, ci, offset, usage, order);
+ *foundp = offset;
+ if (ci->count == SWAPFILE_CLUSTER) {
+ offset = SWAP_NEXT_INVALID;
+ goto done;
+ }
+ offset += nr_pages;
+ break;
+ }
+ offset += nr_pages;
+ }
+ if (offset > end)
+ offset = SWAP_NEXT_INVALID;
+done:
+ unlock_cluster(ci);
+ return offset;
+}
+
+static void swap_reclaim_full_clusters(struct swap_info_struct *si)
+{
+ long to_scan = 1;
+ unsigned long offset, end;
+ struct swap_cluster_info *ci;
+ unsigned char *map = si->swap_map;
+ int nr_reclaim, total_reclaimed = 0;
+
+ if (atomic_long_read(&nr_swap_pages) <= SWAPFILE_CLUSTER)
+ to_scan = si->inuse_pages / SWAPFILE_CLUSTER;
+
+ while (!list_empty(&si->full_clusters)) {
+ ci = list_first_entry(&si->full_clusters, struct swap_cluster_info, list);
+ list_move_tail(&ci->list, &si->full_clusters);
+ offset = cluster_offset(si, ci);
+ end = min(si->max, offset + SWAPFILE_CLUSTER);
+ to_scan--;
+
+ while (offset < end) {
+ if (READ_ONCE(map[offset]) == SWAP_HAS_CACHE) {
+ spin_unlock(&si->lock);
+ nr_reclaim = __try_to_reclaim_swap(si, offset,
+ TTRS_ANYWAY | TTRS_DIRECT);
+ spin_lock(&si->lock);
+ if (nr_reclaim > 0) {
+ offset += nr_reclaim;
+ total_reclaimed += nr_reclaim;
+ continue;
+ } else if (nr_reclaim < 0) {
+ offset += -nr_reclaim;
+ continue;
+ }
+ }
+ offset++;
+ }
+ if (to_scan <= 0 || total_reclaimed)
+ break;
+ }
+}
+
/*
* Try to get swap entries with specified order from current cpu's swap entry
* pool (a cluster). This might involve allocating a new cluster for current CPU
* too.
*/
-static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
- unsigned long *offset, unsigned long *scan_base, int order)
+static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
+ unsigned char usage)
{
- unsigned int nr_pages = 1 << order;
struct percpu_cluster *cluster;
struct swap_cluster_info *ci;
- unsigned int tmp, max;
+ unsigned int offset, found = 0;
new_cluster:
+ lockdep_assert_held(&si->lock);
cluster = this_cpu_ptr(si->percpu_cluster);
- tmp = cluster->next[order];
- if (tmp == SWAP_NEXT_INVALID) {
- if (!cluster_list_empty(&si->free_clusters)) {
- tmp = cluster_next(&si->free_clusters.head) *
- SWAPFILE_CLUSTER;
- } else if (!cluster_list_empty(&si->discard_clusters)) {
- /*
- * we don't have free cluster but have some clusters in
- * discarding, do discard now and reclaim them, then
- * reread cluster_next_cpu since we dropped si->lock
- */
- swap_do_scheduled_discard(si);
- *scan_base = this_cpu_read(*si->cluster_next_cpu);
- *offset = *scan_base;
- goto new_cluster;
- } else
- return false;
+ offset = cluster->next[order];
+ if (offset) {
+ offset = alloc_swap_scan_cluster(si, offset, &found, order, usage);
+ if (found)
+ goto done;
}
- /*
- * Other CPUs can use our cluster if they can't find a free cluster,
- * check if there is still free entry in the cluster, maintaining
- * natural alignment.
- */
- max = min_t(unsigned long, si->max, ALIGN(tmp + 1, SWAPFILE_CLUSTER));
- if (tmp < max) {
- ci = lock_cluster(si, tmp);
- while (tmp < max) {
- if (swap_range_empty(si->swap_map, tmp, nr_pages))
+ if (!list_empty(&si->free_clusters)) {
+ ci = list_first_entry(&si->free_clusters, struct swap_cluster_info, list);
+ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found, order, usage);
+ VM_BUG_ON(!found);
+ goto done;
+ }
+
+ if (order < PMD_ORDER) {
+ unsigned int frags = 0;
+
+ while (!list_empty(&si->nonfull_clusters[order])) {
+ ci = list_first_entry(&si->nonfull_clusters[order],
+ struct swap_cluster_info, list);
+ list_move_tail(&ci->list, &si->frag_clusters[order]);
+ ci->flags = CLUSTER_FLAG_FRAG;
+ si->frag_cluster_nr[order]++;
+ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+ &found, order, usage);
+ frags++;
+ if (found)
break;
- tmp += nr_pages;
}
- unlock_cluster(ci);
+
+ if (!found) {
+ /*
+ * Nonfull clusters are moved to frag tail if we reached
+ * here, count them too, don't over scan the frag list.
+ */
+ while (frags < si->frag_cluster_nr[order]) {
+ ci = list_first_entry(&si->frag_clusters[order],
+ struct swap_cluster_info, list);
+ /*
+ * Rotate the frag list to iterate, they were all failing
+ * high order allocation or moved here due to per-CPU usage,
+ * this help keeping usable cluster ahead.
+ */
+ list_move_tail(&ci->list, &si->frag_clusters[order]);
+ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+ &found, order, usage);
+ frags++;
+ if (found)
+ break;
+ }
+ }
}
- if (tmp >= max) {
- cluster->next[order] = SWAP_NEXT_INVALID;
+
+ if (found)
+ goto done;
+
+ if (!list_empty(&si->discard_clusters)) {
+ /*
+ * we don't have free cluster but have some clusters in
+ * discarding, do discard now and reclaim them, then
+ * reread cluster_next_cpu since we dropped si->lock
+ */
+ swap_do_scheduled_discard(si);
goto new_cluster;
}
- *offset = tmp;
- *scan_base = tmp;
- tmp += nr_pages;
- cluster->next[order] = tmp < max ? tmp : SWAP_NEXT_INVALID;
- return true;
+
+ if (order)
+ goto done;
+
+ /* Order 0 stealing from higher order */
+ for (int o = 1; o < SWAP_NR_ORDERS; o++) {
+ /*
+ * Clusters here have at least one usable slots and can't fail order 0
+ * allocation, but reclaim may drop si->lock and race with another user.
+ */
+ while (!list_empty(&si->frag_clusters[o])) {
+ ci = list_first_entry(&si->frag_clusters[o],
+ struct swap_cluster_info, list);
+ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+ &found, 0, usage);
+ if (found)
+ goto done;
+ }
+
+ while (!list_empty(&si->nonfull_clusters[o])) {
+ ci = list_first_entry(&si->nonfull_clusters[o],
+ struct swap_cluster_info, list);
+ offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
+ &found, 0, usage);
+ if (found)
+ goto done;
+ }
+ }
+
+done:
+ /* Try reclaim from full clusters if device is nearfull */
+ if (vm_swap_full() && (!found || (si->pages - si->inuse_pages) < SWAPFILE_CLUSTER)) {
+ swap_reclaim_full_clusters(si);
+ if (!found && !order && si->pages != si->inuse_pages)
+ goto new_cluster;
+ }
+
+ cluster->next[order] = offset;
+ return found;
}
-static void __del_from_avail_list(struct swap_info_struct *p)
+static void __del_from_avail_list(struct swap_info_struct *si)
{
int nid;
- assert_spin_locked(&p->lock);
+ assert_spin_locked(&si->lock);
for_each_node(nid)
- plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
+ plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]);
}
-static void del_from_avail_list(struct swap_info_struct *p)
+static void del_from_avail_list(struct swap_info_struct *si)
{
spin_lock(&swap_avail_lock);
- __del_from_avail_list(p);
+ __del_from_avail_list(si);
spin_unlock(&swap_avail_lock);
}
@@ -731,13 +924,13 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
}
}
-static void add_to_avail_list(struct swap_info_struct *p)
+static void add_to_avail_list(struct swap_info_struct *si)
{
int nid;
spin_lock(&swap_avail_lock);
for_each_node(nid)
- plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]);
+ plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]);
spin_unlock(&swap_avail_lock);
}
@@ -747,6 +940,14 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
unsigned long begin = offset;
unsigned long end = offset + nr_entries - 1;
void (*swap_slot_free_notify)(struct block_device *, unsigned long);
+ unsigned int i;
+
+ /*
+ * Use atomic clear_bit operations only on zeromap instead of non-atomic
+ * bitmap_clear to prevent adjacent bits corruption due to simultaneous writes.
+ */
+ for (i = 0; i < nr_entries; i++)
+ clear_bit(offset + i, si->zeromap);
if (offset < si->lowest_bit)
si->lowest_bit = offset;
@@ -822,11 +1023,29 @@ static bool swap_offset_available_and_locked(struct swap_info_struct *si,
return false;
}
+static int cluster_alloc_swap(struct swap_info_struct *si,
+ unsigned char usage, int nr,
+ swp_entry_t slots[], int order)
+{
+ int n_ret = 0;
+
+ VM_BUG_ON(!si->cluster_info);
+
+ while (n_ret < nr) {
+ unsigned long offset = cluster_alloc_swap_entry(si, order, usage);
+
+ if (!offset)
+ break;
+ slots[n_ret++] = swp_entry(si->type, offset);
+ }
+
+ return n_ret;
+}
+
static int scan_swap_map_slots(struct swap_info_struct *si,
unsigned char usage, int nr,
swp_entry_t slots[], int order)
{
- struct swap_cluster_info *ci;
unsigned long offset;
unsigned long scan_base;
unsigned long last_in_cluster = 0;
@@ -865,26 +1084,16 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
return 0;
}
+ if (si->cluster_info)
+ return cluster_alloc_swap(si, usage, nr, slots, order);
+
si->flags += SWP_SCANNING;
- /*
- * Use percpu scan base for SSD to reduce lock contention on
- * cluster and swap cache. For HDD, sequential access is more
- * important.
- */
- if (si->flags & SWP_SOLIDSTATE)
- scan_base = this_cpu_read(*si->cluster_next_cpu);
- else
- scan_base = si->cluster_next;
+
+ /* For HDD, sequential access is more important. */
+ scan_base = si->cluster_next;
offset = scan_base;
- /* SSD algorithm */
- if (si->cluster_info) {
- if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order)) {
- if (order > 0)
- goto no_page;
- goto scan;
- }
- } else if (unlikely(!si->cluster_nr--)) {
+ if (unlikely(!si->cluster_nr--)) {
if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
si->cluster_nr = SWAPFILE_CLUSTER - 1;
goto checks;
@@ -895,8 +1104,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
/*
* If seek is expensive, start searching for new cluster from
* start of partition, to minimize the span of allocated swap.
- * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info
- * case, just handled by scan_swap_map_try_ssd_cluster() above.
*/
scan_base = offset = si->lowest_bit;
last_in_cluster = offset + SWAPFILE_CLUSTER - 1;
@@ -924,19 +1131,6 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
}
checks:
- if (si->cluster_info) {
- while (scan_swap_map_ssd_cluster_conflict(si, offset, order)) {
- /* take a break if we already got some slots */
- if (n_ret)
- goto done;
- if (!scan_swap_map_try_ssd_cluster(si, &offset,
- &scan_base, order)) {
- if (order > 0)
- goto no_page;
- goto scan;
- }
- }
- }
if (!(si->flags & SWP_WRITEOK))
goto no_page;
if (!si->highest_bit)
@@ -944,13 +1138,11 @@ checks:
if (offset > si->highest_bit)
scan_base = offset = si->lowest_bit;
- ci = lock_cluster(si, offset);
/* reuse swap entry of cache-only swap if not busy. */
if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
int swap_was_freed;
- unlock_cluster(ci);
spin_unlock(&si->lock);
- swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY);
+ swap_was_freed = __try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT);
spin_lock(&si->lock);
/* entry was freed successfully, try to use this again */
if (swap_was_freed > 0)
@@ -959,15 +1151,12 @@ checks:
}
if (si->swap_map[offset]) {
- unlock_cluster(ci);
if (!n_ret)
goto scan;
else
goto done;
}
memset(si->swap_map + offset, usage, nr_pages);
- add_cluster_info_page(si, si->cluster_info, offset, nr_pages);
- unlock_cluster(ci);
swap_range_alloc(si, offset, nr_pages);
slots[n_ret++] = swp_entry(si->type, offset);
@@ -988,13 +1177,7 @@ checks:
latency_ration = LATENCY_LIMIT;
}
- /* try to get more slots in cluster */
- if (si->cluster_info) {
- if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base, order))
- goto checks;
- if (order > 0)
- goto done;
- } else if (si->cluster_nr && !si->swap_map[++offset]) {
+ if (si->cluster_nr && !si->swap_map[++offset]) {
/* non-ssd case, still more slots in cluster? */
--si->cluster_nr;
goto checks;
@@ -1055,19 +1238,6 @@ no_page:
return n_ret;
}
-static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx)
-{
- unsigned long offset = idx * SWAPFILE_CLUSTER;
- struct swap_cluster_info *ci;
-
- ci = lock_cluster(si, offset);
- memset(si->swap_map + offset, 0, SWAPFILE_CLUSTER);
- cluster_set_count_flag(ci, 0, 0);
- free_cluster(si, idx);
- unlock_cluster(ci);
- swap_range_free(si, offset, SWAPFILE_CLUSTER);
-}
-
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
{
int order = swap_entry_order(entry_order);
@@ -1148,22 +1318,22 @@ noswap:
static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
{
- struct swap_info_struct *p;
+ struct swap_info_struct *si;
unsigned long offset;
if (!entry.val)
goto out;
- p = swp_swap_info(entry);
- if (!p)
+ si = swp_swap_info(entry);
+ if (!si)
goto bad_nofile;
- if (data_race(!(p->flags & SWP_USED)))
+ if (data_race(!(si->flags & SWP_USED)))
goto bad_device;
offset = swp_offset(entry);
- if (offset >= p->max)
+ if (offset >= si->max)
goto bad_offset;
- if (data_race(!p->swap_map[swp_offset(entry)]))
+ if (data_race(!si->swap_map[swp_offset(entry)]))
goto bad_free;
- return p;
+ return si;
bad_free:
pr_err("%s: %s%08lx\n", __func__, Unused_offset, entry.val);
@@ -1196,14 +1366,14 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry,
return p;
}
-static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
+static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
unsigned long offset,
unsigned char usage)
{
unsigned char count;
unsigned char has_cache;
- count = p->swap_map[offset];
+ count = si->swap_map[offset];
has_cache = count & SWAP_HAS_CACHE;
count &= ~SWAP_HAS_CACHE;
@@ -1219,7 +1389,7 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
count = 0;
} else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) {
if (count == COUNT_CONTINUED) {
- if (swap_count_continued(p, offset, count))
+ if (swap_count_continued(si, offset, count))
count = SWAP_MAP_MAX | COUNT_CONTINUED;
else
count = SWAP_MAP_MAX;
@@ -1229,9 +1399,9 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
usage = count | has_cache;
if (usage)
- WRITE_ONCE(p->swap_map[offset], usage);
+ WRITE_ONCE(si->swap_map[offset], usage);
else
- WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
+ WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE);
return usage;
}
@@ -1310,66 +1480,121 @@ put_out:
return NULL;
}
-static unsigned char __swap_entry_free(struct swap_info_struct *p,
+static unsigned char __swap_entry_free(struct swap_info_struct *si,
swp_entry_t entry)
{
struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
unsigned char usage;
- ci = lock_cluster_or_swap_info(p, offset);
- usage = __swap_entry_free_locked(p, offset, 1);
- unlock_cluster_or_swap_info(p, ci);
+ ci = lock_cluster_or_swap_info(si, offset);
+ usage = __swap_entry_free_locked(si, offset, 1);
+ unlock_cluster_or_swap_info(si, ci);
if (!usage)
free_swap_slot(entry);
return usage;
}
-static void swap_entry_free(struct swap_info_struct *p, swp_entry_t entry)
+static bool __swap_entries_free(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
{
- struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
+ unsigned int type = swp_type(entry);
+ struct swap_cluster_info *ci;
+ bool has_cache = false;
unsigned char count;
+ int i;
+
+ if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
+ goto fallback;
+ /* cross into another cluster */
+ if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
+ goto fallback;
+
+ ci = lock_cluster_or_swap_info(si, offset);
+ if (!swap_is_last_map(si, offset, nr, &has_cache)) {
+ unlock_cluster_or_swap_info(si, ci);
+ goto fallback;
+ }
+ for (i = 0; i < nr; i++)
+ WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
+ unlock_cluster_or_swap_info(si, ci);
+
+ if (!has_cache) {
+ for (i = 0; i < nr; i++)
+ zswap_invalidate(swp_entry(si->type, offset + i));
+ spin_lock(&si->lock);
+ swap_entry_range_free(si, entry, nr);
+ spin_unlock(&si->lock);
+ }
+ return has_cache;
+
+fallback:
+ for (i = 0; i < nr; i++) {
+ if (data_race(si->swap_map[offset + i])) {
+ count = __swap_entry_free(si, swp_entry(type, offset + i));
+ if (count == SWAP_HAS_CACHE)
+ has_cache = true;
+ } else {
+ WARN_ON_ONCE(1);
+ }
+ }
+ return has_cache;
+}
- ci = lock_cluster(p, offset);
- count = p->swap_map[offset];
- VM_BUG_ON(count != SWAP_HAS_CACHE);
- p->swap_map[offset] = 0;
- dec_cluster_info_page(p, p->cluster_info, offset);
+/*
+ * Drop the last HAS_CACHE flag of swap entries, caller have to
+ * ensure all entries belong to the same cgroup.
+ */
+static void swap_entry_range_free(struct swap_info_struct *si, swp_entry_t entry,
+ unsigned int nr_pages)
+{
+ unsigned long offset = swp_offset(entry);
+ unsigned char *map = si->swap_map + offset;
+ unsigned char *map_end = map + nr_pages;
+ struct swap_cluster_info *ci;
+
+ ci = lock_cluster(si, offset);
+ do {
+ VM_BUG_ON(*map != SWAP_HAS_CACHE);
+ *map = 0;
+ } while (++map < map_end);
+ dec_cluster_info_page(si, ci, nr_pages);
unlock_cluster(ci);
- mem_cgroup_uncharge_swap(entry, 1);
- swap_range_free(p, offset, 1);
+ mem_cgroup_uncharge_swap(entry, nr_pages);
+ swap_range_free(si, offset, nr_pages);
}
-static void cluster_swap_free_nr(struct swap_info_struct *sis,
- unsigned long offset, int nr_pages)
+static void cluster_swap_free_nr(struct swap_info_struct *si,
+ unsigned long offset, int nr_pages,
+ unsigned char usage)
{
struct swap_cluster_info *ci;
DECLARE_BITMAP(to_free, BITS_PER_LONG) = { 0 };
int i, nr;
- ci = lock_cluster_or_swap_info(sis, offset);
+ ci = lock_cluster_or_swap_info(si, offset);
while (nr_pages) {
nr = min(BITS_PER_LONG, nr_pages);
for (i = 0; i < nr; i++) {
- if (!__swap_entry_free_locked(sis, offset + i, 1))
+ if (!__swap_entry_free_locked(si, offset + i, usage))
bitmap_set(to_free, i, 1);
}
if (!bitmap_empty(to_free, BITS_PER_LONG)) {
- unlock_cluster_or_swap_info(sis, ci);
+ unlock_cluster_or_swap_info(si, ci);
for_each_set_bit(i, to_free, BITS_PER_LONG)
- free_swap_slot(swp_entry(sis->type, offset + i));
+ free_swap_slot(swp_entry(si->type, offset + i));
if (nr == nr_pages)
return;
bitmap_clear(to_free, 0, BITS_PER_LONG);
- ci = lock_cluster_or_swap_info(sis, offset);
+ ci = lock_cluster_or_swap_info(si, offset);
}
offset += nr;
nr_pages -= nr;
}
- unlock_cluster_or_swap_info(sis, ci);
+ unlock_cluster_or_swap_info(si, ci);
}
/*
@@ -1388,7 +1613,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
while (nr_pages) {
nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
- cluster_swap_free_nr(sis, offset, nr);
+ cluster_swap_free_nr(sis, offset, nr, 1);
offset += nr;
nr_pages -= nr;
}
@@ -1400,12 +1625,8 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
void put_swap_folio(struct folio *folio, swp_entry_t entry)
{
unsigned long offset = swp_offset(entry);
- unsigned long idx = offset / SWAPFILE_CLUSTER;
struct swap_cluster_info *ci;
struct swap_info_struct *si;
- unsigned char *map;
- unsigned int i, free_entries = 0;
- unsigned char val;
int size = 1 << swap_entry_order(folio_order(folio));
si = _swap_info_get(entry);
@@ -1413,24 +1634,14 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
return;
ci = lock_cluster_or_swap_info(si, offset);
- if (size == SWAPFILE_CLUSTER) {
- map = si->swap_map + offset;
- for (i = 0; i < SWAPFILE_CLUSTER; i++) {
- val = map[i];
- VM_BUG_ON(!(val & SWAP_HAS_CACHE));
- if (val == SWAP_HAS_CACHE)
- free_entries++;
- }
- if (free_entries == SWAPFILE_CLUSTER) {
- unlock_cluster_or_swap_info(si, ci);
- spin_lock(&si->lock);
- mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER);
- swap_free_cluster(si, idx);
- spin_unlock(&si->lock);
- return;
- }
+ if (size > 1 && swap_is_has_cache(si, offset, size)) {
+ unlock_cluster_or_swap_info(si, ci);
+ spin_lock(&si->lock);
+ swap_entry_range_free(si, entry, size);
+ spin_unlock(&si->lock);
+ return;
}
- for (i = 0; i < size; i++, entry.val++) {
+ for (int i = 0; i < size; i++, entry.val++) {
if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) {
unlock_cluster_or_swap_info(si, ci);
free_swap_slot(entry);
@@ -1470,7 +1681,7 @@ void swapcache_free_entries(swp_entry_t *entries, int n)
for (i = 0; i < n; ++i) {
p = swap_info_get_cont(entries[i], prev);
if (p)
- swap_entry_free(p, entries[i]);
+ swap_entry_range_free(p, entries[i], 1);
prev = p;
}
if (p)
@@ -1509,28 +1720,28 @@ int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry)
int swp_swapcount(swp_entry_t entry)
{
int count, tmp_count, n;
- struct swap_info_struct *p;
+ struct swap_info_struct *si;
struct swap_cluster_info *ci;
struct page *page;
pgoff_t offset;
unsigned char *map;
- p = _swap_info_get(entry);
- if (!p)
+ si = _swap_info_get(entry);
+ if (!si)
return 0;
offset = swp_offset(entry);
- ci = lock_cluster_or_swap_info(p, offset);
+ ci = lock_cluster_or_swap_info(si, offset);
- count = swap_count(p->swap_map[offset]);
+ count = swap_count(si->swap_map[offset]);
if (!(count & COUNT_CONTINUED))
goto out;
count &= ~COUNT_CONTINUED;
n = SWAP_MAP_MAX + 1;
- page = vmalloc_to_page(p->swap_map + offset);
+ page = vmalloc_to_page(si->swap_map + offset);
offset &= ~PAGE_MASK;
VM_BUG_ON(page_private(page) != SWP_CONTINUED);
@@ -1544,7 +1755,7 @@ int swp_swapcount(swp_entry_t entry)
n *= (SWAP_CONT_MAX + 1);
} while (tmp_count & COUNT_CONTINUED);
out:
- unlock_cluster_or_swap_info(p, ci);
+ unlock_cluster_or_swap_info(si, ci);
return count;
}
@@ -1590,16 +1801,7 @@ static bool folio_swapped(struct folio *folio)
return swap_page_trans_huge_swapped(si, entry, folio_order(folio));
}
-/**
- * folio_free_swap() - Free the swap space used for this folio.
- * @folio: The folio to remove.
- *
- * If swap is getting full, or if there are no more mappings of this folio,
- * then call folio_free_swap to free its swap space.
- *
- * Return: true if we were able to release the swap space.
- */
-bool folio_free_swap(struct folio *folio)
+static bool folio_swapcache_freeable(struct folio *folio)
{
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
@@ -1607,8 +1809,6 @@ bool folio_free_swap(struct folio *folio)
return false;
if (folio_test_writeback(folio))
return false;
- if (folio_swapped(folio))
- return false;
/*
* Once hibernation has begun to create its image of memory,
@@ -1628,6 +1828,25 @@ bool folio_free_swap(struct folio *folio)
if (pm_suspended_storage())
return false;
+ return true;
+}
+
+/**
+ * folio_free_swap() - Free the swap space used for this folio.
+ * @folio: The folio to remove.
+ *
+ * If swap is getting full, or if there are no more mappings of this folio,
+ * then call folio_free_swap to free its swap space.
+ *
+ * Return: true if we were able to release the swap space.
+ */
+bool folio_free_swap(struct folio *folio)
+{
+ if (!folio_swapcache_freeable(folio))
+ return false;
+ if (folio_swapped(folio))
+ return false;
+
delete_from_swap_cache(folio);
folio_set_dirty(folio);
return true;
@@ -1647,11 +1866,9 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
{
const unsigned long start_offset = swp_offset(entry);
const unsigned long end_offset = start_offset + nr;
- unsigned int type = swp_type(entry);
struct swap_info_struct *si;
bool any_only_cache = false;
unsigned long offset;
- unsigned char count;
if (non_swap_entry(entry))
return;
@@ -1666,15 +1883,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
/*
* First free all entries in the range.
*/
- for (offset = start_offset; offset < end_offset; offset++) {
- if (data_race(si->swap_map[offset])) {
- count = __swap_entry_free(si, swp_entry(type, offset));
- if (count == SWAP_HAS_CACHE)
- any_only_cache = true;
- } else {
- WARN_ON_ONCE(1);
- }
- }
+ any_only_cache = __swap_entries_free(si, entry, nr);
/*
* Short-circuit the below loop if none of the entries had their
@@ -1704,7 +1913,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
* to the next boundary.
*/
nr = __try_to_reclaim_swap(si, offset,
- TTRS_UNMAPPED | TTRS_FULL);
+ TTRS_UNMAPPED | TTRS_FULL);
if (nr == 0)
nr = 1;
else if (nr < 0)
@@ -1979,7 +2188,6 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
folio = swap_cache_get_folio(entry, vma, addr);
if (!folio) {
- struct page *page;
struct vm_fault vmf = {
.vma = vma,
.address = addr,
@@ -1987,10 +2195,8 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
.pmd = pmd,
};
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+ folio = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
&vmf);
- if (page)
- folio = page_folio(page);
}
if (!folio) {
swp_count = READ_ONCE(si->swap_map[offset]);
@@ -2397,52 +2603,54 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span)
return generic_swapfile_activate(sis, swap_file, span);
}
-static int swap_node(struct swap_info_struct *p)
+static int swap_node(struct swap_info_struct *si)
{
struct block_device *bdev;
- if (p->bdev)
- bdev = p->bdev;
+ if (si->bdev)
+ bdev = si->bdev;
else
- bdev = p->swap_file->f_inode->i_sb->s_bdev;
+ bdev = si->swap_file->f_inode->i_sb->s_bdev;
return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE;
}
-static void setup_swap_info(struct swap_info_struct *p, int prio,
+static void setup_swap_info(struct swap_info_struct *si, int prio,
unsigned char *swap_map,
- struct swap_cluster_info *cluster_info)
+ struct swap_cluster_info *cluster_info,
+ unsigned long *zeromap)
{
int i;
if (prio >= 0)
- p->prio = prio;
+ si->prio = prio;
else
- p->prio = --least_priority;
+ si->prio = --least_priority;
/*
* the plist prio is negated because plist ordering is
* low-to-high, while swap ordering is high-to-low
*/
- p->list.prio = -p->prio;
+ si->list.prio = -si->prio;
for_each_node(i) {
- if (p->prio >= 0)
- p->avail_lists[i].prio = -p->prio;
+ if (si->prio >= 0)
+ si->avail_lists[i].prio = -si->prio;
else {
- if (swap_node(p) == i)
- p->avail_lists[i].prio = 1;
+ if (swap_node(si) == i)
+ si->avail_lists[i].prio = 1;
else
- p->avail_lists[i].prio = -p->prio;
+ si->avail_lists[i].prio = -si->prio;
}
}
- p->swap_map = swap_map;
- p->cluster_info = cluster_info;
+ si->swap_map = swap_map;
+ si->cluster_info = cluster_info;
+ si->zeromap = zeromap;
}
-static void _enable_swap_info(struct swap_info_struct *p)
+static void _enable_swap_info(struct swap_info_struct *si)
{
- p->flags |= SWP_WRITEOK;
- atomic_long_add(p->pages, &nr_swap_pages);
- total_swap_pages += p->pages;
+ si->flags |= SWP_WRITEOK;
+ atomic_long_add(si->pages, &nr_swap_pages);
+ total_swap_pages += si->pages;
assert_spin_locked(&swap_lock);
/*
@@ -2455,40 +2663,41 @@ static void _enable_swap_info(struct swap_info_struct *p)
* which allocates swap pages from the highest available priority
* swap_info_struct.
*/
- plist_add(&p->list, &swap_active_head);
+ plist_add(&si->list, &swap_active_head);
/* add to available list iff swap device is not full */
- if (p->highest_bit)
- add_to_avail_list(p);
+ if (si->highest_bit)
+ add_to_avail_list(si);
}
-static void enable_swap_info(struct swap_info_struct *p, int prio,
+static void enable_swap_info(struct swap_info_struct *si, int prio,
unsigned char *swap_map,
- struct swap_cluster_info *cluster_info)
+ struct swap_cluster_info *cluster_info,
+ unsigned long *zeromap)
{
spin_lock(&swap_lock);
- spin_lock(&p->lock);
- setup_swap_info(p, prio, swap_map, cluster_info);
- spin_unlock(&p->lock);
+ spin_lock(&si->lock);
+ setup_swap_info(si, prio, swap_map, cluster_info, zeromap);
+ spin_unlock(&si->lock);
spin_unlock(&swap_lock);
/*
* Finished initializing swap device, now it's safe to reference it.
*/
- percpu_ref_resurrect(&p->users);
+ percpu_ref_resurrect(&si->users);
spin_lock(&swap_lock);
- spin_lock(&p->lock);
- _enable_swap_info(p);
- spin_unlock(&p->lock);
+ spin_lock(&si->lock);
+ _enable_swap_info(si);
+ spin_unlock(&si->lock);
spin_unlock(&swap_lock);
}
-static void reinsert_swap_info(struct swap_info_struct *p)
+static void reinsert_swap_info(struct swap_info_struct *si)
{
spin_lock(&swap_lock);
- spin_lock(&p->lock);
- setup_swap_info(p, p->prio, p->swap_map, p->cluster_info);
- _enable_swap_info(p);
- spin_unlock(&p->lock);
+ spin_lock(&si->lock);
+ setup_swap_info(si, si->prio, si->swap_map, si->cluster_info, si->zeromap);
+ _enable_swap_info(si);
+ spin_unlock(&si->lock);
spin_unlock(&swap_lock);
}
@@ -2511,6 +2720,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
{
struct swap_info_struct *p = NULL;
unsigned char *swap_map;
+ unsigned long *zeromap;
struct swap_cluster_info *cluster_info;
struct file *swap_file, *victim;
struct address_space *mapping;
@@ -2633,6 +2843,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->max = 0;
swap_map = p->swap_map;
p->swap_map = NULL;
+ zeromap = p->zeromap;
+ p->zeromap = NULL;
cluster_info = p->cluster_info;
p->cluster_info = NULL;
spin_unlock(&p->lock);
@@ -2645,6 +2857,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
free_percpu(p->cluster_next_cpu);
p->cluster_next_cpu = NULL;
vfree(swap_map);
+ kvfree(zeromap);
kvfree(cluster_info);
/* Destroy swap account information */
swap_cgroup_swapoff(p->type);
@@ -2874,20 +3087,20 @@ static struct swap_info_struct *alloc_swap_info(void)
return p;
}
-static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
+static int claim_swapfile(struct swap_info_struct *si, struct inode *inode)
{
if (S_ISBLK(inode->i_mode)) {
- p->bdev = I_BDEV(inode);
+ si->bdev = I_BDEV(inode);
/*
* Zoned block devices contain zones that have a sequential
* write only restriction. Hence zoned block devices are not
* suitable for swapping. Disallow them here.
*/
- if (bdev_is_zoned(p->bdev))
+ if (bdev_is_zoned(si->bdev))
return -EINVAL;
- p->flags |= SWP_BLKDEV;
+ si->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
- p->bdev = inode->i_sb->s_bdev;
+ si->bdev = inode->i_sb->s_bdev;
}
return 0;
@@ -2922,7 +3135,7 @@ __weak unsigned long arch_max_swapfile_size(void)
return generic_max_swapfile_size();
}
-static unsigned long read_swap_header(struct swap_info_struct *p,
+static unsigned long read_swap_header(struct swap_info_struct *si,
union swap_header *swap_header,
struct inode *inode)
{
@@ -2953,9 +3166,9 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
return 0;
}
- p->lowest_bit = 1;
- p->cluster_next = 1;
- p->cluster_nr = 0;
+ si->lowest_bit = 1;
+ si->cluster_next = 1;
+ si->cluster_nr = 0;
maxpages = swapfile_maximum_size;
last_page = swap_header->info.last_page;
@@ -2973,7 +3186,7 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
if ((unsigned int)maxpages == 0)
maxpages = UINT_MAX;
}
- p->highest_bit = maxpages - 1;
+ si->highest_bit = maxpages - 1;
if (!maxpages)
return 0;
@@ -2997,25 +3210,18 @@ static unsigned long read_swap_header(struct swap_info_struct *p,
#define SWAP_CLUSTER_COLS \
max_t(unsigned int, SWAP_CLUSTER_INFO_COLS, SWAP_CLUSTER_SPACE_COLS)
-static int setup_swap_map_and_extents(struct swap_info_struct *p,
+static int setup_swap_map_and_extents(struct swap_info_struct *si,
union swap_header *swap_header,
unsigned char *swap_map,
- struct swap_cluster_info *cluster_info,
unsigned long maxpages,
sector_t *span)
{
- unsigned int j, k;
unsigned int nr_good_pages;
+ unsigned long i;
int nr_extents;
- unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
- unsigned long col = p->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
- unsigned long i, idx;
nr_good_pages = maxpages - 1; /* omit header page */
- cluster_list_init(&p->free_clusters);
- cluster_list_init(&p->discard_clusters);
-
for (i = 0; i < swap_header->info.nr_badpages; i++) {
unsigned int page_nr = swap_header->info.badpages[i];
if (page_nr == 0 || page_nr > swap_header->info.last_page)
@@ -3023,40 +3229,87 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
if (page_nr < maxpages) {
swap_map[page_nr] = SWAP_MAP_BAD;
nr_good_pages--;
- /*
- * Haven't marked the cluster free yet, no list
- * operation involved
- */
- inc_cluster_info_page(p, cluster_info, page_nr);
}
}
- /* Haven't marked the cluster free yet, no list operation involved */
- for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
- inc_cluster_info_page(p, cluster_info, i);
-
if (nr_good_pages) {
swap_map[0] = SWAP_MAP_BAD;
- /*
- * Not mark the cluster free yet, no list
- * operation involved
- */
- inc_cluster_info_page(p, cluster_info, 0);
- p->max = maxpages;
- p->pages = nr_good_pages;
- nr_extents = setup_swap_extents(p, span);
+ si->max = maxpages;
+ si->pages = nr_good_pages;
+ nr_extents = setup_swap_extents(si, span);
if (nr_extents < 0)
return nr_extents;
- nr_good_pages = p->pages;
+ nr_good_pages = si->pages;
}
if (!nr_good_pages) {
pr_warn("Empty swap-file\n");
return -EINVAL;
}
+ return nr_extents;
+}
+
+static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
+ union swap_header *swap_header,
+ unsigned long maxpages)
+{
+ unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
+ unsigned long col = si->cluster_next / SWAPFILE_CLUSTER % SWAP_CLUSTER_COLS;
+ struct swap_cluster_info *cluster_info;
+ unsigned long i, j, k, idx;
+ int cpu, err = -ENOMEM;
+
+ cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
if (!cluster_info)
- return nr_extents;
+ goto err;
+
+ for (i = 0; i < nr_clusters; i++)
+ spin_lock_init(&cluster_info[i].lock);
+
+ si->cluster_next_cpu = alloc_percpu(unsigned int);
+ if (!si->cluster_next_cpu)
+ goto err_free;
+
+ /* Random start position to help with wear leveling */
+ for_each_possible_cpu(cpu)
+ per_cpu(*si->cluster_next_cpu, cpu) =
+ get_random_u32_inclusive(1, si->highest_bit);
+
+ si->percpu_cluster = alloc_percpu(struct percpu_cluster);
+ if (!si->percpu_cluster)
+ goto err_free;
+
+ for_each_possible_cpu(cpu) {
+ struct percpu_cluster *cluster;
+
+ cluster = per_cpu_ptr(si->percpu_cluster, cpu);
+ for (i = 0; i < SWAP_NR_ORDERS; i++)
+ cluster->next[i] = SWAP_NEXT_INVALID;
+ }
+
+ /*
+ * Mark unusable pages as unavailable. The clusters aren't
+ * marked free yet, so no list operations are involved yet.
+ *
+ * See setup_swap_map_and_extents(): header page, bad pages,
+ * and the EOF part of the last cluster.
+ */
+ inc_cluster_info_page(si, cluster_info, 0);
+ for (i = 0; i < swap_header->info.nr_badpages; i++)
+ inc_cluster_info_page(si, cluster_info,
+ swap_header->info.badpages[i]);
+ for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++)
+ inc_cluster_info_page(si, cluster_info, i);
+
+ INIT_LIST_HEAD(&si->free_clusters);
+ INIT_LIST_HEAD(&si->full_clusters);
+ INIT_LIST_HEAD(&si->discard_clusters);
+ for (i = 0; i < SWAP_NR_ORDERS; i++) {
+ INIT_LIST_HEAD(&si->nonfull_clusters[i]);
+ INIT_LIST_HEAD(&si->frag_clusters[i]);
+ si->frag_cluster_nr[i] = 0;
+ }
/*
* Reduce false cache line sharing between cluster_info and
@@ -3065,22 +3318,32 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
for (k = 0; k < SWAP_CLUSTER_COLS; k++) {
j = (k + col) % SWAP_CLUSTER_COLS;
for (i = 0; i < DIV_ROUND_UP(nr_clusters, SWAP_CLUSTER_COLS); i++) {
+ struct swap_cluster_info *ci;
idx = i * SWAP_CLUSTER_COLS + j;
+ ci = cluster_info + idx;
if (idx >= nr_clusters)
continue;
- if (cluster_count(&cluster_info[idx]))
+ if (ci->count) {
+ ci->flags = CLUSTER_FLAG_NONFULL;
+ list_add_tail(&ci->list, &si->nonfull_clusters[0]);
continue;
- cluster_set_flag(&cluster_info[idx], CLUSTER_FLAG_FREE);
- cluster_list_add_tail(&p->free_clusters, cluster_info,
- idx);
+ }
+ ci->flags = CLUSTER_FLAG_FREE;
+ list_add_tail(&ci->list, &si->free_clusters);
}
}
- return nr_extents;
+
+ return cluster_info;
+
+err_free:
+ kvfree(cluster_info);
+err:
+ return ERR_PTR(err);
}
SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
- struct swap_info_struct *p;
+ struct swap_info_struct *si;
struct filename *name;
struct file *swap_file = NULL;
struct address_space *mapping;
@@ -3092,8 +3355,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
sector_t span;
unsigned long maxpages;
unsigned char *swap_map = NULL;
+ unsigned long *zeromap = NULL;
struct swap_cluster_info *cluster_info = NULL;
- struct page *page = NULL;
+ struct folio *folio = NULL;
struct inode *inode = NULL;
bool inced_nr_rotate_swap = false;
@@ -3106,11 +3370,11 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (!swap_avail_heads)
return -ENOMEM;
- p = alloc_swap_info();
- if (IS_ERR(p))
- return PTR_ERR(p);
+ si = alloc_swap_info();
+ if (IS_ERR(si))
+ return PTR_ERR(si);
- INIT_WORK(&p->discard_work, swap_discard_work);
+ INIT_WORK(&si->discard_work, swap_discard_work);
name = getname(specialfile);
if (IS_ERR(name)) {
@@ -3125,12 +3389,12 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap;
}
- p->swap_file = swap_file;
+ si->swap_file = swap_file;
mapping = swap_file->f_mapping;
dentry = swap_file->f_path.dentry;
inode = mapping->host;
- error = claim_swapfile(p, inode);
+ error = claim_swapfile(si, inode);
if (unlikely(error))
goto bad_swap;
@@ -3151,14 +3415,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = -EINVAL;
goto bad_swap_unlock_inode;
}
- page = read_mapping_page(mapping, 0, swap_file);
- if (IS_ERR(page)) {
- error = PTR_ERR(page);
+ folio = read_mapping_folio(mapping, 0, swap_file);
+ if (IS_ERR(folio)) {
+ error = PTR_ERR(folio);
goto bad_swap_unlock_inode;
}
- swap_header = kmap(page);
+ swap_header = kmap_local_folio(folio, 0);
- maxpages = read_swap_header(p, swap_header, inode);
+ maxpages = read_swap_header(si, swap_header, inode);
if (unlikely(!maxpages)) {
error = -EINVAL;
goto bad_swap_unlock_inode;
@@ -3171,79 +3435,57 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap_unlock_inode;
}
- if (p->bdev && bdev_stable_writes(p->bdev))
- p->flags |= SWP_STABLE_WRITES;
+ error = swap_cgroup_swapon(si->type, maxpages);
+ if (error)
+ goto bad_swap_unlock_inode;
- if (p->bdev && bdev_synchronous(p->bdev))
- p->flags |= SWP_SYNCHRONOUS_IO;
+ nr_extents = setup_swap_map_and_extents(si, swap_header, swap_map,
+ maxpages, &span);
+ if (unlikely(nr_extents < 0)) {
+ error = nr_extents;
+ goto bad_swap_unlock_inode;
+ }
- if (p->bdev && bdev_nonrot(p->bdev)) {
- int cpu, i;
- unsigned long ci, nr_cluster;
+ /*
+ * Use kvmalloc_array instead of bitmap_zalloc as the allocation order might
+ * be above MAX_PAGE_ORDER incase of a large swap file.
+ */
+ zeromap = kvmalloc_array(BITS_TO_LONGS(maxpages), sizeof(long),
+ GFP_KERNEL | __GFP_ZERO);
+ if (!zeromap) {
+ error = -ENOMEM;
+ goto bad_swap_unlock_inode;
+ }
- p->flags |= SWP_SOLIDSTATE;
- p->cluster_next_cpu = alloc_percpu(unsigned int);
- if (!p->cluster_next_cpu) {
- error = -ENOMEM;
- goto bad_swap_unlock_inode;
- }
- /*
- * select a random position to start with to help wear leveling
- * SSD
- */
- for_each_possible_cpu(cpu) {
- per_cpu(*p->cluster_next_cpu, cpu) =
- get_random_u32_inclusive(1, p->highest_bit);
- }
- nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
+ if (si->bdev && bdev_stable_writes(si->bdev))
+ si->flags |= SWP_STABLE_WRITES;
- cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
- GFP_KERNEL);
- if (!cluster_info) {
- error = -ENOMEM;
- goto bad_swap_unlock_inode;
- }
+ if (si->bdev && bdev_synchronous(si->bdev))
+ si->flags |= SWP_SYNCHRONOUS_IO;
- for (ci = 0; ci < nr_cluster; ci++)
- spin_lock_init(&((cluster_info + ci)->lock));
+ if (si->bdev && bdev_nonrot(si->bdev)) {
+ si->flags |= SWP_SOLIDSTATE;
- p->percpu_cluster = alloc_percpu(struct percpu_cluster);
- if (!p->percpu_cluster) {
- error = -ENOMEM;
+ cluster_info = setup_clusters(si, swap_header, maxpages);
+ if (IS_ERR(cluster_info)) {
+ error = PTR_ERR(cluster_info);
+ cluster_info = NULL;
goto bad_swap_unlock_inode;
}
- for_each_possible_cpu(cpu) {
- struct percpu_cluster *cluster;
-
- cluster = per_cpu_ptr(p->percpu_cluster, cpu);
- for (i = 0; i < SWAP_NR_ORDERS; i++)
- cluster->next[i] = SWAP_NEXT_INVALID;
- }
} else {
atomic_inc(&nr_rotate_swap);
inced_nr_rotate_swap = true;
}
- error = swap_cgroup_swapon(p->type, maxpages);
- if (error)
- goto bad_swap_unlock_inode;
-
- nr_extents = setup_swap_map_and_extents(p, swap_header, swap_map,
- cluster_info, maxpages, &span);
- if (unlikely(nr_extents < 0)) {
- error = nr_extents;
- goto bad_swap_unlock_inode;
- }
-
if ((swap_flags & SWAP_FLAG_DISCARD) &&
- p->bdev && bdev_max_discard_sectors(p->bdev)) {
+ si->bdev && bdev_max_discard_sectors(si->bdev)) {
/*
* When discard is enabled for swap with no particular
* policy flagged, we set all swap discard flags here in
* order to sustain backward compatibility with older
* swapon(8) releases.
*/
- p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
+ si->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD |
SWP_PAGE_DISCARD);
/*
@@ -3253,24 +3495,24 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
* Now it's time to adjust the p->flags accordingly.
*/
if (swap_flags & SWAP_FLAG_DISCARD_ONCE)
- p->flags &= ~SWP_PAGE_DISCARD;
+ si->flags &= ~SWP_PAGE_DISCARD;
else if (swap_flags & SWAP_FLAG_DISCARD_PAGES)
- p->flags &= ~SWP_AREA_DISCARD;
+ si->flags &= ~SWP_AREA_DISCARD;
/* issue a swapon-time discard if it's still required */
- if (p->flags & SWP_AREA_DISCARD) {
- int err = discard_swap(p);
+ if (si->flags & SWP_AREA_DISCARD) {
+ int err = discard_swap(si);
if (unlikely(err))
pr_err("swapon: discard_swap(%p): %d\n",
- p, err);
+ si, err);
}
}
- error = init_swap_address_space(p->type, maxpages);
+ error = init_swap_address_space(si->type, maxpages);
if (error)
goto bad_swap_unlock_inode;
- error = zswap_swapon(p->type, maxpages);
+ error = zswap_swapon(si->type, maxpages);
if (error)
goto free_swap_address_space;
@@ -3290,15 +3532,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
if (swap_flags & SWAP_FLAG_PREFER)
prio =
(swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT;
- enable_swap_info(p, prio, swap_map, cluster_info);
+ enable_swap_info(si, prio, swap_map, cluster_info, zeromap);
pr_info("Adding %uk swap on %s. Priority:%d extents:%d across:%lluk %s%s%s%s\n",
- K(p->pages), name->name, p->prio, nr_extents,
+ K(si->pages), name->name, si->prio, nr_extents,
K((unsigned long long)span),
- (p->flags & SWP_SOLIDSTATE) ? "SS" : "",
- (p->flags & SWP_DISCARDABLE) ? "D" : "",
- (p->flags & SWP_AREA_DISCARD) ? "s" : "",
- (p->flags & SWP_PAGE_DISCARD) ? "c" : "");
+ (si->flags & SWP_SOLIDSTATE) ? "SS" : "",
+ (si->flags & SWP_DISCARDABLE) ? "D" : "",
+ (si->flags & SWP_AREA_DISCARD) ? "s" : "",
+ (si->flags & SWP_PAGE_DISCARD) ? "c" : "");
mutex_unlock(&swapon_mutex);
atomic_inc(&proc_poll_event);
@@ -3307,34 +3549,33 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
error = 0;
goto out;
free_swap_zswap:
- zswap_swapoff(p->type);
+ zswap_swapoff(si->type);
free_swap_address_space:
- exit_swap_address_space(p->type);
+ exit_swap_address_space(si->type);
bad_swap_unlock_inode:
inode_unlock(inode);
bad_swap:
- free_percpu(p->percpu_cluster);
- p->percpu_cluster = NULL;
- free_percpu(p->cluster_next_cpu);
- p->cluster_next_cpu = NULL;
+ free_percpu(si->percpu_cluster);
+ si->percpu_cluster = NULL;
+ free_percpu(si->cluster_next_cpu);
+ si->cluster_next_cpu = NULL;
inode = NULL;
- destroy_swap_extents(p);
- swap_cgroup_swapoff(p->type);
+ destroy_swap_extents(si);
+ swap_cgroup_swapoff(si->type);
spin_lock(&swap_lock);
- p->swap_file = NULL;
- p->flags = 0;
+ si->swap_file = NULL;
+ si->flags = 0;
spin_unlock(&swap_lock);
vfree(swap_map);
+ kvfree(zeromap);
kvfree(cluster_info);
if (inced_nr_rotate_swap)
atomic_dec(&nr_rotate_swap);
if (swap_file)
filp_close(swap_file, NULL);
out:
- if (page && !IS_ERR(page)) {
- kunmap(page);
- put_page(page);
- }
+ if (!IS_ERR_OR_NULL(folio))
+ folio_release_kmap(folio, swap_header);
if (name)
putname(name);
if (inode)
@@ -3362,7 +3603,7 @@ void si_swapinfo(struct sysinfo *val)
}
/*
- * Verify that a swap entry is valid and increment its swap map count.
+ * Verify that nr swap entries are valid and increment their swap map counts.
*
* Returns error code in following case.
* - success -> 0
@@ -3372,63 +3613,76 @@ void si_swapinfo(struct sysinfo *val)
* - swap-cache reference is requested but the entry is not used. -> ENOENT
* - swap-mapped reference requested but needs continued swap count. -> ENOMEM
*/
-static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
+static int __swap_duplicate(swp_entry_t entry, unsigned char usage, int nr)
{
- struct swap_info_struct *p;
+ struct swap_info_struct *si;
struct swap_cluster_info *ci;
unsigned long offset;
unsigned char count;
unsigned char has_cache;
- int err;
+ int err, i;
- p = swp_swap_info(entry);
+ si = swp_swap_info(entry);
offset = swp_offset(entry);
- ci = lock_cluster_or_swap_info(p, offset);
-
- count = p->swap_map[offset];
-
- /*
- * swapin_readahead() doesn't check if a swap entry is valid, so the
- * swap entry could be SWAP_MAP_BAD. Check here with lock held.
- */
- if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
- err = -ENOENT;
- goto unlock_out;
- }
+ VM_WARN_ON(nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
+ VM_WARN_ON(usage == 1 && nr > 1);
+ ci = lock_cluster_or_swap_info(si, offset);
- has_cache = count & SWAP_HAS_CACHE;
- count &= ~SWAP_HAS_CACHE;
err = 0;
+ for (i = 0; i < nr; i++) {
+ count = si->swap_map[offset + i];
- if (usage == SWAP_HAS_CACHE) {
+ /*
+ * swapin_readahead() doesn't check if a swap entry is valid, so the
+ * swap entry could be SWAP_MAP_BAD. Check here with lock held.
+ */
+ if (unlikely(swap_count(count) == SWAP_MAP_BAD)) {
+ err = -ENOENT;
+ goto unlock_out;
+ }
- /* set SWAP_HAS_CACHE if there is no cache and entry is used */
- if (!has_cache && count)
- has_cache = SWAP_HAS_CACHE;
- else if (has_cache) /* someone else added cache */
- err = -EEXIST;
- else /* no users remaining */
+ has_cache = count & SWAP_HAS_CACHE;
+ count &= ~SWAP_HAS_CACHE;
+
+ if (!count && !has_cache) {
err = -ENOENT;
+ } else if (usage == SWAP_HAS_CACHE) {
+ if (has_cache)
+ err = -EEXIST;
+ } else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) {
+ err = -EINVAL;
+ }
+
+ if (err)
+ goto unlock_out;
+ }
- } else if (count || has_cache) {
+ for (i = 0; i < nr; i++) {
+ count = si->swap_map[offset + i];
+ has_cache = count & SWAP_HAS_CACHE;
+ count &= ~SWAP_HAS_CACHE;
- if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
+ if (usage == SWAP_HAS_CACHE)
+ has_cache = SWAP_HAS_CACHE;
+ else if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX)
count += usage;
- else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX)
- err = -EINVAL;
- else if (swap_count_continued(p, offset, count))
+ else if (swap_count_continued(si, offset + i, count))
count = COUNT_CONTINUED;
- else
+ else {
+ /*
+ * Don't need to rollback changes, because if
+ * usage == 1, there must be nr == 1.
+ */
err = -ENOMEM;
- } else
- err = -ENOENT; /* unused swap entry */
+ goto unlock_out;
+ }
- if (!err)
- WRITE_ONCE(p->swap_map[offset], count | has_cache);
+ WRITE_ONCE(si->swap_map[offset + i], count | has_cache);
+ }
unlock_out:
- unlock_cluster_or_swap_info(p, ci);
+ unlock_cluster_or_swap_info(si, ci);
return err;
}
@@ -3436,9 +3690,9 @@ unlock_out:
* Help swapoff by noting that swap entry belongs to shmem/tmpfs
* (in which case its reference count is never incremented).
*/
-void swap_shmem_alloc(swp_entry_t entry)
+void swap_shmem_alloc(swp_entry_t entry, int nr)
{
- __swap_duplicate(entry, SWAP_MAP_SHMEM);
+ __swap_duplicate(entry, SWAP_MAP_SHMEM, nr);
}
/*
@@ -3452,35 +3706,29 @@ int swap_duplicate(swp_entry_t entry)
{
int err = 0;
- while (!err && __swap_duplicate(entry, 1) == -ENOMEM)
+ while (!err && __swap_duplicate(entry, 1, 1) == -ENOMEM)
err = add_swap_count_continuation(entry, GFP_ATOMIC);
return err;
}
/*
- * @entry: swap entry for which we allocate swap cache.
+ * @entry: first swap entry from which we allocate nr swap cache.
*
- * Called when allocating swap cache for existing swap entry,
+ * Called when allocating swap cache for existing swap entries,
* This can return error codes. Returns 0 at success.
* -EEXIST means there is a swap cache.
* Note: return code is different from swap_duplicate().
*/
-int swapcache_prepare(swp_entry_t entry)
+int swapcache_prepare(swp_entry_t entry, int nr)
{
- return __swap_duplicate(entry, SWAP_HAS_CACHE);
+ return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
}
-void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry)
+void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
{
- struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
- unsigned char usage;
- ci = lock_cluster_or_swap_info(si, offset);
- usage = __swap_entry_free_locked(si, offset, SWAP_HAS_CACHE);
- unlock_cluster_or_swap_info(si, ci);
- if (!usage)
- free_swap_slot(entry);
+ cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE);
}
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index acc56c75ba99..ce13c4062647 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -391,7 +391,7 @@ static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
struct page *page;
int ret;
- ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC);
+ ret = shmem_get_folio(inode, pgoff, 0, &folio, SGP_NOALLOC);
/* Our caller expects us to return -EFAULT if we failed to find folio */
if (ret == -ENOENT)
ret = -EFAULT;
@@ -1763,3 +1763,171 @@ out:
VM_WARN_ON(!moved && !err);
return moved ? moved : err;
}
+
+static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
+ vm_flags_t flags)
+{
+ const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
+
+ vm_flags_reset(vma, flags);
+ /*
+ * For shared mappings, we want to enable writenotify while
+ * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
+ * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
+ */
+ if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
+ vma_set_page_prot(vma);
+}
+
+static void userfaultfd_set_ctx(struct vm_area_struct *vma,
+ struct userfaultfd_ctx *ctx,
+ unsigned long flags)
+{
+ vma_start_write(vma);
+ vma->vm_userfaultfd_ctx = (struct vm_userfaultfd_ctx){ctx};
+ userfaultfd_set_vm_flags(vma,
+ (vma->vm_flags & ~__VM_UFFD_FLAGS) | flags);
+}
+
+void userfaultfd_reset_ctx(struct vm_area_struct *vma)
+{
+ userfaultfd_set_ctx(vma, NULL, 0);
+}
+
+struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end)
+{
+ struct vm_area_struct *ret;
+
+ /* Reset ptes for the whole vma range if wr-protected */
+ if (userfaultfd_wp(vma))
+ uffd_wp_range(vma, start, end - start, false);
+
+ ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
+ vma->vm_flags & ~__VM_UFFD_FLAGS,
+ NULL_VM_UFFD_CTX);
+
+ /*
+ * In the vma_merge() successful mprotect-like case 8:
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+ if (!IS_ERR(ret))
+ userfaultfd_reset_ctx(ret);
+
+ return ret;
+}
+
+/* Assumes mmap write lock taken, and mm_struct pinned. */
+int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
+ struct vm_area_struct *vma,
+ unsigned long vm_flags,
+ unsigned long start, unsigned long end,
+ bool wp_async)
+{
+ VMA_ITERATOR(vmi, ctx->mm, start);
+ struct vm_area_struct *prev = vma_prev(&vmi);
+ unsigned long vma_end;
+ unsigned long new_flags;
+
+ if (vma->vm_start < start)
+ prev = vma;
+
+ for_each_vma_range(vmi, vma, end) {
+ cond_resched();
+
+ BUG_ON(!vma_can_userfault(vma, vm_flags, wp_async));
+ BUG_ON(vma->vm_userfaultfd_ctx.ctx &&
+ vma->vm_userfaultfd_ctx.ctx != ctx);
+ WARN_ON(!(vma->vm_flags & VM_MAYWRITE));
+
+ /*
+ * Nothing to do: this vma is already registered into this
+ * userfaultfd and with the right tracking mode too.
+ */
+ if (vma->vm_userfaultfd_ctx.ctx == ctx &&
+ (vma->vm_flags & vm_flags) == vm_flags)
+ goto skip;
+
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+ vma_end = min(end, vma->vm_end);
+
+ new_flags = (vma->vm_flags & ~__VM_UFFD_FLAGS) | vm_flags;
+ vma = vma_modify_flags_uffd(&vmi, prev, vma, start, vma_end,
+ new_flags,
+ (struct vm_userfaultfd_ctx){ctx});
+ if (IS_ERR(vma))
+ return PTR_ERR(vma);
+
+ /*
+ * In the vma_merge() successful mprotect-like case 8:
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+ userfaultfd_set_ctx(vma, ctx, vm_flags);
+
+ if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
+ hugetlb_unshare_all_pmds(vma);
+
+skip:
+ prev = vma;
+ start = vma->vm_end;
+ }
+
+ return 0;
+}
+
+void userfaultfd_release_new(struct userfaultfd_ctx *ctx)
+{
+ struct mm_struct *mm = ctx->mm;
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ /* the various vma->vm_userfaultfd_ctx still points to it */
+ mmap_write_lock(mm);
+ for_each_vma(vmi, vma) {
+ if (vma->vm_userfaultfd_ctx.ctx == ctx)
+ userfaultfd_reset_ctx(vma);
+ }
+ mmap_write_unlock(mm);
+}
+
+void userfaultfd_release_all(struct mm_struct *mm,
+ struct userfaultfd_ctx *ctx)
+{
+ struct vm_area_struct *vma, *prev;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ if (!mmget_not_zero(mm))
+ return;
+
+ /*
+ * Flush page faults out of all CPUs. NOTE: all page faults
+ * must be retried without returning VM_FAULT_SIGBUS if
+ * userfaultfd_ctx_get() succeeds but vma->vma_userfault_ctx
+ * changes while handle_userfault released the mmap_lock. So
+ * it's critical that released is set to true (above), before
+ * taking the mmap_lock for writing.
+ */
+ mmap_write_lock(mm);
+ prev = NULL;
+ for_each_vma(vmi, vma) {
+ cond_resched();
+ BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^
+ !!(vma->vm_flags & __VM_UFFD_FLAGS));
+ if (vma->vm_userfaultfd_ctx.ctx != ctx) {
+ prev = vma;
+ continue;
+ }
+
+ vma = userfaultfd_clear_vma(&vmi, prev, vma,
+ vma->vm_start, vma->vm_end);
+ prev = vma;
+ }
+ mmap_write_unlock(mm);
+ mmput(mm);
+}
diff --git a/mm/util.c b/mm/util.c
index bd283e2132e0..4f1275023eb7 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -463,7 +463,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack)
if (gap + pad > gap)
gap += pad;
- if (gap < MIN_GAP)
+ if (gap < MIN_GAP && MIN_GAP < MAX_GAP)
gap = MIN_GAP;
else if (gap > MAX_GAP)
gap = MAX_GAP;
@@ -608,6 +608,28 @@ unsigned long vm_mmap(struct file *file, unsigned long addr,
}
EXPORT_SYMBOL(vm_mmap);
+static gfp_t kmalloc_gfp_adjust(gfp_t flags, size_t size)
+{
+ /*
+ * We want to attempt a large physically contiguous block first because
+ * it is less likely to fragment multiple larger blocks and therefore
+ * contribute to a long term fragmentation less than vmalloc fallback.
+ * However make sure that larger requests are not too disruptive - no
+ * OOM killer and no allocation failure warnings as we have a fallback.
+ */
+ if (size > PAGE_SIZE) {
+ flags |= __GFP_NOWARN;
+
+ if (!(flags & __GFP_RETRY_MAYFAIL))
+ flags |= __GFP_NORETRY;
+
+ /* nofail semantic is implemented by the vmalloc fallback */
+ flags &= ~__GFP_NOFAIL;
+ }
+
+ return flags;
+}
+
/**
* __kvmalloc_node - attempt to allocate physically contiguous memory, but upon
* failure, fall back to non-contiguous (vmalloc) allocation.
@@ -627,32 +649,15 @@ EXPORT_SYMBOL(vm_mmap);
*/
void *__kvmalloc_node_noprof(DECL_BUCKET_PARAMS(size, b), gfp_t flags, int node)
{
- gfp_t kmalloc_flags = flags;
void *ret;
/*
- * We want to attempt a large physically contiguous block first because
- * it is less likely to fragment multiple larger blocks and therefore
- * contribute to a long term fragmentation less than vmalloc fallback.
- * However make sure that larger requests are not too disruptive - no
- * OOM killer and no allocation failure warnings as we have a fallback.
- */
- if (size > PAGE_SIZE) {
- kmalloc_flags |= __GFP_NOWARN;
-
- if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL))
- kmalloc_flags |= __GFP_NORETRY;
-
- /* nofail semantic is implemented by the vmalloc fallback */
- kmalloc_flags &= ~__GFP_NOFAIL;
- }
-
- ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b), kmalloc_flags, node);
-
- /*
* It doesn't really make sense to fallback to vmalloc for sub page
* requests
*/
+ ret = __kmalloc_node_noprof(PASS_BUCKET_PARAMS(size, b),
+ kmalloc_gfp_adjust(flags, size),
+ node);
if (ret || size <= PAGE_SIZE)
return ret;
@@ -715,18 +720,53 @@ void kvfree_sensitive(const void *addr, size_t len)
}
EXPORT_SYMBOL(kvfree_sensitive);
-void *kvrealloc_noprof(const void *p, size_t oldsize, size_t newsize, gfp_t flags)
+/**
+ * kvrealloc - reallocate memory; contents remain unchanged
+ * @p: object to reallocate memory for
+ * @size: the size to reallocate
+ * @flags: the flags for the page level allocator
+ *
+ * If @p is %NULL, kvrealloc() behaves exactly like kvmalloc(). If @size is 0
+ * and @p is not a %NULL pointer, the object pointed to is freed.
+ *
+ * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
+ * initial memory allocation, every subsequent call to this API for the same
+ * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
+ * __GFP_ZERO is not fully honored by this API.
+ *
+ * In any case, the contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.
+ *
+ * This function must not be called concurrently with itself or kvfree() for the
+ * same memory allocation.
+ *
+ * Return: pointer to the allocated memory or %NULL in case of error
+ */
+void *kvrealloc_noprof(const void *p, size_t size, gfp_t flags)
{
- void *newp;
+ void *n;
- if (oldsize >= newsize)
- return (void *)p;
- newp = kvmalloc_noprof(newsize, flags);
- if (!newp)
- return NULL;
- memcpy(newp, p, oldsize);
- kvfree(p);
- return newp;
+ if (is_vmalloc_addr(p))
+ return vrealloc_noprof(p, size, flags);
+
+ n = krealloc_noprof(p, size, kmalloc_gfp_adjust(flags, size));
+ if (!n) {
+ /* We failed to krealloc(), fall back to kvmalloc(). */
+ n = kvmalloc_noprof(size, flags);
+ if (!n)
+ return NULL;
+
+ if (p) {
+ /* We already know that `p` is not a vmalloc address. */
+ kasan_disable_current();
+ memcpy(n, kasan_reset_tag(p), ksize(p));
+ kasan_enable_current();
+
+ kfree(p);
+ }
+ }
+
+ return n;
}
EXPORT_SYMBOL(kvrealloc_noprof);
diff --git a/mm/vma.c b/mm/vma.c
new file mode 100644
index 000000000000..4737afcb064c
--- /dev/null
+++ b/mm/vma.c
@@ -0,0 +1,2068 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+
+/*
+ * VMA-specific functions.
+ */
+
+#include "vma_internal.h"
+#include "vma.h"
+
+static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next)
+{
+ struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev;
+
+ if (!mpol_equal(vmg->policy, vma_policy(vma)))
+ return false;
+ /*
+ * VM_SOFTDIRTY should not prevent from VMA merging, if we
+ * match the flags but dirty bit -- the caller should mark
+ * merged VMA as dirty. If dirty bit won't be excluded from
+ * comparison, we increase pressure on the memory system forcing
+ * the kernel to generate new VMAs when old one could be
+ * extended instead.
+ */
+ if ((vma->vm_flags ^ vmg->flags) & ~VM_SOFTDIRTY)
+ return false;
+ if (vma->vm_file != vmg->file)
+ return false;
+ if (!is_mergeable_vm_userfaultfd_ctx(vma, vmg->uffd_ctx))
+ return false;
+ if (!anon_vma_name_eq(anon_vma_name(vma), vmg->anon_name))
+ return false;
+ return true;
+}
+
+static inline bool is_mergeable_anon_vma(struct anon_vma *anon_vma1,
+ struct anon_vma *anon_vma2, struct vm_area_struct *vma)
+{
+ /*
+ * The list_is_singular() test is to avoid merging VMA cloned from
+ * parents. This can improve scalability caused by anon_vma lock.
+ */
+ if ((!anon_vma1 || !anon_vma2) && (!vma ||
+ list_is_singular(&vma->anon_vma_chain)))
+ return true;
+ return anon_vma1 == anon_vma2;
+}
+
+/* Are the anon_vma's belonging to each VMA compatible with one another? */
+static inline bool are_anon_vmas_compatible(struct vm_area_struct *vma1,
+ struct vm_area_struct *vma2)
+{
+ return is_mergeable_anon_vma(vma1->anon_vma, vma2->anon_vma, NULL);
+}
+
+/*
+ * init_multi_vma_prep() - Initializer for struct vma_prepare
+ * @vp: The vma_prepare struct
+ * @vma: The vma that will be altered once locked
+ * @next: The next vma if it is to be adjusted
+ * @remove: The first vma to be removed
+ * @remove2: The second vma to be removed
+ */
+static void init_multi_vma_prep(struct vma_prepare *vp,
+ struct vm_area_struct *vma,
+ struct vm_area_struct *next,
+ struct vm_area_struct *remove,
+ struct vm_area_struct *remove2)
+{
+ memset(vp, 0, sizeof(struct vma_prepare));
+ vp->vma = vma;
+ vp->anon_vma = vma->anon_vma;
+ vp->remove = remove;
+ vp->remove2 = remove2;
+ vp->adj_next = next;
+ if (!vp->anon_vma && next)
+ vp->anon_vma = next->anon_vma;
+
+ vp->file = vma->vm_file;
+ if (vp->file)
+ vp->mapping = vma->vm_file->f_mapping;
+
+}
+
+/*
+ * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
+ * in front of (at a lower virtual address and file offset than) the vma.
+ *
+ * We cannot merge two vmas if they have differently assigned (non-NULL)
+ * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
+ *
+ * We don't check here for the merged mmap wrapping around the end of pagecache
+ * indices (16TB on ia32) because do_mmap() does not permit mmap's which
+ * wrap, nor mmaps which cover the final page at index -1UL.
+ *
+ * We assume the vma may be removed as part of the merge.
+ */
+static bool can_vma_merge_before(struct vma_merge_struct *vmg)
+{
+ pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
+
+ if (is_mergeable_vma(vmg, /* merge_next = */ true) &&
+ is_mergeable_anon_vma(vmg->anon_vma, vmg->next->anon_vma, vmg->next)) {
+ if (vmg->next->vm_pgoff == vmg->pgoff + pglen)
+ return true;
+ }
+
+ return false;
+}
+
+/*
+ * Return true if we can merge this (vm_flags,anon_vma,file,vm_pgoff)
+ * beyond (at a higher virtual address and file offset than) the vma.
+ *
+ * We cannot merge two vmas if they have differently assigned (non-NULL)
+ * anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
+ *
+ * We assume that vma is not removed as part of the merge.
+ */
+static bool can_vma_merge_after(struct vma_merge_struct *vmg)
+{
+ if (is_mergeable_vma(vmg, /* merge_next = */ false) &&
+ is_mergeable_anon_vma(vmg->anon_vma, vmg->prev->anon_vma, vmg->prev)) {
+ if (vmg->prev->vm_pgoff + vma_pages(vmg->prev) == vmg->pgoff)
+ return true;
+ }
+ return false;
+}
+
+static void __vma_link_file(struct vm_area_struct *vma,
+ struct address_space *mapping)
+{
+ if (vma_is_shared_maywrite(vma))
+ mapping_allow_writable(mapping);
+
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+}
+
+/*
+ * Requires inode->i_mapping->i_mmap_rwsem
+ */
+static void __remove_shared_vm_struct(struct vm_area_struct *vma,
+ struct address_space *mapping)
+{
+ if (vma_is_shared_maywrite(vma))
+ mapping_unmap_writable(mapping);
+
+ flush_dcache_mmap_lock(mapping);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
+ flush_dcache_mmap_unlock(mapping);
+}
+
+/*
+ * vma_prepare() - Helper function for handling locking VMAs prior to altering
+ * @vp: The initialized vma_prepare struct
+ */
+static void vma_prepare(struct vma_prepare *vp)
+{
+ if (vp->file) {
+ uprobe_munmap(vp->vma, vp->vma->vm_start, vp->vma->vm_end);
+
+ if (vp->adj_next)
+ uprobe_munmap(vp->adj_next, vp->adj_next->vm_start,
+ vp->adj_next->vm_end);
+
+ i_mmap_lock_write(vp->mapping);
+ if (vp->insert && vp->insert->vm_file) {
+ /*
+ * Put into interval tree now, so instantiated pages
+ * are visible to arm/parisc __flush_dcache_page
+ * throughout; but we cannot insert into address
+ * space until vma start or end is updated.
+ */
+ __vma_link_file(vp->insert,
+ vp->insert->vm_file->f_mapping);
+ }
+ }
+
+ if (vp->anon_vma) {
+ anon_vma_lock_write(vp->anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vp->vma);
+ if (vp->adj_next)
+ anon_vma_interval_tree_pre_update_vma(vp->adj_next);
+ }
+
+ if (vp->file) {
+ flush_dcache_mmap_lock(vp->mapping);
+ vma_interval_tree_remove(vp->vma, &vp->mapping->i_mmap);
+ if (vp->adj_next)
+ vma_interval_tree_remove(vp->adj_next,
+ &vp->mapping->i_mmap);
+ }
+
+}
+
+/*
+ * vma_complete- Helper function for handling the unlocking after altering VMAs,
+ * or for inserting a VMA.
+ *
+ * @vp: The vma_prepare struct
+ * @vmi: The vma iterator
+ * @mm: The mm_struct
+ */
+static void vma_complete(struct vma_prepare *vp, struct vma_iterator *vmi,
+ struct mm_struct *mm)
+{
+ if (vp->file) {
+ if (vp->adj_next)
+ vma_interval_tree_insert(vp->adj_next,
+ &vp->mapping->i_mmap);
+ vma_interval_tree_insert(vp->vma, &vp->mapping->i_mmap);
+ flush_dcache_mmap_unlock(vp->mapping);
+ }
+
+ if (vp->remove && vp->file) {
+ __remove_shared_vm_struct(vp->remove, vp->mapping);
+ if (vp->remove2)
+ __remove_shared_vm_struct(vp->remove2, vp->mapping);
+ } else if (vp->insert) {
+ /*
+ * split_vma has split insert from vma, and needs
+ * us to insert it before dropping the locks
+ * (it may either follow vma or precede it).
+ */
+ vma_iter_store(vmi, vp->insert);
+ mm->map_count++;
+ }
+
+ if (vp->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vp->vma);
+ if (vp->adj_next)
+ anon_vma_interval_tree_post_update_vma(vp->adj_next);
+ anon_vma_unlock_write(vp->anon_vma);
+ }
+
+ if (vp->file) {
+ i_mmap_unlock_write(vp->mapping);
+ uprobe_mmap(vp->vma);
+
+ if (vp->adj_next)
+ uprobe_mmap(vp->adj_next);
+ }
+
+ if (vp->remove) {
+again:
+ vma_mark_detached(vp->remove, true);
+ if (vp->file) {
+ uprobe_munmap(vp->remove, vp->remove->vm_start,
+ vp->remove->vm_end);
+ fput(vp->file);
+ }
+ if (vp->remove->anon_vma)
+ anon_vma_merge(vp->vma, vp->remove);
+ mm->map_count--;
+ mpol_put(vma_policy(vp->remove));
+ if (!vp->remove2)
+ WARN_ON_ONCE(vp->vma->vm_end < vp->remove->vm_end);
+ vm_area_free(vp->remove);
+
+ /*
+ * In mprotect's case 6 (see comments on vma_merge),
+ * we are removing both mid and next vmas
+ */
+ if (vp->remove2) {
+ vp->remove = vp->remove2;
+ vp->remove2 = NULL;
+ goto again;
+ }
+ }
+ if (vp->insert && vp->file)
+ uprobe_mmap(vp->insert);
+}
+
+/*
+ * init_vma_prep() - Initializer wrapper for vma_prepare struct
+ * @vp: The vma_prepare struct
+ * @vma: The vma that will be altered once locked
+ */
+static void init_vma_prep(struct vma_prepare *vp, struct vm_area_struct *vma)
+{
+ init_multi_vma_prep(vp, vma, NULL, NULL, NULL);
+}
+
+/*
+ * Can the proposed VMA be merged with the left (previous) VMA taking into
+ * account the start position of the proposed range.
+ */
+static bool can_vma_merge_left(struct vma_merge_struct *vmg)
+
+{
+ return vmg->prev && vmg->prev->vm_end == vmg->start &&
+ can_vma_merge_after(vmg);
+}
+
+/*
+ * Can the proposed VMA be merged with the right (next) VMA taking into
+ * account the end position of the proposed range.
+ *
+ * In addition, if we can merge with the left VMA, ensure that left and right
+ * anon_vma's are also compatible.
+ */
+static bool can_vma_merge_right(struct vma_merge_struct *vmg,
+ bool can_merge_left)
+{
+ if (!vmg->next || vmg->end != vmg->next->vm_start ||
+ !can_vma_merge_before(vmg))
+ return false;
+
+ if (!can_merge_left)
+ return true;
+
+ /*
+ * If we can merge with prev (left) and next (right), indicating that
+ * each VMA's anon_vma is compatible with the proposed anon_vma, this
+ * does not mean prev and next are compatible with EACH OTHER.
+ *
+ * We therefore check this in addition to mergeability to either side.
+ */
+ return are_anon_vmas_compatible(vmg->prev, vmg->next);
+}
+
+/*
+ * Close a vm structure and free it.
+ */
+void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed)
+{
+ might_sleep();
+ if (!closed && vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+ if (vma->vm_file)
+ fput(vma->vm_file);
+ mpol_put(vma_policy(vma));
+ if (unreachable)
+ __vm_area_free(vma);
+ else
+ vm_area_free(vma);
+}
+
+/*
+ * Get rid of page table information in the indicated region.
+ *
+ * Called with the mm semaphore held.
+ */
+void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct vm_area_struct *next)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ struct mmu_gather tlb;
+
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, mm);
+ update_hiwater_rss(mm);
+ unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end,
+ /* mm_wr_locked = */ true);
+ mas_set(mas, vma->vm_end);
+ free_pgtables(&tlb, mas, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS,
+ next ? next->vm_start : USER_PGTABLES_CEILING,
+ /* mm_wr_locked = */ true);
+ tlb_finish_mmu(&tlb);
+}
+
+/*
+ * __split_vma() bypasses sysctl_max_map_count checking. We use this where it
+ * has already been checked or doesn't make sense to fail.
+ * VMA Iterator will point to the original VMA.
+ */
+static int __split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
+{
+ struct vma_prepare vp;
+ struct vm_area_struct *new;
+ int err;
+
+ WARN_ON(vma->vm_start >= addr);
+ WARN_ON(vma->vm_end <= addr);
+
+ if (vma->vm_ops && vma->vm_ops->may_split) {
+ err = vma->vm_ops->may_split(vma, addr);
+ if (err)
+ return err;
+ }
+
+ new = vm_area_dup(vma);
+ if (!new)
+ return -ENOMEM;
+
+ if (new_below) {
+ new->vm_end = addr;
+ } else {
+ new->vm_start = addr;
+ new->vm_pgoff += ((addr - vma->vm_start) >> PAGE_SHIFT);
+ }
+
+ err = -ENOMEM;
+ vma_iter_config(vmi, new->vm_start, new->vm_end);
+ if (vma_iter_prealloc(vmi, new))
+ goto out_free_vma;
+
+ err = vma_dup_policy(vma, new);
+ if (err)
+ goto out_free_vmi;
+
+ err = anon_vma_clone(new, vma);
+ if (err)
+ goto out_free_mpol;
+
+ if (new->vm_file)
+ get_file(new->vm_file);
+
+ if (new->vm_ops && new->vm_ops->open)
+ new->vm_ops->open(new);
+
+ vma_start_write(vma);
+ vma_start_write(new);
+
+ init_vma_prep(&vp, vma);
+ vp.insert = new;
+ vma_prepare(&vp);
+ vma_adjust_trans_huge(vma, vma->vm_start, addr, 0);
+
+ if (new_below) {
+ vma->vm_start = addr;
+ vma->vm_pgoff += (addr - new->vm_start) >> PAGE_SHIFT;
+ } else {
+ vma->vm_end = addr;
+ }
+
+ /* vma_complete stores the new vma */
+ vma_complete(&vp, vmi, vma->vm_mm);
+ validate_mm(vma->vm_mm);
+
+ /* Success. */
+ if (new_below)
+ vma_next(vmi);
+ else
+ vma_prev(vmi);
+
+ return 0;
+
+out_free_mpol:
+ mpol_put(vma_policy(new));
+out_free_vmi:
+ vma_iter_free(vmi);
+out_free_vma:
+ vm_area_free(new);
+ return err;
+}
+
+/*
+ * Split a vma into two pieces at address 'addr', a new vma is allocated
+ * either for the first part or the tail.
+ */
+static int split_vma(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long addr, int new_below)
+{
+ if (vma->vm_mm->map_count >= sysctl_max_map_count)
+ return -ENOMEM;
+
+ return __split_vma(vmi, vma, addr, new_below);
+}
+
+/*
+ * vma has some anon_vma assigned, and is already inserted on that
+ * anon_vma's interval trees.
+ *
+ * Before updating the vma's vm_start / vm_end / vm_pgoff fields, the
+ * vma must be removed from the anon_vma's interval trees using
+ * anon_vma_interval_tree_pre_update_vma().
+ *
+ * After the update, the vma will be reinserted using
+ * anon_vma_interval_tree_post_update_vma().
+ *
+ * The entire update must be protected by exclusive mmap_lock and by
+ * the root anon_vma's mutex.
+ */
+void
+anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_remove(avc, &avc->anon_vma->rb_root);
+}
+
+void
+anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma_chain *avc;
+
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root);
+}
+
+/*
+ * dup_anon_vma() - Helper function to duplicate anon_vma
+ * @dst: The destination VMA
+ * @src: The source VMA
+ * @dup: Pointer to the destination VMA when successful.
+ *
+ * Returns: 0 on success.
+ */
+static int dup_anon_vma(struct vm_area_struct *dst,
+ struct vm_area_struct *src, struct vm_area_struct **dup)
+{
+ /*
+ * Easily overlooked: when mprotect shifts the boundary, make sure the
+ * expanding vma has anon_vma set if the shrinking vma had, to cover any
+ * anon pages imported.
+ */
+ if (src->anon_vma && !dst->anon_vma) {
+ int ret;
+
+ vma_assert_write_locked(dst);
+ dst->anon_vma = src->anon_vma;
+ ret = anon_vma_clone(dst, src);
+ if (ret)
+ return ret;
+
+ *dup = dst;
+ }
+
+ return 0;
+}
+
+#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
+void validate_mm(struct mm_struct *mm)
+{
+ int bug = 0;
+ int i = 0;
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ mt_validate(&mm->mm_mt);
+ for_each_vma(vmi, vma) {
+#ifdef CONFIG_DEBUG_VM_RB
+ struct anon_vma *anon_vma = vma->anon_vma;
+ struct anon_vma_chain *avc;
+#endif
+ unsigned long vmi_start, vmi_end;
+ bool warn = 0;
+
+ vmi_start = vma_iter_addr(&vmi);
+ vmi_end = vma_iter_end(&vmi);
+ if (VM_WARN_ON_ONCE_MM(vma->vm_end != vmi_end, mm))
+ warn = 1;
+
+ if (VM_WARN_ON_ONCE_MM(vma->vm_start != vmi_start, mm))
+ warn = 1;
+
+ if (warn) {
+ pr_emerg("issue in %s\n", current->comm);
+ dump_stack();
+ dump_vma(vma);
+ pr_emerg("tree range: %px start %lx end %lx\n", vma,
+ vmi_start, vmi_end - 1);
+ vma_iter_dump_tree(&vmi);
+ }
+
+#ifdef CONFIG_DEBUG_VM_RB
+ if (anon_vma) {
+ anon_vma_lock_read(anon_vma);
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_verify(avc);
+ anon_vma_unlock_read(anon_vma);
+ }
+#endif
+ i++;
+ }
+ if (i != mm->map_count) {
+ pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i);
+ bug = 1;
+ }
+ VM_BUG_ON_MM(bug, mm);
+}
+#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */
+
+/* Actually perform the VMA merge operation. */
+static int commit_merge(struct vma_merge_struct *vmg,
+ struct vm_area_struct *adjust,
+ struct vm_area_struct *remove,
+ struct vm_area_struct *remove2,
+ long adj_start,
+ bool expanded)
+{
+ struct vma_prepare vp;
+
+ init_multi_vma_prep(&vp, vmg->vma, adjust, remove, remove2);
+
+ VM_WARN_ON(vp.anon_vma && adjust && adjust->anon_vma &&
+ vp.anon_vma != adjust->anon_vma);
+
+ if (expanded) {
+ /* Note: vma iterator must be pointing to 'start'. */
+ vma_iter_config(vmg->vmi, vmg->start, vmg->end);
+ } else {
+ vma_iter_config(vmg->vmi, adjust->vm_start + adj_start,
+ adjust->vm_end);
+ }
+
+ if (vma_iter_prealloc(vmg->vmi, vmg->vma))
+ return -ENOMEM;
+
+ vma_prepare(&vp);
+ vma_adjust_trans_huge(vmg->vma, vmg->start, vmg->end, adj_start);
+ vma_set_range(vmg->vma, vmg->start, vmg->end, vmg->pgoff);
+
+ if (expanded)
+ vma_iter_store(vmg->vmi, vmg->vma);
+
+ if (adj_start) {
+ adjust->vm_start += adj_start;
+ adjust->vm_pgoff += PHYS_PFN(adj_start);
+ if (adj_start < 0) {
+ WARN_ON(expanded);
+ vma_iter_store(vmg->vmi, adjust);
+ }
+ }
+
+ vma_complete(&vp, vmg->vmi, vmg->vma->vm_mm);
+
+ return 0;
+}
+
+/* We can only remove VMAs when merging if they do not have a close hook. */
+static bool can_merge_remove_vma(struct vm_area_struct *vma)
+{
+ return !vma->vm_ops || !vma->vm_ops->close;
+}
+
+/*
+ * vma_merge_existing_range - Attempt to merge VMAs based on a VMA having its
+ * attributes modified.
+ *
+ * @vmg: Describes the modifications being made to a VMA and associated
+ * metadata.
+ *
+ * When the attributes of a range within a VMA change, then it might be possible
+ * for immediately adjacent VMAs to be merged into that VMA due to having
+ * identical properties.
+ *
+ * This function checks for the existence of any such mergeable VMAs and updates
+ * the maple tree describing the @vmg->vma->vm_mm address space to account for
+ * this, as well as any VMAs shrunk/expanded/deleted as a result of this merge.
+ *
+ * As part of this operation, if a merge occurs, the @vmg object will have its
+ * vma, start, end, and pgoff fields modified to execute the merge. Subsequent
+ * calls to this function should reset these fields.
+ *
+ * Returns: The merged VMA if merge succeeds, or NULL otherwise.
+ *
+ * ASSUMPTIONS:
+ * - The caller must assign the VMA to be modifed to @vmg->vma.
+ * - The caller must have set @vmg->prev to the previous VMA, if there is one.
+ * - The caller must not set @vmg->next, as we determine this.
+ * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
+ * - vmi must be positioned within [@vmg->vma->vm_start, @vmg->vma->vm_end).
+ */
+static struct vm_area_struct *vma_merge_existing_range(struct vma_merge_struct *vmg)
+{
+ struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *prev = vmg->prev;
+ struct vm_area_struct *next, *res;
+ struct vm_area_struct *anon_dup = NULL;
+ struct vm_area_struct *adjust = NULL;
+ unsigned long start = vmg->start;
+ unsigned long end = vmg->end;
+ bool left_side = vma && start == vma->vm_start;
+ bool right_side = vma && end == vma->vm_end;
+ int err = 0;
+ long adj_start = 0;
+ bool merge_will_delete_vma, merge_will_delete_next;
+ bool merge_left, merge_right, merge_both;
+ bool expanded;
+
+ mmap_assert_write_locked(vmg->mm);
+ VM_WARN_ON(!vma); /* We are modifying a VMA, so caller must specify. */
+ VM_WARN_ON(vmg->next); /* We set this. */
+ VM_WARN_ON(prev && start <= prev->vm_start);
+ VM_WARN_ON(start >= end);
+ /*
+ * If vma == prev, then we are offset into a VMA. Otherwise, if we are
+ * not, we must span a portion of the VMA.
+ */
+ VM_WARN_ON(vma && ((vma != prev && vmg->start != vma->vm_start) ||
+ vmg->end > vma->vm_end));
+ /* The vmi must be positioned within vmg->vma. */
+ VM_WARN_ON(vma && !(vma_iter_addr(vmg->vmi) >= vma->vm_start &&
+ vma_iter_addr(vmg->vmi) < vma->vm_end));
+
+ vmg->state = VMA_MERGE_NOMERGE;
+
+ /*
+ * If a special mapping or if the range being modified is neither at the
+ * furthermost left or right side of the VMA, then we have no chance of
+ * merging and should abort.
+ */
+ if (vmg->flags & VM_SPECIAL || (!left_side && !right_side))
+ return NULL;
+
+ if (left_side)
+ merge_left = can_vma_merge_left(vmg);
+ else
+ merge_left = false;
+
+ if (right_side) {
+ next = vmg->next = vma_iter_next_range(vmg->vmi);
+ vma_iter_prev_range(vmg->vmi);
+
+ merge_right = can_vma_merge_right(vmg, merge_left);
+ } else {
+ merge_right = false;
+ next = NULL;
+ }
+
+ if (merge_left) /* If merging prev, position iterator there. */
+ vma_prev(vmg->vmi);
+ else if (!merge_right) /* If we have nothing to merge, abort. */
+ return NULL;
+
+ merge_both = merge_left && merge_right;
+ /* If we span the entire VMA, a merge implies it will be deleted. */
+ merge_will_delete_vma = left_side && right_side;
+
+ /*
+ * If we need to remove vma in its entirety but are unable to do so,
+ * we have no sensible recourse but to abort the merge.
+ */
+ if (merge_will_delete_vma && !can_merge_remove_vma(vma))
+ return NULL;
+
+ /*
+ * If we merge both VMAs, then next is also deleted. This implies
+ * merge_will_delete_vma also.
+ */
+ merge_will_delete_next = merge_both;
+
+ /*
+ * If we cannot delete next, then we can reduce the operation to merging
+ * prev and vma (thereby deleting vma).
+ */
+ if (merge_will_delete_next && !can_merge_remove_vma(next)) {
+ merge_will_delete_next = false;
+ merge_right = false;
+ merge_both = false;
+ }
+
+ /* No matter what happens, we will be adjusting vma. */
+ vma_start_write(vma);
+
+ if (merge_left)
+ vma_start_write(prev);
+
+ if (merge_right)
+ vma_start_write(next);
+
+ if (merge_both) {
+ /*
+ * |<----->|
+ * |-------*********-------|
+ * prev vma next
+ * extend delete delete
+ */
+
+ vmg->vma = prev;
+ vmg->start = prev->vm_start;
+ vmg->end = next->vm_end;
+ vmg->pgoff = prev->vm_pgoff;
+
+ /*
+ * We already ensured anon_vma compatibility above, so now it's
+ * simply a case of, if prev has no anon_vma object, which of
+ * next or vma contains the anon_vma we must duplicate.
+ */
+ err = dup_anon_vma(prev, next->anon_vma ? next : vma, &anon_dup);
+ } else if (merge_left) {
+ /*
+ * |<----->| OR
+ * |<--------->|
+ * |-------*************
+ * prev vma
+ * extend shrink/delete
+ */
+
+ vmg->vma = prev;
+ vmg->start = prev->vm_start;
+ vmg->pgoff = prev->vm_pgoff;
+
+ if (!merge_will_delete_vma) {
+ adjust = vma;
+ adj_start = vmg->end - vma->vm_start;
+ }
+
+ err = dup_anon_vma(prev, vma, &anon_dup);
+ } else { /* merge_right */
+ /*
+ * |<----->| OR
+ * |<--------->|
+ * *************-------|
+ * vma next
+ * shrink/delete extend
+ */
+
+ pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start);
+
+ VM_WARN_ON(!merge_right);
+ /* If we are offset into a VMA, then prev must be vma. */
+ VM_WARN_ON(vmg->start > vma->vm_start && prev && vma != prev);
+
+ if (merge_will_delete_vma) {
+ vmg->vma = next;
+ vmg->end = next->vm_end;
+ vmg->pgoff = next->vm_pgoff - pglen;
+ } else {
+ /*
+ * We shrink vma and expand next.
+ *
+ * IMPORTANT: This is the ONLY case where the final
+ * merged VMA is NOT vmg->vma, but rather vmg->next.
+ */
+
+ vmg->start = vma->vm_start;
+ vmg->end = start;
+ vmg->pgoff = vma->vm_pgoff;
+
+ adjust = next;
+ adj_start = -(vma->vm_end - start);
+ }
+
+ err = dup_anon_vma(next, vma, &anon_dup);
+ }
+
+ if (err)
+ goto abort;
+
+ /*
+ * In nearly all cases, we expand vmg->vma. There is one exception -
+ * merge_right where we partially span the VMA. In this case we shrink
+ * the end of vmg->vma and adjust the start of vmg->next accordingly.
+ */
+ expanded = !merge_right || merge_will_delete_vma;
+
+ if (commit_merge(vmg, adjust,
+ merge_will_delete_vma ? vma : NULL,
+ merge_will_delete_next ? next : NULL,
+ adj_start, expanded)) {
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
+
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
+ return NULL;
+ }
+
+ res = merge_left ? prev : next;
+ khugepaged_enter_vma(res, vmg->flags);
+
+ vmg->state = VMA_MERGE_SUCCESS;
+ return res;
+
+abort:
+ vma_iter_set(vmg->vmi, start);
+ vma_iter_load(vmg->vmi);
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
+ return NULL;
+}
+
+/*
+ * vma_merge_new_range - Attempt to merge a new VMA into address space
+ *
+ * @vmg: Describes the VMA we are adding, in the range @vmg->start to @vmg->end
+ * (exclusive), which we try to merge with any adjacent VMAs if possible.
+ *
+ * We are about to add a VMA to the address space starting at @vmg->start and
+ * ending at @vmg->end. There are three different possible scenarios:
+ *
+ * 1. There is a VMA with identical properties immediately adjacent to the
+ * proposed new VMA [@vmg->start, @vmg->end) either before or after it -
+ * EXPAND that VMA:
+ *
+ * Proposed: |-----| or |-----|
+ * Existing: |----| |----|
+ *
+ * 2. There are VMAs with identical properties immediately adjacent to the
+ * proposed new VMA [@vmg->start, @vmg->end) both before AND after it -
+ * EXPAND the former and REMOVE the latter:
+ *
+ * Proposed: |-----|
+ * Existing: |----| |----|
+ *
+ * 3. There are no VMAs immediately adjacent to the proposed new VMA or those
+ * VMAs do not have identical attributes - NO MERGE POSSIBLE.
+ *
+ * In instances where we can merge, this function returns the expanded VMA which
+ * will have its range adjusted accordingly and the underlying maple tree also
+ * adjusted.
+ *
+ * Returns: In instances where no merge was possible, NULL. Otherwise, a pointer
+ * to the VMA we expanded.
+ *
+ * This function adjusts @vmg to provide @vmg->next if not already specified,
+ * and adjusts [@vmg->start, @vmg->end) to span the expanded range.
+ *
+ * ASSUMPTIONS:
+ * - The caller must hold a WRITE lock on the mm_struct->mmap_lock.
+ * - The caller must have determined that [@vmg->start, @vmg->end) is empty,
+ other than VMAs that will be unmapped should the operation succeed.
+ * - The caller must have specified the previous vma in @vmg->prev.
+ * - The caller must have specified the next vma in @vmg->next.
+ * - The caller must have positioned the vmi at or before the gap.
+ */
+struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
+{
+ struct vm_area_struct *prev = vmg->prev;
+ struct vm_area_struct *next = vmg->next;
+ unsigned long start = vmg->start;
+ unsigned long end = vmg->end;
+ pgoff_t pgoff = vmg->pgoff;
+ pgoff_t pglen = PHYS_PFN(end - start);
+ bool can_merge_left, can_merge_right;
+
+ mmap_assert_write_locked(vmg->mm);
+ VM_WARN_ON(vmg->vma);
+ /* vmi must point at or before the gap. */
+ VM_WARN_ON(vma_iter_addr(vmg->vmi) > end);
+
+ vmg->state = VMA_MERGE_NOMERGE;
+
+ /* Special VMAs are unmergeable, also if no prev/next. */
+ if ((vmg->flags & VM_SPECIAL) || (!prev && !next))
+ return NULL;
+
+ can_merge_left = can_vma_merge_left(vmg);
+ can_merge_right = can_vma_merge_right(vmg, can_merge_left);
+
+ /* If we can merge with the next VMA, adjust vmg accordingly. */
+ if (can_merge_right) {
+ vmg->end = next->vm_end;
+ vmg->vma = next;
+ vmg->pgoff = next->vm_pgoff - pglen;
+ }
+
+ /* If we can merge with the previous VMA, adjust vmg accordingly. */
+ if (can_merge_left) {
+ vmg->start = prev->vm_start;
+ vmg->vma = prev;
+ vmg->pgoff = prev->vm_pgoff;
+
+ /*
+ * If this merge would result in removal of the next VMA but we
+ * are not permitted to do so, reduce the operation to merging
+ * prev and vma.
+ */
+ if (can_merge_right && !can_merge_remove_vma(next))
+ vmg->end = end;
+
+ vma_prev(vmg->vmi); /* Equivalent to going to the previous range */
+ }
+
+ /*
+ * Now try to expand adjacent VMA(s). This takes care of removing the
+ * following VMA if we have VMAs on both sides.
+ */
+ if (vmg->vma && !vma_expand(vmg)) {
+ khugepaged_enter_vma(vmg->vma, vmg->flags);
+ vmg->state = VMA_MERGE_SUCCESS;
+ return vmg->vma;
+ }
+
+ /* If expansion failed, reset state. Allows us to retry merge later. */
+ vmg->vma = NULL;
+ vmg->start = start;
+ vmg->end = end;
+ vmg->pgoff = pgoff;
+ if (vmg->vma == prev)
+ vma_iter_set(vmg->vmi, start);
+
+ return NULL;
+}
+
+/*
+ * vma_expand - Expand an existing VMA
+ *
+ * @vmg: Describes a VMA expansion operation.
+ *
+ * Expand @vma to vmg->start and vmg->end. Can expand off the start and end.
+ * Will expand over vmg->next if it's different from vmg->vma and vmg->end ==
+ * vmg->next->vm_end. Checking if the vmg->vma can expand and merge with
+ * vmg->next needs to be handled by the caller.
+ *
+ * Returns: 0 on success.
+ *
+ * ASSUMPTIONS:
+ * - The caller must hold a WRITE lock on vmg->vma->mm->mmap_lock.
+ * - The caller must have set @vmg->vma and @vmg->next.
+ */
+int vma_expand(struct vma_merge_struct *vmg)
+{
+ struct vm_area_struct *anon_dup = NULL;
+ bool remove_next = false;
+ struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *next = vmg->next;
+
+ mmap_assert_write_locked(vmg->mm);
+
+ vma_start_write(vma);
+ if (next && (vma != next) && (vmg->end == next->vm_end)) {
+ int ret;
+
+ remove_next = true;
+ /* This should already have been checked by this point. */
+ VM_WARN_ON(!can_merge_remove_vma(next));
+ vma_start_write(next);
+ ret = dup_anon_vma(vma, next, &anon_dup);
+ if (ret)
+ return ret;
+ }
+
+ /* Not merging but overwriting any part of next is not handled. */
+ VM_WARN_ON(next && !remove_next &&
+ next != vma && vmg->end > next->vm_start);
+ /* Only handles expanding */
+ VM_WARN_ON(vma->vm_start < vmg->start || vma->vm_end > vmg->end);
+
+ if (commit_merge(vmg, NULL, remove_next ? next : NULL, NULL, 0, true))
+ goto nomem;
+
+ return 0;
+
+nomem:
+ vmg->state = VMA_MERGE_ERROR_NOMEM;
+ if (anon_dup)
+ unlink_anon_vmas(anon_dup);
+ return -ENOMEM;
+}
+
+/*
+ * vma_shrink() - Reduce an existing VMAs memory area
+ * @vmi: The vma iterator
+ * @vma: The VMA to modify
+ * @start: The new start
+ * @end: The new end
+ *
+ * Returns: 0 on success, -ENOMEM otherwise
+ */
+int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff)
+{
+ struct vma_prepare vp;
+
+ WARN_ON((vma->vm_start != start) && (vma->vm_end != end));
+
+ if (vma->vm_start < start)
+ vma_iter_config(vmi, vma->vm_start, start);
+ else
+ vma_iter_config(vmi, end, vma->vm_end);
+
+ if (vma_iter_prealloc(vmi, NULL))
+ return -ENOMEM;
+
+ vma_start_write(vma);
+
+ init_vma_prep(&vp, vma);
+ vma_prepare(&vp);
+ vma_adjust_trans_huge(vma, start, end, 0);
+
+ vma_iter_clear(vmi);
+ vma_set_range(vma, start, end, pgoff);
+ vma_complete(&vp, vmi, vma->vm_mm);
+ validate_mm(vma->vm_mm);
+ return 0;
+}
+
+static inline void vms_clear_ptes(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach, bool mm_wr_locked)
+{
+ struct mmu_gather tlb;
+
+ if (!vms->clear_ptes) /* Nothing to do */
+ return;
+
+ /*
+ * We can free page tables without write-locking mmap_lock because VMAs
+ * were isolated before we downgraded mmap_lock.
+ */
+ mas_set(mas_detach, 1);
+ lru_add_drain();
+ tlb_gather_mmu(&tlb, vms->vma->vm_mm);
+ update_hiwater_rss(vms->vma->vm_mm);
+ unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end,
+ vms->vma_count, mm_wr_locked);
+
+ mas_set(mas_detach, 1);
+ /* start and end may be different if there is no prev or next vma. */
+ free_pgtables(&tlb, mas_detach, vms->vma, vms->unmap_start,
+ vms->unmap_end, mm_wr_locked);
+ tlb_finish_mmu(&tlb);
+ vms->clear_ptes = false;
+}
+
+void vms_clean_up_area(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach)
+{
+ struct vm_area_struct *vma;
+
+ if (!vms->nr_pages)
+ return;
+
+ vms_clear_ptes(vms, mas_detach, true);
+ mas_set(mas_detach, 0);
+ mas_for_each(mas_detach, vma, ULONG_MAX)
+ if (vma->vm_ops && vma->vm_ops->close)
+ vma->vm_ops->close(vma);
+ vms->closed_vm_ops = true;
+}
+
+/*
+ * vms_complete_munmap_vmas() - Finish the munmap() operation
+ * @vms: The vma munmap struct
+ * @mas_detach: The maple state of the detached vmas
+ *
+ * This updates the mm_struct, unmaps the region, frees the resources
+ * used for the munmap() and may downgrade the lock - if requested. Everything
+ * needed to be done once the vma maple tree is updated.
+ */
+void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+
+ mm = current->mm;
+ mm->map_count -= vms->vma_count;
+ mm->locked_vm -= vms->locked_vm;
+ if (vms->unlock)
+ mmap_write_downgrade(mm);
+
+ if (!vms->nr_pages)
+ return;
+
+ vms_clear_ptes(vms, mas_detach, !vms->unlock);
+ /* Update high watermark before we lower total_vm */
+ update_hiwater_vm(mm);
+ /* Stat accounting */
+ WRITE_ONCE(mm->total_vm, READ_ONCE(mm->total_vm) - vms->nr_pages);
+ /* Paranoid bookkeeping */
+ VM_WARN_ON(vms->exec_vm > mm->exec_vm);
+ VM_WARN_ON(vms->stack_vm > mm->stack_vm);
+ VM_WARN_ON(vms->data_vm > mm->data_vm);
+ mm->exec_vm -= vms->exec_vm;
+ mm->stack_vm -= vms->stack_vm;
+ mm->data_vm -= vms->data_vm;
+
+ /* Remove and clean up vmas */
+ mas_set(mas_detach, 0);
+ mas_for_each(mas_detach, vma, ULONG_MAX)
+ remove_vma(vma, /* = */ false, vms->closed_vm_ops);
+
+ vm_unacct_memory(vms->nr_accounted);
+ validate_mm(mm);
+ if (vms->unlock)
+ mmap_read_unlock(mm);
+
+ __mt_destroy(mas_detach->tree);
+}
+
+/*
+ * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree
+ * for removal at a later date. Handles splitting first and last if necessary
+ * and marking the vmas as isolated.
+ *
+ * @vms: The vma munmap struct
+ * @mas_detach: The maple state tracking the detached tree
+ *
+ * Return: 0 on success, error otherwise
+ */
+int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach)
+{
+ struct vm_area_struct *next = NULL;
+ int error;
+
+ /*
+ * If we need to split any vma, do it now to save pain later.
+ * Does it split the first one?
+ */
+ if (vms->start > vms->vma->vm_start) {
+
+ /*
+ * Make sure that map_count on return from munmap() will
+ * not exceed its limit; but let map_count go just above
+ * its limit temporarily, to help free resources as expected.
+ */
+ if (vms->end < vms->vma->vm_end &&
+ vms->vma->vm_mm->map_count >= sysctl_max_map_count) {
+ error = -ENOMEM;
+ goto map_count_exceeded;
+ }
+
+ /* Don't bother splitting the VMA if we can't unmap it anyway */
+ if (!can_modify_vma(vms->vma)) {
+ error = -EPERM;
+ goto start_split_failed;
+ }
+
+ error = __split_vma(vms->vmi, vms->vma, vms->start, 1);
+ if (error)
+ goto start_split_failed;
+ }
+ vms->prev = vma_prev(vms->vmi);
+ if (vms->prev)
+ vms->unmap_start = vms->prev->vm_end;
+
+ /*
+ * Detach a range of VMAs from the mm. Using next as a temp variable as
+ * it is always overwritten.
+ */
+ for_each_vma_range(*(vms->vmi), next, vms->end) {
+ long nrpages;
+
+ if (!can_modify_vma(next)) {
+ error = -EPERM;
+ goto modify_vma_failed;
+ }
+ /* Does it split the end? */
+ if (next->vm_end > vms->end) {
+ error = __split_vma(vms->vmi, next, vms->end, 0);
+ if (error)
+ goto end_split_failed;
+ }
+ vma_start_write(next);
+ mas_set(mas_detach, vms->vma_count++);
+ error = mas_store_gfp(mas_detach, next, GFP_KERNEL);
+ if (error)
+ goto munmap_gather_failed;
+
+ vma_mark_detached(next, true);
+ nrpages = vma_pages(next);
+
+ vms->nr_pages += nrpages;
+ if (next->vm_flags & VM_LOCKED)
+ vms->locked_vm += nrpages;
+
+ if (next->vm_flags & VM_ACCOUNT)
+ vms->nr_accounted += nrpages;
+
+ if (is_exec_mapping(next->vm_flags))
+ vms->exec_vm += nrpages;
+ else if (is_stack_mapping(next->vm_flags))
+ vms->stack_vm += nrpages;
+ else if (is_data_mapping(next->vm_flags))
+ vms->data_vm += nrpages;
+
+ if (unlikely(vms->uf)) {
+ /*
+ * If userfaultfd_unmap_prep returns an error the vmas
+ * will remain split, but userland will get a
+ * highly unexpected error anyway. This is no
+ * different than the case where the first of the two
+ * __split_vma fails, but we don't undo the first
+ * split, despite we could. This is unlikely enough
+ * failure that it's not worth optimizing it for.
+ */
+ error = userfaultfd_unmap_prep(next, vms->start,
+ vms->end, vms->uf);
+ if (error)
+ goto userfaultfd_error;
+ }
+#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
+ BUG_ON(next->vm_start < vms->start);
+ BUG_ON(next->vm_start > vms->end);
+#endif
+ }
+
+ vms->next = vma_next(vms->vmi);
+ if (vms->next)
+ vms->unmap_end = vms->next->vm_start;
+
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+ /* Make sure no VMAs are about to be lost. */
+ {
+ MA_STATE(test, mas_detach->tree, 0, 0);
+ struct vm_area_struct *vma_mas, *vma_test;
+ int test_count = 0;
+
+ vma_iter_set(vms->vmi, vms->start);
+ rcu_read_lock();
+ vma_test = mas_find(&test, vms->vma_count - 1);
+ for_each_vma_range(*(vms->vmi), vma_mas, vms->end) {
+ BUG_ON(vma_mas != vma_test);
+ test_count++;
+ vma_test = mas_next(&test, vms->vma_count - 1);
+ }
+ rcu_read_unlock();
+ BUG_ON(vms->vma_count != test_count);
+ }
+#endif
+
+ while (vma_iter_addr(vms->vmi) > vms->start)
+ vma_iter_prev_range(vms->vmi);
+
+ vms->clear_ptes = true;
+ return 0;
+
+userfaultfd_error:
+munmap_gather_failed:
+end_split_failed:
+modify_vma_failed:
+ reattach_vmas(mas_detach);
+start_split_failed:
+map_count_exceeded:
+ return error;
+}
+
+/*
+ * do_vmi_align_munmap() - munmap the aligned region from @start to @end.
+ * @vmi: The vma iterator
+ * @vma: The starting vm_area_struct
+ * @mm: The mm_struct
+ * @start: The aligned start address to munmap.
+ * @end: The aligned end address to munmap.
+ * @uf: The userfaultfd list_head
+ * @unlock: Set to true to drop the mmap_lock. unlocking only happens on
+ * success.
+ *
+ * Return: 0 on success and drops the lock if so directed, error and leaves the
+ * lock held otherwise.
+ */
+int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ struct mm_struct *mm, unsigned long start, unsigned long end,
+ struct list_head *uf, bool unlock)
+{
+ struct maple_tree mt_detach;
+ MA_STATE(mas_detach, &mt_detach, 0, 0);
+ mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
+ mt_on_stack(mt_detach);
+ struct vma_munmap_struct vms;
+ int error;
+
+ init_vma_munmap(&vms, vmi, vma, start, end, uf, unlock);
+ error = vms_gather_munmap_vmas(&vms, &mas_detach);
+ if (error)
+ goto gather_failed;
+
+ error = vma_iter_clear_gfp(vmi, start, end, GFP_KERNEL);
+ if (error)
+ goto clear_tree_failed;
+
+ /* Point of no return */
+ vms_complete_munmap_vmas(&vms, &mas_detach);
+ return 0;
+
+clear_tree_failed:
+ reattach_vmas(&mas_detach);
+gather_failed:
+ validate_mm(mm);
+ return error;
+}
+
+/*
+ * do_vmi_munmap() - munmap a given range.
+ * @vmi: The vma iterator
+ * @mm: The mm_struct
+ * @start: The start address to munmap
+ * @len: The length of the range to munmap
+ * @uf: The userfaultfd list_head
+ * @unlock: set to true if the user wants to drop the mmap_lock on success
+ *
+ * This function takes a @mas that is either pointing to the previous VMA or set
+ * to MA_START and sets it up to remove the mapping(s). The @len will be
+ * aligned.
+ *
+ * Return: 0 on success and drops the lock if so directed, error and leaves the
+ * lock held otherwise.
+ */
+int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
+ unsigned long start, size_t len, struct list_head *uf,
+ bool unlock)
+{
+ unsigned long end;
+ struct vm_area_struct *vma;
+
+ if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start)
+ return -EINVAL;
+
+ end = start + PAGE_ALIGN(len);
+ if (end == start)
+ return -EINVAL;
+
+ /* Find the first overlapping VMA */
+ vma = vma_find(vmi, end);
+ if (!vma) {
+ if (unlock)
+ mmap_write_unlock(mm);
+ return 0;
+ }
+
+ return do_vmi_align_munmap(vmi, vma, mm, start, end, uf, unlock);
+}
+
+/*
+ * We are about to modify one or multiple of a VMA's flags, policy, userfaultfd
+ * context and anonymous VMA name within the range [start, end).
+ *
+ * As a result, we might be able to merge the newly modified VMA range with an
+ * adjacent VMA with identical properties.
+ *
+ * If no merge is possible and the range does not span the entirety of the VMA,
+ * we then need to split the VMA to accommodate the change.
+ *
+ * The function returns either the merged VMA, the original VMA if a split was
+ * required instead, or an error if the split failed.
+ */
+static struct vm_area_struct *vma_modify(struct vma_merge_struct *vmg)
+{
+ struct vm_area_struct *vma = vmg->vma;
+ struct vm_area_struct *merged;
+
+ /* First, try to merge. */
+ merged = vma_merge_existing_range(vmg);
+ if (merged)
+ return merged;
+
+ /* Split any preceding portion of the VMA. */
+ if (vma->vm_start < vmg->start) {
+ int err = split_vma(vmg->vmi, vma, vmg->start, 1);
+
+ if (err)
+ return ERR_PTR(err);
+ }
+
+ /* Split any trailing portion of the VMA. */
+ if (vma->vm_end > vmg->end) {
+ int err = split_vma(vmg->vmi, vma, vmg->end, 0);
+
+ if (err)
+ return ERR_PTR(err);
+ }
+
+ return vma;
+}
+
+struct vm_area_struct *vma_modify_flags(
+ struct vma_iterator *vmi, struct vm_area_struct *prev,
+ struct vm_area_struct *vma, unsigned long start, unsigned long end,
+ unsigned long new_flags)
+{
+ VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
+
+ vmg.flags = new_flags;
+
+ return vma_modify(&vmg);
+}
+
+struct vm_area_struct
+*vma_modify_flags_name(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ unsigned long new_flags,
+ struct anon_vma_name *new_name)
+{
+ VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
+
+ vmg.flags = new_flags;
+ vmg.anon_name = new_name;
+
+ return vma_modify(&vmg);
+}
+
+struct vm_area_struct
+*vma_modify_policy(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct mempolicy *new_pol)
+{
+ VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
+
+ vmg.policy = new_pol;
+
+ return vma_modify(&vmg);
+}
+
+struct vm_area_struct
+*vma_modify_flags_uffd(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long new_flags,
+ struct vm_userfaultfd_ctx new_ctx)
+{
+ VMG_VMA_STATE(vmg, vmi, prev, vma, start, end);
+
+ vmg.flags = new_flags;
+ vmg.uffd_ctx = new_ctx;
+
+ return vma_modify(&vmg);
+}
+
+/*
+ * Expand vma by delta bytes, potentially merging with an immediately adjacent
+ * VMA with identical properties.
+ */
+struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
+ struct vm_area_struct *vma,
+ unsigned long delta)
+{
+ VMG_VMA_STATE(vmg, vmi, vma, vma, vma->vm_end, vma->vm_end + delta);
+
+ vmg.next = vma_iter_next_rewind(vmi, NULL);
+ vmg.vma = NULL; /* We use the VMA to populate VMG fields only. */
+
+ return vma_merge_new_range(&vmg);
+}
+
+void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb)
+{
+ vb->count = 0;
+}
+
+static void unlink_file_vma_batch_process(struct unlink_vma_file_batch *vb)
+{
+ struct address_space *mapping;
+ int i;
+
+ mapping = vb->vmas[0]->vm_file->f_mapping;
+ i_mmap_lock_write(mapping);
+ for (i = 0; i < vb->count; i++) {
+ VM_WARN_ON_ONCE(vb->vmas[i]->vm_file->f_mapping != mapping);
+ __remove_shared_vm_struct(vb->vmas[i], mapping);
+ }
+ i_mmap_unlock_write(mapping);
+
+ unlink_file_vma_batch_init(vb);
+}
+
+void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
+ struct vm_area_struct *vma)
+{
+ if (vma->vm_file == NULL)
+ return;
+
+ if ((vb->count > 0 && vb->vmas[0]->vm_file != vma->vm_file) ||
+ vb->count == ARRAY_SIZE(vb->vmas))
+ unlink_file_vma_batch_process(vb);
+
+ vb->vmas[vb->count] = vma;
+ vb->count++;
+}
+
+void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb)
+{
+ if (vb->count > 0)
+ unlink_file_vma_batch_process(vb);
+}
+
+/*
+ * Unlink a file-based vm structure from its interval tree, to hide
+ * vma from rmap and vmtruncate before freeing its page tables.
+ */
+void unlink_file_vma(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+
+ if (file) {
+ struct address_space *mapping = file->f_mapping;
+
+ i_mmap_lock_write(mapping);
+ __remove_shared_vm_struct(vma, mapping);
+ i_mmap_unlock_write(mapping);
+ }
+}
+
+void vma_link_file(struct vm_area_struct *vma)
+{
+ struct file *file = vma->vm_file;
+ struct address_space *mapping;
+
+ if (file) {
+ mapping = file->f_mapping;
+ i_mmap_lock_write(mapping);
+ __vma_link_file(vma, mapping);
+ i_mmap_unlock_write(mapping);
+ }
+}
+
+int vma_link(struct mm_struct *mm, struct vm_area_struct *vma)
+{
+ VMA_ITERATOR(vmi, mm, 0);
+
+ vma_iter_config(&vmi, vma->vm_start, vma->vm_end);
+ if (vma_iter_prealloc(&vmi, vma))
+ return -ENOMEM;
+
+ vma_start_write(vma);
+ vma_iter_store(&vmi, vma);
+ vma_link_file(vma);
+ mm->map_count++;
+ validate_mm(mm);
+ return 0;
+}
+
+/*
+ * Copy the vma structure to a new location in the same mm,
+ * prior to moving page table entries, to effect an mremap move.
+ */
+struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
+ unsigned long addr, unsigned long len, pgoff_t pgoff,
+ bool *need_rmap_locks)
+{
+ struct vm_area_struct *vma = *vmap;
+ unsigned long vma_start = vma->vm_start;
+ struct mm_struct *mm = vma->vm_mm;
+ struct vm_area_struct *new_vma;
+ bool faulted_in_anon_vma = true;
+ VMA_ITERATOR(vmi, mm, addr);
+ VMG_VMA_STATE(vmg, &vmi, NULL, vma, addr, addr + len);
+
+ /*
+ * If anonymous vma has not yet been faulted, update new pgoff
+ * to match new location, to increase its chance of merging.
+ */
+ if (unlikely(vma_is_anonymous(vma) && !vma->anon_vma)) {
+ pgoff = addr >> PAGE_SHIFT;
+ faulted_in_anon_vma = false;
+ }
+
+ new_vma = find_vma_prev(mm, addr, &vmg.prev);
+ if (new_vma && new_vma->vm_start < addr + len)
+ return NULL; /* should never get here */
+
+ vmg.vma = NULL; /* New VMA range. */
+ vmg.pgoff = pgoff;
+ vmg.next = vma_iter_next_rewind(&vmi, NULL);
+ new_vma = vma_merge_new_range(&vmg);
+
+ if (new_vma) {
+ /*
+ * Source vma may have been merged into new_vma
+ */
+ if (unlikely(vma_start >= new_vma->vm_start &&
+ vma_start < new_vma->vm_end)) {
+ /*
+ * The only way we can get a vma_merge with
+ * self during an mremap is if the vma hasn't
+ * been faulted in yet and we were allowed to
+ * reset the dst vma->vm_pgoff to the
+ * destination address of the mremap to allow
+ * the merge to happen. mremap must change the
+ * vm_pgoff linearity between src and dst vmas
+ * (in turn preventing a vma_merge) to be
+ * safe. It is only safe to keep the vm_pgoff
+ * linear if there are no pages mapped yet.
+ */
+ VM_BUG_ON_VMA(faulted_in_anon_vma, new_vma);
+ *vmap = vma = new_vma;
+ }
+ *need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
+ } else {
+ new_vma = vm_area_dup(vma);
+ if (!new_vma)
+ goto out;
+ vma_set_range(new_vma, addr, addr + len, pgoff);
+ if (vma_dup_policy(vma, new_vma))
+ goto out_free_vma;
+ if (anon_vma_clone(new_vma, vma))
+ goto out_free_mempol;
+ if (new_vma->vm_file)
+ get_file(new_vma->vm_file);
+ if (new_vma->vm_ops && new_vma->vm_ops->open)
+ new_vma->vm_ops->open(new_vma);
+ if (vma_link(mm, new_vma))
+ goto out_vma_link;
+ *need_rmap_locks = false;
+ }
+ return new_vma;
+
+out_vma_link:
+ if (new_vma->vm_ops && new_vma->vm_ops->close)
+ new_vma->vm_ops->close(new_vma);
+
+ if (new_vma->vm_file)
+ fput(new_vma->vm_file);
+
+ unlink_anon_vmas(new_vma);
+out_free_mempol:
+ mpol_put(vma_policy(new_vma));
+out_free_vma:
+ vm_area_free(new_vma);
+out:
+ return NULL;
+}
+
+/*
+ * Rough compatibility check to quickly see if it's even worth looking
+ * at sharing an anon_vma.
+ *
+ * They need to have the same vm_file, and the flags can only differ
+ * in things that mprotect may change.
+ *
+ * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that
+ * we can merge the two vma's. For example, we refuse to merge a vma if
+ * there is a vm_ops->close() function, because that indicates that the
+ * driver is doing some kind of reference counting. But that doesn't
+ * really matter for the anon_vma sharing case.
+ */
+static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b)
+{
+ return a->vm_end == b->vm_start &&
+ mpol_equal(vma_policy(a), vma_policy(b)) &&
+ a->vm_file == b->vm_file &&
+ !((a->vm_flags ^ b->vm_flags) & ~(VM_ACCESS_FLAGS | VM_SOFTDIRTY)) &&
+ b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT);
+}
+
+/*
+ * Do some basic sanity checking to see if we can re-use the anon_vma
+ * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be
+ * the same as 'old', the other will be the new one that is trying
+ * to share the anon_vma.
+ *
+ * NOTE! This runs with mmap_lock held for reading, so it is possible that
+ * the anon_vma of 'old' is concurrently in the process of being set up
+ * by another page fault trying to merge _that_. But that's ok: if it
+ * is being set up, that automatically means that it will be a singleton
+ * acceptable for merging, so we can do all of this optimistically. But
+ * we do that READ_ONCE() to make sure that we never re-load the pointer.
+ *
+ * IOW: that the "list_is_singular()" test on the anon_vma_chain only
+ * matters for the 'stable anon_vma' case (ie the thing we want to avoid
+ * is to return an anon_vma that is "complex" due to having gone through
+ * a fork).
+ *
+ * We also make sure that the two vma's are compatible (adjacent,
+ * and with the same memory policies). That's all stable, even with just
+ * a read lock on the mmap_lock.
+ */
+static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old,
+ struct vm_area_struct *a,
+ struct vm_area_struct *b)
+{
+ if (anon_vma_compatible(a, b)) {
+ struct anon_vma *anon_vma = READ_ONCE(old->anon_vma);
+
+ if (anon_vma && list_is_singular(&old->anon_vma_chain))
+ return anon_vma;
+ }
+ return NULL;
+}
+
+/*
+ * find_mergeable_anon_vma is used by anon_vma_prepare, to check
+ * neighbouring vmas for a suitable anon_vma, before it goes off
+ * to allocate a new anon_vma. It checks because a repetitive
+ * sequence of mprotects and faults may otherwise lead to distinct
+ * anon_vmas being allocated, preventing vma merge in subsequent
+ * mprotect.
+ */
+struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma)
+{
+ struct anon_vma *anon_vma = NULL;
+ struct vm_area_struct *prev, *next;
+ VMA_ITERATOR(vmi, vma->vm_mm, vma->vm_end);
+
+ /* Try next first. */
+ next = vma_iter_load(&vmi);
+ if (next) {
+ anon_vma = reusable_anon_vma(next, vma, next);
+ if (anon_vma)
+ return anon_vma;
+ }
+
+ prev = vma_prev(&vmi);
+ VM_BUG_ON_VMA(prev != vma, vma);
+ prev = vma_prev(&vmi);
+ /* Try prev next. */
+ if (prev)
+ anon_vma = reusable_anon_vma(prev, prev, vma);
+
+ /*
+ * We might reach here with anon_vma == NULL if we can't find
+ * any reusable anon_vma.
+ * There's no absolute need to look only at touching neighbours:
+ * we could search further afield for "compatible" anon_vmas.
+ * But it would probably just be a waste of time searching,
+ * or lead to too many vmas hanging off the same anon_vma.
+ * We're trying to allow mprotect remerging later on,
+ * not trying to minimize memory used for anon_vmas.
+ */
+ return anon_vma;
+}
+
+static bool vm_ops_needs_writenotify(const struct vm_operations_struct *vm_ops)
+{
+ return vm_ops && (vm_ops->page_mkwrite || vm_ops->pfn_mkwrite);
+}
+
+static bool vma_is_shared_writable(struct vm_area_struct *vma)
+{
+ return (vma->vm_flags & (VM_WRITE | VM_SHARED)) ==
+ (VM_WRITE | VM_SHARED);
+}
+
+static bool vma_fs_can_writeback(struct vm_area_struct *vma)
+{
+ /* No managed pages to writeback. */
+ if (vma->vm_flags & VM_PFNMAP)
+ return false;
+
+ return vma->vm_file && vma->vm_file->f_mapping &&
+ mapping_can_writeback(vma->vm_file->f_mapping);
+}
+
+/*
+ * Does this VMA require the underlying folios to have their dirty state
+ * tracked?
+ */
+bool vma_needs_dirty_tracking(struct vm_area_struct *vma)
+{
+ /* Only shared, writable VMAs require dirty tracking. */
+ if (!vma_is_shared_writable(vma))
+ return false;
+
+ /* Does the filesystem need to be notified? */
+ if (vm_ops_needs_writenotify(vma->vm_ops))
+ return true;
+
+ /*
+ * Even if the filesystem doesn't indicate a need for writenotify, if it
+ * can writeback, dirty tracking is still required.
+ */
+ return vma_fs_can_writeback(vma);
+}
+
+/*
+ * Some shared mappings will want the pages marked read-only
+ * to track write events. If so, we'll downgrade vm_page_prot
+ * to the private version (using protection_map[] without the
+ * VM_SHARED bit).
+ */
+bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot)
+{
+ /* If it was private or non-writable, the write bit is already clear */
+ if (!vma_is_shared_writable(vma))
+ return false;
+
+ /* The backer wishes to know when pages are first written to? */
+ if (vm_ops_needs_writenotify(vma->vm_ops))
+ return true;
+
+ /* The open routine did something to the protections that pgprot_modify
+ * won't preserve? */
+ if (pgprot_val(vm_page_prot) !=
+ pgprot_val(vm_pgprot_modify(vm_page_prot, vma->vm_flags)))
+ return false;
+
+ /*
+ * Do we need to track softdirty? hugetlb does not support softdirty
+ * tracking yet.
+ */
+ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
+ return true;
+
+ /* Do we need write faults for uffd-wp tracking? */
+ if (userfaultfd_wp(vma))
+ return true;
+
+ /* Can the mapping track the dirty pages? */
+ return vma_fs_can_writeback(vma);
+}
+
+static DEFINE_MUTEX(mm_all_locks_mutex);
+
+static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
+{
+ if (!test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
+ /*
+ * The LSB of head.next can't change from under us
+ * because we hold the mm_all_locks_mutex.
+ */
+ down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
+ /*
+ * We can safely modify head.next after taking the
+ * anon_vma->root->rwsem. If some other vma in this mm shares
+ * the same anon_vma we won't take it again.
+ *
+ * No need of atomic instructions here, head.next
+ * can't change from under us thanks to the
+ * anon_vma->root->rwsem.
+ */
+ if (__test_and_set_bit(0, (unsigned long *)
+ &anon_vma->root->rb_root.rb_root.rb_node))
+ BUG();
+ }
+}
+
+static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
+{
+ if (!test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+ /*
+ * AS_MM_ALL_LOCKS can't change from under us because
+ * we hold the mm_all_locks_mutex.
+ *
+ * Operations on ->flags have to be atomic because
+ * even if AS_MM_ALL_LOCKS is stable thanks to the
+ * mm_all_locks_mutex, there may be other cpus
+ * changing other bitflags in parallel to us.
+ */
+ if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
+ BUG();
+ down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
+ }
+}
+
+/*
+ * This operation locks against the VM for all pte/vma/mm related
+ * operations that could ever happen on a certain mm. This includes
+ * vmtruncate, try_to_unmap, and all page faults.
+ *
+ * The caller must take the mmap_lock in write mode before calling
+ * mm_take_all_locks(). The caller isn't allowed to release the
+ * mmap_lock until mm_drop_all_locks() returns.
+ *
+ * mmap_lock in write mode is required in order to block all operations
+ * that could modify pagetables and free pages without need of
+ * altering the vma layout. It's also needed in write mode to avoid new
+ * anon_vmas to be associated with existing vmas.
+ *
+ * A single task can't take more than one mm_take_all_locks() in a row
+ * or it would deadlock.
+ *
+ * The LSB in anon_vma->rb_root.rb_node and the AS_MM_ALL_LOCKS bitflag in
+ * mapping->flags avoid to take the same lock twice, if more than one
+ * vma in this mm is backed by the same anon_vma or address_space.
+ *
+ * We take locks in following order, accordingly to comment at beginning
+ * of mm/rmap.c:
+ * - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
+ * hugetlb mapping);
+ * - all vmas marked locked
+ * - all i_mmap_rwsem locks;
+ * - all anon_vma->rwseml
+ *
+ * We can take all locks within these types randomly because the VM code
+ * doesn't nest them and we protected from parallel mm_take_all_locks() by
+ * mm_all_locks_mutex.
+ *
+ * mm_take_all_locks() and mm_drop_all_locks are expensive operations
+ * that may have to take thousand of locks.
+ *
+ * mm_take_all_locks() can fail if it's interrupted by signals.
+ */
+int mm_take_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ mmap_assert_write_locked(mm);
+
+ mutex_lock(&mm_all_locks_mutex);
+
+ /*
+ * vma_start_write() does not have a complement in mm_drop_all_locks()
+ * because vma_start_write() is always asymmetrical; it marks a VMA as
+ * being written to until mmap_write_unlock() or mmap_write_downgrade()
+ * is reached.
+ */
+ for_each_vma(vmi, vma) {
+ if (signal_pending(current))
+ goto out_unlock;
+ vma_start_write(vma);
+ }
+
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->vm_file && vma->vm_file->f_mapping &&
+ is_vm_hugetlb_page(vma))
+ vm_lock_mapping(mm, vma->vm_file->f_mapping);
+ }
+
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->vm_file && vma->vm_file->f_mapping &&
+ !is_vm_hugetlb_page(vma))
+ vm_lock_mapping(mm, vma->vm_file->f_mapping);
+ }
+
+ vma_iter_init(&vmi, mm, 0);
+ for_each_vma(vmi, vma) {
+ if (signal_pending(current))
+ goto out_unlock;
+ if (vma->anon_vma)
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ vm_lock_anon_vma(mm, avc->anon_vma);
+ }
+
+ return 0;
+
+out_unlock:
+ mm_drop_all_locks(mm);
+ return -EINTR;
+}
+
+static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
+{
+ if (test_bit(0, (unsigned long *) &anon_vma->root->rb_root.rb_root.rb_node)) {
+ /*
+ * The LSB of head.next can't change to 0 from under
+ * us because we hold the mm_all_locks_mutex.
+ *
+ * We must however clear the bitflag before unlocking
+ * the vma so the users using the anon_vma->rb_root will
+ * never see our bitflag.
+ *
+ * No need of atomic instructions here, head.next
+ * can't change from under us until we release the
+ * anon_vma->root->rwsem.
+ */
+ if (!__test_and_clear_bit(0, (unsigned long *)
+ &anon_vma->root->rb_root.rb_root.rb_node))
+ BUG();
+ anon_vma_unlock_write(anon_vma);
+ }
+}
+
+static void vm_unlock_mapping(struct address_space *mapping)
+{
+ if (test_bit(AS_MM_ALL_LOCKS, &mapping->flags)) {
+ /*
+ * AS_MM_ALL_LOCKS can't change to 0 from under us
+ * because we hold the mm_all_locks_mutex.
+ */
+ i_mmap_unlock_write(mapping);
+ if (!test_and_clear_bit(AS_MM_ALL_LOCKS,
+ &mapping->flags))
+ BUG();
+ }
+}
+
+/*
+ * The mmap_lock cannot be released by the caller until
+ * mm_drop_all_locks() returns.
+ */
+void mm_drop_all_locks(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ struct anon_vma_chain *avc;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ mmap_assert_write_locked(mm);
+ BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
+
+ for_each_vma(vmi, vma) {
+ if (vma->anon_vma)
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ vm_unlock_anon_vma(avc->anon_vma);
+ if (vma->vm_file && vma->vm_file->f_mapping)
+ vm_unlock_mapping(vma->vm_file->f_mapping);
+ }
+
+ mutex_unlock(&mm_all_locks_mutex);
+}
diff --git a/mm/vma.h b/mm/vma.h
new file mode 100644
index 000000000000..819f994cf727
--- /dev/null
+++ b/mm/vma.h
@@ -0,0 +1,558 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * vma.h
+ *
+ * Core VMA manipulation API implemented in vma.c.
+ */
+#ifndef __MM_VMA_H
+#define __MM_VMA_H
+
+/*
+ * VMA lock generalization
+ */
+struct vma_prepare {
+ struct vm_area_struct *vma;
+ struct vm_area_struct *adj_next;
+ struct file *file;
+ struct address_space *mapping;
+ struct anon_vma *anon_vma;
+ struct vm_area_struct *insert;
+ struct vm_area_struct *remove;
+ struct vm_area_struct *remove2;
+};
+
+struct unlink_vma_file_batch {
+ int count;
+ struct vm_area_struct *vmas[8];
+};
+
+/*
+ * vma munmap operation
+ */
+struct vma_munmap_struct {
+ struct vma_iterator *vmi;
+ struct vm_area_struct *vma; /* The first vma to munmap */
+ struct vm_area_struct *prev; /* vma before the munmap area */
+ struct vm_area_struct *next; /* vma after the munmap area */
+ struct list_head *uf; /* Userfaultfd list_head */
+ unsigned long start; /* Aligned start addr (inclusive) */
+ unsigned long end; /* Aligned end addr (exclusive) */
+ unsigned long unmap_start; /* Unmap PTE start */
+ unsigned long unmap_end; /* Unmap PTE end */
+ int vma_count; /* Number of vmas that will be removed */
+ bool unlock; /* Unlock after the munmap */
+ bool clear_ptes; /* If there are outstanding PTE to be cleared */
+ bool closed_vm_ops; /* call_mmap() was encountered, so vmas may be closed */
+ /* 1 byte hole */
+ unsigned long nr_pages; /* Number of pages being removed */
+ unsigned long locked_vm; /* Number of locked pages */
+ unsigned long nr_accounted; /* Number of VM_ACCOUNT pages */
+ unsigned long exec_vm;
+ unsigned long stack_vm;
+ unsigned long data_vm;
+};
+
+enum vma_merge_state {
+ VMA_MERGE_START,
+ VMA_MERGE_ERROR_NOMEM,
+ VMA_MERGE_NOMERGE,
+ VMA_MERGE_SUCCESS,
+};
+
+/* Represents a VMA merge operation. */
+struct vma_merge_struct {
+ struct mm_struct *mm;
+ struct vma_iterator *vmi;
+ pgoff_t pgoff;
+ struct vm_area_struct *prev;
+ struct vm_area_struct *next; /* Modified by vma_merge(). */
+ struct vm_area_struct *vma; /* Either a new VMA or the one being modified. */
+ unsigned long start;
+ unsigned long end;
+ unsigned long flags;
+ struct file *file;
+ struct anon_vma *anon_vma;
+ struct mempolicy *policy;
+ struct vm_userfaultfd_ctx uffd_ctx;
+ struct anon_vma_name *anon_name;
+ enum vma_merge_state state;
+};
+
+static inline bool vmg_nomem(struct vma_merge_struct *vmg)
+{
+ return vmg->state == VMA_MERGE_ERROR_NOMEM;
+}
+
+/* Assumes addr >= vma->vm_start. */
+static inline pgoff_t vma_pgoff_offset(struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ return vma->vm_pgoff + PHYS_PFN(addr - vma->vm_start);
+}
+
+#define VMG_STATE(name, mm_, vmi_, start_, end_, flags_, pgoff_) \
+ struct vma_merge_struct name = { \
+ .mm = mm_, \
+ .vmi = vmi_, \
+ .start = start_, \
+ .end = end_, \
+ .flags = flags_, \
+ .pgoff = pgoff_, \
+ .state = VMA_MERGE_START, \
+ }
+
+#define VMG_VMA_STATE(name, vmi_, prev_, vma_, start_, end_) \
+ struct vma_merge_struct name = { \
+ .mm = vma_->vm_mm, \
+ .vmi = vmi_, \
+ .prev = prev_, \
+ .next = NULL, \
+ .vma = vma_, \
+ .start = start_, \
+ .end = end_, \
+ .flags = vma_->vm_flags, \
+ .pgoff = vma_pgoff_offset(vma_, start_), \
+ .file = vma_->vm_file, \
+ .anon_vma = vma_->anon_vma, \
+ .policy = vma_policy(vma_), \
+ .uffd_ctx = vma_->vm_userfaultfd_ctx, \
+ .anon_name = anon_vma_name(vma_), \
+ .state = VMA_MERGE_START, \
+ }
+
+#ifdef CONFIG_DEBUG_VM_MAPLE_TREE
+void validate_mm(struct mm_struct *mm);
+#else
+#define validate_mm(mm) do { } while (0)
+#endif
+
+/* Required for expand_downwards(). */
+void anon_vma_interval_tree_pre_update_vma(struct vm_area_struct *vma);
+
+/* Required for expand_downwards(). */
+void anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma);
+
+int vma_expand(struct vma_merge_struct *vmg);
+int vma_shrink(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, pgoff_t pgoff);
+
+static inline int vma_iter_store_gfp(struct vma_iterator *vmi,
+ struct vm_area_struct *vma, gfp_t gfp)
+
+{
+ if (vmi->mas.status != ma_start &&
+ ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
+ vma_iter_invalidate(vmi);
+
+ __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
+ mas_store_gfp(&vmi->mas, vma, gfp);
+ if (unlikely(mas_is_err(&vmi->mas)))
+ return -ENOMEM;
+
+ return 0;
+}
+
+#ifdef CONFIG_MMU
+/*
+ * init_vma_munmap() - Initializer wrapper for vma_munmap_struct
+ * @vms: The vma munmap struct
+ * @vmi: The vma iterator
+ * @vma: The first vm_area_struct to munmap
+ * @start: The aligned start address to munmap
+ * @end: The aligned end address to munmap
+ * @uf: The userfaultfd list_head
+ * @unlock: Unlock after the operation. Only unlocked on success
+ */
+static inline void init_vma_munmap(struct vma_munmap_struct *vms,
+ struct vma_iterator *vmi, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end, struct list_head *uf,
+ bool unlock)
+{
+ vms->vmi = vmi;
+ vms->vma = vma;
+ if (vma) {
+ vms->start = start;
+ vms->end = end;
+ } else {
+ vms->start = vms->end = 0;
+ }
+ vms->unlock = unlock;
+ vms->uf = uf;
+ vms->vma_count = 0;
+ vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0;
+ vms->exec_vm = vms->stack_vm = vms->data_vm = 0;
+ vms->unmap_start = FIRST_USER_ADDRESS;
+ vms->unmap_end = USER_PGTABLES_CEILING;
+ vms->clear_ptes = false;
+ vms->closed_vm_ops = false;
+}
+#endif
+
+int vms_gather_munmap_vmas(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach);
+
+void vms_complete_munmap_vmas(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach);
+
+void vms_clean_up_area(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach);
+
+/*
+ * reattach_vmas() - Undo any munmap work and free resources
+ * @mas_detach: The maple state with the detached maple tree
+ *
+ * Reattach any detached vmas and free up the maple tree used to track the vmas.
+ */
+static inline void reattach_vmas(struct ma_state *mas_detach)
+{
+ struct vm_area_struct *vma;
+
+ mas_set(mas_detach, 0);
+ mas_for_each(mas_detach, vma, ULONG_MAX)
+ vma_mark_detached(vma, false);
+
+ __mt_destroy(mas_detach->tree);
+}
+
+/*
+ * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap()
+ * operation.
+ * @vms: The vma unmap structure
+ * @mas_detach: The maple state with the detached maple tree
+ *
+ * Reattach any detached vmas, free up the maple tree used to track the vmas.
+ * If that's not possible because the ptes are cleared (and vm_ops->closed() may
+ * have been called), then a NULL is written over the vmas and the vmas are
+ * removed (munmap() completed).
+ */
+static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms,
+ struct ma_state *mas_detach)
+{
+ struct ma_state *mas = &vms->vmi->mas;
+ if (!vms->nr_pages)
+ return;
+
+ if (vms->clear_ptes)
+ return reattach_vmas(mas_detach);
+
+ /*
+ * Aborting cannot just call the vm_ops open() because they are often
+ * not symmetrical and state data has been lost. Resort to the old
+ * failure method of leaving a gap where the MAP_FIXED mapping failed.
+ */
+ mas_set_range(mas, vms->start, vms->end - 1);
+ if (unlikely(mas_store_gfp(mas, NULL, GFP_KERNEL))) {
+ pr_warn_once("%s: (%d) Unable to abort munmap() operation\n",
+ current->comm, current->pid);
+ /* Leaving vmas detached and in-tree may hamper recovery */
+ reattach_vmas(mas_detach);
+ } else {
+ /* Clean up the insertion of the unfortunate gap */
+ vms_complete_munmap_vmas(vms, mas_detach);
+ }
+}
+
+int
+do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
+ struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct list_head *uf, bool unlock);
+
+int do_vmi_munmap(struct vma_iterator *vmi, struct mm_struct *mm,
+ unsigned long start, size_t len, struct list_head *uf,
+ bool unlock);
+
+void remove_vma(struct vm_area_struct *vma, bool unreachable, bool closed);
+
+void unmap_region(struct ma_state *mas, struct vm_area_struct *vma,
+ struct vm_area_struct *prev, struct vm_area_struct *next);
+
+/* We are about to modify the VMA's flags. */
+struct vm_area_struct *vma_modify_flags(struct vma_iterator *vmi,
+ struct vm_area_struct *prev, struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long new_flags);
+
+/* We are about to modify the VMA's flags and/or anon_name. */
+struct vm_area_struct
+*vma_modify_flags_name(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start,
+ unsigned long end,
+ unsigned long new_flags,
+ struct anon_vma_name *new_name);
+
+/* We are about to modify the VMA's memory policy. */
+struct vm_area_struct
+*vma_modify_policy(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct mempolicy *new_pol);
+
+/* We are about to modify the VMA's flags and/or uffd context. */
+struct vm_area_struct
+*vma_modify_flags_uffd(struct vma_iterator *vmi,
+ struct vm_area_struct *prev,
+ struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ unsigned long new_flags,
+ struct vm_userfaultfd_ctx new_ctx);
+
+struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg);
+
+struct vm_area_struct *vma_merge_extend(struct vma_iterator *vmi,
+ struct vm_area_struct *vma,
+ unsigned long delta);
+
+void unlink_file_vma_batch_init(struct unlink_vma_file_batch *vb);
+
+void unlink_file_vma_batch_final(struct unlink_vma_file_batch *vb);
+
+void unlink_file_vma_batch_add(struct unlink_vma_file_batch *vb,
+ struct vm_area_struct *vma);
+
+void unlink_file_vma(struct vm_area_struct *vma);
+
+void vma_link_file(struct vm_area_struct *vma);
+
+int vma_link(struct mm_struct *mm, struct vm_area_struct *vma);
+
+struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
+ unsigned long addr, unsigned long len, pgoff_t pgoff,
+ bool *need_rmap_locks);
+
+struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma);
+
+bool vma_needs_dirty_tracking(struct vm_area_struct *vma);
+bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot);
+
+int mm_take_all_locks(struct mm_struct *mm);
+void mm_drop_all_locks(struct mm_struct *mm);
+
+static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma)
+{
+ /*
+ * We want to check manually if we can change individual PTEs writable
+ * if we can't do that automatically for all PTEs in a mapping. For
+ * private mappings, that's always the case when we have write
+ * permissions as we properly have to handle COW.
+ */
+ if (vma->vm_flags & VM_SHARED)
+ return vma_wants_writenotify(vma, vma->vm_page_prot);
+ return !!(vma->vm_flags & VM_WRITE);
+}
+
+#ifdef CONFIG_MMU
+static inline pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags)
+{
+ return pgprot_modify(oldprot, vm_get_page_prot(vm_flags));
+}
+#endif
+
+static inline struct vm_area_struct *vma_prev_limit(struct vma_iterator *vmi,
+ unsigned long min)
+{
+ return mas_prev(&vmi->mas, min);
+}
+
+/*
+ * These three helpers classifies VMAs for virtual memory accounting.
+ */
+
+/*
+ * Executable code area - executable, not writable, not stack
+ */
+static inline bool is_exec_mapping(vm_flags_t flags)
+{
+ return (flags & (VM_EXEC | VM_WRITE | VM_STACK)) == VM_EXEC;
+}
+
+/*
+ * Stack area (including shadow stacks)
+ *
+ * VM_GROWSUP / VM_GROWSDOWN VMAs are always private anonymous:
+ * do_mmap() forbids all other combinations.
+ */
+static inline bool is_stack_mapping(vm_flags_t flags)
+{
+ return ((flags & VM_STACK) == VM_STACK) || (flags & VM_SHADOW_STACK);
+}
+
+/*
+ * Data area - private, writable, not stack
+ */
+static inline bool is_data_mapping(vm_flags_t flags)
+{
+ return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE;
+}
+
+
+static inline void vma_iter_config(struct vma_iterator *vmi,
+ unsigned long index, unsigned long last)
+{
+ __mas_set_range(&vmi->mas, index, last - 1);
+}
+
+static inline void vma_iter_reset(struct vma_iterator *vmi)
+{
+ mas_reset(&vmi->mas);
+}
+
+static inline
+struct vm_area_struct *vma_iter_prev_range_limit(struct vma_iterator *vmi, unsigned long min)
+{
+ return mas_prev_range(&vmi->mas, min);
+}
+
+static inline
+struct vm_area_struct *vma_iter_next_range_limit(struct vma_iterator *vmi, unsigned long max)
+{
+ return mas_next_range(&vmi->mas, max);
+}
+
+static inline int vma_iter_area_lowest(struct vma_iterator *vmi, unsigned long min,
+ unsigned long max, unsigned long size)
+{
+ return mas_empty_area(&vmi->mas, min, max - 1, size);
+}
+
+static inline int vma_iter_area_highest(struct vma_iterator *vmi, unsigned long min,
+ unsigned long max, unsigned long size)
+{
+ return mas_empty_area_rev(&vmi->mas, min, max - 1, size);
+}
+
+/*
+ * VMA Iterator functions shared between nommu and mmap
+ */
+static inline int vma_iter_prealloc(struct vma_iterator *vmi,
+ struct vm_area_struct *vma)
+{
+ return mas_preallocate(&vmi->mas, vma, GFP_KERNEL);
+}
+
+static inline void vma_iter_clear(struct vma_iterator *vmi)
+{
+ mas_store_prealloc(&vmi->mas, NULL);
+}
+
+static inline struct vm_area_struct *vma_iter_load(struct vma_iterator *vmi)
+{
+ return mas_walk(&vmi->mas);
+}
+
+/* Store a VMA with preallocated memory */
+static inline void vma_iter_store(struct vma_iterator *vmi,
+ struct vm_area_struct *vma)
+{
+
+#if defined(CONFIG_DEBUG_VM_MAPLE_TREE)
+ if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
+ vmi->mas.index > vma->vm_start)) {
+ pr_warn("%lx > %lx\n store vma %lx-%lx\n into slot %lx-%lx\n",
+ vmi->mas.index, vma->vm_start, vma->vm_start,
+ vma->vm_end, vmi->mas.index, vmi->mas.last);
+ }
+ if (MAS_WARN_ON(&vmi->mas, vmi->mas.status != ma_start &&
+ vmi->mas.last < vma->vm_start)) {
+ pr_warn("%lx < %lx\nstore vma %lx-%lx\ninto slot %lx-%lx\n",
+ vmi->mas.last, vma->vm_start, vma->vm_start, vma->vm_end,
+ vmi->mas.index, vmi->mas.last);
+ }
+#endif
+
+ if (vmi->mas.status != ma_start &&
+ ((vmi->mas.index > vma->vm_start) || (vmi->mas.last < vma->vm_start)))
+ vma_iter_invalidate(vmi);
+
+ __mas_set_range(&vmi->mas, vma->vm_start, vma->vm_end - 1);
+ mas_store_prealloc(&vmi->mas, vma);
+}
+
+static inline unsigned long vma_iter_addr(struct vma_iterator *vmi)
+{
+ return vmi->mas.index;
+}
+
+static inline unsigned long vma_iter_end(struct vma_iterator *vmi)
+{
+ return vmi->mas.last + 1;
+}
+
+static inline int vma_iter_bulk_alloc(struct vma_iterator *vmi,
+ unsigned long count)
+{
+ return mas_expected_entries(&vmi->mas, count);
+}
+
+static inline
+struct vm_area_struct *vma_iter_prev_range(struct vma_iterator *vmi)
+{
+ return mas_prev_range(&vmi->mas, 0);
+}
+
+/*
+ * Retrieve the next VMA and rewind the iterator to end of the previous VMA, or
+ * if no previous VMA, to index 0.
+ */
+static inline
+struct vm_area_struct *vma_iter_next_rewind(struct vma_iterator *vmi,
+ struct vm_area_struct **pprev)
+{
+ struct vm_area_struct *next = vma_next(vmi);
+ struct vm_area_struct *prev = vma_prev(vmi);
+
+ /*
+ * Consider the case where no previous VMA exists. We advance to the
+ * next VMA, skipping any gap, then rewind to the start of the range.
+ *
+ * If we were to unconditionally advance to the next range we'd wind up
+ * at the next VMA again, so we check to ensure there is a previous VMA
+ * to skip over.
+ */
+ if (prev)
+ vma_iter_next_range(vmi);
+
+ if (pprev)
+ *pprev = prev;
+
+ return next;
+}
+
+#ifdef CONFIG_64BIT
+
+static inline bool vma_is_sealed(struct vm_area_struct *vma)
+{
+ return (vma->vm_flags & VM_SEALED);
+}
+
+/*
+ * check if a vma is sealed for modification.
+ * return true, if modification is allowed.
+ */
+static inline bool can_modify_vma(struct vm_area_struct *vma)
+{
+ if (unlikely(vma_is_sealed(vma)))
+ return false;
+
+ return true;
+}
+
+bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior);
+
+#else
+
+static inline bool can_modify_vma(struct vm_area_struct *vma)
+{
+ return true;
+}
+
+static inline bool can_modify_vma_madv(struct vm_area_struct *vma, int behavior)
+{
+ return true;
+}
+
+#endif
+
+#endif /* __MM_VMA_H */
diff --git a/mm/vma_internal.h b/mm/vma_internal.h
new file mode 100644
index 000000000000..b930ab12a587
--- /dev/null
+++ b/mm/vma_internal.h
@@ -0,0 +1,49 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * vma_internal.h
+ *
+ * Headers required by vma.c, which can be substituted accordingly when testing
+ * VMA functionality.
+ */
+
+#ifndef __MM_VMA_INTERNAL_H
+#define __MM_VMA_INTERNAL_H
+
+#include <linux/backing-dev.h>
+#include <linux/bitops.h>
+#include <linux/bug.h>
+#include <linux/cacheflush.h>
+#include <linux/err.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/huge_mm.h>
+#include <linux/hugetlb_inline.h>
+#include <linux/kernel.h>
+#include <linux/khugepaged.h>
+#include <linux/list.h>
+#include <linux/maple_tree.h>
+#include <linux/mempolicy.h>
+#include <linux/mm.h>
+#include <linux/mm_inline.h>
+#include <linux/mm_types.h>
+#include <linux/mman.h>
+#include <linux/mmap_lock.h>
+#include <linux/mmdebug.h>
+#include <linux/mmu_context.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/pfn.h>
+#include <linux/rcupdate.h>
+#include <linux/rmap.h>
+#include <linux/rwsem.h>
+#include <linux/sched/signal.h>
+#include <linux/swap.h>
+#include <linux/uprobes.h>
+#include <linux/userfaultfd_k.h>
+
+#include <asm/current.h>
+#include <asm/tlb.h>
+
+#include "internal.h"
+
+#endif /* __MM_VMA_INTERNAL_H */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index a0df1e2e155a..634162271c00 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -105,7 +105,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
if (!pte)
return -ENOMEM;
do {
- if (!pte_none(ptep_get(pte))) {
+ if (unlikely(!pte_none(ptep_get(pte)))) {
if (pfn_valid(pfn)) {
page = pfn_to_page(pfn);
dump_page(page, "remapping already mapped page");
@@ -1940,7 +1940,7 @@ static inline void setup_vmalloc_vm(struct vm_struct *vm,
{
vm->flags = flags;
vm->addr = (void *)va->va_start;
- vm->size = va->va_end - va->va_start;
+ vm->size = va_size(va);
vm->caller = caller;
va->vm = vm;
}
@@ -2018,7 +2018,7 @@ retry:
if (vm) {
vm->addr = (void *)va->va_start;
- vm->size = va->va_end - va->va_start;
+ vm->size = va_size(va);
va->vm = vm;
}
@@ -2131,23 +2131,18 @@ reclaim_list_global(struct list_head *head)
static void
decay_va_pool_node(struct vmap_node *vn, bool full_decay)
{
+ LIST_HEAD(decay_list);
+ struct rb_root decay_root = RB_ROOT;
struct vmap_area *va, *nva;
- struct list_head decay_list;
- struct rb_root decay_root;
unsigned long n_decay;
int i;
- decay_root = RB_ROOT;
- INIT_LIST_HEAD(&decay_list);
-
for (i = 0; i < MAX_VA_SIZE_PAGES; i++) {
- struct list_head tmp_list;
+ LIST_HEAD(tmp_list);
if (list_empty(&vn->pool[i].head))
continue;
- INIT_LIST_HEAD(&tmp_list);
-
/* Detach the pool, so no-one can access it. */
spin_lock(&vn->pool_lock);
list_replace_init(&vn->pool[i].head, &tmp_list);
@@ -2198,7 +2193,7 @@ static void purge_vmap_node(struct work_struct *work)
vn->nr_purged = 0;
list_for_each_entry_safe(va, n_va, &vn->purge_list, list) {
- unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
+ unsigned long nr = va_size(va) >> PAGE_SHIFT;
unsigned long orig_start = va->va_start;
unsigned long orig_end = va->va_end;
unsigned int vn_id = decode_vn_id(va->flags);
@@ -2344,8 +2339,8 @@ static void free_vmap_area_noflush(struct vmap_area *va)
if (WARN_ON_ONCE(!list_empty(&va->list)))
return;
- nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
- PAGE_SHIFT, &vmap_lazy_nr);
+ nr_lazy = atomic_long_add_return(va_size(va) >> PAGE_SHIFT,
+ &vmap_lazy_nr);
/*
* If it was request by a certain node we would like to
@@ -2941,8 +2936,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
if (WARN_ON_ONCE(!va))
return;
- debug_check_no_locks_freed((void *)va->va_start,
- (va->va_end - va->va_start));
+ debug_check_no_locks_freed((void *)va->va_start, va_size(va));
free_unmap_vmap_area(va);
}
EXPORT_SYMBOL(vm_unmap_ram);
@@ -3518,8 +3512,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
unsigned int order, unsigned int nr_pages, struct page **pages)
{
unsigned int nr_allocated = 0;
- gfp_t alloc_gfp = gfp;
- bool nofail = gfp & __GFP_NOFAIL;
struct page *page;
int i;
@@ -3530,9 +3522,6 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* more permissive.
*/
if (!order) {
- /* bulk allocator doesn't support nofail req. officially */
- gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL;
-
while (nr_allocated < nr_pages) {
unsigned int nr, nr_pages_request;
@@ -3550,12 +3539,11 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
* but mempolicy wants to alloc memory by interleaving.
*/
if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE)
- nr = alloc_pages_bulk_array_mempolicy_noprof(bulk_gfp,
+ nr = alloc_pages_bulk_array_mempolicy_noprof(gfp,
nr_pages_request,
pages + nr_allocated);
-
else
- nr = alloc_pages_bulk_array_node_noprof(bulk_gfp, nid,
+ nr = alloc_pages_bulk_array_node_noprof(gfp, nid,
nr_pages_request,
pages + nr_allocated);
@@ -3569,30 +3557,24 @@ vm_area_alloc_pages(gfp_t gfp, int nid,
if (nr != nr_pages_request)
break;
}
- } else if (gfp & __GFP_NOFAIL) {
- /*
- * Higher order nofail allocations are really expensive and
- * potentially dangerous (pre-mature OOM, disruptive reclaim
- * and compaction etc.
- */
- alloc_gfp &= ~__GFP_NOFAIL;
}
/* High-order pages or fallback path if "bulk" fails. */
while (nr_allocated < nr_pages) {
- if (!nofail && fatal_signal_pending(current))
+ if (!(gfp & __GFP_NOFAIL) && fatal_signal_pending(current))
break;
if (nid == NUMA_NO_NODE)
- page = alloc_pages_noprof(alloc_gfp, order);
+ page = alloc_pages_noprof(gfp, order);
else
- page = alloc_pages_node_noprof(nid, alloc_gfp, order);
+ page = alloc_pages_node_noprof(nid, gfp, order);
+
if (unlikely(!page))
break;
/*
- * Higher order allocations must be able to be treated as
- * indepdenent small pages by callers (as they can with
+ * High-order allocations must be able to be treated as
+ * independent small pages by callers (as they can with
* small-page vmallocs). Some drivers do their own refcounting
* on vmalloc_to_page() pages, some use page->mapping,
* page->lru, etc.
@@ -3653,7 +3635,16 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
page_order = vm_area_page_order(area);
- area->nr_pages = vm_area_alloc_pages(gfp_mask | __GFP_NOWARN,
+ /*
+ * High-order nofail allocations are really expensive and
+ * potentially dangerous (pre-mature OOM, disruptive reclaim
+ * and compaction etc.
+ *
+ * Please note, the __vmalloc_node_range_noprof() falls-back
+ * to order-0 pages if high-order attempt is unsuccessful.
+ */
+ area->nr_pages = vm_area_alloc_pages((page_order ?
+ gfp_mask & ~__GFP_NOFAIL : gfp_mask) | __GFP_NOWARN,
node, page_order, nr_small_pages, area->pages);
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
@@ -4033,6 +4024,76 @@ void *vzalloc_node_noprof(unsigned long size, int node)
}
EXPORT_SYMBOL(vzalloc_node_noprof);
+/**
+ * vrealloc - reallocate virtually contiguous memory; contents remain unchanged
+ * @p: object to reallocate memory for
+ * @size: the size to reallocate
+ * @flags: the flags for the page level allocator
+ *
+ * If @p is %NULL, vrealloc() behaves exactly like vmalloc(). If @size is 0 and
+ * @p is not a %NULL pointer, the object pointed to is freed.
+ *
+ * If __GFP_ZERO logic is requested, callers must ensure that, starting with the
+ * initial memory allocation, every subsequent call to this API for the same
+ * memory allocation is flagged with __GFP_ZERO. Otherwise, it is possible that
+ * __GFP_ZERO is not fully honored by this API.
+ *
+ * In any case, the contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.
+ *
+ * This function must not be called concurrently with itself or vfree() for the
+ * same memory allocation.
+ *
+ * Return: pointer to the allocated memory; %NULL if @size is zero or in case of
+ * failure
+ */
+void *vrealloc_noprof(const void *p, size_t size, gfp_t flags)
+{
+ size_t old_size = 0;
+ void *n;
+
+ if (!size) {
+ vfree(p);
+ return NULL;
+ }
+
+ if (p) {
+ struct vm_struct *vm;
+
+ vm = find_vm_area(p);
+ if (unlikely(!vm)) {
+ WARN(1, "Trying to vrealloc() nonexistent vm area (%p)\n", p);
+ return NULL;
+ }
+
+ old_size = get_vm_area_size(vm);
+ }
+
+ /*
+ * TODO: Shrink the vm_area, i.e. unmap and free unused pages. What
+ * would be a good heuristic for when to shrink the vm_area?
+ */
+ if (size <= old_size) {
+ /* Zero out spare memory. */
+ if (want_init_on_alloc(flags))
+ memset((void *)p + size, 0, old_size - size);
+
+ return (void *)p;
+ }
+
+ /* TODO: Grow the vm_area, i.e. allocate and map additional pages. */
+ n = __vmalloc_noprof(size, flags);
+ if (!n)
+ return NULL;
+
+ if (p) {
+ memcpy(n, p, old_size);
+ vfree(p);
+ }
+
+ return n;
+}
+
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
@@ -4873,7 +4934,7 @@ static void show_purge_info(struct seq_file *m)
list_for_each_entry(va, &vn->lazy.head, list) {
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
(void *)va->va_start, (void *)va->va_end,
- va->va_end - va->va_start);
+ va_size(va));
}
spin_unlock(&vn->lazy.lock);
}
@@ -4895,7 +4956,7 @@ static int vmalloc_info_show(struct seq_file *m, void *p)
if (va->flags & VMAP_RAM)
seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
(void *)va->va_start, (void *)va->va_end,
- va->va_end - va->va_start);
+ va_size(va));
continue;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a8d61a8b6894..749cdc110c74 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -628,7 +628,7 @@ typedef enum {
* Calls ->writepage().
*/
static pageout_t pageout(struct folio *folio, struct address_space *mapping,
- struct swap_iocb **plug)
+ struct swap_iocb **plug, struct list_head *folio_list)
{
/*
* If the folio is dirty, only perform writeback if that write
@@ -676,6 +676,14 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
.swap_plug = plug,
};
+ /*
+ * The large shmem folio can be split if CONFIG_THP_SWAP is
+ * not enabled or contiguous swap entries are failed to
+ * allocate.
+ */
+ if (shmem_mapping(mapping) && folio_test_large(folio))
+ wbc.list = folio_list;
+
folio_set_reclaim(folio);
res = mapping->a_ops->writepage(&folio->page, &wbc);
if (res < 0)
@@ -863,7 +871,12 @@ static enum folio_references folio_check_references(struct folio *folio,
if (vm_flags & VM_LOCKED)
return FOLIOREF_ACTIVATE;
- /* rmap lock contention: rotate */
+ /*
+ * There are two cases to consider.
+ * 1) Rmap lock contention: rotate.
+ * 2) Skip the non-shared swapbacked folio mapped solely by
+ * the exiting or OOM-reaped process.
+ */
if (referenced_ptes == -1)
return FOLIOREF_KEEP;
@@ -1003,9 +1016,6 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
- mod_node_page_state(pgdat, PGDEMOTE_KSWAPD + reclaimer_offset(),
- nr_succeeded);
-
return nr_succeeded;
}
@@ -1222,13 +1232,14 @@ retry:
goto keep_locked;
if (folio_test_large(folio)) {
/* cannot split folio, skip it */
- if (!can_split_folio(folio, NULL))
+ if (!can_split_folio(folio, 1, NULL))
goto activate_locked;
/*
* Split partially mapped folios right away.
* We can free the unmapped pages without IO.
*/
- if (data_race(!list_empty(&folio->_deferred_list)) &&
+ if (data_race(!list_empty(&folio->_deferred_list) &&
+ folio_test_partially_mapped(folio)) &&
split_folio_to_list(folio, folio_list))
goto activate_locked;
}
@@ -1252,11 +1263,6 @@ retry:
goto activate_locked_split;
}
}
- } else if (folio_test_swapbacked(folio) &&
- folio_test_large(folio)) {
- /* Split shmem folio */
- if (split_folio_to_list(folio, folio_list))
- goto keep_locked;
}
/*
@@ -1357,12 +1363,25 @@ retry:
* starts and then write it out here.
*/
try_to_unmap_flush_dirty();
- switch (pageout(folio, mapping, &plug)) {
+ switch (pageout(folio, mapping, &plug, folio_list)) {
case PAGE_KEEP:
goto keep_locked;
case PAGE_ACTIVATE:
+ /*
+ * If shmem folio is split when writeback to swap,
+ * the tail pages will make their own pass through
+ * this function and be accounted then.
+ */
+ if (nr_pages > 1 && !folio_test_large(folio)) {
+ sc->nr_scanned -= (nr_pages - 1);
+ nr_pages = 1;
+ }
goto activate_locked;
case PAGE_SUCCESS:
+ if (nr_pages > 1 && !folio_test_large(folio)) {
+ sc->nr_scanned -= (nr_pages - 1);
+ nr_pages = 1;
+ }
stat->nr_pageout += nr_pages;
if (folio_test_writeback(folio))
@@ -1495,7 +1514,8 @@ keep:
/* 'folio_list' is always empty here */
/* Migrate folios selected for demotion */
- nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
+ stat->nr_demoted = demote_folio_list(&demote_folios, pgdat);
+ nr_reclaimed += stat->nr_demoted;
/* Folios that could not be demoted are still in @demote_folios */
if (!list_empty(&demote_folios)) {
/* Folios which weren't demoted go back on @folio_list */
@@ -1941,6 +1961,8 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
spin_lock_irq(&lruvec->lru_lock);
move_folios_to_lru(lruvec, &folio_list);
+ __mod_lruvec_state(lruvec, PGDEMOTE_KSWAPD + reclaimer_offset(),
+ stat.nr_demoted);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
@@ -2239,10 +2261,11 @@ static void prepare_scan_control(pg_data_t *pgdat, struct scan_control *sc)
target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat);
/*
- * Flush the memory cgroup stats, so that we read accurate per-memcg
- * lruvec stats for heuristics.
+ * Flush the memory cgroup stats in rate-limited way as we don't need
+ * most accurate stats here. We may switch to regular stats flushing
+ * in the future once it is cheap enough.
*/
- mem_cgroup_flush_stats(sc->target_mem_cgroup);
+ mem_cgroup_flush_stats_ratelimited(sc->target_mem_cgroup);
/*
* Determine the scan balance between anon and file LRUs.
@@ -3456,7 +3479,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long addr, struct vm_area
goto next;
if (!pmd_trans_huge(pmd[i])) {
- if (should_clear_pmd_young())
+ if (!walk->force_scan && should_clear_pmd_young())
pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next;
}
@@ -3543,7 +3566,7 @@ restart:
walk->mm_stats[MM_NONLEAF_TOTAL]++;
- if (should_clear_pmd_young()) {
+ if (!walk->force_scan && should_clear_pmd_young()) {
if (!pmd_young(val))
continue;
@@ -6644,7 +6667,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
continue;
if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING)
- mark = wmark_pages(zone, WMARK_PROMO);
+ mark = promo_wmark_pages(zone);
else
mark = high_wmark_pages(zone);
if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
@@ -7519,7 +7542,9 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
ret = __node_reclaim(pgdat, gfp_mask, order);
clear_bit(PGDAT_RECLAIM_LOCKED, &pgdat->flags);
- if (!ret)
+ if (ret)
+ count_vm_event(PGSCAN_ZONE_RECLAIM_SUCCESS);
+ else
count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
return ret;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index e875f2a4915f..b5a4cea423e1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1314,6 +1314,7 @@ const char * const vmstat_text[] = {
"pgsteal_file",
#ifdef CONFIG_NUMA
+ "zone_reclaim_success",
"zone_reclaim_failed",
#endif
"pginodesteal",
@@ -1384,6 +1385,7 @@ const char * const vmstat_text[] = {
"thp_split_page",
"thp_split_page_failed",
"thp_deferred_split_page",
+ "thp_underused_split_page",
"thp_split_pmd",
"thp_scan_exceed_none_pte",
"thp_scan_exceed_swap_pte",
@@ -1435,6 +1437,30 @@ const char * const vmstat_text[] = {
"vma_lock_retry",
"vma_lock_miss",
#endif
+#ifdef CONFIG_DEBUG_STACK_USAGE
+ "kstack_1k",
+#if THREAD_SIZE > 1024
+ "kstack_2k",
+#endif
+#if THREAD_SIZE > 2048
+ "kstack_4k",
+#endif
+#if THREAD_SIZE > 4096
+ "kstack_8k",
+#endif
+#if THREAD_SIZE > 8192
+ "kstack_16k",
+#endif
+#if THREAD_SIZE > 16384
+ "kstack_32k",
+#endif
+#if THREAD_SIZE > 32768
+ "kstack_64k",
+#endif
+#if THREAD_SIZE > 65536
+ "kstack_rest",
+#endif
+#endif
#endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
};
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
@@ -1718,6 +1744,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n min %lu"
"\n low %lu"
"\n high %lu"
+ "\n promo %lu"
"\n spanned %lu"
"\n present %lu"
"\n managed %lu"
@@ -1727,6 +1754,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
+ promo_wmark_pages(zone),
zone->spanned_pages,
zone->present_pages,
zone_managed_pages(zone),
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 2ebfed32871b..379d24b4fef9 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -144,7 +144,7 @@ struct z3fold_pool {
const char *name;
spinlock_t lock;
spinlock_t stale_lock;
- struct list_head *unbuddied;
+ struct list_head __percpu *unbuddied;
struct list_head stale;
atomic64_t pages_nr;
struct kmem_cache *c_handle;
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b572aa84823c..16a07def09c9 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -20,7 +20,7 @@
* page->index: links together all component pages of a zspage
* For the huge page, this is always 0, so we use this field
* to store handle.
- * page->page_type: PG_zsmalloc, lower 16 bit locate the first object
+ * page->page_type: PGTY_zsmalloc, lower 24 bits locate the first object
* offset in a subpage of a zspage
*
* Usage of struct page flags:
@@ -463,13 +463,7 @@ static inline struct page *get_first_page(struct zspage *zspage)
return first_page;
}
-#define FIRST_OBJ_PAGE_TYPE_MASK 0xffff
-
-static inline void reset_first_obj_offset(struct page *page)
-{
- VM_WARN_ON_ONCE(!PageZsmalloc(page));
- page->page_type |= FIRST_OBJ_PAGE_TYPE_MASK;
-}
+#define FIRST_OBJ_PAGE_TYPE_MASK 0xffffff
static inline unsigned int get_first_obj_offset(struct page *page)
{
@@ -479,8 +473,8 @@ static inline unsigned int get_first_obj_offset(struct page *page)
static inline void set_first_obj_offset(struct page *page, unsigned int offset)
{
- /* With 16 bit available, we can support offsets into 64 KiB pages. */
- BUILD_BUG_ON(PAGE_SIZE > SZ_64K);
+ /* With 24 bits available, we can support offsets into 16 MiB pages. */
+ BUILD_BUG_ON(PAGE_SIZE > SZ_16M);
VM_WARN_ON_ONCE(!PageZsmalloc(page));
VM_WARN_ON_ONCE(offset & ~FIRST_OBJ_PAGE_TYPE_MASK);
page->page_type &= ~FIRST_OBJ_PAGE_TYPE_MASK;
@@ -819,7 +813,6 @@ static void reset_page(struct page *page)
ClearPagePrivate(page);
set_page_private(page, 0);
page->index = 0;
- reset_first_obj_offset(page);
__ClearPageZsmalloc(page);
}
diff --git a/mm/zswap.c b/mm/zswap.c
index adeaf9c97fde..449914ea9919 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -44,8 +44,6 @@
**********************************/
/* The number of compressed pages currently stored in zswap */
atomic_t zswap_stored_pages = ATOMIC_INIT(0);
-/* The number of same-value filled pages currently stored in zswap */
-static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0);
/*
* The statistics below are not protected from concurrent access for
@@ -185,8 +183,11 @@ static struct shrinker *zswap_shrinker;
*
* swpentry - associated swap entry, the offset indexes into the red-black tree
* length - the length in bytes of the compressed page data. Needed during
- * decompression. For a same value filled page length is 0, and both
- * pool and lru are invalid and must be ignored.
+ * decompression.
+ * referenced - true if the entry recently entered the zswap pool. Unset by the
+ * writeback logic. The entry is only reclaimed by the writeback
+ * logic if referenced is unset. See comments in the shrinker
+ * section for context.
* pool - the zswap_pool the entry's data is in
* handle - zpool allocation handle that stores the compressed page data
* value - value of the same-value filled pages which have same content
@@ -196,11 +197,9 @@ static struct shrinker *zswap_shrinker;
struct zswap_entry {
swp_entry_t swpentry;
unsigned int length;
+ bool referenced;
struct zswap_pool *pool;
- union {
- unsigned long handle;
- unsigned long value;
- };
+ unsigned long handle;
struct obj_cgroup *objcg;
struct list_head lru;
};
@@ -700,11 +699,8 @@ static inline int entry_to_nid(struct zswap_entry *entry)
static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
{
- atomic_long_t *nr_zswap_protected;
- unsigned long lru_size, old, new;
int nid = entry_to_nid(entry);
struct mem_cgroup *memcg;
- struct lruvec *lruvec;
/*
* Note that it is safe to use rcu_read_lock() here, even in the face of
@@ -722,19 +718,6 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry)
memcg = mem_cgroup_from_entry(entry);
/* will always succeed */
list_lru_add(list_lru, &entry->lru, nid, memcg);
-
- /* Update the protection area */
- lru_size = list_lru_count_one(list_lru, nid, memcg);
- lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
- nr_zswap_protected = &lruvec->zswap_lruvec_state.nr_zswap_protected;
- old = atomic_long_inc_return(nr_zswap_protected);
- /*
- * Decay to avoid overflow and adapt to changing workloads.
- * This is based on LRU reclaim cost decaying heuristics.
- */
- do {
- new = old > lru_size / 4 ? old / 2 : old;
- } while (!atomic_long_try_cmpxchg(nr_zswap_protected, &old, new));
rcu_read_unlock();
}
@@ -752,7 +735,7 @@ static void zswap_lru_del(struct list_lru *list_lru, struct zswap_entry *entry)
void zswap_lruvec_state_init(struct lruvec *lruvec)
{
- atomic_long_set(&lruvec->zswap_lruvec_state.nr_zswap_protected, 0);
+ atomic_long_set(&lruvec->zswap_lruvec_state.nr_disk_swapins, 0);
}
void zswap_folio_swapin(struct folio *folio)
@@ -761,16 +744,29 @@ void zswap_folio_swapin(struct folio *folio)
if (folio) {
lruvec = folio_lruvec(folio);
- atomic_long_inc(&lruvec->zswap_lruvec_state.nr_zswap_protected);
+ atomic_long_inc(&lruvec->zswap_lruvec_state.nr_disk_swapins);
}
}
+/*
+ * This function should be called when a memcg is being offlined.
+ *
+ * Since the global shrinker shrink_worker() may hold a reference
+ * of the memcg, we must check and release the reference in
+ * zswap_next_shrink.
+ *
+ * shrink_worker() must handle the case where this function releases
+ * the reference of memcg being shrunk.
+ */
void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg)
{
/* lock out zswap shrinker walking memcg tree */
spin_lock(&zswap_shrink_lock);
- if (zswap_next_shrink == memcg)
- zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL);
+ if (zswap_next_shrink == memcg) {
+ do {
+ zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL);
+ } while (zswap_next_shrink && !mem_cgroup_online(zswap_next_shrink));
+ }
spin_unlock(&zswap_shrink_lock);
}
@@ -799,13 +795,9 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
*/
static void zswap_entry_free(struct zswap_entry *entry)
{
- if (!entry->length)
- atomic_dec(&zswap_same_filled_pages);
- else {
- zswap_lru_del(&zswap_list_lru, entry);
- zpool_free(entry->pool->zpool, entry->handle);
- zswap_pool_put(entry->pool);
- }
+ zswap_lru_del(&zswap_list_lru, entry);
+ zpool_free(entry->pool->zpool, entry->handle);
+ zswap_pool_put(entry->pool);
if (entry->objcg) {
obj_cgroup_uncharge_zswap(entry->objcg, entry->length);
obj_cgroup_put(entry->objcg);
@@ -1082,6 +1074,28 @@ static int zswap_writeback_entry(struct zswap_entry *entry,
/*********************************
* shrinker functions
**********************************/
+/*
+ * The dynamic shrinker is modulated by the following factors:
+ *
+ * 1. Each zswap entry has a referenced bit, which the shrinker unsets (giving
+ * the entry a second chance) before rotating it in the LRU list. If the
+ * entry is considered again by the shrinker, with its referenced bit unset,
+ * it is written back. The writeback rate as a result is dynamically
+ * adjusted by the pool activities - if the pool is dominated by new entries
+ * (i.e lots of recent zswapouts), these entries will be protected and
+ * the writeback rate will slow down. On the other hand, if the pool has a
+ * lot of stagnant entries, these entries will be reclaimed immediately,
+ * effectively increasing the writeback rate.
+ *
+ * 2. Swapins counter: If we observe swapins, it is a sign that we are
+ * overshrinking and should slow down. We maintain a swapins counter, which
+ * is consumed and subtract from the number of eligible objects on the LRU
+ * in zswap_shrinker_count().
+ *
+ * 3. Compression ratio. The better the workload compresses, the less gains we
+ * can expect from writeback. We scale down the number of objects available
+ * for reclaim by this ratio.
+ */
static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l,
spinlock_t *lock, void *arg)
{
@@ -1092,6 +1106,16 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
int writeback_result;
/*
+ * Second chance algorithm: if the entry has its referenced bit set, give it
+ * a second chance. Only clear the referenced bit and rotate it in the
+ * zswap's LRU list.
+ */
+ if (entry->referenced) {
+ entry->referenced = false;
+ return LRU_ROTATE;
+ }
+
+ /*
* As soon as we drop the LRU lock, the entry can be freed by
* a concurrent invalidation. This means the following:
*
@@ -1157,8 +1181,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o
static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
struct shrink_control *sc)
{
- struct lruvec *lruvec = mem_cgroup_lruvec(sc->memcg, NODE_DATA(sc->nid));
- unsigned long shrink_ret, nr_protected, lru_size;
+ unsigned long shrink_ret;
bool encountered_page_in_swapcache = false;
if (!zswap_shrinker_enabled ||
@@ -1167,25 +1190,6 @@ static unsigned long zswap_shrinker_scan(struct shrinker *shrinker,
return SHRINK_STOP;
}
- nr_protected =
- atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
- lru_size = list_lru_shrink_count(&zswap_list_lru, sc);
-
- /*
- * Abort if we are shrinking into the protected region.
- *
- * This short-circuiting is necessary because if we have too many multiple
- * concurrent reclaimers getting the freeable zswap object counts at the
- * same time (before any of them made reasonable progress), the total
- * number of reclaimed objects might be more than the number of unprotected
- * objects (i.e the reclaimers will reclaim into the protected area of the
- * zswap LRU).
- */
- if (nr_protected >= lru_size - sc->nr_to_scan) {
- sc->nr_scanned = 0;
- return SHRINK_STOP;
- }
-
shrink_ret = list_lru_shrink_walk(&zswap_list_lru, sc, &shrink_memcg_cb,
&encountered_page_in_swapcache);
@@ -1200,7 +1204,10 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
{
struct mem_cgroup *memcg = sc->memcg;
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(sc->nid));
- unsigned long nr_backing, nr_stored, nr_freeable, nr_protected;
+ atomic_long_t *nr_disk_swapins =
+ &lruvec->zswap_lruvec_state.nr_disk_swapins;
+ unsigned long nr_backing, nr_stored, nr_freeable, nr_disk_swapins_cur,
+ nr_remain;
if (!zswap_shrinker_enabled || !mem_cgroup_zswap_writeback_enabled(memcg))
return 0;
@@ -1233,25 +1240,33 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker,
if (!nr_stored)
return 0;
- nr_protected =
- atomic_long_read(&lruvec->zswap_lruvec_state.nr_zswap_protected);
nr_freeable = list_lru_shrink_count(&zswap_list_lru, sc);
+ if (!nr_freeable)
+ return 0;
+
/*
- * Subtract the lru size by an estimate of the number of pages
- * that should be protected.
+ * Subtract from the lru size the number of pages that are recently swapped
+ * in from disk. The idea is that had we protect the zswap's LRU by this
+ * amount of pages, these disk swapins would not have happened.
*/
- nr_freeable = nr_freeable > nr_protected ? nr_freeable - nr_protected : 0;
+ nr_disk_swapins_cur = atomic_long_read(nr_disk_swapins);
+ do {
+ if (nr_freeable >= nr_disk_swapins_cur)
+ nr_remain = 0;
+ else
+ nr_remain = nr_disk_swapins_cur - nr_freeable;
+ } while (!atomic_long_try_cmpxchg(
+ nr_disk_swapins, &nr_disk_swapins_cur, nr_remain));
+
+ nr_freeable -= nr_disk_swapins_cur - nr_remain;
+ if (!nr_freeable)
+ return 0;
/*
* Scale the number of freeable pages by the memory saving factor.
* This ensures that the better zswap compresses memory, the fewer
* pages we will evict to swap (as it will otherwise incur IO for
* relatively small memory saving).
- *
- * The memory saving factor calculated here takes same-filled pages into
- * account, but those are not freeable since they almost occupy no
- * space. Hence, we may scale nr_freeable down a little bit more than we
- * should if we have a lot of same-filled pages.
*/
return mult_frac(nr_freeable, nr_backing, nr_stored);
}
@@ -1274,10 +1289,10 @@ static struct shrinker *zswap_alloc_shrinker(void)
static int shrink_memcg(struct mem_cgroup *memcg)
{
- int nid, shrunk = 0;
+ int nid, shrunk = 0, scanned = 0;
if (!mem_cgroup_zswap_writeback_enabled(memcg))
- return -EINVAL;
+ return -ENOENT;
/*
* Skip zombies because their LRUs are reparented and we would be
@@ -1291,63 +1306,94 @@ static int shrink_memcg(struct mem_cgroup *memcg)
shrunk += list_lru_walk_one(&zswap_list_lru, nid, memcg,
&shrink_memcg_cb, NULL, &nr_to_walk);
+ scanned += 1 - nr_to_walk;
}
+
+ if (!scanned)
+ return -ENOENT;
+
return shrunk ? 0 : -EAGAIN;
}
static void shrink_worker(struct work_struct *w)
{
struct mem_cgroup *memcg;
- int ret, failures = 0;
+ int ret, failures = 0, attempts = 0;
unsigned long thr;
/* Reclaim down to the accept threshold */
thr = zswap_accept_thr_pages();
- /* global reclaim will select cgroup in a round-robin fashion. */
+ /*
+ * Global reclaim will select cgroup in a round-robin fashion from all
+ * online memcgs, but memcgs that have no pages in zswap and
+ * writeback-disabled memcgs (memory.zswap.writeback=0) are not
+ * candidates for shrinking.
+ *
+ * Shrinking will be aborted if we encounter the following
+ * MAX_RECLAIM_RETRIES times:
+ * - No writeback-candidate memcgs found in a memcg tree walk.
+ * - Shrinking a writeback-candidate memcg failed.
+ *
+ * We save iteration cursor memcg into zswap_next_shrink,
+ * which can be modified by the offline memcg cleaner
+ * zswap_memcg_offline_cleanup().
+ *
+ * Since the offline cleaner is called only once, we cannot leave an
+ * offline memcg reference in zswap_next_shrink.
+ * We can rely on the cleaner only if we get online memcg under lock.
+ *
+ * If we get an offline memcg, we cannot determine if the cleaner has
+ * already been called or will be called later. We must put back the
+ * reference before returning from this function. Otherwise, the
+ * offline memcg left in zswap_next_shrink will hold the reference
+ * until the next run of shrink_worker().
+ */
do {
- spin_lock(&zswap_shrink_lock);
- zswap_next_shrink = mem_cgroup_iter(NULL, zswap_next_shrink, NULL);
- memcg = zswap_next_shrink;
-
/*
- * We need to retry if we have gone through a full round trip, or if we
- * got an offline memcg (or else we risk undoing the effect of the
- * zswap memcg offlining cleanup callback). This is not catastrophic
- * per se, but it will keep the now offlined memcg hostage for a while.
+ * Start shrinking from the next memcg after zswap_next_shrink.
+ * When the offline cleaner has already advanced the cursor,
+ * advancing the cursor here overlooks one memcg, but this
+ * should be negligibly rare.
*
- * Note that if we got an online memcg, we will keep the extra
- * reference in case the original reference obtained by mem_cgroup_iter
- * is dropped by the zswap memcg offlining callback, ensuring that the
- * memcg is not killed when we are reclaiming.
+ * If we get an online memcg, keep the extra reference in case
+ * the original one obtained by mem_cgroup_iter() is dropped by
+ * zswap_memcg_offline_cleanup() while we are shrinking the
+ * memcg.
*/
- if (!memcg) {
- spin_unlock(&zswap_shrink_lock);
- if (++failures == MAX_RECLAIM_RETRIES)
- break;
-
- goto resched;
- }
-
- if (!mem_cgroup_tryget_online(memcg)) {
- /* drop the reference from mem_cgroup_iter() */
- mem_cgroup_iter_break(NULL, memcg);
- zswap_next_shrink = NULL;
- spin_unlock(&zswap_shrink_lock);
+ spin_lock(&zswap_shrink_lock);
+ do {
+ memcg = mem_cgroup_iter(NULL, zswap_next_shrink, NULL);
+ zswap_next_shrink = memcg;
+ } while (memcg && !mem_cgroup_tryget_online(memcg));
+ spin_unlock(&zswap_shrink_lock);
- if (++failures == MAX_RECLAIM_RETRIES)
+ if (!memcg) {
+ /*
+ * Continue shrinking without incrementing failures if
+ * we found candidate memcgs in the last tree walk.
+ */
+ if (!attempts && ++failures == MAX_RECLAIM_RETRIES)
break;
+ attempts = 0;
goto resched;
}
- spin_unlock(&zswap_shrink_lock);
ret = shrink_memcg(memcg);
/* drop the extra reference */
mem_cgroup_put(memcg);
- if (ret == -EINVAL)
- break;
+ /*
+ * There are no writeback-candidate pages in the memcg.
+ * This is not an issue as long as we can find another memcg
+ * with pages in zswap. Skip this without incrementing attempts
+ * and failures.
+ */
+ if (ret == -ENOENT)
+ continue;
+ ++attempts;
+
if (ret && ++failures == MAX_RECLAIM_RETRIES)
break;
resched:
@@ -1356,42 +1402,6 @@ resched:
}
/*********************************
-* same-filled functions
-**********************************/
-static bool zswap_is_folio_same_filled(struct folio *folio, unsigned long *value)
-{
- unsigned long *data;
- unsigned long val;
- unsigned int pos, last_pos = PAGE_SIZE / sizeof(*data) - 1;
- bool ret = false;
-
- data = kmap_local_folio(folio, 0);
- val = data[0];
-
- if (val != data[last_pos])
- goto out;
-
- for (pos = 1; pos < last_pos; pos++) {
- if (val != data[pos])
- goto out;
- }
-
- *value = val;
- ret = true;
-out:
- kunmap_local(data);
- return ret;
-}
-
-static void zswap_fill_folio(struct folio *folio, unsigned long value)
-{
- unsigned long *data = kmap_local_folio(folio, 0);
-
- memset_l(data, value, PAGE_SIZE / sizeof(unsigned long));
- kunmap_local(data);
-}
-
-/*********************************
* main API
**********************************/
bool zswap_store(struct folio *folio)
@@ -1402,7 +1412,6 @@ bool zswap_store(struct folio *folio)
struct zswap_entry *entry, *old;
struct obj_cgroup *objcg = NULL;
struct mem_cgroup *memcg = NULL;
- unsigned long value;
VM_WARN_ON_ONCE(!folio_test_locked(folio));
VM_WARN_ON_ONCE(!folio_test_swapcache(folio));
@@ -1435,13 +1444,6 @@ bool zswap_store(struct folio *folio)
goto reject;
}
- if (zswap_is_folio_same_filled(folio, &value)) {
- entry->length = 0;
- entry->value = value;
- atomic_inc(&zswap_same_filled_pages);
- goto store_entry;
- }
-
/* if entry is successfully added, it keeps the reference */
entry->pool = zswap_pool_current_get();
if (!entry->pool)
@@ -1459,9 +1461,9 @@ bool zswap_store(struct folio *folio)
if (!zswap_compress(folio, entry))
goto put_pool;
-store_entry:
entry->swpentry = swp;
entry->objcg = objcg;
+ entry->referenced = true;
old = xa_store(tree, offset, entry, GFP_KERNEL);
if (xa_is_err(old)) {
@@ -1507,13 +1509,9 @@ store_entry:
return true;
store_failed:
- if (!entry->length)
- atomic_dec(&zswap_same_filled_pages);
- else {
- zpool_free(entry->pool->zpool, entry->handle);
+ zpool_free(entry->pool->zpool, entry->handle);
put_pool:
- zswap_pool_put(entry->pool);
- }
+ zswap_pool_put(entry->pool);
freepage:
zswap_entry_cache_free(entry);
reject:
@@ -1576,10 +1574,7 @@ bool zswap_load(struct folio *folio)
if (!entry)
return false;
- if (entry->length)
- zswap_decompress(entry, folio);
- else
- zswap_fill_folio(folio, entry->value);
+ zswap_decompress(entry, folio);
count_vm_event(ZSWPIN);
if (entry->objcg)
@@ -1682,8 +1677,6 @@ static int zswap_debugfs_init(void)
zswap_debugfs_root, NULL, &total_size_fops);
debugfs_create_atomic_t("stored_pages", 0444,
zswap_debugfs_root, &zswap_stored_pages);
- debugfs_create_atomic_t("same_filled_pages", 0444,
- zswap_debugfs_root, &zswap_same_filled_pages);
return 0;
}