summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/backing-dev.c19
-rw-r--r--mm/filemap.c99
-rw-r--r--mm/folio-compat.c4
-rw-r--r--mm/gup.c29
-rw-r--r--mm/huge_memory.c7
-rw-r--r--mm/kfence/.kunitconfig6
-rw-r--r--mm/kfence/core.c10
-rw-r--r--mm/kfence/kfence_test.c31
-rw-r--r--mm/memory-failure.c15
-rw-r--r--mm/memory.c4
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page-writeback.c10
-rw-r--r--mm/page_io.c4
-rw-r--r--mm/page_owner.c9
-rw-r--r--mm/readahead.c53
-rw-r--r--mm/secretmem.c8
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slab.c29
-rw-r--r--mm/slab.h5
-rw-r--r--mm/slab_common.c23
-rw-r--r--mm/slub.c174
-rw-r--r--mm/swapfile.c34
-rw-r--r--mm/usercopy.c91
-rw-r--r--mm/util.c32
-rw-r--r--mm/vmscan.c12
26 files changed, 364 insertions, 352 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7176af65b103..ff60bd7d74e0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/blkdev.h>
#include <linux/wait.h>
#include <linux/rbtree.h>
#include <linux/kthread.h>
@@ -390,7 +391,6 @@ static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
release_work);
- struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css);
struct backing_dev_info *bdi = wb->bdi;
mutex_lock(&wb->bdi->cgwb_release_mutex);
@@ -401,7 +401,7 @@ static void cgwb_release_workfn(struct work_struct *work)
mutex_unlock(&wb->bdi->cgwb_release_mutex);
/* triggers blkg destruction if no online users left */
- blkcg_unpin_online(blkcg);
+ blkcg_unpin_online(wb->blkcg_css);
fprop_local_destroy_percpu(&wb->memcg_completions);
@@ -446,7 +446,6 @@ static int cgwb_create(struct backing_dev_info *bdi,
{
struct mem_cgroup *memcg;
struct cgroup_subsys_state *blkcg_css;
- struct blkcg *blkcg;
struct list_head *memcg_cgwb_list, *blkcg_cgwb_list;
struct bdi_writeback *wb;
unsigned long flags;
@@ -454,9 +453,8 @@ static int cgwb_create(struct backing_dev_info *bdi,
memcg = mem_cgroup_from_css(memcg_css);
blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys);
- blkcg = css_to_blkcg(blkcg_css);
memcg_cgwb_list = &memcg->cgwb_list;
- blkcg_cgwb_list = &blkcg->cgwb_list;
+ blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css);
/* look up again under lock and discard on blkcg mismatch */
spin_lock_irqsave(&cgwb_lock, flags);
@@ -511,7 +509,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
list_add(&wb->memcg_node, memcg_cgwb_list);
list_add(&wb->blkcg_node, blkcg_cgwb_list);
- blkcg_pin_online(blkcg);
+ blkcg_pin_online(blkcg_css);
css_get(memcg_css);
css_get(blkcg_css);
}
@@ -724,18 +722,19 @@ void wb_memcg_offline(struct mem_cgroup *memcg)
/**
* wb_blkcg_offline - kill all wb's associated with a blkcg being offlined
- * @blkcg: blkcg being offlined
+ * @css: blkcg being offlined
*
* Also prevents creation of any new wb's associated with @blkcg.
*/
-void wb_blkcg_offline(struct blkcg *blkcg)
+void wb_blkcg_offline(struct cgroup_subsys_state *css)
{
struct bdi_writeback *wb, *next;
+ struct list_head *list = blkcg_get_cgwb_list(css);
spin_lock_irq(&cgwb_lock);
- list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+ list_for_each_entry_safe(wb, next, list, blkcg_node)
cgwb_kill(wb);
- blkcg->cgwb_list.next = NULL; /* prevent new wb's */
+ list->next = NULL; /* prevent new wb's */
spin_unlock_irq(&cgwb_lock);
}
diff --git a/mm/filemap.c b/mm/filemap.c
index de34c6c10384..9daeaab36081 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -225,12 +225,12 @@ void __filemap_remove_folio(struct folio *folio, void *shadow)
void filemap_free_folio(struct address_space *mapping, struct folio *folio)
{
- void (*freepage)(struct page *);
+ void (*free_folio)(struct folio *);
int refs = 1;
- freepage = mapping->a_ops->freepage;
- if (freepage)
- freepage(&folio->page);
+ free_folio = mapping->a_ops->free_folio;
+ if (free_folio)
+ free_folio(folio);
if (folio_test_large(folio) && !folio_test_hugetlb(folio))
refs = folio_nr_pages(folio);
@@ -807,7 +807,7 @@ void replace_page_cache_page(struct page *old, struct page *new)
struct folio *fold = page_folio(old);
struct folio *fnew = page_folio(new);
struct address_space *mapping = old->mapping;
- void (*freepage)(struct page *) = mapping->a_ops->freepage;
+ void (*free_folio)(struct folio *) = mapping->a_ops->free_folio;
pgoff_t offset = old->index;
XA_STATE(xas, &mapping->i_pages, offset);
@@ -835,9 +835,9 @@ void replace_page_cache_page(struct page *old, struct page *new)
if (PageSwapBacked(new))
__inc_lruvec_page_state(new, NR_SHMEM);
xas_unlock_irq(&xas);
- if (freepage)
- freepage(old);
- put_page(old);
+ if (free_folio)
+ free_folio(fold);
+ folio_put(fold);
}
EXPORT_SYMBOL_GPL(replace_page_cache_page);
@@ -2414,12 +2414,12 @@ static int filemap_read_folio(struct file *file, struct address_space *mapping,
/*
* A previous I/O error may have been due to temporary failures,
- * eg. multipath errors. PG_error will be set again if readpage
+ * eg. multipath errors. PG_error will be set again if read_folio
* fails.
*/
folio_clear_error(folio);
/* Start the actual read. The read will unlock the page. */
- error = mapping->a_ops->readpage(file, &folio->page);
+ error = mapping->a_ops->read_folio(file, folio);
if (error)
return error;
@@ -2636,7 +2636,7 @@ err:
* @already_read: Number of bytes already read by the caller.
*
* Copies data from the page cache. If the data is not currently present,
- * uses the readahead and readpage address_space operations to fetch it.
+ * uses the readahead and read_folio address_space operations to fetch it.
*
* Return: Total number of bytes copied, including those already read by
* the caller. If an error happens before any bytes are copied, returns
@@ -3452,7 +3452,7 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma)
{
struct address_space *mapping = file->f_mapping;
- if (!mapping->a_ops->readpage)
+ if (!mapping->a_ops->read_folio)
return -ENOEXEC;
file_accessed(file);
vma->vm_ops = &generic_file_vm_ops;
@@ -3488,10 +3488,13 @@ EXPORT_SYMBOL(generic_file_mmap);
EXPORT_SYMBOL(generic_file_readonly_mmap);
static struct folio *do_read_cache_folio(struct address_space *mapping,
- pgoff_t index, filler_t filler, void *data, gfp_t gfp)
+ pgoff_t index, filler_t filler, struct file *file, gfp_t gfp)
{
struct folio *folio;
int err;
+
+ if (!filler)
+ filler = mapping->a_ops->read_folio;
repeat:
folio = filemap_get_folio(mapping, index);
if (!folio) {
@@ -3508,11 +3511,7 @@ repeat:
}
filler:
- if (filler)
- err = filler(data, &folio->page);
- else
- err = mapping->a_ops->readpage(data, &folio->page);
-
+ err = filler(file, folio);
if (err < 0) {
folio_put(folio);
return ERR_PTR(err);
@@ -3562,44 +3561,44 @@ out:
}
/**
- * read_cache_folio - read into page cache, fill it if needed
- * @mapping: the page's address_space
- * @index: the page index
- * @filler: function to perform the read
- * @data: first arg to filler(data, page) function, often left as NULL
+ * read_cache_folio - Read into page cache, fill it if needed.
+ * @mapping: The address_space to read from.
+ * @index: The index to read.
+ * @filler: Function to perform the read, or NULL to use aops->read_folio().
+ * @file: Passed to filler function, may be NULL if not required.
*
- * Read into the page cache. If a page already exists, and PageUptodate() is
- * not set, try to fill the page and wait for it to become unlocked.
+ * Read one page into the page cache. If it succeeds, the folio returned
+ * will contain @index, but it may not be the first page of the folio.
*
- * If the page does not get brought uptodate, return -EIO.
- *
- * The function expects mapping->invalidate_lock to be already held.
+ * If the filler function returns an error, it will be returned to the
+ * caller.
*
- * Return: up to date page on success, ERR_PTR() on failure.
+ * Context: May sleep. Expects mapping->invalidate_lock to be held.
+ * Return: An uptodate folio on success, ERR_PTR() on failure.
*/
struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index,
- filler_t filler, void *data)
+ filler_t filler, struct file *file)
{
- return do_read_cache_folio(mapping, index, filler, data,
+ return do_read_cache_folio(mapping, index, filler, file,
mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_folio);
static struct page *do_read_cache_page(struct address_space *mapping,
- pgoff_t index, filler_t *filler, void *data, gfp_t gfp)
+ pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp)
{
struct folio *folio;
- folio = do_read_cache_folio(mapping, index, filler, data, gfp);
+ folio = do_read_cache_folio(mapping, index, filler, file, gfp);
if (IS_ERR(folio))
return &folio->page;
return folio_file_page(folio, index);
}
struct page *read_cache_page(struct address_space *mapping,
- pgoff_t index, filler_t *filler, void *data)
+ pgoff_t index, filler_t *filler, struct file *file)
{
- return do_read_cache_page(mapping, index, filler, data,
+ return do_read_cache_page(mapping, index, filler, file,
mapping_gfp_mask(mapping));
}
EXPORT_SYMBOL(read_cache_page);
@@ -3627,27 +3626,6 @@ struct page *read_cache_page_gfp(struct address_space *mapping,
}
EXPORT_SYMBOL(read_cache_page_gfp);
-int pagecache_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
- struct page **pagep, void **fsdata)
-{
- const struct address_space_operations *aops = mapping->a_ops;
-
- return aops->write_begin(file, mapping, pos, len, flags,
- pagep, fsdata);
-}
-EXPORT_SYMBOL(pagecache_write_begin);
-
-int pagecache_write_end(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned copied,
- struct page *page, void *fsdata)
-{
- const struct address_space_operations *aops = mapping->a_ops;
-
- return aops->write_end(file, mapping, pos, len, copied, page, fsdata);
-}
-EXPORT_SYMBOL(pagecache_write_end);
-
/*
* Warn about a page cache invalidation failure during a direct I/O write.
*/
@@ -3759,7 +3737,6 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
const struct address_space_operations *a_ops = mapping->a_ops;
long status = 0;
ssize_t written = 0;
- unsigned int flags = 0;
do {
struct page *page;
@@ -3789,7 +3766,7 @@ again:
break;
}
- status = a_ops->write_begin(file, mapping, pos, bytes, flags,
+ status = a_ops->write_begin(file, mapping, pos, bytes,
&page, &fsdata);
if (unlikely(status < 0))
break;
@@ -3983,8 +3960,8 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp)
if (folio_test_writeback(folio))
return false;
- if (mapping && mapping->a_ops->releasepage)
- return mapping->a_ops->releasepage(&folio->page, gfp);
- return try_to_free_buffers(&folio->page);
+ if (mapping && mapping->a_ops->release_folio)
+ return mapping->a_ops->release_folio(folio, gfp);
+ return try_to_free_buffers(folio);
}
EXPORT_SYMBOL(filemap_release_folio);
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index 46fa179e32fb..20bc15b57d93 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -131,12 +131,10 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
EXPORT_SYMBOL(pagecache_get_page);
struct page *grab_cache_page_write_begin(struct address_space *mapping,
- pgoff_t index, unsigned flags)
+ pgoff_t index)
{
unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
- if (flags & AOP_FLAG_NOFS)
- fgp_flags |= FGP_NOFS;
return pagecache_get_page(mapping, index, fgp_flags,
mapping_gfp_mask(mapping));
}
diff --git a/mm/gup.c b/mm/gup.c
index 2e07cff3b31b..551264407624 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -1724,6 +1724,35 @@ out:
}
EXPORT_SYMBOL(fault_in_writeable);
+/**
+ * fault_in_subpage_writeable - fault in an address range for writing
+ * @uaddr: start of address range
+ * @size: size of address range
+ *
+ * Fault in a user address range for writing while checking for permissions at
+ * sub-page granularity (e.g. arm64 MTE). This function should be used when
+ * the caller cannot guarantee forward progress of a copy_to_user() loop.
+ *
+ * Returns the number of bytes not faulted in (like copy_to_user() and
+ * copy_from_user()).
+ */
+size_t fault_in_subpage_writeable(char __user *uaddr, size_t size)
+{
+ size_t faulted_in;
+
+ /*
+ * Attempt faulting in at page granularity first for page table
+ * permission checking. The arch-specific probe_subpage_writeable()
+ * functions may not check for this.
+ */
+ faulted_in = size - fault_in_writeable(uaddr, size);
+ if (faulted_in)
+ faulted_in -= probe_subpage_writeable(uaddr, faulted_in);
+
+ return size - faulted_in;
+}
+EXPORT_SYMBOL(fault_in_subpage_writeable);
+
/*
* fault_in_safe_writeable - fault in an address range for writing
* @uaddr: start of address range
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index efd06af61fca..a77c78a2b6b5 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2545,11 +2545,16 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct address_space *mapping = NULL;
int extra_pins, ret;
pgoff_t end;
+ bool is_hzp;
- VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
VM_BUG_ON_PAGE(!PageLocked(head), head);
VM_BUG_ON_PAGE(!PageCompound(head), head);
+ is_hzp = is_huge_zero_page(head);
+ VM_WARN_ON_ONCE_PAGE(is_hzp, head);
+ if (is_hzp)
+ return -EBUSY;
+
if (PageWriteback(head))
return -EBUSY;
diff --git a/mm/kfence/.kunitconfig b/mm/kfence/.kunitconfig
new file mode 100644
index 000000000000..f3d65e939bfa
--- /dev/null
+++ b/mm/kfence/.kunitconfig
@@ -0,0 +1,6 @@
+CONFIG_KUNIT=y
+CONFIG_KFENCE=y
+CONFIG_KFENCE_KUNIT_TEST=y
+
+# Additional dependencies.
+CONFIG_FTRACE=y
diff --git a/mm/kfence/core.c b/mm/kfence/core.c
index 2b9f89749154..4e7cd4c8e687 100644
--- a/mm/kfence/core.c
+++ b/mm/kfence/core.c
@@ -630,6 +630,16 @@ static bool __init kfence_init_pool_early(void)
* fails for the first page, and therefore expect addr==__kfence_pool in
* most failure cases.
*/
+ for (char *p = (char *)addr; p < __kfence_pool + KFENCE_POOL_SIZE; p += PAGE_SIZE) {
+ struct slab *slab = virt_to_slab(p);
+
+ if (!slab)
+ continue;
+#ifdef CONFIG_MEMCG
+ slab->memcg_data = 0;
+#endif
+ __folio_clear_slab(slab_folio(slab));
+ }
memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
__kfence_pool = NULL;
return false;
diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c
index 21aafc95f4ab..a97bffe0cc3e 100644
--- a/mm/kfence/kfence_test.c
+++ b/mm/kfence/kfence_test.c
@@ -825,14 +825,6 @@ static void test_exit(struct kunit *test)
test_cache_destroy();
}
-static struct kunit_suite kfence_test_suite = {
- .name = "kfence",
- .test_cases = kfence_test_cases,
- .init = test_init,
- .exit = test_exit,
-};
-static struct kunit_suite *kfence_test_suites[] = { &kfence_test_suite, NULL };
-
static void register_tracepoints(struct tracepoint *tp, void *ignore)
{
check_trace_callback_type_console(probe_console);
@@ -846,11 +838,7 @@ static void unregister_tracepoints(struct tracepoint *tp, void *ignore)
tracepoint_probe_unregister(tp, probe_console, NULL);
}
-/*
- * We only want to do tracepoints setup and teardown once, therefore we have to
- * customize the init and exit functions and cannot rely on kunit_test_suite().
- */
-static int __init kfence_test_init(void)
+static int kfence_suite_init(struct kunit_suite *suite)
{
/*
* Because we want to be able to build the test as a module, we need to
@@ -858,18 +846,25 @@ static int __init kfence_test_init(void)
* won't work here.
*/
for_each_kernel_tracepoint(register_tracepoints, NULL);
- return __kunit_test_suites_init(kfence_test_suites);
+ return 0;
}
-static void kfence_test_exit(void)
+static void kfence_suite_exit(struct kunit_suite *suite)
{
- __kunit_test_suites_exit(kfence_test_suites);
for_each_kernel_tracepoint(unregister_tracepoints, NULL);
tracepoint_synchronize_unregister();
}
-late_initcall_sync(kfence_test_init);
-module_exit(kfence_test_exit);
+static struct kunit_suite kfence_test_suite = {
+ .name = "kfence",
+ .test_cases = kfence_test_cases,
+ .init = test_init,
+ .exit = test_exit,
+ .suite_init = kfence_suite_init,
+ .suite_exit = kfence_suite_exit,
+};
+
+kunit_test_suites(&kfence_test_suite);
MODULE_LICENSE("GPL v2");
MODULE_AUTHOR("Alexander Potapenko <glider@google.com>, Marco Elver <elver@google.com>");
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a934ee8124dd..b85661cbdc4a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1272,7 +1272,7 @@ try_again:
}
out:
if (ret == -EIO)
- dump_page(p, "hwpoison: unhandlable page");
+ pr_err("Memory failure: %#lx: unhandlable page.\n", page_to_pfn(p));
return ret;
}
@@ -1849,19 +1849,6 @@ try_again:
if (PageTransHuge(hpage)) {
/*
- * Bail out before SetPageHasHWPoisoned() if hpage is
- * huge_zero_page, although PG_has_hwpoisoned is not
- * checked in set_huge_zero_page().
- *
- * TODO: Handle memory failure of huge_zero_page thoroughly.
- */
- if (is_huge_zero_page(hpage)) {
- action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
- res = -EBUSY;
- goto unlock_mutex;
- }
-
- /*
* The flag must be set after the refcount is bumped
* otherwise it may race with THP split.
* And the flag can't be set in get_hwpoison_page() since
diff --git a/mm/memory.c b/mm/memory.c
index 2bf5bca39567..54bcd5327b74 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -558,11 +558,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
dump_page(page, "bad pte");
pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n",
(void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
- pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n",
+ pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n",
vma->vm_file,
vma->vm_ops ? vma->vm_ops->fault : NULL,
vma->vm_file ? vma->vm_file->f_op->mmap : NULL,
- mapping ? mapping->a_ops->readpage : NULL);
+ mapping ? mapping->a_ops->read_folio : NULL);
dump_stack();
add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 1415ea25f7aa..e51588e95f57 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1020,7 +1020,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
if (!page->mapping) {
VM_BUG_ON_PAGE(PageAnon(page), page);
if (page_has_private(page)) {
- try_to_free_buffers(page);
+ try_to_free_buffers(folio);
goto out_unlock_both;
}
} else if (page_mapped(page)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 097002506d2d..b522cd0259a0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -941,7 +941,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
return -EINTR;
vma = vma_lookup(mm, addr);
if (!vma) {
- ret = EFAULT;
+ ret = -EFAULT;
goto out;
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9f459f7f8f83..359dc1da3636 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2609,10 +2609,12 @@ EXPORT_SYMBOL(folio_redirty_for_writepage);
* folio_mark_dirty - Mark a folio as being modified.
* @folio: The folio.
*
- * For folios with a mapping this should be done with the folio lock held
- * for the benefit of asynchronous memory errors who prefer a consistent
- * dirty state. This rule can be broken in some special cases,
- * but should be better not to.
+ * The folio may not be truncated while this function is running.
+ * Holding the folio lock is sufficient to prevent truncation, but some
+ * callers cannot acquire a sleeping lock. These callers instead hold
+ * the page table lock for a page table which contains at least one page
+ * in this folio. Truncation will block on the page table lock as it
+ * unmaps pages before removing the folio from its mapping.
*
* Return: True if the folio was newly dirtied, false if it was already dirty.
*/
diff --git a/mm/page_io.c b/mm/page_io.c
index 1b8075ef3418..68318134dc92 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -496,7 +496,6 @@ int swap_readpage(struct page *page, bool synchronous,
* attempt to access it in the page fault retry time check.
*/
if (synchronous) {
- bio->bi_opf |= REQ_POLLED;
get_task_struct(current);
bio->bi_private = current;
}
@@ -508,8 +507,7 @@ int swap_readpage(struct page *page, bool synchronous,
if (!READ_ONCE(bio->bi_private))
break;
- if (!bio_poll(bio, NULL, 0))
- blk_io_schedule();
+ blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
bio_put(bio);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index d808901e27b2..e4c6f3f1695b 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -45,7 +45,12 @@ static void init_early_allocated_pages(void);
static int __init early_page_owner_param(char *buf)
{
- return kstrtobool(buf, &page_owner_enabled);
+ int ret = kstrtobool(buf, &page_owner_enabled);
+
+ if (page_owner_enabled)
+ stack_depot_want_early_init();
+
+ return ret;
}
early_param("page_owner", early_page_owner_param);
@@ -83,8 +88,6 @@ static __init void init_page_owner(void)
if (!page_owner_enabled)
return;
- stack_depot_init();
-
register_dummy_stack();
register_failure_stack();
register_early_stack();
diff --git a/mm/readahead.c b/mm/readahead.c
index 8e3775829513..b78921b54754 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -15,7 +15,7 @@
* explicitly requested by the application. Readahead only ever
* attempts to read folios that are not yet in the page cache. If a
* folio is present but not up-to-date, readahead will not try to read
- * it. In that case a simple ->readpage() will be requested.
+ * it. In that case a simple ->read_folio() will be requested.
*
* Readahead is triggered when an application read request (whether a
* system call or a page fault) finds that the requested folio is not in
@@ -78,7 +78,7 @@
* address space operation, for which mpage_readahead() is a canonical
* implementation. ->readahead() should normally initiate reads on all
* folios, but may fail to read any or all folios without causing an I/O
- * error. The page cache reading code will issue a ->readpage() request
+ * error. The page cache reading code will issue a ->read_folio() request
* for any folio which ->readahead() did not read, and only an error
* from this will be final.
*
@@ -110,9 +110,10 @@
* were not fetched with readahead_folio(). This will allow a
* subsequent synchronous readahead request to try them again. If they
* are left in the page cache, then they will be read individually using
- * ->readpage() which may be less efficient.
+ * ->read_folio() which may be less efficient.
*/
+#include <linux/blkdev.h>
#include <linux/kernel.h>
#include <linux/dax.h>
#include <linux/gfp.h>
@@ -145,7 +146,7 @@ EXPORT_SYMBOL_GPL(file_ra_state_init);
static void read_pages(struct readahead_control *rac)
{
const struct address_space_operations *aops = rac->mapping->a_ops;
- struct page *page;
+ struct folio *folio;
struct blk_plug plug;
if (!readahead_count(rac))
@@ -156,24 +157,23 @@ static void read_pages(struct readahead_control *rac)
if (aops->readahead) {
aops->readahead(rac);
/*
- * Clean up the remaining pages. The sizes in ->ra
+ * Clean up the remaining folios. The sizes in ->ra
* may be used to size the next readahead, so make sure
* they accurately reflect what happened.
*/
- while ((page = readahead_page(rac))) {
- rac->ra->size -= 1;
- if (rac->ra->async_size > 0) {
- rac->ra->async_size -= 1;
- delete_from_page_cache(page);
+ while ((folio = readahead_folio(rac)) != NULL) {
+ unsigned long nr = folio_nr_pages(folio);
+
+ rac->ra->size -= nr;
+ if (rac->ra->async_size >= nr) {
+ rac->ra->async_size -= nr;
+ filemap_remove_folio(folio);
}
- unlock_page(page);
- put_page(page);
+ folio_unlock(folio);
}
} else {
- while ((page = readahead_page(rac))) {
- aops->readpage(rac->file, page);
- put_page(page);
- }
+ while ((folio = readahead_folio(rac)) != NULL)
+ aops->read_folio(rac->file, folio);
}
blk_finish_plug(&plug);
@@ -254,8 +254,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl,
}
/*
- * Now start the IO. We ignore I/O errors - if the page is not
- * uptodate then the caller will launch readpage again, and
+ * Now start the IO. We ignore I/O errors - if the folio is not
+ * uptodate then the caller will launch read_folio again, and
* will then handle the error.
*/
read_pages(ractl);
@@ -303,7 +303,7 @@ void force_page_cache_ra(struct readahead_control *ractl,
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long max_pages, index;
- if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readahead))
+ if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead))
return;
/*
@@ -474,7 +474,8 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
if (!folio)
return -ENOMEM;
- if (mark - index < (1UL << order))
+ mark = round_up(mark, 1UL << order);
+ if (index == mark)
folio_set_readahead(folio);
err = filemap_add_folio(ractl->mapping, folio, index, gfp);
if (err)
@@ -555,8 +556,9 @@ static void ondemand_readahead(struct readahead_control *ractl,
struct file_ra_state *ra = ractl->ra;
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
- unsigned long index = readahead_index(ractl);
- pgoff_t prev_index;
+ pgoff_t index = readahead_index(ractl);
+ pgoff_t expected, prev_index;
+ unsigned int order = folio ? folio_order(folio) : 0;
/*
* If the request exceeds the readahead window, allow the read to
@@ -575,8 +577,9 @@ static void ondemand_readahead(struct readahead_control *ractl,
* It's the expected callback index, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
- if ((index == (ra->start + ra->size - ra->async_size) ||
- index == (ra->start + ra->size))) {
+ expected = round_up(ra->start + ra->size - ra->async_size,
+ 1UL << order);
+ if (index == expected || index == (ra->start + ra->size)) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
@@ -662,7 +665,7 @@ readit:
}
ractl->_index = ra->start;
- page_cache_ra_order(ractl, ra, folio ? folio_order(folio) : 0);
+ page_cache_ra_order(ractl, ra, order);
}
void page_cache_sync_ra(struct readahead_control *ractl,
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 3b3cf2892b6a..206ed6b40c1d 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -145,15 +145,15 @@ static int secretmem_migratepage(struct address_space *mapping,
return -EBUSY;
}
-static void secretmem_freepage(struct page *page)
+static void secretmem_free_folio(struct folio *folio)
{
- set_direct_map_default_noflush(page);
- clear_highpage(page);
+ set_direct_map_default_noflush(&folio->page);
+ folio_zero_segment(folio, 0, folio_size(folio));
}
const struct address_space_operations secretmem_aops = {
.dirty_folio = noop_dirty_folio,
- .freepage = secretmem_freepage,
+ .free_folio = secretmem_free_folio,
.migratepage = secretmem_migratepage,
.isolate_page = secretmem_isolate_page,
};
diff --git a/mm/shmem.c b/mm/shmem.c
index b8169beff226..da30c769b376 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2414,7 +2414,7 @@ static int shmem_initxattrs(struct inode *, const struct xattr *, void *);
static int
shmem_write_begin(struct file *file, struct address_space *mapping,
- loff_t pos, unsigned len, unsigned flags,
+ loff_t pos, unsigned len,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
@@ -4145,7 +4145,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
*
* This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)",
* with any new page allocations done using the specified allocation flags.
- * But read_cache_page_gfp() uses the ->readpage() method: which does not
+ * But read_cache_page_gfp() uses the ->read_folio() method: which does not
* suit tmpfs, since it may have pages in swapcache, and needs to find those
* for itself; although drivers/gpu/drm i915 and ttm rely upon this support.
*
diff --git a/mm/slab.c b/mm/slab.c
index 8fb259511305..f8cd00f4ba13 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -619,18 +619,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
return 0;
}
-static inline void *alternate_node_alloc(struct kmem_cache *cachep,
- gfp_t flags)
-{
- return NULL;
-}
-
-static inline void *____cache_alloc_node(struct kmem_cache *cachep,
- gfp_t flags, int nodeid)
-{
- return NULL;
-}
-
static inline gfp_t gfp_exact_node(gfp_t flags)
{
return flags & ~__GFP_NOFAIL;
@@ -638,9 +626,6 @@ static inline gfp_t gfp_exact_node(gfp_t flags)
#else /* CONFIG_NUMA */
-static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
-static void *alternate_node_alloc(struct kmem_cache *, gfp_t);
-
static struct alien_cache *__alloc_alien_cache(int node, int entries,
int batch, gfp_t gfp)
{
@@ -796,7 +781,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
int slab_node = slab_nid(virt_to_slab(objp));
int node = numa_mem_id();
/*
- * Make sure we are not freeing a object from another node to the array
+ * Make sure we are not freeing an object from another node to the array
* cache on this cpu.
*/
if (likely(node == slab_node))
@@ -847,7 +832,7 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
/*
* The kmem_cache_nodes don't come and go as CPUs
- * come and go. slab_mutex is sufficient
+ * come and go. slab_mutex provides sufficient
* protection here.
*/
cachep->node[node] = n;
@@ -860,7 +845,7 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp)
* Allocates and initializes node for a node on each slab cache, used for
* either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node
* will be allocated off-node since memory is not yet online for the new node.
- * When hotplugging memory or a cpu, existing node are not replaced if
+ * When hotplugging memory or a cpu, existing nodes are not replaced if
* already in use.
*
* Must hold slab_mutex.
@@ -1061,7 +1046,7 @@ int slab_prepare_cpu(unsigned int cpu)
* offline.
*
* Even if all the cpus of a node are down, we don't free the
- * kmem_cache_node of any cache. This to avoid a race between cpu_down, and
+ * kmem_cache_node of any cache. This is to avoid a race between cpu_down, and
* a kmalloc allocation from another cpu for memory from the node of
* the cpu going down. The kmem_cache_node structure is usually allocated from
* kmem_cache_create() and gets destroyed at kmem_cache_destroy().
@@ -1905,7 +1890,7 @@ static bool set_on_slab_cache(struct kmem_cache *cachep,
* @flags: SLAB flags
*
* Returns a ptr to the cache on success, NULL on failure.
- * Cannot be called within a int, but can be interrupted.
+ * Cannot be called within an int, but can be interrupted.
* The @ctor is run when new pages are allocated by the cache.
*
* The flags are
@@ -3055,6 +3040,8 @@ out:
}
#ifdef CONFIG_NUMA
+static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int);
+
/*
* Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set.
*
@@ -3150,7 +3137,7 @@ retry:
}
/*
- * A interface to enable slab creation on nodeid
+ * An interface to enable slab creation on nodeid
*/
static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
int nodeid)
diff --git a/mm/slab.h b/mm/slab.h
index 95eb34174c1b..db9fb5c8dae7 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -331,7 +331,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
SLAB_ACCOUNT)
#elif defined(CONFIG_SLUB)
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_TEMPORARY | SLAB_ACCOUNT)
+ SLAB_TEMPORARY | SLAB_ACCOUNT | SLAB_NO_USER_FLAGS)
#else
#define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE)
#endif
@@ -350,7 +350,8 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size,
SLAB_NOLEAKTRACE | \
SLAB_RECLAIM_ACCOUNT | \
SLAB_TEMPORARY | \
- SLAB_ACCOUNT)
+ SLAB_ACCOUNT | \
+ SLAB_NO_USER_FLAGS)
bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 987e6917d850..77c3adf40e50 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -24,6 +24,7 @@
#include <asm/tlbflush.h>
#include <asm/page.h>
#include <linux/memcontrol.h>
+#include <linux/stackdepot.h>
#define CREATE_TRACE_POINTS
#include <trace/events/kmem.h>
@@ -313,9 +314,13 @@ kmem_cache_create_usercopy(const char *name,
* If no slub_debug was enabled globally, the static key is not yet
* enabled by setup_slub_debug(). Enable it if the cache is being
* created with any of the debugging flags passed explicitly.
+ * It's also possible that this is the first cache created with
+ * SLAB_STORE_USER and we should init stack_depot for it.
*/
if (flags & SLAB_DEBUG_FLAGS)
static_branch_enable(&slub_debug_enabled);
+ if (flags & SLAB_STORE_USER)
+ stack_depot_init();
#endif
mutex_lock(&slab_mutex);
@@ -857,6 +862,8 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags)
return;
}
flags |= SLAB_ACCOUNT;
+ } else if (IS_ENABLED(CONFIG_ZONE_DMA) && (type == KMALLOC_DMA)) {
+ flags |= SLAB_CACHE_DMA;
}
kmalloc_caches[type][idx] = create_kmalloc_cache(
@@ -885,7 +892,7 @@ void __init create_kmalloc_caches(slab_flags_t flags)
/*
* Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined
*/
- for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) {
+ for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) {
for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) {
if (!kmalloc_caches[type][i])
new_kmalloc_cache(i, type, flags);
@@ -906,20 +913,6 @@ void __init create_kmalloc_caches(slab_flags_t flags)
/* Kmalloc array is now usable */
slab_state = UP;
-
-#ifdef CONFIG_ZONE_DMA
- for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) {
- struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i];
-
- if (s) {
- kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
- kmalloc_info[i].name[KMALLOC_DMA],
- kmalloc_info[i].size,
- SLAB_CACHE_DMA | flags, 0,
- kmalloc_info[i].size);
- }
- }
-#endif
}
#endif /* !CONFIG_SLOB */
diff --git a/mm/slub.c b/mm/slub.c
index ed5c2c03a47a..e5535020e0fd 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -26,6 +26,7 @@
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
#include <linux/ctype.h>
+#include <linux/stackdepot.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/kfence.h>
@@ -37,6 +38,7 @@
#include <linux/memcontrol.h>
#include <linux/random.h>
#include <kunit/test.h>
+#include <linux/sort.h>
#include <linux/debugfs.h>
#include <trace/events/kmem.h>
@@ -264,8 +266,8 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
#define TRACK_ADDRS_COUNT 16
struct track {
unsigned long addr; /* Called from address */
-#ifdef CONFIG_STACKTRACE
- unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
+#ifdef CONFIG_STACKDEPOT
+ depot_stack_handle_t handle;
#endif
int cpu; /* Was running on cpu */
int pid; /* Pid context */
@@ -724,57 +726,51 @@ static struct track *get_track(struct kmem_cache *s, void *object,
return kasan_reset_tag(p + alloc);
}
-static void set_track(struct kmem_cache *s, void *object,
+static void noinline set_track(struct kmem_cache *s, void *object,
enum track_item alloc, unsigned long addr)
{
struct track *p = get_track(s, object, alloc);
- if (addr) {
-#ifdef CONFIG_STACKTRACE
- unsigned int nr_entries;
-
- metadata_access_enable();
- nr_entries = stack_trace_save(kasan_reset_tag(p->addrs),
- TRACK_ADDRS_COUNT, 3);
- metadata_access_disable();
+#ifdef CONFIG_STACKDEPOT
+ unsigned long entries[TRACK_ADDRS_COUNT];
+ unsigned int nr_entries;
- if (nr_entries < TRACK_ADDRS_COUNT)
- p->addrs[nr_entries] = 0;
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3);
+ p->handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT);
#endif
- p->addr = addr;
- p->cpu = smp_processor_id();
- p->pid = current->pid;
- p->when = jiffies;
- } else {
- memset(p, 0, sizeof(struct track));
- }
+
+ p->addr = addr;
+ p->cpu = smp_processor_id();
+ p->pid = current->pid;
+ p->when = jiffies;
}
static void init_tracking(struct kmem_cache *s, void *object)
{
+ struct track *p;
+
if (!(s->flags & SLAB_STORE_USER))
return;
- set_track(s, object, TRACK_FREE, 0UL);
- set_track(s, object, TRACK_ALLOC, 0UL);
+ p = get_track(s, object, TRACK_ALLOC);
+ memset(p, 0, 2*sizeof(struct track));
}
static void print_track(const char *s, struct track *t, unsigned long pr_time)
{
+ depot_stack_handle_t handle __maybe_unused;
+
if (!t->addr)
return;
pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
-#ifdef CONFIG_STACKTRACE
- {
- int i;
- for (i = 0; i < TRACK_ADDRS_COUNT; i++)
- if (t->addrs[i])
- pr_err("\t%pS\n", (void *)t->addrs[i]);
- else
- break;
- }
+#ifdef CONFIG_STACKDEPOT
+ handle = READ_ONCE(t->handle);
+ if (handle)
+ stack_depot_print(handle);
+ else
+ pr_err("object allocation/free stack trace missing\n");
#endif
}
@@ -1021,7 +1017,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p)
}
/* Check the pad bytes at the end of a slab page */
-static int slab_pad_check(struct kmem_cache *s, struct slab *slab)
+static void slab_pad_check(struct kmem_cache *s, struct slab *slab)
{
u8 *start;
u8 *fault;
@@ -1031,21 +1027,21 @@ static int slab_pad_check(struct kmem_cache *s, struct slab *slab)
int remainder;
if (!(s->flags & SLAB_POISON))
- return 1;
+ return;
start = slab_address(slab);
length = slab_size(slab);
end = start + length;
remainder = length % s->size;
if (!remainder)
- return 1;
+ return;
pad = end - remainder;
metadata_access_enable();
fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
metadata_access_disable();
if (!fault)
- return 1;
+ return;
while (end > fault && end[-1] == POISON_INUSE)
end--;
@@ -1054,7 +1050,6 @@ static int slab_pad_check(struct kmem_cache *s, struct slab *slab)
print_section(KERN_ERR, "Padding ", pad, remainder);
restore_bytes(s, "slab padding", POISON_INUSE, fault, end);
- return 0;
}
static int check_object(struct kmem_cache *s, struct slab *slab,
@@ -1268,8 +1263,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
}
/* Object debug checks for alloc/free paths */
-static void setup_object_debug(struct kmem_cache *s, struct slab *slab,
- void *object)
+static void setup_object_debug(struct kmem_cache *s, void *object)
{
if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
return;
@@ -1534,6 +1528,8 @@ static int __init setup_slub_debug(char *str)
global_slub_debug_changed = true;
} else {
slab_list_specified = true;
+ if (flags & SLAB_STORE_USER)
+ stack_depot_want_early_init();
}
}
@@ -1551,6 +1547,8 @@ static int __init setup_slub_debug(char *str)
}
out:
slub_debug = global_flags;
+ if (slub_debug & SLAB_STORE_USER)
+ stack_depot_want_early_init();
if (slub_debug != 0 || slub_debug_string)
static_branch_enable(&slub_debug_enabled);
else
@@ -1584,6 +1582,9 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
slab_flags_t block_flags;
slab_flags_t slub_debug_local = slub_debug;
+ if (flags & SLAB_NO_USER_FLAGS)
+ return flags;
+
/*
* If the slab cache is for debugging (e.g. kmemleak) then
* don't store user (stack trace) information by default,
@@ -1628,8 +1629,7 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
return flags | slub_debug_local;
}
#else /* !CONFIG_SLUB_DEBUG */
-static inline void setup_object_debug(struct kmem_cache *s,
- struct slab *slab, void *object) {}
+static inline void setup_object_debug(struct kmem_cache *s, void *object) {}
static inline
void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {}
@@ -1641,8 +1641,7 @@ static inline int free_debug_processing(
void *head, void *tail, int bulk_cnt,
unsigned long addr) { return 0; }
-static inline int slab_pad_check(struct kmem_cache *s, struct slab *slab)
- { return 1; }
+static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {}
static inline int check_object(struct kmem_cache *s, struct slab *slab,
void *object, u8 val) { return 1; }
static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
@@ -1772,10 +1771,9 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
return *head != NULL;
}
-static void *setup_object(struct kmem_cache *s, struct slab *slab,
- void *object)
+static void *setup_object(struct kmem_cache *s, void *object)
{
- setup_object_debug(s, slab, object);
+ setup_object_debug(s, object);
object = kasan_init_slab_obj(s, object);
if (unlikely(s->ctor)) {
kasan_unpoison_object_data(s, object);
@@ -1894,13 +1892,13 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab)
/* First entry is used as the base of the freelist */
cur = next_freelist_entry(s, slab, &pos, start, page_limit,
freelist_count);
- cur = setup_object(s, slab, cur);
+ cur = setup_object(s, cur);
slab->freelist = cur;
for (idx = 1; idx < slab->objects; idx++) {
next = next_freelist_entry(s, slab, &pos, start, page_limit,
freelist_count);
- next = setup_object(s, slab, next);
+ next = setup_object(s, next);
set_freepointer(s, cur, next);
cur = next;
}
@@ -1939,7 +1937,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
*/
alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min))
- alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL);
+ alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM;
slab = alloc_slab_page(alloc_gfp, node, oo);
if (unlikely(!slab)) {
@@ -1971,11 +1969,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
if (!shuffle) {
start = fixup_red_left(s, start);
- start = setup_object(s, slab, start);
+ start = setup_object(s, start);
slab->freelist = start;
for (idx = 0, p = start; idx < slab->objects - 1; idx++) {
next = p + s->size;
- next = setup_object(s, slab, next);
+ next = setup_object(s, next);
set_freepointer(s, p, next);
p = next;
}
@@ -2910,7 +2908,6 @@ redo:
*/
if (!node_isset(node, slab_nodes)) {
node = NUMA_NO_NODE;
- goto redo;
} else {
stat(s, ALLOC_NODE_MISMATCH);
goto deactivate_slab;
@@ -4165,8 +4162,6 @@ static int calculate_sizes(struct kmem_cache *s)
*/
s->oo = oo_make(order, size);
s->min = oo_make(get_order(size), size);
- if (oo_objects(s->oo) > oo_objects(s->max))
- s->max = s->oo;
return !!oo_objects(s->oo);
}
@@ -4344,18 +4339,26 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab)
objp = fixup_red_left(s, objp);
trackp = get_track(s, objp, TRACK_ALLOC);
kpp->kp_ret = (void *)trackp->addr;
-#ifdef CONFIG_STACKTRACE
- for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
- kpp->kp_stack[i] = (void *)trackp->addrs[i];
- if (!kpp->kp_stack[i])
- break;
- }
+#ifdef CONFIG_STACKDEPOT
+ {
+ depot_stack_handle_t handle;
+ unsigned long *entries;
+ unsigned int nr_entries;
- trackp = get_track(s, objp, TRACK_FREE);
- for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
- kpp->kp_free_stack[i] = (void *)trackp->addrs[i];
- if (!kpp->kp_free_stack[i])
- break;
+ handle = READ_ONCE(trackp->handle);
+ if (handle) {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
+ kpp->kp_stack[i] = (void *)entries[i];
+ }
+
+ trackp = get_track(s, objp, TRACK_FREE);
+ handle = READ_ONCE(trackp->handle);
+ if (handle) {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
+ kpp->kp_free_stack[i] = (void *)entries[i];
+ }
}
#endif
#endif
@@ -5057,6 +5060,7 @@ EXPORT_SYMBOL(validate_slab_cache);
*/
struct location {
+ depot_stack_handle_t handle;
unsigned long count;
unsigned long addr;
long long sum_time;
@@ -5109,9 +5113,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
{
long start, end, pos;
struct location *l;
- unsigned long caddr;
+ unsigned long caddr, chandle;
unsigned long age = jiffies - track->when;
+ depot_stack_handle_t handle = 0;
+#ifdef CONFIG_STACKDEPOT
+ handle = READ_ONCE(track->handle);
+#endif
start = -1;
end = t->count;
@@ -5126,7 +5134,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
break;
caddr = t->loc[pos].addr;
- if (track->addr == caddr) {
+ chandle = t->loc[pos].handle;
+ if ((track->addr == caddr) && (handle == chandle)) {
l = &t->loc[pos];
l->count++;
@@ -5151,6 +5160,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
if (track->addr < caddr)
end = pos;
+ else if (track->addr == caddr && handle < chandle)
+ end = pos;
else
start = pos;
}
@@ -5173,6 +5184,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
l->max_time = age;
l->min_pid = track->pid;
l->max_pid = track->pid;
+ l->handle = handle;
cpumask_clear(to_cpumask(l->cpus));
cpumask_set_cpu(track->cpu, to_cpumask(l->cpus));
nodes_clear(l->nodes);
@@ -6082,6 +6094,21 @@ static int slab_debugfs_show(struct seq_file *seq, void *v)
seq_printf(seq, " nodes=%*pbl",
nodemask_pr_args(&l->nodes));
+#ifdef CONFIG_STACKDEPOT
+ {
+ depot_stack_handle_t handle;
+ unsigned long *entries;
+ unsigned int nr_entries, j;
+
+ handle = READ_ONCE(l->handle);
+ if (handle) {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ seq_puts(seq, "\n");
+ for (j = 0; j < nr_entries; j++)
+ seq_printf(seq, " %pS\n", (void *)entries[j]);
+ }
+ }
+#endif
seq_puts(seq, "\n");
}
@@ -6106,6 +6133,17 @@ static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos)
return NULL;
}
+static int cmp_loc_by_count(const void *a, const void *b, const void *data)
+{
+ struct location *loc1 = (struct location *)a;
+ struct location *loc2 = (struct location *)b;
+
+ if (loc1->count > loc2->count)
+ return -1;
+ else
+ return 1;
+}
+
static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos)
{
struct loc_track *t = seq->private;
@@ -6167,6 +6205,10 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep)
spin_unlock_irqrestore(&n->list_lock, flags);
}
+ /* Sort locations by count */
+ sort_r(t->loc, t->count, sizeof(struct location),
+ cmp_loc_by_count, NULL, NULL);
+
bitmap_free(obj_map);
return 0;
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index a0eb690d9926..94b4ff43ead0 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -6,6 +6,7 @@
* Swap reorganised 29.12.95, Stephen Tweedie
*/
+#include <linux/blkdev.h>
#include <linux/mm.h>
#include <linux/sched/mm.h>
#include <linux/sched/task.h>
@@ -180,7 +181,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, 0);
+ nr_blocks, GFP_KERNEL);
if (err)
return err;
cond_resched();
@@ -191,7 +192,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL, 0);
+ nr_blocks, GFP_KERNEL);
if (err)
break;
@@ -255,7 +256,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO, 0))
+ nr_blocks, GFP_NOIO))
break;
se = next_se(se);
@@ -2472,7 +2473,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
if (p->flags & SWP_CONTINUED)
free_swap_count_continuations(p);
- if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev)))
+ if (!p->bdev || !bdev_nonrot(p->bdev))
atomic_dec(&nr_rotate_swap);
mutex_lock(&swapon_mutex);
@@ -2767,7 +2768,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
* write only restriction. Hence zoned block devices are not
* suitable for swapping. Disallow them here.
*/
- if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
+ if (bdev_is_zoned(p->bdev))
return -EINVAL;
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
@@ -2963,20 +2964,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
return nr_extents;
}
-/*
- * Helper to sys_swapon determining if a given swap
- * backing device queue supports DISCARD operations.
- */
-static bool swap_discardable(struct swap_info_struct *si)
-{
- struct request_queue *q = bdev_get_queue(si->bdev);
-
- if (!blk_queue_discard(q))
- return false;
-
- return true;
-}
-
SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
{
struct swap_info_struct *p;
@@ -3047,7 +3034,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
/*
* Read the swap header.
*/
- if (!mapping->a_ops->readpage) {
+ if (!mapping->a_ops->read_folio) {
error = -EINVAL;
goto bad_swap_unlock_inode;
}
@@ -3071,13 +3058,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
goto bad_swap_unlock_inode;
}
- if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue))
+ if (p->bdev && bdev_stable_writes(p->bdev))
p->flags |= SWP_STABLE_WRITES;
if (p->bdev && p->bdev->bd_disk->fops->rw_page)
p->flags |= SWP_SYNCHRONOUS_IO;
- if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) {
+ if (p->bdev && bdev_nonrot(p->bdev)) {
int cpu;
unsigned long ci, nr_cluster;
@@ -3138,7 +3125,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
sizeof(long),
GFP_KERNEL);
- if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) {
+ if ((swap_flags & SWAP_FLAG_DISCARD) &&
+ p->bdev && bdev_max_discard_sectors(p->bdev)) {
/*
* When discard is enabled for swap with no particular
* policy flagged, we set all swap discard flags here in
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 2c235d5c2364..baeacc735b83 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -17,6 +17,7 @@
#include <linux/sched/task.h>
#include <linux/sched/task_stack.h>
#include <linux/thread_info.h>
+#include <linux/vmalloc.h>
#include <linux/atomic.h>
#include <linux/jump_label.h>
#include <asm/sections.h>
@@ -157,91 +158,47 @@ static inline void check_bogus_address(const unsigned long ptr, unsigned long n,
usercopy_abort("null address", NULL, to_user, ptr, n);
}
-/* Checks for allocs that are marked in some way as spanning multiple pages. */
-static inline void check_page_span(const void *ptr, unsigned long n,
- struct page *page, bool to_user)
+static inline void check_heap_object(const void *ptr, unsigned long n,
+ bool to_user)
{
-#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN
- const void *end = ptr + n - 1;
- struct page *endpage;
- bool is_reserved, is_cma;
+ struct folio *folio;
- /*
- * Sometimes the kernel data regions are not marked Reserved (see
- * check below). And sometimes [_sdata,_edata) does not cover
- * rodata and/or bss, so check each range explicitly.
- */
+ if (is_kmap_addr(ptr)) {
+ unsigned long page_end = (unsigned long)ptr | (PAGE_SIZE - 1);
- /* Allow reads of kernel rodata region (if not marked as Reserved). */
- if (ptr >= (const void *)__start_rodata &&
- end <= (const void *)__end_rodata) {
- if (!to_user)
- usercopy_abort("rodata", NULL, to_user, 0, n);
+ if ((unsigned long)ptr + n - 1 > page_end)
+ usercopy_abort("kmap", NULL, to_user,
+ offset_in_page(ptr), n);
return;
}
- /* Allow kernel data region (if not marked as Reserved). */
- if (ptr >= (const void *)_sdata && end <= (const void *)_edata)
- return;
+ if (is_vmalloc_addr(ptr)) {
+ struct vm_struct *area = find_vm_area(ptr);
+ unsigned long offset;
- /* Allow kernel bss region (if not marked as Reserved). */
- if (ptr >= (const void *)__bss_start &&
- end <= (const void *)__bss_stop)
- return;
-
- /* Is the object wholly within one base page? */
- if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) ==
- ((unsigned long)end & (unsigned long)PAGE_MASK)))
- return;
+ if (!area) {
+ usercopy_abort("vmalloc", "no area", to_user, 0, n);
+ return;
+ }
- /* Allow if fully inside the same compound (__GFP_COMP) page. */
- endpage = virt_to_head_page(end);
- if (likely(endpage == page))
+ offset = ptr - area->addr;
+ if (offset + n > get_vm_area_size(area))
+ usercopy_abort("vmalloc", NULL, to_user, offset, n);
return;
-
- /*
- * Reject if range is entirely either Reserved (i.e. special or
- * device memory), or CMA. Otherwise, reject since the object spans
- * several independently allocated pages.
- */
- is_reserved = PageReserved(page);
- is_cma = is_migrate_cma_page(page);
- if (!is_reserved && !is_cma)
- usercopy_abort("spans multiple pages", NULL, to_user, 0, n);
-
- for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) {
- page = virt_to_head_page(ptr);
- if (is_reserved && !PageReserved(page))
- usercopy_abort("spans Reserved and non-Reserved pages",
- NULL, to_user, 0, n);
- if (is_cma && !is_migrate_cma_page(page))
- usercopy_abort("spans CMA and non-CMA pages", NULL,
- to_user, 0, n);
}
-#endif
-}
-
-static inline void check_heap_object(const void *ptr, unsigned long n,
- bool to_user)
-{
- struct folio *folio;
if (!virt_addr_valid(ptr))
return;
- /*
- * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the
- * highmem page or fallback to virt_to_page(). The following
- * is effectively a highmem-aware virt_to_slab().
- */
- folio = page_folio(kmap_to_page((void *)ptr));
+ folio = virt_to_folio(ptr);
if (folio_test_slab(folio)) {
/* Check slab allocator for flags and size. */
__check_heap_object(ptr, n, folio_slab(folio), to_user);
- } else {
- /* Verify object does not incorrectly span multiple pages. */
- check_page_span(ptr, n, folio_page(folio, 0), to_user);
+ } else if (folio_test_large(folio)) {
+ unsigned long offset = ptr - folio_address(folio);
+ if (offset + n > folio_size(folio))
+ usercopy_abort("page alloc", NULL, to_user, offset, n);
}
}
diff --git a/mm/util.c b/mm/util.c
index f27c796239b1..29f4f773dc7b 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -344,6 +344,38 @@ unsigned long randomize_stack_top(unsigned long stack_top)
#endif
}
+/**
+ * randomize_page - Generate a random, page aligned address
+ * @start: The smallest acceptable address the caller will take.
+ * @range: The size of the area, starting at @start, within which the
+ * random address must fall.
+ *
+ * If @start + @range would overflow, @range is capped.
+ *
+ * NOTE: Historical use of randomize_range, which this replaces, presumed that
+ * @start was already page aligned. We now align it regardless.
+ *
+ * Return: A page aligned address within [start, start + range). On error,
+ * @start is returned.
+ */
+unsigned long randomize_page(unsigned long start, unsigned long range)
+{
+ if (!PAGE_ALIGNED(start)) {
+ range -= PAGE_ALIGN(start) - start;
+ start = PAGE_ALIGN(start);
+ }
+
+ if (start > ULONG_MAX - range)
+ range = ULONG_MAX - start;
+
+ range >>= PAGE_SHIFT;
+
+ if (range == 0)
+ return start;
+
+ return start + (get_random_long() % range << PAGE_SHIFT);
+}
+
#ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT
unsigned long arch_randomize_brk(struct mm_struct *mm)
{
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 76bca20679d9..f7d9a683e3a7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1178,7 +1178,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping,
* folio->mapping == NULL while being dirty with clean buffers.
*/
if (folio_test_private(folio)) {
- if (try_to_free_buffers(&folio->page)) {
+ if (try_to_free_buffers(folio)) {
folio_clear_dirty(folio);
pr_info("%s: orphaned folio\n", __func__);
return PAGE_CLEAN;
@@ -1280,9 +1280,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
xa_unlock_irq(&mapping->i_pages);
put_swap_page(&folio->page, swap);
} else {
- void (*freepage)(struct page *);
+ void (*free_folio)(struct folio *);
- freepage = mapping->a_ops->freepage;
+ free_folio = mapping->a_ops->free_folio;
/*
* Remember a shadow entry for reclaimed file cache in
* order to detect refaults, thus thrashing, later on.
@@ -1308,8 +1308,8 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
inode_add_lru(mapping->host);
spin_unlock(&mapping->host->i_lock);
- if (freepage != NULL)
- freepage(&folio->page);
+ if (free_folio)
+ free_folio(folio);
}
return 1;
@@ -1456,7 +1456,7 @@ static void folio_check_dirty_writeback(struct folio *folio,
mapping = folio_mapping(folio);
if (mapping && mapping->a_ops->is_dirty_writeback)
- mapping->a_ops->is_dirty_writeback(&folio->page, dirty, writeback);
+ mapping->a_ops->is_dirty_writeback(folio, dirty, writeback);
}
static struct page *alloc_demote_page(struct page *page, unsigned long node)