diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 19 | ||||
-rw-r--r-- | mm/filemap.c | 99 | ||||
-rw-r--r-- | mm/folio-compat.c | 4 | ||||
-rw-r--r-- | mm/gup.c | 29 | ||||
-rw-r--r-- | mm/huge_memory.c | 7 | ||||
-rw-r--r-- | mm/kfence/.kunitconfig | 6 | ||||
-rw-r--r-- | mm/kfence/core.c | 10 | ||||
-rw-r--r-- | mm/kfence/kfence_test.c | 31 | ||||
-rw-r--r-- | mm/memory-failure.c | 15 | ||||
-rw-r--r-- | mm/memory.c | 4 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 10 | ||||
-rw-r--r-- | mm/page_io.c | 4 | ||||
-rw-r--r-- | mm/page_owner.c | 9 | ||||
-rw-r--r-- | mm/readahead.c | 53 | ||||
-rw-r--r-- | mm/secretmem.c | 8 | ||||
-rw-r--r-- | mm/shmem.c | 4 | ||||
-rw-r--r-- | mm/slab.c | 29 | ||||
-rw-r--r-- | mm/slab.h | 5 | ||||
-rw-r--r-- | mm/slab_common.c | 23 | ||||
-rw-r--r-- | mm/slub.c | 174 | ||||
-rw-r--r-- | mm/swapfile.c | 34 | ||||
-rw-r--r-- | mm/usercopy.c | 91 | ||||
-rw-r--r-- | mm/util.c | 32 | ||||
-rw-r--r-- | mm/vmscan.c | 12 |
26 files changed, 364 insertions, 352 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7176af65b103..ff60bd7d74e0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include <linux/blkdev.h> #include <linux/wait.h> #include <linux/rbtree.h> #include <linux/kthread.h> @@ -390,7 +391,6 @@ static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); - struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css); struct backing_dev_info *bdi = wb->bdi; mutex_lock(&wb->bdi->cgwb_release_mutex); @@ -401,7 +401,7 @@ static void cgwb_release_workfn(struct work_struct *work) mutex_unlock(&wb->bdi->cgwb_release_mutex); /* triggers blkg destruction if no online users left */ - blkcg_unpin_online(blkcg); + blkcg_unpin_online(wb->blkcg_css); fprop_local_destroy_percpu(&wb->memcg_completions); @@ -446,7 +446,6 @@ static int cgwb_create(struct backing_dev_info *bdi, { struct mem_cgroup *memcg; struct cgroup_subsys_state *blkcg_css; - struct blkcg *blkcg; struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; struct bdi_writeback *wb; unsigned long flags; @@ -454,9 +453,8 @@ static int cgwb_create(struct backing_dev_info *bdi, memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); - blkcg = css_to_blkcg(blkcg_css); memcg_cgwb_list = &memcg->cgwb_list; - blkcg_cgwb_list = &blkcg->cgwb_list; + blkcg_cgwb_list = blkcg_get_cgwb_list(blkcg_css); /* look up again under lock and discard on blkcg mismatch */ spin_lock_irqsave(&cgwb_lock, flags); @@ -511,7 +509,7 @@ static int cgwb_create(struct backing_dev_info *bdi, list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); - blkcg_pin_online(blkcg); + blkcg_pin_online(blkcg_css); css_get(memcg_css); css_get(blkcg_css); } @@ -724,18 +722,19 @@ void wb_memcg_offline(struct mem_cgroup *memcg) /** * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined - * @blkcg: blkcg being offlined + * @css: blkcg being offlined * * Also prevents creation of any new wb's associated with @blkcg. */ -void wb_blkcg_offline(struct blkcg *blkcg) +void wb_blkcg_offline(struct cgroup_subsys_state *css) { struct bdi_writeback *wb, *next; + struct list_head *list = blkcg_get_cgwb_list(css); spin_lock_irq(&cgwb_lock); - list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node) + list_for_each_entry_safe(wb, next, list, blkcg_node) cgwb_kill(wb); - blkcg->cgwb_list.next = NULL; /* prevent new wb's */ + list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); } diff --git a/mm/filemap.c b/mm/filemap.c index de34c6c10384..9daeaab36081 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -225,12 +225,12 @@ void __filemap_remove_folio(struct folio *folio, void *shadow) void filemap_free_folio(struct address_space *mapping, struct folio *folio) { - void (*freepage)(struct page *); + void (*free_folio)(struct folio *); int refs = 1; - freepage = mapping->a_ops->freepage; - if (freepage) - freepage(&folio->page); + free_folio = mapping->a_ops->free_folio; + if (free_folio) + free_folio(folio); if (folio_test_large(folio) && !folio_test_hugetlb(folio)) refs = folio_nr_pages(folio); @@ -807,7 +807,7 @@ void replace_page_cache_page(struct page *old, struct page *new) struct folio *fold = page_folio(old); struct folio *fnew = page_folio(new); struct address_space *mapping = old->mapping; - void (*freepage)(struct page *) = mapping->a_ops->freepage; + void (*free_folio)(struct folio *) = mapping->a_ops->free_folio; pgoff_t offset = old->index; XA_STATE(xas, &mapping->i_pages, offset); @@ -835,9 +835,9 @@ void replace_page_cache_page(struct page *old, struct page *new) if (PageSwapBacked(new)) __inc_lruvec_page_state(new, NR_SHMEM); xas_unlock_irq(&xas); - if (freepage) - freepage(old); - put_page(old); + if (free_folio) + free_folio(fold); + folio_put(fold); } EXPORT_SYMBOL_GPL(replace_page_cache_page); @@ -2414,12 +2414,12 @@ static int filemap_read_folio(struct file *file, struct address_space *mapping, /* * A previous I/O error may have been due to temporary failures, - * eg. multipath errors. PG_error will be set again if readpage + * eg. multipath errors. PG_error will be set again if read_folio * fails. */ folio_clear_error(folio); /* Start the actual read. The read will unlock the page. */ - error = mapping->a_ops->readpage(file, &folio->page); + error = mapping->a_ops->read_folio(file, folio); if (error) return error; @@ -2636,7 +2636,7 @@ err: * @already_read: Number of bytes already read by the caller. * * Copies data from the page cache. If the data is not currently present, - * uses the readahead and readpage address_space operations to fetch it. + * uses the readahead and read_folio address_space operations to fetch it. * * Return: Total number of bytes copied, including those already read by * the caller. If an error happens before any bytes are copied, returns @@ -3452,7 +3452,7 @@ int generic_file_mmap(struct file *file, struct vm_area_struct *vma) { struct address_space *mapping = file->f_mapping; - if (!mapping->a_ops->readpage) + if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(file); vma->vm_ops = &generic_file_vm_ops; @@ -3488,10 +3488,13 @@ EXPORT_SYMBOL(generic_file_mmap); EXPORT_SYMBOL(generic_file_readonly_mmap); static struct folio *do_read_cache_folio(struct address_space *mapping, - pgoff_t index, filler_t filler, void *data, gfp_t gfp) + pgoff_t index, filler_t filler, struct file *file, gfp_t gfp) { struct folio *folio; int err; + + if (!filler) + filler = mapping->a_ops->read_folio; repeat: folio = filemap_get_folio(mapping, index); if (!folio) { @@ -3508,11 +3511,7 @@ repeat: } filler: - if (filler) - err = filler(data, &folio->page); - else - err = mapping->a_ops->readpage(data, &folio->page); - + err = filler(file, folio); if (err < 0) { folio_put(folio); return ERR_PTR(err); @@ -3562,44 +3561,44 @@ out: } /** - * read_cache_folio - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: first arg to filler(data, page) function, often left as NULL + * read_cache_folio - Read into page cache, fill it if needed. + * @mapping: The address_space to read from. + * @index: The index to read. + * @filler: Function to perform the read, or NULL to use aops->read_folio(). + * @file: Passed to filler function, may be NULL if not required. * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page and wait for it to become unlocked. + * Read one page into the page cache. If it succeeds, the folio returned + * will contain @index, but it may not be the first page of the folio. * - * If the page does not get brought uptodate, return -EIO. - * - * The function expects mapping->invalidate_lock to be already held. + * If the filler function returns an error, it will be returned to the + * caller. * - * Return: up to date page on success, ERR_PTR() on failure. + * Context: May sleep. Expects mapping->invalidate_lock to be held. + * Return: An uptodate folio on success, ERR_PTR() on failure. */ struct folio *read_cache_folio(struct address_space *mapping, pgoff_t index, - filler_t filler, void *data) + filler_t filler, struct file *file) { - return do_read_cache_folio(mapping, index, filler, data, + return do_read_cache_folio(mapping, index, filler, file, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_folio); static struct page *do_read_cache_page(struct address_space *mapping, - pgoff_t index, filler_t *filler, void *data, gfp_t gfp) + pgoff_t index, filler_t *filler, struct file *file, gfp_t gfp) { struct folio *folio; - folio = do_read_cache_folio(mapping, index, filler, data, gfp); + folio = do_read_cache_folio(mapping, index, filler, file, gfp); if (IS_ERR(folio)) return &folio->page; return folio_file_page(folio, index); } struct page *read_cache_page(struct address_space *mapping, - pgoff_t index, filler_t *filler, void *data) + pgoff_t index, filler_t *filler, struct file *file) { - return do_read_cache_page(mapping, index, filler, data, + return do_read_cache_page(mapping, index, filler, file, mapping_gfp_mask(mapping)); } EXPORT_SYMBOL(read_cache_page); @@ -3627,27 +3626,6 @@ struct page *read_cache_page_gfp(struct address_space *mapping, } EXPORT_SYMBOL(read_cache_page_gfp); -int pagecache_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ - const struct address_space_operations *aops = mapping->a_ops; - - return aops->write_begin(file, mapping, pos, len, flags, - pagep, fsdata); -} -EXPORT_SYMBOL(pagecache_write_begin); - -int pagecache_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ - const struct address_space_operations *aops = mapping->a_ops; - - return aops->write_end(file, mapping, pos, len, copied, page, fsdata); -} -EXPORT_SYMBOL(pagecache_write_end); - /* * Warn about a page cache invalidation failure during a direct I/O write. */ @@ -3759,7 +3737,6 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) const struct address_space_operations *a_ops = mapping->a_ops; long status = 0; ssize_t written = 0; - unsigned int flags = 0; do { struct page *page; @@ -3789,7 +3766,7 @@ again: break; } - status = a_ops->write_begin(file, mapping, pos, bytes, flags, + status = a_ops->write_begin(file, mapping, pos, bytes, &page, &fsdata); if (unlikely(status < 0)) break; @@ -3983,8 +3960,8 @@ bool filemap_release_folio(struct folio *folio, gfp_t gfp) if (folio_test_writeback(folio)) return false; - if (mapping && mapping->a_ops->releasepage) - return mapping->a_ops->releasepage(&folio->page, gfp); - return try_to_free_buffers(&folio->page); + if (mapping && mapping->a_ops->release_folio) + return mapping->a_ops->release_folio(folio, gfp); + return try_to_free_buffers(folio); } EXPORT_SYMBOL(filemap_release_folio); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 46fa179e32fb..20bc15b57d93 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -131,12 +131,10 @@ struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index, EXPORT_SYMBOL(pagecache_get_page); struct page *grab_cache_page_write_begin(struct address_space *mapping, - pgoff_t index, unsigned flags) + pgoff_t index) { unsigned fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; - if (flags & AOP_FLAG_NOFS) - fgp_flags |= FGP_NOFS; return pagecache_get_page(mapping, index, fgp_flags, mapping_gfp_mask(mapping)); } @@ -1724,6 +1724,35 @@ out: } EXPORT_SYMBOL(fault_in_writeable); +/** + * fault_in_subpage_writeable - fault in an address range for writing + * @uaddr: start of address range + * @size: size of address range + * + * Fault in a user address range for writing while checking for permissions at + * sub-page granularity (e.g. arm64 MTE). This function should be used when + * the caller cannot guarantee forward progress of a copy_to_user() loop. + * + * Returns the number of bytes not faulted in (like copy_to_user() and + * copy_from_user()). + */ +size_t fault_in_subpage_writeable(char __user *uaddr, size_t size) +{ + size_t faulted_in; + + /* + * Attempt faulting in at page granularity first for page table + * permission checking. The arch-specific probe_subpage_writeable() + * functions may not check for this. + */ + faulted_in = size - fault_in_writeable(uaddr, size); + if (faulted_in) + faulted_in -= probe_subpage_writeable(uaddr, faulted_in); + + return size - faulted_in; +} +EXPORT_SYMBOL(fault_in_subpage_writeable); + /* * fault_in_safe_writeable - fault in an address range for writing * @uaddr: start of address range diff --git a/mm/huge_memory.c b/mm/huge_memory.c index efd06af61fca..a77c78a2b6b5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2545,11 +2545,16 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct address_space *mapping = NULL; int extra_pins, ret; pgoff_t end; + bool is_hzp; - VM_BUG_ON_PAGE(is_huge_zero_page(head), head); VM_BUG_ON_PAGE(!PageLocked(head), head); VM_BUG_ON_PAGE(!PageCompound(head), head); + is_hzp = is_huge_zero_page(head); + VM_WARN_ON_ONCE_PAGE(is_hzp, head); + if (is_hzp) + return -EBUSY; + if (PageWriteback(head)) return -EBUSY; diff --git a/mm/kfence/.kunitconfig b/mm/kfence/.kunitconfig new file mode 100644 index 000000000000..f3d65e939bfa --- /dev/null +++ b/mm/kfence/.kunitconfig @@ -0,0 +1,6 @@ +CONFIG_KUNIT=y +CONFIG_KFENCE=y +CONFIG_KFENCE_KUNIT_TEST=y + +# Additional dependencies. +CONFIG_FTRACE=y diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 2b9f89749154..4e7cd4c8e687 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -630,6 +630,16 @@ static bool __init kfence_init_pool_early(void) * fails for the first page, and therefore expect addr==__kfence_pool in * most failure cases. */ + for (char *p = (char *)addr; p < __kfence_pool + KFENCE_POOL_SIZE; p += PAGE_SIZE) { + struct slab *slab = virt_to_slab(p); + + if (!slab) + continue; +#ifdef CONFIG_MEMCG + slab->memcg_data = 0; +#endif + __folio_clear_slab(slab_folio(slab)); + } memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool)); __kfence_pool = NULL; return false; diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index 21aafc95f4ab..a97bffe0cc3e 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -825,14 +825,6 @@ static void test_exit(struct kunit *test) test_cache_destroy(); } -static struct kunit_suite kfence_test_suite = { - .name = "kfence", - .test_cases = kfence_test_cases, - .init = test_init, - .exit = test_exit, -}; -static struct kunit_suite *kfence_test_suites[] = { &kfence_test_suite, NULL }; - static void register_tracepoints(struct tracepoint *tp, void *ignore) { check_trace_callback_type_console(probe_console); @@ -846,11 +838,7 @@ static void unregister_tracepoints(struct tracepoint *tp, void *ignore) tracepoint_probe_unregister(tp, probe_console, NULL); } -/* - * We only want to do tracepoints setup and teardown once, therefore we have to - * customize the init and exit functions and cannot rely on kunit_test_suite(). - */ -static int __init kfence_test_init(void) +static int kfence_suite_init(struct kunit_suite *suite) { /* * Because we want to be able to build the test as a module, we need to @@ -858,18 +846,25 @@ static int __init kfence_test_init(void) * won't work here. */ for_each_kernel_tracepoint(register_tracepoints, NULL); - return __kunit_test_suites_init(kfence_test_suites); + return 0; } -static void kfence_test_exit(void) +static void kfence_suite_exit(struct kunit_suite *suite) { - __kunit_test_suites_exit(kfence_test_suites); for_each_kernel_tracepoint(unregister_tracepoints, NULL); tracepoint_synchronize_unregister(); } -late_initcall_sync(kfence_test_init); -module_exit(kfence_test_exit); +static struct kunit_suite kfence_test_suite = { + .name = "kfence", + .test_cases = kfence_test_cases, + .init = test_init, + .exit = test_exit, + .suite_init = kfence_suite_init, + .suite_exit = kfence_suite_exit, +}; + +kunit_test_suites(&kfence_test_suite); MODULE_LICENSE("GPL v2"); MODULE_AUTHOR("Alexander Potapenko <glider@google.com>, Marco Elver <elver@google.com>"); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a934ee8124dd..b85661cbdc4a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1272,7 +1272,7 @@ try_again: } out: if (ret == -EIO) - dump_page(p, "hwpoison: unhandlable page"); + pr_err("Memory failure: %#lx: unhandlable page.\n", page_to_pfn(p)); return ret; } @@ -1849,19 +1849,6 @@ try_again: if (PageTransHuge(hpage)) { /* - * Bail out before SetPageHasHWPoisoned() if hpage is - * huge_zero_page, although PG_has_hwpoisoned is not - * checked in set_huge_zero_page(). - * - * TODO: Handle memory failure of huge_zero_page thoroughly. - */ - if (is_huge_zero_page(hpage)) { - action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); - res = -EBUSY; - goto unlock_mutex; - } - - /* * The flag must be set after the refcount is bumped * otherwise it may race with THP split. * And the flag can't be set in get_hwpoison_page() since diff --git a/mm/memory.c b/mm/memory.c index 2bf5bca39567..54bcd5327b74 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -558,11 +558,11 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, dump_page(page, "bad pte"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); - pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n", + pr_alert("file:%pD fault:%ps mmap:%ps read_folio:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, - mapping ? mapping->a_ops->readpage : NULL); + mapping ? mapping->a_ops->read_folio : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } diff --git a/mm/migrate.c b/mm/migrate.c index 1415ea25f7aa..e51588e95f57 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1020,7 +1020,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (!page->mapping) { VM_BUG_ON_PAGE(PageAnon(page), page); if (page_has_private(page)) { - try_to_free_buffers(page); + try_to_free_buffers(folio); goto out_unlock_both; } } else if (page_mapped(page)) { diff --git a/mm/mremap.c b/mm/mremap.c index 097002506d2d..b522cd0259a0 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -941,7 +941,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, return -EINTR; vma = vma_lookup(mm, addr); if (!vma) { - ret = EFAULT; + ret = -EFAULT; goto out; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9f459f7f8f83..359dc1da3636 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2609,10 +2609,12 @@ EXPORT_SYMBOL(folio_redirty_for_writepage); * folio_mark_dirty - Mark a folio as being modified. * @folio: The folio. * - * For folios with a mapping this should be done with the folio lock held - * for the benefit of asynchronous memory errors who prefer a consistent - * dirty state. This rule can be broken in some special cases, - * but should be better not to. + * The folio may not be truncated while this function is running. + * Holding the folio lock is sufficient to prevent truncation, but some + * callers cannot acquire a sleeping lock. These callers instead hold + * the page table lock for a page table which contains at least one page + * in this folio. Truncation will block on the page table lock as it + * unmaps pages before removing the folio from its mapping. * * Return: True if the folio was newly dirtied, false if it was already dirty. */ diff --git a/mm/page_io.c b/mm/page_io.c index 1b8075ef3418..68318134dc92 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -496,7 +496,6 @@ int swap_readpage(struct page *page, bool synchronous, * attempt to access it in the page fault retry time check. */ if (synchronous) { - bio->bi_opf |= REQ_POLLED; get_task_struct(current); bio->bi_private = current; } @@ -508,8 +507,7 @@ int swap_readpage(struct page *page, bool synchronous, if (!READ_ONCE(bio->bi_private)) break; - if (!bio_poll(bio, NULL, 0)) - blk_io_schedule(); + blk_io_schedule(); } __set_current_state(TASK_RUNNING); bio_put(bio); diff --git a/mm/page_owner.c b/mm/page_owner.c index d808901e27b2..e4c6f3f1695b 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -45,7 +45,12 @@ static void init_early_allocated_pages(void); static int __init early_page_owner_param(char *buf) { - return kstrtobool(buf, &page_owner_enabled); + int ret = kstrtobool(buf, &page_owner_enabled); + + if (page_owner_enabled) + stack_depot_want_early_init(); + + return ret; } early_param("page_owner", early_page_owner_param); @@ -83,8 +88,6 @@ static __init void init_page_owner(void) if (!page_owner_enabled) return; - stack_depot_init(); - register_dummy_stack(); register_failure_stack(); register_early_stack(); diff --git a/mm/readahead.c b/mm/readahead.c index 8e3775829513..b78921b54754 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -15,7 +15,7 @@ * explicitly requested by the application. Readahead only ever * attempts to read folios that are not yet in the page cache. If a * folio is present but not up-to-date, readahead will not try to read - * it. In that case a simple ->readpage() will be requested. + * it. In that case a simple ->read_folio() will be requested. * * Readahead is triggered when an application read request (whether a * system call or a page fault) finds that the requested folio is not in @@ -78,7 +78,7 @@ * address space operation, for which mpage_readahead() is a canonical * implementation. ->readahead() should normally initiate reads on all * folios, but may fail to read any or all folios without causing an I/O - * error. The page cache reading code will issue a ->readpage() request + * error. The page cache reading code will issue a ->read_folio() request * for any folio which ->readahead() did not read, and only an error * from this will be final. * @@ -110,9 +110,10 @@ * were not fetched with readahead_folio(). This will allow a * subsequent synchronous readahead request to try them again. If they * are left in the page cache, then they will be read individually using - * ->readpage() which may be less efficient. + * ->read_folio() which may be less efficient. */ +#include <linux/blkdev.h> #include <linux/kernel.h> #include <linux/dax.h> #include <linux/gfp.h> @@ -145,7 +146,7 @@ EXPORT_SYMBOL_GPL(file_ra_state_init); static void read_pages(struct readahead_control *rac) { const struct address_space_operations *aops = rac->mapping->a_ops; - struct page *page; + struct folio *folio; struct blk_plug plug; if (!readahead_count(rac)) @@ -156,24 +157,23 @@ static void read_pages(struct readahead_control *rac) if (aops->readahead) { aops->readahead(rac); /* - * Clean up the remaining pages. The sizes in ->ra + * Clean up the remaining folios. The sizes in ->ra * may be used to size the next readahead, so make sure * they accurately reflect what happened. */ - while ((page = readahead_page(rac))) { - rac->ra->size -= 1; - if (rac->ra->async_size > 0) { - rac->ra->async_size -= 1; - delete_from_page_cache(page); + while ((folio = readahead_folio(rac)) != NULL) { + unsigned long nr = folio_nr_pages(folio); + + rac->ra->size -= nr; + if (rac->ra->async_size >= nr) { + rac->ra->async_size -= nr; + filemap_remove_folio(folio); } - unlock_page(page); - put_page(page); + folio_unlock(folio); } } else { - while ((page = readahead_page(rac))) { - aops->readpage(rac->file, page); - put_page(page); - } + while ((folio = readahead_folio(rac)) != NULL) + aops->read_folio(rac->file, folio); } blk_finish_plug(&plug); @@ -254,8 +254,8 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, } /* - * Now start the IO. We ignore I/O errors - if the page is not - * uptodate then the caller will launch readpage again, and + * Now start the IO. We ignore I/O errors - if the folio is not + * uptodate then the caller will launch read_folio again, and * will then handle the error. */ read_pages(ractl); @@ -303,7 +303,7 @@ void force_page_cache_ra(struct readahead_control *ractl, struct backing_dev_info *bdi = inode_to_bdi(mapping->host); unsigned long max_pages, index; - if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readahead)) + if (unlikely(!mapping->a_ops->read_folio && !mapping->a_ops->readahead)) return; /* @@ -474,7 +474,8 @@ static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index, if (!folio) return -ENOMEM; - if (mark - index < (1UL << order)) + mark = round_up(mark, 1UL << order); + if (index == mark) folio_set_readahead(folio); err = filemap_add_folio(ractl->mapping, folio, index, gfp); if (err) @@ -555,8 +556,9 @@ static void ondemand_readahead(struct readahead_control *ractl, struct file_ra_state *ra = ractl->ra; unsigned long max_pages = ra->ra_pages; unsigned long add_pages; - unsigned long index = readahead_index(ractl); - pgoff_t prev_index; + pgoff_t index = readahead_index(ractl); + pgoff_t expected, prev_index; + unsigned int order = folio ? folio_order(folio) : 0; /* * If the request exceeds the readahead window, allow the read to @@ -575,8 +577,9 @@ static void ondemand_readahead(struct readahead_control *ractl, * It's the expected callback index, assume sequential access. * Ramp up sizes, and push forward the readahead window. */ - if ((index == (ra->start + ra->size - ra->async_size) || - index == (ra->start + ra->size))) { + expected = round_up(ra->start + ra->size - ra->async_size, + 1UL << order); + if (index == expected || index == (ra->start + ra->size)) { ra->start += ra->size; ra->size = get_next_ra_size(ra, max_pages); ra->async_size = ra->size; @@ -662,7 +665,7 @@ readit: } ractl->_index = ra->start; - page_cache_ra_order(ractl, ra, folio ? folio_order(folio) : 0); + page_cache_ra_order(ractl, ra, order); } void page_cache_sync_ra(struct readahead_control *ractl, diff --git a/mm/secretmem.c b/mm/secretmem.c index 3b3cf2892b6a..206ed6b40c1d 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -145,15 +145,15 @@ static int secretmem_migratepage(struct address_space *mapping, return -EBUSY; } -static void secretmem_freepage(struct page *page) +static void secretmem_free_folio(struct folio *folio) { - set_direct_map_default_noflush(page); - clear_highpage(page); + set_direct_map_default_noflush(&folio->page); + folio_zero_segment(folio, 0, folio_size(folio)); } const struct address_space_operations secretmem_aops = { .dirty_folio = noop_dirty_folio, - .freepage = secretmem_freepage, + .free_folio = secretmem_free_folio, .migratepage = secretmem_migratepage, .isolate_page = secretmem_isolate_page, }; diff --git a/mm/shmem.c b/mm/shmem.c index b8169beff226..da30c769b376 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2414,7 +2414,7 @@ static int shmem_initxattrs(struct inode *, const struct xattr *, void *); static int shmem_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; @@ -4145,7 +4145,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) * * This behaves as a tmpfs "read_cache_page_gfp(mapping, index, gfp)", * with any new page allocations done using the specified allocation flags. - * But read_cache_page_gfp() uses the ->readpage() method: which does not + * But read_cache_page_gfp() uses the ->read_folio() method: which does not * suit tmpfs, since it may have pages in swapcache, and needs to find those * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. * diff --git a/mm/slab.c b/mm/slab.c index 8fb259511305..f8cd00f4ba13 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -619,18 +619,6 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) return 0; } -static inline void *alternate_node_alloc(struct kmem_cache *cachep, - gfp_t flags) -{ - return NULL; -} - -static inline void *____cache_alloc_node(struct kmem_cache *cachep, - gfp_t flags, int nodeid) -{ - return NULL; -} - static inline gfp_t gfp_exact_node(gfp_t flags) { return flags & ~__GFP_NOFAIL; @@ -638,9 +626,6 @@ static inline gfp_t gfp_exact_node(gfp_t flags) #else /* CONFIG_NUMA */ -static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); -static void *alternate_node_alloc(struct kmem_cache *, gfp_t); - static struct alien_cache *__alloc_alien_cache(int node, int entries, int batch, gfp_t gfp) { @@ -796,7 +781,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) int slab_node = slab_nid(virt_to_slab(objp)); int node = numa_mem_id(); /* - * Make sure we are not freeing a object from another node to the array + * Make sure we are not freeing an object from another node to the array * cache on this cpu. */ if (likely(node == slab_node)) @@ -847,7 +832,7 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) /* * The kmem_cache_nodes don't come and go as CPUs - * come and go. slab_mutex is sufficient + * come and go. slab_mutex provides sufficient * protection here. */ cachep->node[node] = n; @@ -860,7 +845,7 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) * Allocates and initializes node for a node on each slab cache, used for * either memory or cpu hotplug. If memory is being hot-added, the kmem_cache_node * will be allocated off-node since memory is not yet online for the new node. - * When hotplugging memory or a cpu, existing node are not replaced if + * When hotplugging memory or a cpu, existing nodes are not replaced if * already in use. * * Must hold slab_mutex. @@ -1061,7 +1046,7 @@ int slab_prepare_cpu(unsigned int cpu) * offline. * * Even if all the cpus of a node are down, we don't free the - * kmem_cache_node of any cache. This to avoid a race between cpu_down, and + * kmem_cache_node of any cache. This is to avoid a race between cpu_down, and * a kmalloc allocation from another cpu for memory from the node of * the cpu going down. The kmem_cache_node structure is usually allocated from * kmem_cache_create() and gets destroyed at kmem_cache_destroy(). @@ -1905,7 +1890,7 @@ static bool set_on_slab_cache(struct kmem_cache *cachep, * @flags: SLAB flags * * Returns a ptr to the cache on success, NULL on failure. - * Cannot be called within a int, but can be interrupted. + * Cannot be called within an int, but can be interrupted. * The @ctor is run when new pages are allocated by the cache. * * The flags are @@ -3055,6 +3040,8 @@ out: } #ifdef CONFIG_NUMA +static void *____cache_alloc_node(struct kmem_cache *, gfp_t, int); + /* * Try allocating on another node if PFA_SPREAD_SLAB is a mempolicy is set. * @@ -3150,7 +3137,7 @@ retry: } /* - * A interface to enable slab creation on nodeid + * An interface to enable slab creation on nodeid */ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) diff --git a/mm/slab.h b/mm/slab.h index 95eb34174c1b..db9fb5c8dae7 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -331,7 +331,7 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, SLAB_ACCOUNT) #elif defined(CONFIG_SLUB) #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \ - SLAB_TEMPORARY | SLAB_ACCOUNT) + SLAB_TEMPORARY | SLAB_ACCOUNT | SLAB_NO_USER_FLAGS) #else #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE) #endif @@ -350,7 +350,8 @@ static inline slab_flags_t kmem_cache_flags(unsigned int object_size, SLAB_NOLEAKTRACE | \ SLAB_RECLAIM_ACCOUNT | \ SLAB_TEMPORARY | \ - SLAB_ACCOUNT) + SLAB_ACCOUNT | \ + SLAB_NO_USER_FLAGS) bool __kmem_cache_empty(struct kmem_cache *); int __kmem_cache_shutdown(struct kmem_cache *); diff --git a/mm/slab_common.c b/mm/slab_common.c index 987e6917d850..77c3adf40e50 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -24,6 +24,7 @@ #include <asm/tlbflush.h> #include <asm/page.h> #include <linux/memcontrol.h> +#include <linux/stackdepot.h> #define CREATE_TRACE_POINTS #include <trace/events/kmem.h> @@ -313,9 +314,13 @@ kmem_cache_create_usercopy(const char *name, * If no slub_debug was enabled globally, the static key is not yet * enabled by setup_slub_debug(). Enable it if the cache is being * created with any of the debugging flags passed explicitly. + * It's also possible that this is the first cache created with + * SLAB_STORE_USER and we should init stack_depot for it. */ if (flags & SLAB_DEBUG_FLAGS) static_branch_enable(&slub_debug_enabled); + if (flags & SLAB_STORE_USER) + stack_depot_init(); #endif mutex_lock(&slab_mutex); @@ -857,6 +862,8 @@ new_kmalloc_cache(int idx, enum kmalloc_cache_type type, slab_flags_t flags) return; } flags |= SLAB_ACCOUNT; + } else if (IS_ENABLED(CONFIG_ZONE_DMA) && (type == KMALLOC_DMA)) { + flags |= SLAB_CACHE_DMA; } kmalloc_caches[type][idx] = create_kmalloc_cache( @@ -885,7 +892,7 @@ void __init create_kmalloc_caches(slab_flags_t flags) /* * Including KMALLOC_CGROUP if CONFIG_MEMCG_KMEM defined */ - for (type = KMALLOC_NORMAL; type <= KMALLOC_RECLAIM; type++) { + for (type = KMALLOC_NORMAL; type < NR_KMALLOC_TYPES; type++) { for (i = KMALLOC_SHIFT_LOW; i <= KMALLOC_SHIFT_HIGH; i++) { if (!kmalloc_caches[type][i]) new_kmalloc_cache(i, type, flags); @@ -906,20 +913,6 @@ void __init create_kmalloc_caches(slab_flags_t flags) /* Kmalloc array is now usable */ slab_state = UP; - -#ifdef CONFIG_ZONE_DMA - for (i = 0; i <= KMALLOC_SHIFT_HIGH; i++) { - struct kmem_cache *s = kmalloc_caches[KMALLOC_NORMAL][i]; - - if (s) { - kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache( - kmalloc_info[i].name[KMALLOC_DMA], - kmalloc_info[i].size, - SLAB_CACHE_DMA | flags, 0, - kmalloc_info[i].size); - } - } -#endif } #endif /* !CONFIG_SLOB */ diff --git a/mm/slub.c b/mm/slub.c index ed5c2c03a47a..e5535020e0fd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -26,6 +26,7 @@ #include <linux/cpuset.h> #include <linux/mempolicy.h> #include <linux/ctype.h> +#include <linux/stackdepot.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/kfence.h> @@ -37,6 +38,7 @@ #include <linux/memcontrol.h> #include <linux/random.h> #include <kunit/test.h> +#include <linux/sort.h> #include <linux/debugfs.h> #include <trace/events/kmem.h> @@ -264,8 +266,8 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #define TRACK_ADDRS_COUNT 16 struct track { unsigned long addr; /* Called from address */ -#ifdef CONFIG_STACKTRACE - unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#ifdef CONFIG_STACKDEPOT + depot_stack_handle_t handle; #endif int cpu; /* Was running on cpu */ int pid; /* Pid context */ @@ -724,57 +726,51 @@ static struct track *get_track(struct kmem_cache *s, void *object, return kasan_reset_tag(p + alloc); } -static void set_track(struct kmem_cache *s, void *object, +static void noinline set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) { struct track *p = get_track(s, object, alloc); - if (addr) { -#ifdef CONFIG_STACKTRACE - unsigned int nr_entries; - - metadata_access_enable(); - nr_entries = stack_trace_save(kasan_reset_tag(p->addrs), - TRACK_ADDRS_COUNT, 3); - metadata_access_disable(); +#ifdef CONFIG_STACKDEPOT + unsigned long entries[TRACK_ADDRS_COUNT]; + unsigned int nr_entries; - if (nr_entries < TRACK_ADDRS_COUNT) - p->addrs[nr_entries] = 0; + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 3); + p->handle = stack_depot_save(entries, nr_entries, GFP_NOWAIT); #endif - p->addr = addr; - p->cpu = smp_processor_id(); - p->pid = current->pid; - p->when = jiffies; - } else { - memset(p, 0, sizeof(struct track)); - } + + p->addr = addr; + p->cpu = smp_processor_id(); + p->pid = current->pid; + p->when = jiffies; } static void init_tracking(struct kmem_cache *s, void *object) { + struct track *p; + if (!(s->flags & SLAB_STORE_USER)) return; - set_track(s, object, TRACK_FREE, 0UL); - set_track(s, object, TRACK_ALLOC, 0UL); + p = get_track(s, object, TRACK_ALLOC); + memset(p, 0, 2*sizeof(struct track)); } static void print_track(const char *s, struct track *t, unsigned long pr_time) { + depot_stack_handle_t handle __maybe_unused; + if (!t->addr) return; pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); -#ifdef CONFIG_STACKTRACE - { - int i; - for (i = 0; i < TRACK_ADDRS_COUNT; i++) - if (t->addrs[i]) - pr_err("\t%pS\n", (void *)t->addrs[i]); - else - break; - } +#ifdef CONFIG_STACKDEPOT + handle = READ_ONCE(t->handle); + if (handle) + stack_depot_print(handle); + else + pr_err("object allocation/free stack trace missing\n"); #endif } @@ -1021,7 +1017,7 @@ static int check_pad_bytes(struct kmem_cache *s, struct slab *slab, u8 *p) } /* Check the pad bytes at the end of a slab page */ -static int slab_pad_check(struct kmem_cache *s, struct slab *slab) +static void slab_pad_check(struct kmem_cache *s, struct slab *slab) { u8 *start; u8 *fault; @@ -1031,21 +1027,21 @@ static int slab_pad_check(struct kmem_cache *s, struct slab *slab) int remainder; if (!(s->flags & SLAB_POISON)) - return 1; + return; start = slab_address(slab); length = slab_size(slab); end = start + length; remainder = length % s->size; if (!remainder) - return 1; + return; pad = end - remainder; metadata_access_enable(); fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder); metadata_access_disable(); if (!fault) - return 1; + return; while (end > fault && end[-1] == POISON_INUSE) end--; @@ -1054,7 +1050,6 @@ static int slab_pad_check(struct kmem_cache *s, struct slab *slab) print_section(KERN_ERR, "Padding ", pad, remainder); restore_bytes(s, "slab padding", POISON_INUSE, fault, end); - return 0; } static int check_object(struct kmem_cache *s, struct slab *slab, @@ -1268,8 +1263,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) } /* Object debug checks for alloc/free paths */ -static void setup_object_debug(struct kmem_cache *s, struct slab *slab, - void *object) +static void setup_object_debug(struct kmem_cache *s, void *object) { if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)) return; @@ -1534,6 +1528,8 @@ static int __init setup_slub_debug(char *str) global_slub_debug_changed = true; } else { slab_list_specified = true; + if (flags & SLAB_STORE_USER) + stack_depot_want_early_init(); } } @@ -1551,6 +1547,8 @@ static int __init setup_slub_debug(char *str) } out: slub_debug = global_flags; + if (slub_debug & SLAB_STORE_USER) + stack_depot_want_early_init(); if (slub_debug != 0 || slub_debug_string) static_branch_enable(&slub_debug_enabled); else @@ -1584,6 +1582,9 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, slab_flags_t block_flags; slab_flags_t slub_debug_local = slub_debug; + if (flags & SLAB_NO_USER_FLAGS) + return flags; + /* * If the slab cache is for debugging (e.g. kmemleak) then * don't store user (stack trace) information by default, @@ -1628,8 +1629,7 @@ slab_flags_t kmem_cache_flags(unsigned int object_size, return flags | slub_debug_local; } #else /* !CONFIG_SLUB_DEBUG */ -static inline void setup_object_debug(struct kmem_cache *s, - struct slab *slab, void *object) {} +static inline void setup_object_debug(struct kmem_cache *s, void *object) {} static inline void setup_slab_debug(struct kmem_cache *s, struct slab *slab, void *addr) {} @@ -1641,8 +1641,7 @@ static inline int free_debug_processing( void *head, void *tail, int bulk_cnt, unsigned long addr) { return 0; } -static inline int slab_pad_check(struct kmem_cache *s, struct slab *slab) - { return 1; } +static inline void slab_pad_check(struct kmem_cache *s, struct slab *slab) {} static inline int check_object(struct kmem_cache *s, struct slab *slab, void *object, u8 val) { return 1; } static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n, @@ -1772,10 +1771,9 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s, return *head != NULL; } -static void *setup_object(struct kmem_cache *s, struct slab *slab, - void *object) +static void *setup_object(struct kmem_cache *s, void *object) { - setup_object_debug(s, slab, object); + setup_object_debug(s, object); object = kasan_init_slab_obj(s, object); if (unlikely(s->ctor)) { kasan_unpoison_object_data(s, object); @@ -1894,13 +1892,13 @@ static bool shuffle_freelist(struct kmem_cache *s, struct slab *slab) /* First entry is used as the base of the freelist */ cur = next_freelist_entry(s, slab, &pos, start, page_limit, freelist_count); - cur = setup_object(s, slab, cur); + cur = setup_object(s, cur); slab->freelist = cur; for (idx = 1; idx < slab->objects; idx++) { next = next_freelist_entry(s, slab, &pos, start, page_limit, freelist_count); - next = setup_object(s, slab, next); + next = setup_object(s, next); set_freepointer(s, cur, next); cur = next; } @@ -1939,7 +1937,7 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; if ((alloc_gfp & __GFP_DIRECT_RECLAIM) && oo_order(oo) > oo_order(s->min)) - alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~(__GFP_RECLAIM|__GFP_NOFAIL); + alloc_gfp = (alloc_gfp | __GFP_NOMEMALLOC) & ~__GFP_RECLAIM; slab = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!slab)) { @@ -1971,11 +1969,11 @@ static struct slab *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (!shuffle) { start = fixup_red_left(s, start); - start = setup_object(s, slab, start); + start = setup_object(s, start); slab->freelist = start; for (idx = 0, p = start; idx < slab->objects - 1; idx++) { next = p + s->size; - next = setup_object(s, slab, next); + next = setup_object(s, next); set_freepointer(s, p, next); p = next; } @@ -2910,7 +2908,6 @@ redo: */ if (!node_isset(node, slab_nodes)) { node = NUMA_NO_NODE; - goto redo; } else { stat(s, ALLOC_NODE_MISMATCH); goto deactivate_slab; @@ -4165,8 +4162,6 @@ static int calculate_sizes(struct kmem_cache *s) */ s->oo = oo_make(order, size); s->min = oo_make(get_order(size), size); - if (oo_objects(s->oo) > oo_objects(s->max)) - s->max = s->oo; return !!oo_objects(s->oo); } @@ -4344,18 +4339,26 @@ void __kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct slab *slab) objp = fixup_red_left(s, objp); trackp = get_track(s, objp, TRACK_ALLOC); kpp->kp_ret = (void *)trackp->addr; -#ifdef CONFIG_STACKTRACE - for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { - kpp->kp_stack[i] = (void *)trackp->addrs[i]; - if (!kpp->kp_stack[i]) - break; - } +#ifdef CONFIG_STACKDEPOT + { + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries; - trackp = get_track(s, objp, TRACK_FREE); - for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { - kpp->kp_free_stack[i] = (void *)trackp->addrs[i]; - if (!kpp->kp_free_stack[i]) - break; + handle = READ_ONCE(trackp->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) + kpp->kp_stack[i] = (void *)entries[i]; + } + + trackp = get_track(s, objp, TRACK_FREE); + handle = READ_ONCE(trackp->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) + kpp->kp_free_stack[i] = (void *)entries[i]; + } } #endif #endif @@ -5057,6 +5060,7 @@ EXPORT_SYMBOL(validate_slab_cache); */ struct location { + depot_stack_handle_t handle; unsigned long count; unsigned long addr; long long sum_time; @@ -5109,9 +5113,13 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, { long start, end, pos; struct location *l; - unsigned long caddr; + unsigned long caddr, chandle; unsigned long age = jiffies - track->when; + depot_stack_handle_t handle = 0; +#ifdef CONFIG_STACKDEPOT + handle = READ_ONCE(track->handle); +#endif start = -1; end = t->count; @@ -5126,7 +5134,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, break; caddr = t->loc[pos].addr; - if (track->addr == caddr) { + chandle = t->loc[pos].handle; + if ((track->addr == caddr) && (handle == chandle)) { l = &t->loc[pos]; l->count++; @@ -5151,6 +5160,8 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, if (track->addr < caddr) end = pos; + else if (track->addr == caddr && handle < chandle) + end = pos; else start = pos; } @@ -5173,6 +5184,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, l->max_time = age; l->min_pid = track->pid; l->max_pid = track->pid; + l->handle = handle; cpumask_clear(to_cpumask(l->cpus)); cpumask_set_cpu(track->cpu, to_cpumask(l->cpus)); nodes_clear(l->nodes); @@ -6082,6 +6094,21 @@ static int slab_debugfs_show(struct seq_file *seq, void *v) seq_printf(seq, " nodes=%*pbl", nodemask_pr_args(&l->nodes)); +#ifdef CONFIG_STACKDEPOT + { + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries, j; + + handle = READ_ONCE(l->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + seq_puts(seq, "\n"); + for (j = 0; j < nr_entries; j++) + seq_printf(seq, " %pS\n", (void *)entries[j]); + } + } +#endif seq_puts(seq, "\n"); } @@ -6106,6 +6133,17 @@ static void *slab_debugfs_next(struct seq_file *seq, void *v, loff_t *ppos) return NULL; } +static int cmp_loc_by_count(const void *a, const void *b, const void *data) +{ + struct location *loc1 = (struct location *)a; + struct location *loc2 = (struct location *)b; + + if (loc1->count > loc2->count) + return -1; + else + return 1; +} + static void *slab_debugfs_start(struct seq_file *seq, loff_t *ppos) { struct loc_track *t = seq->private; @@ -6167,6 +6205,10 @@ static int slab_debug_trace_open(struct inode *inode, struct file *filep) spin_unlock_irqrestore(&n->list_lock, flags); } + /* Sort locations by count */ + sort_r(t->loc, t->count, sizeof(struct location), + cmp_loc_by_count, NULL, NULL); + bitmap_free(obj_map); return 0; } diff --git a/mm/swapfile.c b/mm/swapfile.c index a0eb690d9926..94b4ff43ead0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -6,6 +6,7 @@ * Swap reorganised 29.12.95, Stephen Tweedie */ +#include <linux/blkdev.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched/task.h> @@ -180,7 +181,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); if (nr_blocks) { err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, 0); + nr_blocks, GFP_KERNEL); if (err) return err; cond_resched(); @@ -191,7 +192,7 @@ static int discard_swap(struct swap_info_struct *si) nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL, 0); + nr_blocks, GFP_KERNEL); if (err) break; @@ -255,7 +256,7 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO, 0)) + nr_blocks, GFP_NOIO)) break; se = next_se(se); @@ -2472,7 +2473,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); - if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev))) + if (!p->bdev || !bdev_nonrot(p->bdev)) atomic_dec(&nr_rotate_swap); mutex_lock(&swapon_mutex); @@ -2767,7 +2768,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) * write only restriction. Hence zoned block devices are not * suitable for swapping. Disallow them here. */ - if (blk_queue_is_zoned(p->bdev->bd_disk->queue)) + if (bdev_is_zoned(p->bdev)) return -EINVAL; p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { @@ -2963,20 +2964,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, return nr_extents; } -/* - * Helper to sys_swapon determining if a given swap - * backing device queue supports DISCARD operations. - */ -static bool swap_discardable(struct swap_info_struct *si) -{ - struct request_queue *q = bdev_get_queue(si->bdev); - - if (!blk_queue_discard(q)) - return false; - - return true; -} - SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; @@ -3047,7 +3034,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) /* * Read the swap header. */ - if (!mapping->a_ops->readpage) { + if (!mapping->a_ops->read_folio) { error = -EINVAL; goto bad_swap_unlock_inode; } @@ -3071,13 +3058,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap_unlock_inode; } - if (p->bdev && blk_queue_stable_writes(p->bdev->bd_disk->queue)) + if (p->bdev && bdev_stable_writes(p->bdev)) p->flags |= SWP_STABLE_WRITES; if (p->bdev && p->bdev->bd_disk->fops->rw_page) p->flags |= SWP_SYNCHRONOUS_IO; - if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { + if (p->bdev && bdev_nonrot(p->bdev)) { int cpu; unsigned long ci, nr_cluster; @@ -3138,7 +3125,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sizeof(long), GFP_KERNEL); - if (p->bdev && (swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + if ((swap_flags & SWAP_FLAG_DISCARD) && + p->bdev && bdev_max_discard_sectors(p->bdev)) { /* * When discard is enabled for swap with no particular * policy flagged, we set all swap discard flags here in diff --git a/mm/usercopy.c b/mm/usercopy.c index 2c235d5c2364..baeacc735b83 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -17,6 +17,7 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/thread_info.h> +#include <linux/vmalloc.h> #include <linux/atomic.h> #include <linux/jump_label.h> #include <asm/sections.h> @@ -157,91 +158,47 @@ static inline void check_bogus_address(const unsigned long ptr, unsigned long n, usercopy_abort("null address", NULL, to_user, ptr, n); } -/* Checks for allocs that are marked in some way as spanning multiple pages. */ -static inline void check_page_span(const void *ptr, unsigned long n, - struct page *page, bool to_user) +static inline void check_heap_object(const void *ptr, unsigned long n, + bool to_user) { -#ifdef CONFIG_HARDENED_USERCOPY_PAGESPAN - const void *end = ptr + n - 1; - struct page *endpage; - bool is_reserved, is_cma; + struct folio *folio; - /* - * Sometimes the kernel data regions are not marked Reserved (see - * check below). And sometimes [_sdata,_edata) does not cover - * rodata and/or bss, so check each range explicitly. - */ + if (is_kmap_addr(ptr)) { + unsigned long page_end = (unsigned long)ptr | (PAGE_SIZE - 1); - /* Allow reads of kernel rodata region (if not marked as Reserved). */ - if (ptr >= (const void *)__start_rodata && - end <= (const void *)__end_rodata) { - if (!to_user) - usercopy_abort("rodata", NULL, to_user, 0, n); + if ((unsigned long)ptr + n - 1 > page_end) + usercopy_abort("kmap", NULL, to_user, + offset_in_page(ptr), n); return; } - /* Allow kernel data region (if not marked as Reserved). */ - if (ptr >= (const void *)_sdata && end <= (const void *)_edata) - return; + if (is_vmalloc_addr(ptr)) { + struct vm_struct *area = find_vm_area(ptr); + unsigned long offset; - /* Allow kernel bss region (if not marked as Reserved). */ - if (ptr >= (const void *)__bss_start && - end <= (const void *)__bss_stop) - return; - - /* Is the object wholly within one base page? */ - if (likely(((unsigned long)ptr & (unsigned long)PAGE_MASK) == - ((unsigned long)end & (unsigned long)PAGE_MASK))) - return; + if (!area) { + usercopy_abort("vmalloc", "no area", to_user, 0, n); + return; + } - /* Allow if fully inside the same compound (__GFP_COMP) page. */ - endpage = virt_to_head_page(end); - if (likely(endpage == page)) + offset = ptr - area->addr; + if (offset + n > get_vm_area_size(area)) + usercopy_abort("vmalloc", NULL, to_user, offset, n); return; - - /* - * Reject if range is entirely either Reserved (i.e. special or - * device memory), or CMA. Otherwise, reject since the object spans - * several independently allocated pages. - */ - is_reserved = PageReserved(page); - is_cma = is_migrate_cma_page(page); - if (!is_reserved && !is_cma) - usercopy_abort("spans multiple pages", NULL, to_user, 0, n); - - for (ptr += PAGE_SIZE; ptr <= end; ptr += PAGE_SIZE) { - page = virt_to_head_page(ptr); - if (is_reserved && !PageReserved(page)) - usercopy_abort("spans Reserved and non-Reserved pages", - NULL, to_user, 0, n); - if (is_cma && !is_migrate_cma_page(page)) - usercopy_abort("spans CMA and non-CMA pages", NULL, - to_user, 0, n); } -#endif -} - -static inline void check_heap_object(const void *ptr, unsigned long n, - bool to_user) -{ - struct folio *folio; if (!virt_addr_valid(ptr)) return; - /* - * When CONFIG_HIGHMEM=y, kmap_to_page() will give either the - * highmem page or fallback to virt_to_page(). The following - * is effectively a highmem-aware virt_to_slab(). - */ - folio = page_folio(kmap_to_page((void *)ptr)); + folio = virt_to_folio(ptr); if (folio_test_slab(folio)) { /* Check slab allocator for flags and size. */ __check_heap_object(ptr, n, folio_slab(folio), to_user); - } else { - /* Verify object does not incorrectly span multiple pages. */ - check_page_span(ptr, n, folio_page(folio, 0), to_user); + } else if (folio_test_large(folio)) { + unsigned long offset = ptr - folio_address(folio); + if (offset + n > folio_size(folio)) + usercopy_abort("page alloc", NULL, to_user, offset, n); } } diff --git a/mm/util.c b/mm/util.c index f27c796239b1..29f4f773dc7b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -344,6 +344,38 @@ unsigned long randomize_stack_top(unsigned long stack_top) #endif } +/** + * randomize_page - Generate a random, page aligned address + * @start: The smallest acceptable address the caller will take. + * @range: The size of the area, starting at @start, within which the + * random address must fall. + * + * If @start + @range would overflow, @range is capped. + * + * NOTE: Historical use of randomize_range, which this replaces, presumed that + * @start was already page aligned. We now align it regardless. + * + * Return: A page aligned address within [start, start + range). On error, + * @start is returned. + */ +unsigned long randomize_page(unsigned long start, unsigned long range) +{ + if (!PAGE_ALIGNED(start)) { + range -= PAGE_ALIGN(start) - start; + start = PAGE_ALIGN(start); + } + + if (start > ULONG_MAX - range) + range = ULONG_MAX - start; + + range >>= PAGE_SHIFT; + + if (range == 0) + return start; + + return start + (get_random_long() % range << PAGE_SHIFT); +} + #ifdef CONFIG_ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT unsigned long arch_randomize_brk(struct mm_struct *mm) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 76bca20679d9..f7d9a683e3a7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1178,7 +1178,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, * folio->mapping == NULL while being dirty with clean buffers. */ if (folio_test_private(folio)) { - if (try_to_free_buffers(&folio->page)) { + if (try_to_free_buffers(folio)) { folio_clear_dirty(folio); pr_info("%s: orphaned folio\n", __func__); return PAGE_CLEAN; @@ -1280,9 +1280,9 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, xa_unlock_irq(&mapping->i_pages); put_swap_page(&folio->page, swap); } else { - void (*freepage)(struct page *); + void (*free_folio)(struct folio *); - freepage = mapping->a_ops->freepage; + free_folio = mapping->a_ops->free_folio; /* * Remember a shadow entry for reclaimed file cache in * order to detect refaults, thus thrashing, later on. @@ -1308,8 +1308,8 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); - if (freepage != NULL) - freepage(&folio->page); + if (free_folio) + free_folio(folio); } return 1; @@ -1456,7 +1456,7 @@ static void folio_check_dirty_writeback(struct folio *folio, mapping = folio_mapping(folio); if (mapping && mapping->a_ops->is_dirty_writeback) - mapping->a_ops->is_dirty_writeback(&folio->page, dirty, writeback); + mapping->a_ops->is_dirty_writeback(folio, dirty, writeback); } static struct page *alloc_demote_page(struct page *page, unsigned long node) |