diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 4 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/filemap.c | 36 | ||||
-rw-r--r-- | mm/gup.c | 12 | ||||
-rw-r--r-- | mm/init-mm.c | 9 | ||||
-rw-r--r-- | mm/internal.h | 3 | ||||
-rw-r--r-- | mm/memblock.c | 26 | ||||
-rw-r--r-- | mm/mlock.c | 3 | ||||
-rw-r--r-- | mm/mmap.c | 5 | ||||
-rw-r--r-- | mm/mremap.c | 108 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/secretmem.c | 254 | ||||
-rw-r--r-- | mm/slab.h | 1 | ||||
-rw-r--r-- | mm/slab_common.c | 12 | ||||
-rw-r--r-- | mm/slub.c | 75 | ||||
-rw-r--r-- | mm/util.c | 2 |
16 files changed, 474 insertions, 79 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index a02498c0e13d..40a9bfcd5062 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -885,4 +885,8 @@ config KMAP_LOCAL # struct io_mapping based helper. Selected by drivers that need them config IO_MAPPING bool + +config SECRETMEM + def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED + endmenu diff --git a/mm/Makefile b/mm/Makefile index 74b47c354682..e3436741d539 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -113,6 +113,7 @@ obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o +obj-$(CONFIG_SECRETMEM) += secretmem.o obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o diff --git a/mm/filemap.c b/mm/filemap.c index ac82a93d4f38..d1458ecf2f51 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3642,10 +3642,6 @@ again: * Otherwise there's a nasty deadlock on copying from the * same page as we're writing to, without it being marked * up-to-date. - * - * Not only is this an optimisation, but it is also required - * to check that the address is actually valid, when atomic - * usercopies are used, below. */ if (unlikely(iov_iter_fault_in_readable(i, bytes))) { status = -EFAULT; @@ -3665,33 +3661,31 @@ again: if (mapping_writably_mapped(mapping)) flush_dcache_page(page); - copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); + copied = copy_page_from_iter_atomic(page, offset, bytes, i); flush_dcache_page(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); - if (unlikely(status < 0)) - break; - copied = status; - + if (unlikely(status != copied)) { + iov_iter_revert(i, copied - max(status, 0L)); + if (unlikely(status < 0)) + break; + } cond_resched(); - iov_iter_advance(i, copied); - if (unlikely(copied == 0)) { + if (unlikely(status == 0)) { /* - * If we were unable to copy any data at all, we must - * fall back to a single segment length write. - * - * If we didn't fallback here, we could livelock - * because not all segments in the iov can be copied at - * once without a pagefault. + * A short copy made ->write_end() reject the + * thing entirely. Might be memory poisoning + * halfway through, might be a race with munmap, + * might be severe memory pressure. */ - bytes = min_t(unsigned long, PAGE_SIZE - offset, - iov_iter_single_seg_count(i)); + if (copied) + bytes = copied; goto again; } - pos += copied; - written += copied; + pos += status; + written += status; balance_dirty_pages_ratelimited(mapping); } while (iov_iter_count(i)); @@ -10,6 +10,7 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/secretmem.h> #include <linux/sched/signal.h> #include <linux/rwsem.h> @@ -855,6 +856,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, struct follow_page_context ctx = { NULL }; struct page *page; + if (vma_is_secretmem(vma)) + return NULL; + page = follow_page_mask(vma, address, foll_flags, &ctx); if (ctx.pgmap) put_dev_pagemap(ctx.pgmap); @@ -988,6 +992,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma)) return -EOPNOTSUPP; + if (vma_is_secretmem(vma)) + return -EFAULT; + if (write) { if (!(vm_flags & VM_WRITE)) { if (!(gup_flags & FOLL_FORCE)) @@ -2170,6 +2177,11 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, if (!head) goto pte_unmap; + if (unlikely(page_is_secretmem(page))) { + put_compound_head(head, 1, flags); + goto pte_unmap; + } + if (unlikely(pte_val(pte) != pte_val(*ptep))) { put_compound_head(head, 1, flags); goto pte_unmap; diff --git a/mm/init-mm.c b/mm/init-mm.c index 153162669f80..b4a6f38fb51d 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -40,3 +40,12 @@ struct mm_struct init_mm = { .cpu_bitmap = CPU_BITS_NONE, INIT_MM_CONTEXT(init_mm) }; + +void setup_initial_init_mm(void *start_code, void *end_code, + void *end_data, void *brk) +{ + init_mm.start_code = (unsigned long)start_code; + init_mm.end_code = (unsigned long)end_code; + init_mm.end_data = (unsigned long)end_data; + init_mm.brk = (unsigned long)brk; +} diff --git a/mm/internal.h b/mm/internal.h index 2d7c9a2e0118..31ff935b2547 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -360,6 +360,9 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) extern void mlock_vma_page(struct page *page); extern unsigned int munlock_vma_page(struct page *page); +extern int mlock_future_check(struct mm_struct *mm, unsigned long flags, + unsigned long len); + /* * Clear the page's PageMlocked(). This can be useful in a situation where * we want to unconditionally remove a page from the pagecache -- e.g., diff --git a/mm/memblock.c b/mm/memblock.c index 3e4acbf03ab7..0041ff62c584 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -182,6 +182,8 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type, { unsigned long i; + memblock_cap_size(base, &size); + for (i = 0; i < type->cnt; i++) if (memblock_addrs_overlap(base, size, type->regions[i].base, type->regions[i].size)) @@ -1799,7 +1801,6 @@ bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t siz */ bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) { - memblock_cap_size(base, &size); return memblock_overlaps_region(&memblock.reserved, base, size); } @@ -1946,14 +1947,13 @@ static void __init free_unused_memmap(void) * due to SPARSEMEM sections which aren't present. */ start = min(start, ALIGN(prev_end, PAGES_PER_SECTION)); -#else +#endif /* - * Align down here since the VM subsystem insists that the - * memmap entries are valid from the bank start aligned to - * MAX_ORDER_NR_PAGES. + * Align down here since many operations in VM subsystem + * presume that there are no holes in the memory map inside + * a pageblock */ - start = round_down(start, MAX_ORDER_NR_PAGES); -#endif + start = round_down(start, pageblock_nr_pages); /* * If we had a previous bank, and there is a space @@ -1963,16 +1963,18 @@ static void __init free_unused_memmap(void) free_memmap(prev_end, start); /* - * Align up here since the VM subsystem insists that the - * memmap entries are valid from the bank end aligned to - * MAX_ORDER_NR_PAGES. + * Align up here since many operations in VM subsystem + * presume that there are no holes in the memory map inside + * a pageblock */ - prev_end = ALIGN(end, MAX_ORDER_NR_PAGES); + prev_end = ALIGN(end, pageblock_nr_pages); } #ifdef CONFIG_SPARSEMEM - if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) + if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) { + prev_end = ALIGN(end, pageblock_nr_pages); free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION)); + } #endif } diff --git a/mm/mlock.c b/mm/mlock.c index 0d639bf48794..16d2ee160d43 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -23,6 +23,7 @@ #include <linux/hugetlb.h> #include <linux/memcontrol.h> #include <linux/mm_inline.h> +#include <linux/secretmem.h> #include "internal.h" @@ -503,7 +504,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) || - vma_is_dax(vma)) + vma_is_dax(vma) || vma_is_secretmem(vma)) /* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */ goto out; diff --git a/mm/mmap.c b/mm/mmap.c index aa9de981b659..ca54d36d203a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1352,9 +1352,8 @@ static inline unsigned long round_hint_to_min(unsigned long hint) return hint; } -static inline int mlock_future_check(struct mm_struct *mm, - unsigned long flags, - unsigned long len) +int mlock_future_check(struct mm_struct *mm, unsigned long flags, + unsigned long len) { unsigned long locked, lock_limit; diff --git a/mm/mremap.c b/mm/mremap.c index a369a6100698..5989d3990020 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -25,7 +25,8 @@ #include <linux/userfaultfd_k.h> #include <asm/cacheflush.h> -#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/pgalloc.h> #include "internal.h" @@ -209,6 +210,15 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, drop_rmap_locks(vma); } +#ifndef arch_supports_page_table_move +#define arch_supports_page_table_move arch_supports_page_table_move +static inline bool arch_supports_page_table_move(void) +{ + return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) || + IS_ENABLED(CONFIG_HAVE_MOVE_PUD); +} +#endif + #ifdef CONFIG_HAVE_MOVE_PMD static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd) @@ -217,6 +227,8 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, struct mm_struct *mm = vma->vm_mm; pmd_t pmd; + if (!arch_supports_page_table_move()) + return false; /* * The destination pmd shouldn't be established, free_pgtables() * should have released it. @@ -258,8 +270,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr, VM_BUG_ON(!pmd_none(*new_pmd)); - /* Set the new pmd */ - set_pmd_at(mm, new_addr, new_pmd, pmd); + pmd_populate(mm, new_pmd, pmd_pgtable(pmd)); flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); @@ -276,7 +287,7 @@ static inline bool move_normal_pmd(struct vm_area_struct *vma, } #endif -#ifdef CONFIG_HAVE_MOVE_PUD +#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD) static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) { @@ -284,6 +295,8 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, struct mm_struct *mm = vma->vm_mm; pud_t pud; + if (!arch_supports_page_table_move()) + return false; /* * The destination pud shouldn't be established, free_pgtables() * should have released it. @@ -306,8 +319,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr, VM_BUG_ON(!pud_none(*new_pud)); - /* Set the new pud */ - set_pud_at(mm, new_addr, new_pud, pud); + pud_populate(mm, new_pud, pud_pgtable(pud)); flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE); if (new_ptl != old_ptl) spin_unlock(new_ptl); @@ -324,10 +336,61 @@ static inline bool move_normal_pud(struct vm_area_struct *vma, } #endif +#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD +static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) +{ + spinlock_t *old_ptl, *new_ptl; + struct mm_struct *mm = vma->vm_mm; + pud_t pud; + + /* + * The destination pud shouldn't be established, free_pgtables() + * should have released it. + */ + if (WARN_ON_ONCE(!pud_none(*new_pud))) + return false; + + /* + * We don't have to worry about the ordering of src and dst + * ptlocks because exclusive mmap_lock prevents deadlock. + */ + old_ptl = pud_lock(vma->vm_mm, old_pud); + new_ptl = pud_lockptr(mm, new_pud); + if (new_ptl != old_ptl) + spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); + + /* Clear the pud */ + pud = *old_pud; + pud_clear(old_pud); + + VM_BUG_ON(!pud_none(*new_pud)); + + /* Set the new pud */ + /* mark soft_ditry when we add pud level soft dirty support */ + set_pud_at(mm, new_addr, new_pud, pud); + flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE); + if (new_ptl != old_ptl) + spin_unlock(new_ptl); + spin_unlock(old_ptl); + + return true; +} +#else +static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr, + unsigned long new_addr, pud_t *old_pud, pud_t *new_pud) +{ + WARN_ON_ONCE(1); + return false; + +} +#endif + enum pgt_entry { NORMAL_PMD, HPAGE_PMD, NORMAL_PUD, + HPAGE_PUD, }; /* @@ -347,6 +410,7 @@ static __always_inline unsigned long get_extent(enum pgt_entry entry, mask = PMD_MASK; size = PMD_SIZE; break; + case HPAGE_PUD: case NORMAL_PUD: mask = PUD_MASK; size = PUD_SIZE; @@ -395,6 +459,12 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma, move_huge_pmd(vma, old_addr, new_addr, old_entry, new_entry); break; + case HPAGE_PUD: + moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + move_huge_pud(vma, old_addr, new_addr, old_entry, + new_entry); + break; + default: WARN_ON_ONCE(1); break; @@ -414,6 +484,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, unsigned long extent, old_end; struct mmu_notifier_range range; pmd_t *old_pmd, *new_pmd; + pud_t *old_pud, *new_pud; old_end = old_addr + len; flush_cache_range(vma, old_addr, old_end); @@ -429,17 +500,24 @@ unsigned long move_page_tables(struct vm_area_struct *vma, * PUD level if possible. */ extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr); - if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { - pud_t *old_pud, *new_pud; - old_pud = get_old_pud(vma->vm_mm, old_addr); - if (!old_pud) + old_pud = get_old_pud(vma->vm_mm, old_addr); + if (!old_pud) + continue; + new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr); + if (!new_pud) + break; + if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) { + if (extent == HPAGE_PUD_SIZE) { + move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr, + old_pud, new_pud, need_rmap_locks); + /* We ignore and continue on error? */ continue; - new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr); - if (!new_pud) - break; + } + } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) { + if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr, - old_pud, new_pud, need_rmap_locks)) + old_pud, new_pud, true)) continue; } @@ -466,7 +544,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, * moving at the PMD level if possible. */ if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr, - old_pmd, new_pmd, need_rmap_locks)) + old_pmd, new_pmd, true)) continue; } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index fcc29e9a3064..c729a4c4a1ac 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -922,7 +922,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message) continue; } /* - * No kthead_use_mm() user needs to read from the userspace so + * No kthread_use_mm() user needs to read from the userspace so * we are ok to reap it. */ if (unlikely(p->flags & PF_KTHREAD)) diff --git a/mm/secretmem.c b/mm/secretmem.c new file mode 100644 index 000000000000..f77d25467a14 --- /dev/null +++ b/mm/secretmem.c @@ -0,0 +1,254 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright IBM Corporation, 2021 + * + * Author: Mike Rapoport <rppt@linux.ibm.com> + */ + +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/swap.h> +#include <linux/mount.h> +#include <linux/memfd.h> +#include <linux/bitops.h> +#include <linux/printk.h> +#include <linux/pagemap.h> +#include <linux/syscalls.h> +#include <linux/pseudo_fs.h> +#include <linux/secretmem.h> +#include <linux/set_memory.h> +#include <linux/sched/signal.h> + +#include <uapi/linux/magic.h> + +#include <asm/tlbflush.h> + +#include "internal.h" + +#undef pr_fmt +#define pr_fmt(fmt) "secretmem: " fmt + +/* + * Define mode and flag masks to allow validation of the system call + * parameters. + */ +#define SECRETMEM_MODE_MASK (0x0) +#define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK + +static bool secretmem_enable __ro_after_init; +module_param_named(enable, secretmem_enable, bool, 0400); +MODULE_PARM_DESC(secretmem_enable, + "Enable secretmem and memfd_secret(2) system call"); + +static atomic_t secretmem_users; + +bool secretmem_active(void) +{ + return !!atomic_read(&secretmem_users); +} + +static vm_fault_t secretmem_fault(struct vm_fault *vmf) +{ + struct address_space *mapping = vmf->vma->vm_file->f_mapping; + struct inode *inode = file_inode(vmf->vma->vm_file); + pgoff_t offset = vmf->pgoff; + gfp_t gfp = vmf->gfp_mask; + unsigned long addr; + struct page *page; + int err; + + if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode)) + return vmf_error(-EINVAL); + +retry: + page = find_lock_page(mapping, offset); + if (!page) { + page = alloc_page(gfp | __GFP_ZERO); + if (!page) + return VM_FAULT_OOM; + + err = set_direct_map_invalid_noflush(page); + if (err) { + put_page(page); + return vmf_error(err); + } + + __SetPageUptodate(page); + err = add_to_page_cache_lru(page, mapping, offset, gfp); + if (unlikely(err)) { + put_page(page); + /* + * If a split of large page was required, it + * already happened when we marked the page invalid + * which guarantees that this call won't fail + */ + set_direct_map_default_noflush(page); + if (err == -EEXIST) + goto retry; + + return vmf_error(err); + } + + addr = (unsigned long)page_address(page); + flush_tlb_kernel_range(addr, addr + PAGE_SIZE); + } + + vmf->page = page; + return VM_FAULT_LOCKED; +} + +static const struct vm_operations_struct secretmem_vm_ops = { + .fault = secretmem_fault, +}; + +static int secretmem_release(struct inode *inode, struct file *file) +{ + atomic_dec(&secretmem_users); + return 0; +} + +static int secretmem_mmap(struct file *file, struct vm_area_struct *vma) +{ + unsigned long len = vma->vm_end - vma->vm_start; + + if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0) + return -EINVAL; + + if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len)) + return -EAGAIN; + + vma->vm_flags |= VM_LOCKED | VM_DONTDUMP; + vma->vm_ops = &secretmem_vm_ops; + + return 0; +} + +bool vma_is_secretmem(struct vm_area_struct *vma) +{ + return vma->vm_ops == &secretmem_vm_ops; +} + +static const struct file_operations secretmem_fops = { + .release = secretmem_release, + .mmap = secretmem_mmap, +}; + +static bool secretmem_isolate_page(struct page *page, isolate_mode_t mode) +{ + return false; +} + +static int secretmem_migratepage(struct address_space *mapping, + struct page *newpage, struct page *page, + enum migrate_mode mode) +{ + return -EBUSY; +} + +static void secretmem_freepage(struct page *page) +{ + set_direct_map_default_noflush(page); + clear_highpage(page); +} + +const struct address_space_operations secretmem_aops = { + .freepage = secretmem_freepage, + .migratepage = secretmem_migratepage, + .isolate_page = secretmem_isolate_page, +}; + +static struct vfsmount *secretmem_mnt; + +static struct file *secretmem_file_create(unsigned long flags) +{ + struct file *file = ERR_PTR(-ENOMEM); + struct inode *inode; + + inode = alloc_anon_inode(secretmem_mnt->mnt_sb); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem", + O_RDWR, &secretmem_fops); + if (IS_ERR(file)) + goto err_free_inode; + + mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); + mapping_set_unevictable(inode->i_mapping); + + inode->i_mapping->a_ops = &secretmem_aops; + + /* pretend we are a normal file with zero size */ + inode->i_mode |= S_IFREG; + inode->i_size = 0; + + return file; + +err_free_inode: + iput(inode); + return file; +} + +SYSCALL_DEFINE1(memfd_secret, unsigned int, flags) +{ + struct file *file; + int fd, err; + + /* make sure local flags do not confict with global fcntl.h */ + BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC); + + if (!secretmem_enable) + return -ENOSYS; + + if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC)) + return -EINVAL; + + fd = get_unused_fd_flags(flags & O_CLOEXEC); + if (fd < 0) + return fd; + + file = secretmem_file_create(flags); + if (IS_ERR(file)) { + err = PTR_ERR(file); + goto err_put_fd; + } + + file->f_flags |= O_LARGEFILE; + + fd_install(fd, file); + atomic_inc(&secretmem_users); + return fd; + +err_put_fd: + put_unused_fd(fd); + return err; +} + +static int secretmem_init_fs_context(struct fs_context *fc) +{ + return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM; +} + +static struct file_system_type secretmem_fs = { + .name = "secretmem", + .init_fs_context = secretmem_init_fs_context, + .kill_sb = kill_anon_super, +}; + +static int secretmem_init(void) +{ + int ret = 0; + + if (!secretmem_enable) + return ret; + + secretmem_mnt = kern_mount(&secretmem_fs); + if (IS_ERR(secretmem_mnt)) + ret = PTR_ERR(secretmem_mnt); + + /* prevent secretmem mappings from ever getting PROT_EXEC */ + secretmem_mnt->mnt_flags |= MNT_NOEXEC; + + return ret; +} +fs_initcall(secretmem_init); diff --git a/mm/slab.h b/mm/slab.h index 7b60ef2f32c3..67e06637ff2e 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -634,6 +634,7 @@ struct kmem_obj_info { struct kmem_cache *kp_slab_cache; void *kp_ret; void *kp_stack[KS_ADDRS_COUNT]; + void *kp_free_stack[KS_ADDRS_COUNT]; }; void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page); #endif diff --git a/mm/slab_common.c b/mm/slab_common.c index c126e6f6b5a5..1c673c323baf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -575,7 +575,7 @@ EXPORT_SYMBOL_GPL(kmem_valid_obj); * depends on the type of object and on how much debugging is enabled. * For a slab-cache object, the fact that it is a slab object is printed, * and, if available, the slab name, return address, and stack trace from - * the allocation of that object. + * the allocation and last free path of that object. * * This function will splat if passed a pointer to a non-slab object. * If you are not sure what type of object you have, you should instead @@ -620,6 +620,16 @@ void kmem_dump_obj(void *object) break; pr_info(" %pS\n", kp.kp_stack[i]); } + + if (kp.kp_free_stack[0]) + pr_cont(" Free path:\n"); + + for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) { + if (!kp.kp_free_stack[i]) + break; + pr_info(" %pS\n", kp.kp_free_stack[i]); + } + } EXPORT_SYMBOL_GPL(kmem_dump_obj); #endif diff --git a/mm/slub.c b/mm/slub.c index 3bc8b940c933..dc863c1ea324 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -26,6 +26,7 @@ #include <linux/cpuset.h> #include <linux/mempolicy.h> #include <linux/ctype.h> +#include <linux/stackdepot.h> #include <linux/debugobjects.h> #include <linux/kallsyms.h> #include <linux/kfence.h> @@ -220,8 +221,8 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) #define TRACK_ADDRS_COUNT 16 struct track { unsigned long addr; /* Called from address */ -#ifdef CONFIG_STACKTRACE - unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */ +#ifdef CONFIG_STACKDEPOT + depot_stack_handle_t handle; #endif int cpu; /* Was running on cpu */ int pid; /* Pid context */ @@ -625,22 +626,27 @@ static struct track *get_track(struct kmem_cache *s, void *object, return kasan_reset_tag(p + alloc); } +#ifdef CONFIG_STACKDEPOT +static depot_stack_handle_t save_stack_depot_trace(gfp_t flags) +{ + unsigned long entries[TRACK_ADDRS_COUNT]; + depot_stack_handle_t handle; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 4); + handle = stack_depot_save(entries, nr_entries, flags); + return handle; +} +#endif + static void set_track(struct kmem_cache *s, void *object, enum track_item alloc, unsigned long addr) { struct track *p = get_track(s, object, alloc); if (addr) { -#ifdef CONFIG_STACKTRACE - unsigned int nr_entries; - - metadata_access_enable(); - nr_entries = stack_trace_save(kasan_reset_tag(p->addrs), - TRACK_ADDRS_COUNT, 3); - metadata_access_disable(); - - if (nr_entries < TRACK_ADDRS_COUNT) - p->addrs[nr_entries] = 0; +#ifdef CONFIG_STACKDEPOT + p->handle = save_stack_depot_trace(GFP_NOWAIT); #endif p->addr = addr; p->cpu = smp_processor_id(); @@ -667,14 +673,19 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time) pr_err("%s in %pS age=%lu cpu=%u pid=%d\n", s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid); -#ifdef CONFIG_STACKTRACE +#ifdef CONFIG_STACKDEPOT { - int i; - for (i = 0; i < TRACK_ADDRS_COUNT; i++) - if (t->addrs[i]) - pr_err("\t%pS\n", (void *)t->addrs[i]); - else - break; + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries; + + handle = READ_ONCE(t->handle); + if (!handle) { + pr_err("object allocation/free stack trace missing\n"); + } else { + nr_entries = stack_depot_fetch(handle, &entries); + stack_trace_print(entries, nr_entries, 0); + } } #endif } @@ -4045,13 +4056,29 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page) !(s->flags & SLAB_STORE_USER)) return; #ifdef CONFIG_SLUB_DEBUG + objp = fixup_red_left(s, objp); trackp = get_track(s, objp, TRACK_ALLOC); kpp->kp_ret = (void *)trackp->addr; -#ifdef CONFIG_STACKTRACE - for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) { - kpp->kp_stack[i] = (void *)trackp->addrs[i]; - if (!kpp->kp_stack[i]) - break; +#ifdef CONFIG_STACKDEPOT + { + depot_stack_handle_t handle; + unsigned long *entries; + unsigned int nr_entries; + + handle = READ_ONCE(trackp->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) + kpp->kp_stack[i] = (void *)entries[i]; + } + + trackp = get_track(s, objp, TRACK_FREE); + handle = READ_ONCE(trackp->handle); + if (handle) { + nr_entries = stack_depot_fetch(handle, &entries); + for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++) + kpp->kp_free_stack[i] = (void *)entries[i]; + } } #endif #endif diff --git a/mm/util.c b/mm/util.c index a034525e7ba2..99c6cc77de9e 100644 --- a/mm/util.c +++ b/mm/util.c @@ -983,7 +983,7 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) * depends on the type of object and on how much debugging is enabled. * For example, for a slab-cache object, the slab name is printed, and, * if available, the return address and stack trace from the allocation - * of that object. + * and last free path of that object. */ void mem_dump_obj(void *object) { |