summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig4
-rw-r--r--mm/Makefile1
-rw-r--r--mm/filemap.c36
-rw-r--r--mm/gup.c12
-rw-r--r--mm/init-mm.c9
-rw-r--r--mm/internal.h3
-rw-r--r--mm/memblock.c26
-rw-r--r--mm/mlock.c3
-rw-r--r--mm/mmap.c5
-rw-r--r--mm/mremap.c108
-rw-r--r--mm/oom_kill.c2
-rw-r--r--mm/secretmem.c254
-rw-r--r--mm/slab.h1
-rw-r--r--mm/slab_common.c12
-rw-r--r--mm/slub.c75
-rw-r--r--mm/util.c2
16 files changed, 474 insertions, 79 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index a02498c0e13d..40a9bfcd5062 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -885,4 +885,8 @@ config KMAP_LOCAL
# struct io_mapping based helper. Selected by drivers that need them
config IO_MAPPING
bool
+
+config SECRETMEM
+ def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
+
endmenu
diff --git a/mm/Makefile b/mm/Makefile
index 74b47c354682..e3436741d539 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -113,6 +113,7 @@ obj-$(CONFIG_CMA) += cma.o
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
+obj-$(CONFIG_SECRETMEM) += secretmem.o
obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
diff --git a/mm/filemap.c b/mm/filemap.c
index ac82a93d4f38..d1458ecf2f51 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3642,10 +3642,6 @@ again:
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
- *
- * Not only is this an optimisation, but it is also required
- * to check that the address is actually valid, when atomic
- * usercopies are used, below.
*/
if (unlikely(iov_iter_fault_in_readable(i, bytes))) {
status = -EFAULT;
@@ -3665,33 +3661,31 @@ again:
if (mapping_writably_mapped(mapping))
flush_dcache_page(page);
- copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes);
+ copied = copy_page_from_iter_atomic(page, offset, bytes, i);
flush_dcache_page(page);
status = a_ops->write_end(file, mapping, pos, bytes, copied,
page, fsdata);
- if (unlikely(status < 0))
- break;
- copied = status;
-
+ if (unlikely(status != copied)) {
+ iov_iter_revert(i, copied - max(status, 0L));
+ if (unlikely(status < 0))
+ break;
+ }
cond_resched();
- iov_iter_advance(i, copied);
- if (unlikely(copied == 0)) {
+ if (unlikely(status == 0)) {
/*
- * If we were unable to copy any data at all, we must
- * fall back to a single segment length write.
- *
- * If we didn't fallback here, we could livelock
- * because not all segments in the iov can be copied at
- * once without a pagefault.
+ * A short copy made ->write_end() reject the
+ * thing entirely. Might be memory poisoning
+ * halfway through, might be a race with munmap,
+ * might be severe memory pressure.
*/
- bytes = min_t(unsigned long, PAGE_SIZE - offset,
- iov_iter_single_seg_count(i));
+ if (copied)
+ bytes = copied;
goto again;
}
- pos += copied;
- written += copied;
+ pos += status;
+ written += status;
balance_dirty_pages_ratelimited(mapping);
} while (iov_iter_count(i));
diff --git a/mm/gup.c b/mm/gup.c
index 728d996767cb..42b8b1fa6521 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -10,6 +10,7 @@
#include <linux/rmap.h>
#include <linux/swap.h>
#include <linux/swapops.h>
+#include <linux/secretmem.h>
#include <linux/sched/signal.h>
#include <linux/rwsem.h>
@@ -855,6 +856,9 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
struct follow_page_context ctx = { NULL };
struct page *page;
+ if (vma_is_secretmem(vma))
+ return NULL;
+
page = follow_page_mask(vma, address, foll_flags, &ctx);
if (ctx.pgmap)
put_dev_pagemap(ctx.pgmap);
@@ -988,6 +992,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
return -EOPNOTSUPP;
+ if (vma_is_secretmem(vma))
+ return -EFAULT;
+
if (write) {
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
@@ -2170,6 +2177,11 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
if (!head)
goto pte_unmap;
+ if (unlikely(page_is_secretmem(page))) {
+ put_compound_head(head, 1, flags);
+ goto pte_unmap;
+ }
+
if (unlikely(pte_val(pte) != pte_val(*ptep))) {
put_compound_head(head, 1, flags);
goto pte_unmap;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 153162669f80..b4a6f38fb51d 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -40,3 +40,12 @@ struct mm_struct init_mm = {
.cpu_bitmap = CPU_BITS_NONE,
INIT_MM_CONTEXT(init_mm)
};
+
+void setup_initial_init_mm(void *start_code, void *end_code,
+ void *end_data, void *brk)
+{
+ init_mm.start_code = (unsigned long)start_code;
+ init_mm.end_code = (unsigned long)end_code;
+ init_mm.end_data = (unsigned long)end_data;
+ init_mm.brk = (unsigned long)brk;
+}
diff --git a/mm/internal.h b/mm/internal.h
index 2d7c9a2e0118..31ff935b2547 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -360,6 +360,9 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
extern void mlock_vma_page(struct page *page);
extern unsigned int munlock_vma_page(struct page *page);
+extern int mlock_future_check(struct mm_struct *mm, unsigned long flags,
+ unsigned long len);
+
/*
* Clear the page's PageMlocked(). This can be useful in a situation where
* we want to unconditionally remove a page from the pagecache -- e.g.,
diff --git a/mm/memblock.c b/mm/memblock.c
index 3e4acbf03ab7..0041ff62c584 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -182,6 +182,8 @@ bool __init_memblock memblock_overlaps_region(struct memblock_type *type,
{
unsigned long i;
+ memblock_cap_size(base, &size);
+
for (i = 0; i < type->cnt; i++)
if (memblock_addrs_overlap(base, size, type->regions[i].base,
type->regions[i].size))
@@ -1799,7 +1801,6 @@ bool __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t siz
*/
bool __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
{
- memblock_cap_size(base, &size);
return memblock_overlaps_region(&memblock.reserved, base, size);
}
@@ -1946,14 +1947,13 @@ static void __init free_unused_memmap(void)
* due to SPARSEMEM sections which aren't present.
*/
start = min(start, ALIGN(prev_end, PAGES_PER_SECTION));
-#else
+#endif
/*
- * Align down here since the VM subsystem insists that the
- * memmap entries are valid from the bank start aligned to
- * MAX_ORDER_NR_PAGES.
+ * Align down here since many operations in VM subsystem
+ * presume that there are no holes in the memory map inside
+ * a pageblock
*/
- start = round_down(start, MAX_ORDER_NR_PAGES);
-#endif
+ start = round_down(start, pageblock_nr_pages);
/*
* If we had a previous bank, and there is a space
@@ -1963,16 +1963,18 @@ static void __init free_unused_memmap(void)
free_memmap(prev_end, start);
/*
- * Align up here since the VM subsystem insists that the
- * memmap entries are valid from the bank end aligned to
- * MAX_ORDER_NR_PAGES.
+ * Align up here since many operations in VM subsystem
+ * presume that there are no holes in the memory map inside
+ * a pageblock
*/
- prev_end = ALIGN(end, MAX_ORDER_NR_PAGES);
+ prev_end = ALIGN(end, pageblock_nr_pages);
}
#ifdef CONFIG_SPARSEMEM
- if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION))
+ if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) {
+ prev_end = ALIGN(end, pageblock_nr_pages);
free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION));
+ }
#endif
}
diff --git a/mm/mlock.c b/mm/mlock.c
index 0d639bf48794..16d2ee160d43 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -23,6 +23,7 @@
#include <linux/hugetlb.h>
#include <linux/memcontrol.h>
#include <linux/mm_inline.h>
+#include <linux/secretmem.h>
#include "internal.h"
@@ -503,7 +504,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) ||
is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm) ||
- vma_is_dax(vma))
+ vma_is_dax(vma) || vma_is_secretmem(vma))
/* don't set VM_LOCKED or VM_LOCKONFAULT and don't count */
goto out;
diff --git a/mm/mmap.c b/mm/mmap.c
index aa9de981b659..ca54d36d203a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1352,9 +1352,8 @@ static inline unsigned long round_hint_to_min(unsigned long hint)
return hint;
}
-static inline int mlock_future_check(struct mm_struct *mm,
- unsigned long flags,
- unsigned long len)
+int mlock_future_check(struct mm_struct *mm, unsigned long flags,
+ unsigned long len)
{
unsigned long locked, lock_limit;
diff --git a/mm/mremap.c b/mm/mremap.c
index a369a6100698..5989d3990020 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -25,7 +25,8 @@
#include <linux/userfaultfd_k.h>
#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
+#include <asm/tlb.h>
+#include <asm/pgalloc.h>
#include "internal.h"
@@ -209,6 +210,15 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
drop_rmap_locks(vma);
}
+#ifndef arch_supports_page_table_move
+#define arch_supports_page_table_move arch_supports_page_table_move
+static inline bool arch_supports_page_table_move(void)
+{
+ return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
+ IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
+}
+#endif
+
#ifdef CONFIG_HAVE_MOVE_PMD
static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
@@ -217,6 +227,8 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
struct mm_struct *mm = vma->vm_mm;
pmd_t pmd;
+ if (!arch_supports_page_table_move())
+ return false;
/*
* The destination pmd shouldn't be established, free_pgtables()
* should have released it.
@@ -258,8 +270,7 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
VM_BUG_ON(!pmd_none(*new_pmd));
- /* Set the new pmd */
- set_pmd_at(mm, new_addr, new_pmd, pmd);
+ pmd_populate(mm, new_pmd, pmd_pgtable(pmd));
flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
@@ -276,7 +287,7 @@ static inline bool move_normal_pmd(struct vm_area_struct *vma,
}
#endif
-#ifdef CONFIG_HAVE_MOVE_PUD
+#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
{
@@ -284,6 +295,8 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
struct mm_struct *mm = vma->vm_mm;
pud_t pud;
+ if (!arch_supports_page_table_move())
+ return false;
/*
* The destination pud shouldn't be established, free_pgtables()
* should have released it.
@@ -306,8 +319,7 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
VM_BUG_ON(!pud_none(*new_pud));
- /* Set the new pud */
- set_pud_at(mm, new_addr, new_pud, pud);
+ pud_populate(mm, new_pud, pud_pgtable(pud));
flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
@@ -324,10 +336,61 @@ static inline bool move_normal_pud(struct vm_area_struct *vma,
}
#endif
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+{
+ spinlock_t *old_ptl, *new_ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ pud_t pud;
+
+ /*
+ * The destination pud shouldn't be established, free_pgtables()
+ * should have released it.
+ */
+ if (WARN_ON_ONCE(!pud_none(*new_pud)))
+ return false;
+
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * ptlocks because exclusive mmap_lock prevents deadlock.
+ */
+ old_ptl = pud_lock(vma->vm_mm, old_pud);
+ new_ptl = pud_lockptr(mm, new_pud);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
+ /* Clear the pud */
+ pud = *old_pud;
+ pud_clear(old_pud);
+
+ VM_BUG_ON(!pud_none(*new_pud));
+
+ /* Set the new pud */
+ /* mark soft_ditry when we add pud level soft dirty support */
+ set_pud_at(mm, new_addr, new_pud, pud);
+ flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ spin_unlock(old_ptl);
+
+ return true;
+}
+#else
+static bool move_huge_pud(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+{
+ WARN_ON_ONCE(1);
+ return false;
+
+}
+#endif
+
enum pgt_entry {
NORMAL_PMD,
HPAGE_PMD,
NORMAL_PUD,
+ HPAGE_PUD,
};
/*
@@ -347,6 +410,7 @@ static __always_inline unsigned long get_extent(enum pgt_entry entry,
mask = PMD_MASK;
size = PMD_SIZE;
break;
+ case HPAGE_PUD:
case NORMAL_PUD:
mask = PUD_MASK;
size = PUD_SIZE;
@@ -395,6 +459,12 @@ static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
move_huge_pmd(vma, old_addr, new_addr, old_entry,
new_entry);
break;
+ case HPAGE_PUD:
+ moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ move_huge_pud(vma, old_addr, new_addr, old_entry,
+ new_entry);
+ break;
+
default:
WARN_ON_ONCE(1);
break;
@@ -414,6 +484,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long extent, old_end;
struct mmu_notifier_range range;
pmd_t *old_pmd, *new_pmd;
+ pud_t *old_pud, *new_pud;
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
@@ -429,17 +500,24 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
* PUD level if possible.
*/
extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
- if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
- pud_t *old_pud, *new_pud;
- old_pud = get_old_pud(vma->vm_mm, old_addr);
- if (!old_pud)
+ old_pud = get_old_pud(vma->vm_mm, old_addr);
+ if (!old_pud)
+ continue;
+ new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
+ if (!new_pud)
+ break;
+ if (pud_trans_huge(*old_pud) || pud_devmap(*old_pud)) {
+ if (extent == HPAGE_PUD_SIZE) {
+ move_pgt_entry(HPAGE_PUD, vma, old_addr, new_addr,
+ old_pud, new_pud, need_rmap_locks);
+ /* We ignore and continue on error? */
continue;
- new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
- if (!new_pud)
- break;
+ }
+ } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
+
if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
- old_pud, new_pud, need_rmap_locks))
+ old_pud, new_pud, true))
continue;
}
@@ -466,7 +544,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
* moving at the PMD level if possible.
*/
if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
- old_pmd, new_pmd, need_rmap_locks))
+ old_pmd, new_pmd, true))
continue;
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index fcc29e9a3064..c729a4c4a1ac 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -922,7 +922,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
continue;
}
/*
- * No kthead_use_mm() user needs to read from the userspace so
+ * No kthread_use_mm() user needs to read from the userspace so
* we are ok to reap it.
*/
if (unlikely(p->flags & PF_KTHREAD))
diff --git a/mm/secretmem.c b/mm/secretmem.c
new file mode 100644
index 000000000000..f77d25467a14
--- /dev/null
+++ b/mm/secretmem.c
@@ -0,0 +1,254 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright IBM Corporation, 2021
+ *
+ * Author: Mike Rapoport <rppt@linux.ibm.com>
+ */
+
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/swap.h>
+#include <linux/mount.h>
+#include <linux/memfd.h>
+#include <linux/bitops.h>
+#include <linux/printk.h>
+#include <linux/pagemap.h>
+#include <linux/syscalls.h>
+#include <linux/pseudo_fs.h>
+#include <linux/secretmem.h>
+#include <linux/set_memory.h>
+#include <linux/sched/signal.h>
+
+#include <uapi/linux/magic.h>
+
+#include <asm/tlbflush.h>
+
+#include "internal.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt) "secretmem: " fmt
+
+/*
+ * Define mode and flag masks to allow validation of the system call
+ * parameters.
+ */
+#define SECRETMEM_MODE_MASK (0x0)
+#define SECRETMEM_FLAGS_MASK SECRETMEM_MODE_MASK
+
+static bool secretmem_enable __ro_after_init;
+module_param_named(enable, secretmem_enable, bool, 0400);
+MODULE_PARM_DESC(secretmem_enable,
+ "Enable secretmem and memfd_secret(2) system call");
+
+static atomic_t secretmem_users;
+
+bool secretmem_active(void)
+{
+ return !!atomic_read(&secretmem_users);
+}
+
+static vm_fault_t secretmem_fault(struct vm_fault *vmf)
+{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
+ struct inode *inode = file_inode(vmf->vma->vm_file);
+ pgoff_t offset = vmf->pgoff;
+ gfp_t gfp = vmf->gfp_mask;
+ unsigned long addr;
+ struct page *page;
+ int err;
+
+ if (((loff_t)vmf->pgoff << PAGE_SHIFT) >= i_size_read(inode))
+ return vmf_error(-EINVAL);
+
+retry:
+ page = find_lock_page(mapping, offset);
+ if (!page) {
+ page = alloc_page(gfp | __GFP_ZERO);
+ if (!page)
+ return VM_FAULT_OOM;
+
+ err = set_direct_map_invalid_noflush(page);
+ if (err) {
+ put_page(page);
+ return vmf_error(err);
+ }
+
+ __SetPageUptodate(page);
+ err = add_to_page_cache_lru(page, mapping, offset, gfp);
+ if (unlikely(err)) {
+ put_page(page);
+ /*
+ * If a split of large page was required, it
+ * already happened when we marked the page invalid
+ * which guarantees that this call won't fail
+ */
+ set_direct_map_default_noflush(page);
+ if (err == -EEXIST)
+ goto retry;
+
+ return vmf_error(err);
+ }
+
+ addr = (unsigned long)page_address(page);
+ flush_tlb_kernel_range(addr, addr + PAGE_SIZE);
+ }
+
+ vmf->page = page;
+ return VM_FAULT_LOCKED;
+}
+
+static const struct vm_operations_struct secretmem_vm_ops = {
+ .fault = secretmem_fault,
+};
+
+static int secretmem_release(struct inode *inode, struct file *file)
+{
+ atomic_dec(&secretmem_users);
+ return 0;
+}
+
+static int secretmem_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ unsigned long len = vma->vm_end - vma->vm_start;
+
+ if ((vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) == 0)
+ return -EINVAL;
+
+ if (mlock_future_check(vma->vm_mm, vma->vm_flags | VM_LOCKED, len))
+ return -EAGAIN;
+
+ vma->vm_flags |= VM_LOCKED | VM_DONTDUMP;
+ vma->vm_ops = &secretmem_vm_ops;
+
+ return 0;
+}
+
+bool vma_is_secretmem(struct vm_area_struct *vma)
+{
+ return vma->vm_ops == &secretmem_vm_ops;
+}
+
+static const struct file_operations secretmem_fops = {
+ .release = secretmem_release,
+ .mmap = secretmem_mmap,
+};
+
+static bool secretmem_isolate_page(struct page *page, isolate_mode_t mode)
+{
+ return false;
+}
+
+static int secretmem_migratepage(struct address_space *mapping,
+ struct page *newpage, struct page *page,
+ enum migrate_mode mode)
+{
+ return -EBUSY;
+}
+
+static void secretmem_freepage(struct page *page)
+{
+ set_direct_map_default_noflush(page);
+ clear_highpage(page);
+}
+
+const struct address_space_operations secretmem_aops = {
+ .freepage = secretmem_freepage,
+ .migratepage = secretmem_migratepage,
+ .isolate_page = secretmem_isolate_page,
+};
+
+static struct vfsmount *secretmem_mnt;
+
+static struct file *secretmem_file_create(unsigned long flags)
+{
+ struct file *file = ERR_PTR(-ENOMEM);
+ struct inode *inode;
+
+ inode = alloc_anon_inode(secretmem_mnt->mnt_sb);
+ if (IS_ERR(inode))
+ return ERR_CAST(inode);
+
+ file = alloc_file_pseudo(inode, secretmem_mnt, "secretmem",
+ O_RDWR, &secretmem_fops);
+ if (IS_ERR(file))
+ goto err_free_inode;
+
+ mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
+ mapping_set_unevictable(inode->i_mapping);
+
+ inode->i_mapping->a_ops = &secretmem_aops;
+
+ /* pretend we are a normal file with zero size */
+ inode->i_mode |= S_IFREG;
+ inode->i_size = 0;
+
+ return file;
+
+err_free_inode:
+ iput(inode);
+ return file;
+}
+
+SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
+{
+ struct file *file;
+ int fd, err;
+
+ /* make sure local flags do not confict with global fcntl.h */
+ BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);
+
+ if (!secretmem_enable)
+ return -ENOSYS;
+
+ if (flags & ~(SECRETMEM_FLAGS_MASK | O_CLOEXEC))
+ return -EINVAL;
+
+ fd = get_unused_fd_flags(flags & O_CLOEXEC);
+ if (fd < 0)
+ return fd;
+
+ file = secretmem_file_create(flags);
+ if (IS_ERR(file)) {
+ err = PTR_ERR(file);
+ goto err_put_fd;
+ }
+
+ file->f_flags |= O_LARGEFILE;
+
+ fd_install(fd, file);
+ atomic_inc(&secretmem_users);
+ return fd;
+
+err_put_fd:
+ put_unused_fd(fd);
+ return err;
+}
+
+static int secretmem_init_fs_context(struct fs_context *fc)
+{
+ return init_pseudo(fc, SECRETMEM_MAGIC) ? 0 : -ENOMEM;
+}
+
+static struct file_system_type secretmem_fs = {
+ .name = "secretmem",
+ .init_fs_context = secretmem_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+static int secretmem_init(void)
+{
+ int ret = 0;
+
+ if (!secretmem_enable)
+ return ret;
+
+ secretmem_mnt = kern_mount(&secretmem_fs);
+ if (IS_ERR(secretmem_mnt))
+ ret = PTR_ERR(secretmem_mnt);
+
+ /* prevent secretmem mappings from ever getting PROT_EXEC */
+ secretmem_mnt->mnt_flags |= MNT_NOEXEC;
+
+ return ret;
+}
+fs_initcall(secretmem_init);
diff --git a/mm/slab.h b/mm/slab.h
index 7b60ef2f32c3..67e06637ff2e 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -634,6 +634,7 @@ struct kmem_obj_info {
struct kmem_cache *kp_slab_cache;
void *kp_ret;
void *kp_stack[KS_ADDRS_COUNT];
+ void *kp_free_stack[KS_ADDRS_COUNT];
};
void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page);
#endif
diff --git a/mm/slab_common.c b/mm/slab_common.c
index c126e6f6b5a5..1c673c323baf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -575,7 +575,7 @@ EXPORT_SYMBOL_GPL(kmem_valid_obj);
* depends on the type of object and on how much debugging is enabled.
* For a slab-cache object, the fact that it is a slab object is printed,
* and, if available, the slab name, return address, and stack trace from
- * the allocation of that object.
+ * the allocation and last free path of that object.
*
* This function will splat if passed a pointer to a non-slab object.
* If you are not sure what type of object you have, you should instead
@@ -620,6 +620,16 @@ void kmem_dump_obj(void *object)
break;
pr_info(" %pS\n", kp.kp_stack[i]);
}
+
+ if (kp.kp_free_stack[0])
+ pr_cont(" Free path:\n");
+
+ for (i = 0; i < ARRAY_SIZE(kp.kp_free_stack); i++) {
+ if (!kp.kp_free_stack[i])
+ break;
+ pr_info(" %pS\n", kp.kp_free_stack[i]);
+ }
+
}
EXPORT_SYMBOL_GPL(kmem_dump_obj);
#endif
diff --git a/mm/slub.c b/mm/slub.c
index 3bc8b940c933..dc863c1ea324 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -26,6 +26,7 @@
#include <linux/cpuset.h>
#include <linux/mempolicy.h>
#include <linux/ctype.h>
+#include <linux/stackdepot.h>
#include <linux/debugobjects.h>
#include <linux/kallsyms.h>
#include <linux/kfence.h>
@@ -220,8 +221,8 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
#define TRACK_ADDRS_COUNT 16
struct track {
unsigned long addr; /* Called from address */
-#ifdef CONFIG_STACKTRACE
- unsigned long addrs[TRACK_ADDRS_COUNT]; /* Called from address */
+#ifdef CONFIG_STACKDEPOT
+ depot_stack_handle_t handle;
#endif
int cpu; /* Was running on cpu */
int pid; /* Pid context */
@@ -625,22 +626,27 @@ static struct track *get_track(struct kmem_cache *s, void *object,
return kasan_reset_tag(p + alloc);
}
+#ifdef CONFIG_STACKDEPOT
+static depot_stack_handle_t save_stack_depot_trace(gfp_t flags)
+{
+ unsigned long entries[TRACK_ADDRS_COUNT];
+ depot_stack_handle_t handle;
+ unsigned int nr_entries;
+
+ nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 4);
+ handle = stack_depot_save(entries, nr_entries, flags);
+ return handle;
+}
+#endif
+
static void set_track(struct kmem_cache *s, void *object,
enum track_item alloc, unsigned long addr)
{
struct track *p = get_track(s, object, alloc);
if (addr) {
-#ifdef CONFIG_STACKTRACE
- unsigned int nr_entries;
-
- metadata_access_enable();
- nr_entries = stack_trace_save(kasan_reset_tag(p->addrs),
- TRACK_ADDRS_COUNT, 3);
- metadata_access_disable();
-
- if (nr_entries < TRACK_ADDRS_COUNT)
- p->addrs[nr_entries] = 0;
+#ifdef CONFIG_STACKDEPOT
+ p->handle = save_stack_depot_trace(GFP_NOWAIT);
#endif
p->addr = addr;
p->cpu = smp_processor_id();
@@ -667,14 +673,19 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time)
pr_err("%s in %pS age=%lu cpu=%u pid=%d\n",
s, (void *)t->addr, pr_time - t->when, t->cpu, t->pid);
-#ifdef CONFIG_STACKTRACE
+#ifdef CONFIG_STACKDEPOT
{
- int i;
- for (i = 0; i < TRACK_ADDRS_COUNT; i++)
- if (t->addrs[i])
- pr_err("\t%pS\n", (void *)t->addrs[i]);
- else
- break;
+ depot_stack_handle_t handle;
+ unsigned long *entries;
+ unsigned int nr_entries;
+
+ handle = READ_ONCE(t->handle);
+ if (!handle) {
+ pr_err("object allocation/free stack trace missing\n");
+ } else {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ stack_trace_print(entries, nr_entries, 0);
+ }
}
#endif
}
@@ -4045,13 +4056,29 @@ void kmem_obj_info(struct kmem_obj_info *kpp, void *object, struct page *page)
!(s->flags & SLAB_STORE_USER))
return;
#ifdef CONFIG_SLUB_DEBUG
+ objp = fixup_red_left(s, objp);
trackp = get_track(s, objp, TRACK_ALLOC);
kpp->kp_ret = (void *)trackp->addr;
-#ifdef CONFIG_STACKTRACE
- for (i = 0; i < KS_ADDRS_COUNT && i < TRACK_ADDRS_COUNT; i++) {
- kpp->kp_stack[i] = (void *)trackp->addrs[i];
- if (!kpp->kp_stack[i])
- break;
+#ifdef CONFIG_STACKDEPOT
+ {
+ depot_stack_handle_t handle;
+ unsigned long *entries;
+ unsigned int nr_entries;
+
+ handle = READ_ONCE(trackp->handle);
+ if (handle) {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
+ kpp->kp_stack[i] = (void *)entries[i];
+ }
+
+ trackp = get_track(s, objp, TRACK_FREE);
+ handle = READ_ONCE(trackp->handle);
+ if (handle) {
+ nr_entries = stack_depot_fetch(handle, &entries);
+ for (i = 0; i < KS_ADDRS_COUNT && i < nr_entries; i++)
+ kpp->kp_free_stack[i] = (void *)entries[i];
+ }
}
#endif
#endif
diff --git a/mm/util.c b/mm/util.c
index a034525e7ba2..99c6cc77de9e 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -983,7 +983,7 @@ int __weak memcmp_pages(struct page *page1, struct page *page2)
* depends on the type of object and on how much debugging is enabled.
* For example, for a slab-cache object, the slab name is printed, and,
* if available, the return address and stack trace from the allocation
- * of that object.
+ * and last free path of that object.
*/
void mem_dump_obj(void *object)
{