diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-27 19:42:02 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-04-27 19:42:02 -0700 |
commit | 7fa8a8ee9400fe8ec188426e40e481717bc5e924 (patch) | |
tree | cc8fd6b4f936ec01e73238643757451e20478c07 /fs/proc | |
parent | 91ec4b0d11fe115581ce2835300558802ce55e6c (diff) | |
parent | 4d4b6d66db63ceed399f1fb1a4b24081d2590eb1 (diff) |
Merge tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton:
- Nick Piggin's "shoot lazy tlbs" series, to improve the peformance of
switching from a user process to a kernel thread.
- More folio conversions from Kefeng Wang, Zhang Peng and Pankaj
Raghav.
- zsmalloc performance improvements from Sergey Senozhatsky.
- Yue Zhao has found and fixed some data race issues around the
alteration of memcg userspace tunables.
- VFS rationalizations from Christoph Hellwig:
- removal of most of the callers of write_one_page()
- make __filemap_get_folio()'s return value more useful
- Luis Chamberlain has changed tmpfs so it no longer requires swap
backing. Use `mount -o noswap'.
- Qi Zheng has made the slab shrinkers operate locklessly, providing
some scalability benefits.
- Keith Busch has improved dmapool's performance, making part of its
operations O(1) rather than O(n).
- Peter Xu adds the UFFD_FEATURE_WP_UNPOPULATED feature to userfaultd,
permitting userspace to wr-protect anon memory unpopulated ptes.
- Kirill Shutemov has changed MAX_ORDER's meaning to be inclusive
rather than exclusive, and has fixed a bunch of errors which were
caused by its unintuitive meaning.
- Axel Rasmussen give userfaultfd the UFFDIO_CONTINUE_MODE_WP feature,
which causes minor faults to install a write-protected pte.
- Vlastimil Babka has done some maintenance work on vma_merge():
cleanups to the kernel code and improvements to our userspace test
harness.
- Cleanups to do_fault_around() by Lorenzo Stoakes.
- Mike Rapoport has moved a lot of initialization code out of various
mm/ files and into mm/mm_init.c.
- Lorenzo Stoakes removd vmf_insert_mixed_prot(), which was added for
DRM, but DRM doesn't use it any more.
- Lorenzo has also coverted read_kcore() and vread() to use iterators
and has thereby removed the use of bounce buffers in some cases.
- Lorenzo has also contributed further cleanups of vma_merge().
- Chaitanya Prakash provides some fixes to the mmap selftesting code.
- Matthew Wilcox changes xfs and afs so they no longer take sleeping
locks in ->map_page(), a step towards RCUification of pagefaults.
- Suren Baghdasaryan has improved mmap_lock scalability by switching to
per-VMA locking.
- Frederic Weisbecker has reworked the percpu cache draining so that it
no longer causes latency glitches on cpu isolated workloads.
- Mike Rapoport cleans up and corrects the ARCH_FORCE_MAX_ORDER Kconfig
logic.
- Liu Shixin has changed zswap's initialization so we no longer waste a
chunk of memory if zswap is not being used.
- Yosry Ahmed has improved the performance of memcg statistics
flushing.
- David Stevens has fixed several issues involving khugepaged,
userfaultfd and shmem.
- Christoph Hellwig has provided some cleanup work to zram's IO-related
code paths.
- David Hildenbrand has fixed up some issues in the selftest code's
testing of our pte state changing.
- Pankaj Raghav has made page_endio() unneeded and has removed it.
- Peter Xu contributed some rationalizations of the userfaultfd
selftests.
- Yosry Ahmed has fixed an issue around memcg's page recalim
accounting.
- Chaitanya Prakash has fixed some arm-related issues in the
selftests/mm code.
- Longlong Xia has improved the way in which KSM handles hwpoisoned
pages.
- Peter Xu fixes a few issues with uffd-wp at fork() time.
- Stefan Roesch has changed KSM so that it may now be used on a
per-process and per-cgroup basis.
* tag 'mm-stable-2023-04-27-15-30' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (369 commits)
mm,unmap: avoid flushing TLB in batch if PTE is inaccessible
shmem: restrict noswap option to initial user namespace
mm/khugepaged: fix conflicting mods to collapse_file()
sparse: remove unnecessary 0 values from rc
mm: move 'mmap_min_addr' logic from callers into vm_unmapped_area()
hugetlb: pte_alloc_huge() to replace huge pte_alloc_map()
maple_tree: fix allocation in mas_sparse_area()
mm: do not increment pgfault stats when page fault handler retries
zsmalloc: allow only one active pool compaction context
selftests/mm: add new selftests for KSM
mm: add new KSM process and sysfs knobs
mm: add new api to enable ksm per process
mm: shrinkers: fix debugfs file permissions
mm: don't check VMA write permissions if the PTE/PMD indicates write permissions
migrate_pages_batch: fix statistics for longterm pin retry
userfaultfd: use helper function range_in_vma()
lib/show_mem.c: use for_each_populated_zone() simplify code
mm: correct arg in reclaim_pages()/reclaim_clean_pages_from_list()
fs/buffer: convert create_page_buffers to folio_create_buffers
fs/buffer: add folio_create_empty_buffers helper
...
Diffstat (limited to 'fs/proc')
-rw-r--r-- | fs/proc/base.c | 3 | ||||
-rw-r--r-- | fs/proc/kcore.c | 85 | ||||
-rw-r--r-- | fs/proc/meminfo.c | 13 | ||||
-rw-r--r-- | fs/proc/task_mmu.c | 3 |
4 files changed, 57 insertions, 47 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c index 5e0e0ccd47aa..96a6a08c8235 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -96,6 +96,7 @@ #include <linux/time_namespace.h> #include <linux/resctrl.h> #include <linux/cn_proc.h> +#include <linux/ksm.h> #include <trace/events/oom.h> #include "internal.h" #include "fd.h" @@ -3207,6 +3208,8 @@ static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, mm = get_task_mm(task); if (mm) { seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + seq_printf(m, "ksm_merging_pages %lu\n", mm->ksm_merging_pages); + seq_printf(m, "ksm_process_profit %ld\n", ksm_process_profit(mm)); mmput(mm); } diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 71157ee35c1a..25b44b303b35 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -24,7 +24,7 @@ #include <linux/memblock.h> #include <linux/init.h> #include <linux/slab.h> -#include <linux/uaccess.h> +#include <linux/uio.h> #include <asm/io.h> #include <linux/list.h> #include <linux/ioport.h> @@ -307,10 +307,9 @@ static void append_kcore_note(char *notes, size_t *i, const char *name, *i = ALIGN(*i + descsz, 4); } -static ssize_t -read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) +static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) { - char *buf = file->private_data; + loff_t *fpos = &iocb->ki_pos; size_t phdrs_offset, notes_offset, data_offset; size_t page_offline_frozen = 1; size_t phdrs_len, notes_len; @@ -318,6 +317,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) size_t tsz; int nphdr; unsigned long start; + size_t buflen = iov_iter_count(iter); size_t orig_buflen = buflen; int ret = 0; @@ -356,12 +356,11 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) }; tsz = min_t(size_t, buflen, sizeof(struct elfhdr) - *fpos); - if (copy_to_user(buffer, (char *)&ehdr + *fpos, tsz)) { + if (copy_to_iter((char *)&ehdr + *fpos, tsz, iter) != tsz) { ret = -EFAULT; goto out; } - buffer += tsz; buflen -= tsz; *fpos += tsz; } @@ -398,15 +397,14 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) } tsz = min_t(size_t, buflen, phdrs_offset + phdrs_len - *fpos); - if (copy_to_user(buffer, (char *)phdrs + *fpos - phdrs_offset, - tsz)) { + if (copy_to_iter((char *)phdrs + *fpos - phdrs_offset, tsz, + iter) != tsz) { kfree(phdrs); ret = -EFAULT; goto out; } kfree(phdrs); - buffer += tsz; buflen -= tsz; *fpos += tsz; } @@ -448,14 +446,13 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) min(vmcoreinfo_size, notes_len - i)); tsz = min_t(size_t, buflen, notes_offset + notes_len - *fpos); - if (copy_to_user(buffer, notes + *fpos - notes_offset, tsz)) { + if (copy_to_iter(notes + *fpos - notes_offset, tsz, iter) != tsz) { kfree(notes); ret = -EFAULT; goto out; } kfree(notes); - buffer += tsz; buflen -= tsz; *fpos += tsz; } @@ -497,7 +494,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) } if (!m) { - if (clear_user(buffer, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -506,16 +503,33 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) switch (m->type) { case KCORE_VMALLOC: - vread(buf, (char *)start, tsz); - /* we have to zero-fill user buffer even if no read */ - if (copy_to_user(buffer, buf, tsz)) { - ret = -EFAULT; - goto out; + { + const char *src = (char *)start; + size_t read = 0, left = tsz; + + /* + * vmalloc uses spinlocks, so we optimistically try to + * read memory. If this fails, fault pages in and try + * again until we are done. + */ + while (true) { + read += vread_iter(iter, src, left); + if (read == tsz) + break; + + src += read; + left -= read; + + if (fault_in_iov_iter_writeable(iter, left)) { + ret = -EFAULT; + goto out; + } } break; + } case KCORE_USER: /* User page is handled prior to normal kernel page: */ - if (copy_to_user(buffer, (char *)start, tsz)) { + if (copy_to_iter((char *)start, tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -531,7 +545,7 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) */ if (!page || PageOffline(page) || is_page_hwpoison(page) || !pfn_is_ram(pfn)) { - if (clear_user(buffer, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -541,24 +555,17 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) case KCORE_VMEMMAP: case KCORE_TEXT: /* - * Using bounce buffer to bypass the - * hardened user copy kernel text checks. + * We use _copy_to_iter() to bypass usermode hardening + * which would otherwise prevent this operation. */ - if (copy_from_kernel_nofault(buf, (void *)start, tsz)) { - if (clear_user(buffer, tsz)) { - ret = -EFAULT; - goto out; - } - } else { - if (copy_to_user(buffer, buf, tsz)) { - ret = -EFAULT; - goto out; - } + if (_copy_to_iter((char *)start, tsz, iter) != tsz) { + ret = -EFAULT; + goto out; } break; default: pr_warn_once("Unhandled KCORE type: %d\n", m->type); - if (clear_user(buffer, tsz)) { + if (iov_iter_zero(tsz, iter) != tsz) { ret = -EFAULT; goto out; } @@ -566,7 +573,6 @@ read_kcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) skip: buflen -= tsz; *fpos += tsz; - buffer += tsz; start += tsz; tsz = (buflen > PAGE_SIZE ? PAGE_SIZE : buflen); } @@ -589,10 +595,6 @@ static int open_kcore(struct inode *inode, struct file *filp) if (ret) return ret; - filp->private_data = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!filp->private_data) - return -ENOMEM; - if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { @@ -603,16 +605,9 @@ static int open_kcore(struct inode *inode, struct file *filp) return 0; } -static int release_kcore(struct inode *inode, struct file *file) -{ - kfree(file->private_data); - return 0; -} - static const struct proc_ops kcore_proc_ops = { - .proc_read = read_kcore, + .proc_read_iter = read_kcore_iter, .proc_open = open_kcore, - .proc_release = release_kcore, .proc_lseek = default_llseek, }; diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 440960110a42..b43d0bd42762 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -6,6 +6,7 @@ #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/mmzone.h> +#include <linux/memblock.h> #include <linux/proc_fs.h> #include <linux/percpu.h> #include <linux/seq_file.h> @@ -131,6 +132,18 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "VmallocChunk: ", 0ul); show_val_kb(m, "Percpu: ", pcpu_nr_pages()); +#ifdef CONFIG_MEMTEST + if (early_memtest_done) { + unsigned long early_memtest_bad_size_kb; + + early_memtest_bad_size_kb = early_memtest_bad_size>>10; + if (early_memtest_bad_size && !early_memtest_bad_size_kb) + early_memtest_bad_size_kb = 1; + /* When 0 is reported, it means there actually was a successful test */ + seq_printf(m, "EarlyMemtestBad: %5lu kB\n", early_memtest_bad_size_kb); + } +#endif + #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", atomic_long_read(&num_poisoned_pages) << (PAGE_SHIFT - 10)); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6a96e1713fd5..cb49479acd2e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -782,7 +782,6 @@ static void smap_gather_stats(struct vm_area_struct *vma, if (start >= vma->vm_end) return; -#ifdef CONFIG_SHMEM if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { /* * For shared or readonly shmem mappings we know that all @@ -803,7 +802,7 @@ static void smap_gather_stats(struct vm_area_struct *vma, ops = &smaps_shmem_walk_ops; } } -#endif + /* mmap_lock is held in m_start */ if (!start) walk_page_vma(vma, ops, mss); |