summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/s390/mm/gmap.c20
-rw-r--r--fs/coredump.c1
-rw-r--r--include/linux/ksm.h7
-rw-r--r--include/linux/uio.h16
-rw-r--r--kernel/sys.c12
-rw-r--r--lib/iov_iter.c17
-rw-r--r--mm/damon/paddr.c26
-rw-r--r--mm/ksm.c70
-rw-r--r--mm/page_alloc.c9
-rw-r--r--tools/testing/selftests/mm/ksm_functional_tests.c46
10 files changed, 172 insertions, 52 deletions
diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c
index 0949811761e6..dfe905c7bd8e 100644
--- a/arch/s390/mm/gmap.c
+++ b/arch/s390/mm/gmap.c
@@ -2585,30 +2585,12 @@ EXPORT_SYMBOL_GPL(s390_enable_sie);
int gmap_mark_unmergeable(void)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
- unsigned long vm_flags;
- int ret;
- VMA_ITERATOR(vmi, mm, 0);
-
/*
* Make sure to disable KSM (if enabled for the whole process or
* individual VMAs). Note that nothing currently hinders user space
* from re-enabling it.
*/
- clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
-
- for_each_vma(vmi, vma) {
- /* Copy vm_flags to avoid partial modifications in ksm_madvise */
- vm_flags = vma->vm_flags;
- ret = ksm_madvise(vma, vma->vm_start, vma->vm_end,
- MADV_UNMERGEABLE, &vm_flags);
- if (ret)
- return ret;
- vm_flags_reset(vma, vm_flags);
- }
- mm->def_flags &= ~VM_MERGEABLE;
- return 0;
+ return ksm_disable(current->mm);
}
EXPORT_SYMBOL_GPL(gmap_mark_unmergeable);
diff --git a/fs/coredump.c b/fs/coredump.c
index 5df1e6e1eb2b..ece7badf701b 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -882,6 +882,7 @@ static int dump_emit_page(struct coredump_params *cprm, struct page *page)
pos = file->f_pos;
bvec_set_page(&bvec, page, PAGE_SIZE, 0);
iov_iter_bvec(&iter, ITER_SOURCE, &bvec, 1, PAGE_SIZE);
+ iov_iter_set_copy_mc(&iter);
n = __kernel_write_iter(cprm->file, &iter, &pos);
if (n != PAGE_SIZE)
return 0;
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 7a9b76fb6c3f..899a314bc487 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -21,6 +21,8 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
void ksm_add_vma(struct vm_area_struct *vma);
int ksm_enable_merge_any(struct mm_struct *mm);
+int ksm_disable_merge_any(struct mm_struct *mm);
+int ksm_disable(struct mm_struct *mm);
int __ksm_enter(struct mm_struct *mm);
void __ksm_exit(struct mm_struct *mm);
@@ -79,6 +81,11 @@ static inline void ksm_add_vma(struct vm_area_struct *vma)
{
}
+static inline int ksm_disable(struct mm_struct *mm)
+{
+ return 0;
+}
+
static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
return 0;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 3d386849a758..044c1d8c230c 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -42,6 +42,7 @@ struct iov_iter_state {
struct iov_iter {
u8 iter_type;
+ bool copy_mc;
bool nofault;
bool data_source;
bool user_backed;
@@ -256,8 +257,22 @@ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i);
#ifdef CONFIG_ARCH_HAS_COPY_MC
size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i);
+static inline void iov_iter_set_copy_mc(struct iov_iter *i)
+{
+ i->copy_mc = true;
+}
+
+static inline bool iov_iter_is_copy_mc(const struct iov_iter *i)
+{
+ return i->copy_mc;
+}
#else
#define _copy_mc_to_iter _copy_to_iter
+static inline void iov_iter_set_copy_mc(struct iov_iter *i) { }
+static inline bool iov_iter_is_copy_mc(const struct iov_iter *i)
+{
+ return false;
+}
#endif
size_t iov_iter_zero(size_t bytes, struct iov_iter *);
@@ -380,6 +395,7 @@ static inline void iov_iter_ubuf(struct iov_iter *i, unsigned int direction,
WARN_ON(direction & ~(READ | WRITE));
*i = (struct iov_iter) {
.iter_type = ITER_UBUF,
+ .copy_mc = false,
.user_backed = true,
.data_source = direction,
.ubuf = buf,
diff --git a/kernel/sys.c b/kernel/sys.c
index 72cdb16e2636..339fee3eff6a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -2695,16 +2695,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
if (mmap_write_lock_killable(me->mm))
return -EINTR;
- if (arg2) {
+ if (arg2)
error = ksm_enable_merge_any(me->mm);
- } else {
- /*
- * TODO: we might want disable KSM on all VMAs and
- * trigger unsharing to completely disable KSM.
- */
- clear_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
- error = 0;
- }
+ else
+ error = ksm_disable_merge_any(me->mm);
mmap_write_unlock(me->mm);
break;
case PR_GET_MEMORY_MERGE:
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index c3dbe994112c..960223ed9199 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -434,6 +434,7 @@ void iov_iter_init(struct iov_iter *i, unsigned int direction,
WARN_ON(direction & ~(READ | WRITE));
*i = (struct iov_iter) {
.iter_type = ITER_IOVEC,
+ .copy_mc = false,
.nofault = false,
.user_backed = true,
.data_source = direction,
@@ -630,6 +631,14 @@ size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
EXPORT_SYMBOL_GPL(_copy_mc_to_iter);
#endif /* CONFIG_ARCH_HAS_COPY_MC */
+static void *memcpy_from_iter(struct iov_iter *i, void *to, const void *from,
+ size_t size)
+{
+ if (iov_iter_is_copy_mc(i))
+ return (void *)copy_mc_to_kernel(to, from, size);
+ return memcpy(to, from, size);
+}
+
size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
{
if (WARN_ON_ONCE(!i->data_source))
@@ -639,7 +648,7 @@ size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
might_fault();
iterate_and_advance(i, bytes, base, len, off,
copyin(addr + off, base, len),
- memcpy(addr + off, base, len)
+ memcpy_from_iter(i, addr + off, base, len)
)
return bytes;
@@ -862,7 +871,7 @@ size_t copy_page_from_iter_atomic(struct page *page, unsigned offset, size_t byt
}
iterate_and_advance(i, bytes, base, len, off,
copyin(p + off, base, len),
- memcpy(p + off, base, len)
+ memcpy_from_iter(i, p + off, base, len)
)
kunmap_atomic(kaddr);
return bytes;
@@ -1043,6 +1052,7 @@ void iov_iter_kvec(struct iov_iter *i, unsigned int direction,
WARN_ON(direction & ~(READ | WRITE));
*i = (struct iov_iter){
.iter_type = ITER_KVEC,
+ .copy_mc = false,
.data_source = direction,
.kvec = kvec,
.nr_segs = nr_segs,
@@ -1059,6 +1069,7 @@ void iov_iter_bvec(struct iov_iter *i, unsigned int direction,
WARN_ON(direction & ~(READ | WRITE));
*i = (struct iov_iter){
.iter_type = ITER_BVEC,
+ .copy_mc = false,
.data_source = direction,
.bvec = bvec,
.nr_segs = nr_segs,
@@ -1105,6 +1116,7 @@ void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
BUG_ON(direction & ~1);
*i = (struct iov_iter) {
.iter_type = ITER_XARRAY,
+ .copy_mc = false,
.data_source = direction,
.xarray = xarray,
.xarray_start = start,
@@ -1128,6 +1140,7 @@ void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count)
BUG_ON(direction != READ);
*i = (struct iov_iter){
.iter_type = ITER_DISCARD,
+ .copy_mc = false,
.data_source = false,
.count = count,
.iov_offset = 0
diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c
index dd9c33fbe805..467b99166b43 100644
--- a/mm/damon/paddr.c
+++ b/mm/damon/paddr.c
@@ -134,10 +134,8 @@ static bool damon_pa_young(unsigned long paddr, unsigned long *folio_sz)
}
need_lock = !folio_test_anon(folio) || folio_test_ksm(folio);
- if (need_lock && !folio_trylock(folio)) {
- folio_put(folio);
- return false;
- }
+ if (need_lock && !folio_trylock(folio))
+ goto out;
rmap_walk(folio, &rwc);
@@ -238,21 +236,18 @@ static unsigned long damon_pa_pageout(struct damon_region *r, struct damos *s)
if (!folio)
continue;
- if (damos_pa_filter_out(s, folio)) {
- folio_put(folio);
- continue;
- }
+ if (damos_pa_filter_out(s, folio))
+ goto put_folio;
folio_clear_referenced(folio);
folio_test_clear_young(folio);
- if (!folio_isolate_lru(folio)) {
- folio_put(folio);
- continue;
- }
+ if (!folio_isolate_lru(folio))
+ goto put_folio;
if (folio_test_unevictable(folio))
folio_putback_lru(folio);
else
list_add(&folio->lru, &folio_list);
+put_folio:
folio_put(folio);
}
applied = reclaim_pages(&folio_list);
@@ -271,16 +266,15 @@ static inline unsigned long damon_pa_mark_accessed_or_deactivate(
if (!folio)
continue;
- if (damos_pa_filter_out(s, folio)) {
- folio_put(folio);
- continue;
- }
+ if (damos_pa_filter_out(s, folio))
+ goto put_folio;
if (mark_accessed)
folio_mark_accessed(folio);
else
folio_deactivate(folio);
applied += folio_nr_pages(folio);
+put_folio:
folio_put(folio);
}
return applied * PAGE_SIZE;
diff --git a/mm/ksm.c b/mm/ksm.c
index 9e48258985d2..0156bded3a66 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2520,6 +2520,22 @@ static void __ksm_add_vma(struct vm_area_struct *vma)
vm_flags_set(vma, VM_MERGEABLE);
}
+static int __ksm_del_vma(struct vm_area_struct *vma)
+{
+ int err;
+
+ if (!(vma->vm_flags & VM_MERGEABLE))
+ return 0;
+
+ if (vma->anon_vma) {
+ err = unmerge_ksm_pages(vma, vma->vm_start, vma->vm_end);
+ if (err)
+ return err;
+ }
+
+ vm_flags_clear(vma, VM_MERGEABLE);
+ return 0;
+}
/**
* ksm_add_vma - Mark vma as mergeable if compatible
*
@@ -2542,6 +2558,20 @@ static void ksm_add_vmas(struct mm_struct *mm)
__ksm_add_vma(vma);
}
+static int ksm_del_vmas(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ int err;
+
+ VMA_ITERATOR(vmi, mm, 0);
+ for_each_vma(vmi, vma) {
+ err = __ksm_del_vma(vma);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
/**
* ksm_enable_merge_any - Add mm to mm ksm list and enable merging on all
* compatible VMA's
@@ -2569,6 +2599,46 @@ int ksm_enable_merge_any(struct mm_struct *mm)
return 0;
}
+/**
+ * ksm_disable_merge_any - Disable merging on all compatible VMA's of the mm,
+ * previously enabled via ksm_enable_merge_any().
+ *
+ * Disabling merging implies unmerging any merged pages, like setting
+ * MADV_UNMERGEABLE would. If unmerging fails, the whole operation fails and
+ * merging on all compatible VMA's remains enabled.
+ *
+ * @mm: Pointer to mm
+ *
+ * Returns 0 on success, otherwise error code
+ */
+int ksm_disable_merge_any(struct mm_struct *mm)
+{
+ int err;
+
+ if (!test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ return 0;
+
+ err = ksm_del_vmas(mm);
+ if (err) {
+ ksm_add_vmas(mm);
+ return err;
+ }
+
+ clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ return 0;
+}
+
+int ksm_disable(struct mm_struct *mm)
+{
+ mmap_assert_write_locked(mm);
+
+ if (!test_bit(MMF_VM_MERGEABLE, &mm->flags))
+ return 0;
+ if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ return ksm_disable_merge_any(mm);
+ return ksm_del_vmas(mm);
+}
+
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, unsigned long *vm_flags)
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 9de2a18519a1..47421bedc12b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1502,6 +1502,15 @@ void __free_pages_core(struct page *page, unsigned int order)
* interleaving within a single pageblock. It is therefore sufficient to check
* the first and last page of a pageblock and avoid checking each individual
* page in a pageblock.
+ *
+ * Note: the function may return non-NULL struct page even for a page block
+ * which contains a memory hole (i.e. there is no physical memory for a subset
+ * of the pfn range). For example, if the pageblock order is MAX_ORDER, which
+ * will fall into 2 sub-sections, and the end pfn of the pageblock may be hole
+ * even though the start pfn is online and valid. This should be safe most of
+ * the time because struct pages are still initialized via init_unavailable_range()
+ * and pfn walkers shouldn't touch any physical memory range for which they do
+ * not recognize any specific metadata in struct pages.
*/
struct page *__pageblock_pfn_to_page(unsigned long start_pfn,
unsigned long end_pfn, struct zone *zone)
diff --git a/tools/testing/selftests/mm/ksm_functional_tests.c b/tools/testing/selftests/mm/ksm_functional_tests.c
index 7bc9fc17c9f0..26853badae70 100644
--- a/tools/testing/selftests/mm/ksm_functional_tests.c
+++ b/tools/testing/selftests/mm/ksm_functional_tests.c
@@ -91,9 +91,10 @@ static int ksm_merge(void)
return 0;
}
-static char *mmap_and_merge_range(char val, unsigned long size)
+static char *mmap_and_merge_range(char val, unsigned long size, bool use_prctl)
{
char *map;
+ int ret;
map = mmap(NULL, size, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANON, -1, 0);
@@ -110,7 +111,17 @@ static char *mmap_and_merge_range(char val, unsigned long size)
/* Make sure each page contains the same values to merge them. */
memset(map, val, size);
- if (madvise(map, size, MADV_MERGEABLE)) {
+
+ if (use_prctl) {
+ ret = prctl(PR_SET_MEMORY_MERGE, 1, 0, 0, 0);
+ if (ret < 0 && errno == EINVAL) {
+ ksft_test_result_skip("PR_SET_MEMORY_MERGE not supported\n");
+ goto unmap;
+ } else if (ret) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=1 failed\n");
+ goto unmap;
+ }
+ } else if (madvise(map, size, MADV_MERGEABLE)) {
ksft_test_result_fail("MADV_MERGEABLE failed\n");
goto unmap;
}
@@ -133,7 +144,7 @@ static void test_unmerge(void)
ksft_print_msg("[RUN] %s\n", __func__);
- map = mmap_and_merge_range(0xcf, size);
+ map = mmap_and_merge_range(0xcf, size, false);
if (map == MAP_FAILED)
return;
@@ -155,7 +166,7 @@ static void test_unmerge_discarded(void)
ksft_print_msg("[RUN] %s\n", __func__);
- map = mmap_and_merge_range(0xcf, size);
+ map = mmap_and_merge_range(0xcf, size, false);
if (map == MAP_FAILED)
return;
@@ -187,7 +198,7 @@ static void test_unmerge_uffd_wp(void)
ksft_print_msg("[RUN] %s\n", __func__);
- map = mmap_and_merge_range(0xcf, size);
+ map = mmap_and_merge_range(0xcf, size, false);
if (map == MAP_FAILED)
return;
@@ -323,9 +334,31 @@ static void test_prctl_fork(void)
ksft_test_result_pass("PR_SET_MEMORY_MERGE value is inherited\n");
}
+static void test_prctl_unmerge(void)
+{
+ const unsigned int size = 2 * MiB;
+ char *map;
+
+ ksft_print_msg("[RUN] %s\n", __func__);
+
+ map = mmap_and_merge_range(0xcf, size, true);
+ if (map == MAP_FAILED)
+ return;
+
+ if (prctl(PR_SET_MEMORY_MERGE, 0, 0, 0, 0)) {
+ ksft_test_result_fail("PR_SET_MEMORY_MERGE=0 failed\n");
+ goto unmap;
+ }
+
+ ksft_test_result(!range_maps_duplicates(map, size),
+ "Pages were unmerged\n");
+unmap:
+ munmap(map, size);
+}
+
int main(int argc, char **argv)
{
- unsigned int tests = 4;
+ unsigned int tests = 5;
int err;
#ifdef __NR_userfaultfd
@@ -355,6 +388,7 @@ int main(int argc, char **argv)
test_prctl();
test_prctl_fork();
+ test_prctl_unmerge();
err = ksft_get_fail_cnt();
if (err)