summaryrefslogtreecommitdiff
path: root/mm/rmap.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/rmap.c')
-rw-r--r--mm/rmap.c409
1 files changed, 285 insertions, 124 deletions
diff --git a/mm/rmap.c b/mm/rmap.c
index fedb82371efe..5bcb334cd6f2 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -73,6 +73,7 @@
#include <linux/page_idle.h>
#include <linux/memremap.h>
#include <linux/userfaultfd_k.h>
+#include <linux/mm_inline.h>
#include <asm/tlbflush.h>
@@ -298,7 +299,7 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
* Reuse existing anon_vma if its degree lower than two,
* that means it has no vma and only one anon_vma child.
*
- * Do not chose parent anon_vma, otherwise first child
+ * Do not choose parent anon_vma, otherwise first child
* will always reuse it. Root anon_vma is never reused:
* it has self-parent reference and at least one child.
*/
@@ -526,9 +527,11 @@ out:
*
* Its a little more complex as it tries to keep the fast path to a single
* atomic op -- the trylock. If we fail the trylock, we fall back to getting a
- * reference like with page_get_anon_vma() and then block on the mutex.
+ * reference like with page_get_anon_vma() and then block on the mutex
+ * on !rwc->try_lock case.
*/
-struct anon_vma *folio_lock_anon_vma_read(struct folio *folio)
+struct anon_vma *folio_lock_anon_vma_read(struct folio *folio,
+ struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma = NULL;
struct anon_vma *root_anon_vma;
@@ -556,6 +559,12 @@ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio)
goto out;
}
+ if (rwc && rwc->try_lock) {
+ anon_vma = NULL;
+ rwc->contended = true;
+ goto out;
+ }
+
/* trylock failed, we got to sleep */
if (!atomic_inc_not_zero(&anon_vma->refcount)) {
anon_vma = NULL;
@@ -882,7 +891,8 @@ static bool invalid_folio_referenced_vma(struct vm_area_struct *vma, void *arg)
*
* Quick test_and_clear_referenced for all mappings of a folio,
*
- * Return: The number of mappings which referenced the folio.
+ * Return: The number of mappings which referenced the folio. Return -1 if
+ * the function bailed out due to rmap lock contention.
*/
int folio_referenced(struct folio *folio, int is_locked,
struct mem_cgroup *memcg, unsigned long *vm_flags)
@@ -896,6 +906,7 @@ int folio_referenced(struct folio *folio, int is_locked,
.rmap_one = folio_referenced_one,
.arg = (void *)&pra,
.anon_lock = folio_lock_anon_vma_read,
+ .try_lock = true,
};
*vm_flags = 0;
@@ -926,15 +937,15 @@ int folio_referenced(struct folio *folio, int is_locked,
if (we_locked)
folio_unlock(folio);
- return pra.referenced;
+ return rwc.contended ? -1 : pra.referenced;
}
-static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
- unsigned long address, void *arg)
+static int page_vma_mkclean_one(struct page_vma_mapped_walk *pvmw)
{
- DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
+ int cleaned = 0;
+ struct vm_area_struct *vma = pvmw->vma;
struct mmu_notifier_range range;
- int *cleaned = arg;
+ unsigned long address = pvmw->address;
/*
* We have to assume the worse case ie pmd for invalidation. Note that
@@ -942,16 +953,16 @@ static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
*/
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, vma, vma->vm_mm, address,
- vma_address_end(&pvmw));
+ vma_address_end(pvmw));
mmu_notifier_invalidate_range_start(&range);
- while (page_vma_mapped_walk(&pvmw)) {
+ while (page_vma_mapped_walk(pvmw)) {
int ret = 0;
- address = pvmw.address;
- if (pvmw.pte) {
+ address = pvmw->address;
+ if (pvmw->pte) {
pte_t entry;
- pte_t *pte = pvmw.pte;
+ pte_t *pte = pvmw->pte;
if (!pte_dirty(*pte) && !pte_write(*pte))
continue;
@@ -964,13 +975,14 @@ static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
ret = 1;
} else {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- pmd_t *pmd = pvmw.pmd;
+ pmd_t *pmd = pvmw->pmd;
pmd_t entry;
if (!pmd_dirty(*pmd) && !pmd_write(*pmd))
continue;
- flush_cache_page(vma, address, folio_pfn(folio));
+ flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
entry = pmdp_invalidate(vma, address, pmd);
entry = pmd_wrprotect(entry);
entry = pmd_mkclean(entry);
@@ -990,11 +1002,22 @@ static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
* See Documentation/vm/mmu_notifier.rst
*/
if (ret)
- (*cleaned)++;
+ cleaned++;
}
mmu_notifier_invalidate_range_end(&range);
+ return cleaned;
+}
+
+static bool page_mkclean_one(struct folio *folio, struct vm_area_struct *vma,
+ unsigned long address, void *arg)
+{
+ DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, PVMW_SYNC);
+ int *cleaned = arg;
+
+ *cleaned += page_vma_mkclean_one(&pvmw);
+
return true;
}
@@ -1032,6 +1055,38 @@ int folio_mkclean(struct folio *folio)
EXPORT_SYMBOL_GPL(folio_mkclean);
/**
+ * pfn_mkclean_range - Cleans the PTEs (including PMDs) mapped with range of
+ * [@pfn, @pfn + @nr_pages) at the specific offset (@pgoff)
+ * within the @vma of shared mappings. And since clean PTEs
+ * should also be readonly, write protects them too.
+ * @pfn: start pfn.
+ * @nr_pages: number of physically contiguous pages srarting with @pfn.
+ * @pgoff: page offset that the @pfn mapped with.
+ * @vma: vma that @pfn mapped within.
+ *
+ * Returns the number of cleaned PTEs (including PMDs).
+ */
+int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff,
+ struct vm_area_struct *vma)
+{
+ struct page_vma_mapped_walk pvmw = {
+ .pfn = pfn,
+ .nr_pages = nr_pages,
+ .pgoff = pgoff,
+ .vma = vma,
+ .flags = PVMW_SYNC,
+ };
+
+ if (invalid_mkclean_vma(vma, NULL))
+ return 0;
+
+ pvmw.address = vma_pgoff_address(pgoff, nr_pages, vma);
+ VM_BUG_ON_VMA(pvmw.address == -EFAULT, vma);
+
+ return page_vma_mkclean_one(&pvmw);
+}
+
+/**
* page_move_anon_rmap - move a page to our anon_vma
* @page: the page to move to our anon_vma
* @vma: the vma the page belongs to
@@ -1044,6 +1099,7 @@ EXPORT_SYMBOL_GPL(folio_mkclean);
void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
+ struct page *subpage = page;
page = compound_head(page);
@@ -1057,6 +1113,7 @@ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma)
* folio_test_anon()) will not see one without the other.
*/
WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
+ SetPageAnonExclusive(subpage);
}
/**
@@ -1074,7 +1131,7 @@ static void __page_set_anon_rmap(struct page *page,
BUG_ON(!anon_vma);
if (PageAnon(page))
- return;
+ goto out;
/*
* If the page isn't exclusively mapped into this vma,
@@ -1093,6 +1150,9 @@ static void __page_set_anon_rmap(struct page *page,
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
page->index = linear_page_index(vma, address);
+out:
+ if (exclusive)
+ SetPageAnonExclusive(page);
}
/**
@@ -1127,7 +1187,7 @@ static void __page_check_anon_rmap(struct page *page,
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
- * @compound: charge the page as compound or small page
+ * @flags: the rmap flags
*
* The caller needs to hold the pte lock, and the page must be locked in
* the anon_vma case: to serialize mapping,index checking after setting,
@@ -1135,18 +1195,7 @@ static void __page_check_anon_rmap(struct page *page,
* (but PageKsm is never downgraded to PageAnon).
*/
void page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, bool compound)
-{
- do_page_add_anon_rmap(page, vma, address, compound ? RMAP_COMPOUND : 0);
-}
-
-/*
- * Special version of the above for do_swap_page, which often runs
- * into pages that are exclusively owned by the current process.
- * Everybody else should continue to use page_add_anon_rmap above.
- */
-void do_page_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, int flags)
+ struct vm_area_struct *vma, unsigned long address, rmap_t flags)
{
bool compound = flags & RMAP_COMPOUND;
bool first;
@@ -1165,6 +1214,8 @@ void do_page_add_anon_rmap(struct page *page,
} else {
first = atomic_inc_and_test(&page->_mapcount);
}
+ VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
+ VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
if (first) {
int nr = compound ? thp_nr_pages(page) : 1;
@@ -1185,7 +1236,7 @@ void do_page_add_anon_rmap(struct page *page,
/* address might be in next vma when migration races vma_adjust */
else if (first)
__page_set_anon_rmap(page, vma, address,
- flags & RMAP_EXCLUSIVE);
+ !!(flags & RMAP_EXCLUSIVE));
else
__page_check_anon_rmap(page, vma, address);
@@ -1193,19 +1244,22 @@ void do_page_add_anon_rmap(struct page *page,
}
/**
- * page_add_new_anon_rmap - add pte mapping to a new anonymous page
+ * page_add_new_anon_rmap - add mapping to a new anonymous page
* @page: the page to add the mapping to
* @vma: the vm area in which the mapping is added
* @address: the user virtual address mapped
- * @compound: charge the page as compound or small page
+ *
+ * If it's a compound page, it is accounted as a compound page. As the page
+ * is new, it's assume to get mapped exclusively by a single process.
*
* Same as page_add_anon_rmap but must only be called on *new* pages.
* This means the inc-and-test can be bypassed.
* Page does not have to be locked.
*/
void page_add_new_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address, bool compound)
+ struct vm_area_struct *vma, unsigned long address)
{
+ const bool compound = PageCompound(page);
int nr = compound ? thp_nr_pages(page) : 1;
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
@@ -1218,8 +1272,6 @@ void page_add_new_anon_rmap(struct page *page,
__mod_lruvec_page_state(page, NR_ANON_THPS, nr);
} else {
- /* Anon THP always mapped first with PMD */
- VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* increment count (starts at -1) */
atomic_set(&page->_mapcount, 0);
}
@@ -1425,7 +1477,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
pte_t pteval;
struct page *subpage;
- bool ret = true;
+ bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
@@ -1481,59 +1533,81 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
subpage = folio_page(folio,
pte_pfn(*pvmw.pte) - folio_pfn(folio));
address = pvmw.address;
+ anon_exclusive = folio_test_anon(folio) &&
+ PageAnonExclusive(subpage);
- if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
+ if (folio_test_hugetlb(folio)) {
/*
- * To call huge_pmd_unshare, i_mmap_rwsem must be
- * held in write mode. Caller needs to explicitly
- * do this outside rmap routines.
+ * The try_to_unmap() is only passed a hugetlb page
+ * in the case where the hugetlb page is poisoned.
*/
- VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
- if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
- /*
- * huge_pmd_unshare unmapped an entire PMD
- * page. There is no way of knowing exactly
- * which PMDs may be cached for this mm, so
- * we must flush them all. start/end were
- * already adjusted above to cover this range.
- */
- flush_cache_range(vma, range.start, range.end);
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
+ VM_BUG_ON_PAGE(!PageHWPoison(subpage), subpage);
+ /*
+ * huge_pmd_unshare may unmap an entire PMD page.
+ * There is no way of knowing exactly which PMDs may
+ * be cached for this mm, so we must flush them all.
+ * start/end were already adjusted above to cover this
+ * range.
+ */
+ flush_cache_range(vma, range.start, range.end);
+ if (!folio_test_anon(folio)) {
/*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
*/
- page_vma_mapped_walk_done(&pvmw);
- break;
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+
+ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
+
+ /*
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
}
- }
-
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
- if (should_defer_flush(mm, flags)) {
+ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
+ } else {
+ flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
/*
- * We clear the PTE but do not flush so potentially
- * a remote CPU could still be writing to the folio.
- * If the entry was previously clean then the
- * architecture must guarantee that a clear->dirty
- * transition on a cached TLB entry is written through
- * and traps if the PTE is unmapped.
+ * Nuke the page table entry. When having to clear
+ * PageAnonExclusive(), we always have to flush.
*/
- pteval = ptep_get_and_clear(mm, address, pvmw.pte);
+ if (should_defer_flush(mm, flags) && !anon_exclusive) {
+ /*
+ * We clear the PTE but do not flush so potentially
+ * a remote CPU could still be writing to the folio.
+ * If the entry was previously clean then the
+ * architecture must guarantee that a clear->dirty
+ * transition on a cached TLB entry is written through
+ * and traps if the PTE is unmapped.
+ */
+ pteval = ptep_get_and_clear(mm, address, pvmw.pte);
- set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
- } else {
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
+ set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
+ } else {
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
+ }
}
+ /*
+ * Now the pte is cleared. If this pte was uffd-wp armed,
+ * we may want to replace a none pte with a marker pte if
+ * it's file-backed, so we don't lose the tracking info.
+ */
+ pte_install_uffd_wp_if_needed(vma, address, pvmw.pte, pteval);
+
/* Set the dirty flag on the folio now the pte is gone. */
if (pte_dirty(pteval))
folio_mark_dirty(folio);
@@ -1637,11 +1711,31 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
break;
}
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
+ swap_free(entry);
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ if (anon_exclusive &&
+ page_try_share_anon_rmap(subpage)) {
+ swap_free(entry);
set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
}
+ /*
+ * Note: We *don't* remember if the page was mapped
+ * exclusively in the swap pte if the architecture
+ * doesn't support __HAVE_ARCH_PTE_SWP_EXCLUSIVE. In
+ * that case, swapin code has to re-determine that
+ * manually and might detect the page as possibly
+ * shared, for example, if there are other references on
+ * the page or if the page is under writeback. We made
+ * sure that there are no GUP pins on the page that
+ * would rely on it, so for GUP pins this is fine.
+ */
if (list_empty(&mm->mmlist)) {
spin_lock(&mmlist_lock);
if (list_empty(&mm->mmlist))
@@ -1651,6 +1745,8 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
dec_mm_counter(mm, MM_ANONPAGES);
inc_mm_counter(mm, MM_SWAPENTS);
swp_pte = swp_entry_to_pte(entry);
+ if (anon_exclusive)
+ swp_pte = pte_swp_mkexclusive(swp_pte);
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
@@ -1741,7 +1837,7 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
DEFINE_FOLIO_VMA_WALK(pvmw, folio, vma, address, 0);
pte_t pteval;
struct page *subpage;
- bool ret = true;
+ bool anon_exclusive, ret = true;
struct mmu_notifier_range range;
enum ttu_flags flags = (enum ttu_flags)(long)arg;
@@ -1791,7 +1887,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
VM_BUG_ON_FOLIO(folio_test_hugetlb(folio) ||
!folio_test_pmd_mappable(folio), folio);
- set_pmd_migration_entry(&pvmw, subpage);
+ if (set_pmd_migration_entry(&pvmw, subpage)) {
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
continue;
}
#endif
@@ -1802,44 +1902,53 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
subpage = folio_page(folio,
pte_pfn(*pvmw.pte) - folio_pfn(folio));
address = pvmw.address;
+ anon_exclusive = folio_test_anon(folio) &&
+ PageAnonExclusive(subpage);
- if (folio_test_hugetlb(folio) && !folio_test_anon(folio)) {
+ if (folio_test_hugetlb(folio)) {
/*
- * To call huge_pmd_unshare, i_mmap_rwsem must be
- * held in write mode. Caller needs to explicitly
- * do this outside rmap routines.
+ * huge_pmd_unshare may unmap an entire PMD page.
+ * There is no way of knowing exactly which PMDs may
+ * be cached for this mm, so we must flush them all.
+ * start/end were already adjusted above to cover this
+ * range.
*/
- VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
- if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
- /*
- * huge_pmd_unshare unmapped an entire PMD
- * page. There is no way of knowing exactly
- * which PMDs may be cached for this mm, so
- * we must flush them all. start/end were
- * already adjusted above to cover this range.
- */
- flush_cache_range(vma, range.start, range.end);
- flush_tlb_range(vma, range.start, range.end);
- mmu_notifier_invalidate_range(mm, range.start,
- range.end);
+ flush_cache_range(vma, range.start, range.end);
+ if (!folio_test_anon(folio)) {
/*
- * The ref count of the PMD page was dropped
- * which is part of the way map counting
- * is done for shared PMDs. Return 'true'
- * here. When there is no other sharing,
- * huge_pmd_unshare returns false and we will
- * unmap the actual page and drop map count
- * to zero.
+ * To call huge_pmd_unshare, i_mmap_rwsem must be
+ * held in write mode. Caller needs to explicitly
+ * do this outside rmap routines.
*/
- page_vma_mapped_walk_done(&pvmw);
- break;
+ VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
+
+ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
+ flush_tlb_range(vma, range.start, range.end);
+ mmu_notifier_invalidate_range(mm, range.start,
+ range.end);
+
+ /*
+ * The ref count of the PMD page was dropped
+ * which is part of the way map counting
+ * is done for shared PMDs. Return 'true'
+ * here. When there is no other sharing,
+ * huge_pmd_unshare returns false and we will
+ * unmap the actual page and drop map count
+ * to zero.
+ */
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
}
- }
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
- pteval = ptep_clear_flush(vma, address, pvmw.pte);
+ /* Nuke the hugetlb page table entry */
+ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte);
+ } else {
+ flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
+ /* Nuke the page table entry. */
+ pteval = ptep_clear_flush(vma, address, pvmw.pte);
+ }
/* Set the dirty flag on the folio now the pte is gone. */
if (pte_dirty(pteval))
@@ -1853,6 +1962,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
swp_entry_t entry;
pte_t swp_pte;
+ if (anon_exclusive)
+ BUG_ON(page_try_share_anon_rmap(subpage));
+
/*
* Store the pfn of the page in a special migration
* pte. do_swap_page() will wait until the migration
@@ -1861,6 +1973,8 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
entry = pte_to_swp_entry(pteval);
if (is_writable_device_private_entry(entry))
entry = make_writable_migration_entry(pfn);
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(pfn);
else
entry = make_readable_migration_entry(pfn);
swp_pte = swp_entry_to_pte(entry);
@@ -1920,7 +2034,22 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
pte_t swp_pte;
if (arch_unmap_one(mm, vma, address, pteval) < 0) {
- set_pte_at(mm, address, pvmw.pte, pteval);
+ if (folio_test_hugetlb(folio))
+ set_huge_pte_at(mm, address, pvmw.pte, pteval);
+ else
+ set_pte_at(mm, address, pvmw.pte, pteval);
+ ret = false;
+ page_vma_mapped_walk_done(&pvmw);
+ break;
+ }
+ VM_BUG_ON_PAGE(pte_write(pteval) && folio_test_anon(folio) &&
+ !anon_exclusive, subpage);
+ if (anon_exclusive &&
+ page_try_share_anon_rmap(subpage)) {
+ if (folio_test_hugetlb(folio))
+ set_huge_pte_at(mm, address, pvmw.pte, pteval);
+ else
+ set_pte_at(mm, address, pvmw.pte, pteval);
ret = false;
page_vma_mapped_walk_done(&pvmw);
break;
@@ -1934,6 +2063,9 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
if (pte_write(pteval))
entry = make_writable_migration_entry(
page_to_pfn(subpage));
+ else if (anon_exclusive)
+ entry = make_readable_exclusive_migration_entry(
+ page_to_pfn(subpage));
else
entry = make_readable_migration_entry(
page_to_pfn(subpage));
@@ -1943,7 +2075,11 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma,
swp_pte = pte_swp_mksoft_dirty(swp_pte);
if (pte_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
- set_pte_at(mm, address, pvmw.pte, swp_pte);
+ if (folio_test_hugetlb(folio))
+ set_huge_swap_pte_at(mm, address, pvmw.pte,
+ swp_pte, vma_mmu_pagesize(vma));
+ else
+ set_pte_at(mm, address, pvmw.pte, swp_pte);
trace_set_migration_pte(address, pte_val(swp_pte),
compound_order(&folio->page));
/*
@@ -2148,7 +2284,7 @@ static bool folio_make_device_exclusive(struct folio *folio,
/**
* make_device_exclusive_range() - Mark a range for exclusive use by a device
- * @mm: mm_struct of assoicated target process
+ * @mm: mm_struct of associated target process
* @start: start of the region to mark for exclusive device access
* @end: end address of region
* @pages: returns the pages which were successfully marked for exclusive access
@@ -2210,12 +2346,12 @@ void __put_anon_vma(struct anon_vma *anon_vma)
}
static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
- const struct rmap_walk_control *rwc)
+ struct rmap_walk_control *rwc)
{
struct anon_vma *anon_vma;
if (rwc->anon_lock)
- return rwc->anon_lock(folio);
+ return rwc->anon_lock(folio, rwc);
/*
* Note: remove_migration_ptes() cannot use folio_lock_anon_vma_read()
@@ -2227,7 +2363,17 @@ static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
if (!anon_vma)
return NULL;
+ if (anon_vma_trylock_read(anon_vma))
+ goto out;
+
+ if (rwc->try_lock) {
+ anon_vma = NULL;
+ rwc->contended = true;
+ goto out;
+ }
+
anon_vma_lock_read(anon_vma);
+out:
return anon_vma;
}
@@ -2241,7 +2387,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct folio *folio,
* contained in the anon_vma struct it points to.
*/
static void rmap_walk_anon(struct folio *folio,
- const struct rmap_walk_control *rwc, bool locked)
+ struct rmap_walk_control *rwc, bool locked)
{
struct anon_vma *anon_vma;
pgoff_t pgoff_start, pgoff_end;
@@ -2289,7 +2435,7 @@ static void rmap_walk_anon(struct folio *folio,
* contained in the address_space struct it points to.
*/
static void rmap_walk_file(struct folio *folio,
- const struct rmap_walk_control *rwc, bool locked)
+ struct rmap_walk_control *rwc, bool locked)
{
struct address_space *mapping = folio_mapping(folio);
pgoff_t pgoff_start, pgoff_end;
@@ -2308,8 +2454,18 @@ static void rmap_walk_file(struct folio *folio,
pgoff_start = folio_pgoff(folio);
pgoff_end = pgoff_start + folio_nr_pages(folio) - 1;
- if (!locked)
+ if (!locked) {
+ if (i_mmap_trylock_read(mapping))
+ goto lookup;
+
+ if (rwc->try_lock) {
+ rwc->contended = true;
+ return;
+ }
+
i_mmap_lock_read(mapping);
+ }
+lookup:
vma_interval_tree_foreach(vma, &mapping->i_mmap,
pgoff_start, pgoff_end) {
unsigned long address = vma_address(&folio->page, vma);
@@ -2331,7 +2487,7 @@ done:
i_mmap_unlock_read(mapping);
}
-void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc)
+void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc)
{
if (unlikely(folio_test_ksm(folio)))
rmap_walk_ksm(folio, rwc);
@@ -2342,7 +2498,7 @@ void rmap_walk(struct folio *folio, const struct rmap_walk_control *rwc)
}
/* Like rmap_walk, but caller holds relevant rmap lock */
-void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc)
+void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc)
{
/* no ksm support for now */
VM_BUG_ON_FOLIO(folio_test_ksm(folio), folio);
@@ -2357,9 +2513,11 @@ void rmap_walk_locked(struct folio *folio, const struct rmap_walk_control *rwc)
* The following two functions are for anonymous (private mapped) hugepages.
* Unlike common anonymous pages, anonymous hugepages have no accounting code
* and no lru code, because we handle hugepages differently from common pages.
+ *
+ * RMAP_COMPOUND is ignored.
*/
-void hugepage_add_anon_rmap(struct page *page,
- struct vm_area_struct *vma, unsigned long address)
+void hugepage_add_anon_rmap(struct page *page, struct vm_area_struct *vma,
+ unsigned long address, rmap_t flags)
{
struct anon_vma *anon_vma = vma->anon_vma;
int first;
@@ -2368,8 +2526,11 @@ void hugepage_add_anon_rmap(struct page *page,
BUG_ON(!anon_vma);
/* address might be in next vma when migration races vma_adjust */
first = atomic_inc_and_test(compound_mapcount_ptr(page));
+ VM_BUG_ON_PAGE(!first && (flags & RMAP_EXCLUSIVE), page);
+ VM_BUG_ON_PAGE(!first && PageAnonExclusive(page), page);
if (first)
- __page_set_anon_rmap(page, vma, address, 0);
+ __page_set_anon_rmap(page, vma, address,
+ !!(flags & RMAP_EXCLUSIVE));
}
void hugepage_add_new_anon_rmap(struct page *page,