mm: support GUP-triggered unsharing of anonymous pages

Whenever GUP currently ends up taking a R/O pin on an anonymous page that might be shared -- mapped R/O and !PageAnonExclusive() -- any write fault on the page table entry will end up replacing the mapped anonymous page due to COW, resulting in the GUP pin no longer being consistent with the page actually mapped into the page table. The possible ways to deal with this situation are: (1) Ignore and pin -- what we do right now. (2) Fail to pin -- which would be rather surprising to callers and could break user space. (3) Trigger unsharing and pin the now exclusive page -- reliable R/O pins. We want to implement 3) because it provides the clearest semantics and allows for checking in unpin_user_pages() and friends for possible BUGs: when trying to unpin a page that's no longer exclusive, clearly something went very wrong and might result in memory corruptions that might be hard to debug. So we better have a nice way to spot such issues. To implement 3), we need a way for GUP to trigger unsharing: FAULT_FLAG_UNSHARE. FAULT_FLAG_UNSHARE is only applicable to R/O mapped anonymous pages and resembles COW logic during a write fault. However, in contrast to a write fault, GUP-triggered unsharing will, for example, still maintain the write protection. Let's implement FAULT_FLAG_UNSHARE by hooking into the existing write fault handlers for all applicable anonymous page types: ordinary pages, THP and hugetlb. * If FAULT_FLAG_UNSHARE finds a R/O-mapped anonymous page that has been marked exclusive in the meantime by someone else, there is nothing to do. * If FAULT_FLAG_UNSHARE finds a R/O-mapped anonymous page that's not marked exclusive, it will try detecting if the process is the exclusive owner. If exclusive, it can be set exclusive similar to reuse logic during write faults via page_move_anon_rmap() and there is nothing else to do; otherwise, we either have to copy and map a fresh, anonymous exclusive page R/O (ordinary pages, hugetlb), or split the THP. This commit is heavily based on patches by Andrea. Link: https://lkml.kernel.org/r/20220428083441.37290-16-david@redhat.com Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: David Hildenbrand <david@redhat.com> Acked-by: Vlastimil Babka <vbabka@suse.cz> Co-developed-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Christoph Hellwig <hch@lst.de> Cc: David Rientjes <rientjes@google.com> Cc: Don Dutile <ddutile@redhat.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jan Kara <jack@suse.cz> Cc: Jann Horn <jannh@google.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Khalid Aziz <khalid.aziz@oracle.com> Cc: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com> Cc: Liang Zhang <zhangliang5@huawei.com> Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> Cc: Michal Hocko <mhocko@kernel.org> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport <rppt@linux.ibm.com> Cc: Nadav Amit <namit@vmware.com> Cc: Oded Gabbay <oded.gabbay@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Pedro Demarchi Gomes <pedrodemargomes@gmail.com> Cc: Peter Xu <peterx@redhat.com> Cc: Rik van Riel <riel@surriel.com> Cc: Roman Gushchin <guro@fb.com> Cc: Shakeel Butt <shakeelb@google.com> Cc: Yang Shi <shy828301@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
author: David Hildenbrand <david@redhat.com> 2022-05-09 18:20:45 -0700
committer: akpm <akpm@linux-foundation.org> 2022-05-09 18:20:45 -0700
commit: c89357e27f20dda3fff6791d27bb6c91eae99f4a (patch)
tree: 82fa0d770531b803f7df242b6e8ec8cc849a68aa /mm/memory.c
parent: 8909691b6c5a84b67573b23ee8bb917b005628f0 (diff)
1 files changed, 74 insertions, 33 deletions
diff --git a/mm/memory.c b/mm/memory.c
index 454ecc05ad85..a75040a47fcc 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2745,8 +2745,8 @@ static inline int pte_unmap_same(struct vm_fault *vmf)
 	return same;
 }
 
-static inline bool cow_user_page(struct page *dst, struct page *src,
-				 struct vm_fault *vmf)
+static inline bool __wp_page_copy_user(struct page *dst, struct page *src,
+				       struct vm_fault *vmf)
 {
 	bool ret;
 	void *kaddr;
@@ -2954,6 +2954,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
 	struct page *page = vmf->page;
 	pte_t entry;
 
+	VM_BUG_ON(!(vmf->flags & FAULT_FLAG_WRITE));
 	VM_BUG_ON(PageAnon(page) && !PageAnonExclusive(page));
 
 	/*
@@ -2974,7 +2975,8 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
 }
 
 /*
- * Handle the case of a page which we actually need to copy to a new page.
+ * Handle the case of a page which we actually need to copy to a new page,
+ * either due to COW or unsharing.
  *
  * Called with mmap_lock locked and the old page referenced, but
  * without the ptl held.
@@ -2991,6 +2993,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
  */
 static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 {
+	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
 	struct vm_area_struct *vma = vmf->vma;
 	struct mm_struct *mm = vma->vm_mm;
 	struct page *old_page = vmf->page;
@@ -3013,7 +3016,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		if (!new_page)
 			goto oom;
 
-		if (!cow_user_page(new_page, old_page, vmf)) {
+		if (!__wp_page_copy_user(new_page, old_page, vmf)) {
 			/*
 			 * COW failed, if the fault was solved by other,
 			 * it's fine. If not, userspace would re-fault on
@@ -3055,7 +3058,14 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = pte_sw_mkyoung(entry);
-		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		if (unlikely(unshare)) {
+			if (pte_soft_dirty(vmf->orig_pte))
+				entry = pte_mksoft_dirty(entry);
+			if (pte_uffd_wp(vmf->orig_pte))
+				entry = pte_mkuffd_wp(entry);
+		} else {
+			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		}
 
 		/*
 		 * Clear the pte entry and flush it first, before updating the
@@ -3072,6 +3082,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 		 * mmu page tables (such as kvm shadow page tables), we want the
 		 * new page to be mapped directly into the secondary page table.
 		 */
+		BUG_ON(unshare && pte_write(entry));
 		set_pte_at_notify(mm, vmf->address, vmf->pte, entry);
 		update_mmu_cache(vma, vmf->address, vmf->pte);
 		if (old_page) {
@@ -3121,7 +3132,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
 			free_swap_cache(old_page);
 		put_page(old_page);
 	}
-	return page_copied ? VM_FAULT_WRITE : 0;
+	return (page_copied && !unshare) ? VM_FAULT_WRITE : 0;
 oom_free_new:
 	put_page(new_page);
 oom:
@@ -3221,18 +3232,22 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
 }
 
 /*
- * This routine handles present pages, when users try to write
- * to a shared page. It is done by copying the page to a new address
- * and decrementing the shared-page counter for the old page.
+ * This routine handles present pages, when
+ * * users try to write to a shared page (FAULT_FLAG_WRITE)
+ * * GUP wants to take a R/O pin on a possibly shared anonymous page
+ *   (FAULT_FLAG_UNSHARE)
+ *
+ * It is done by copying the page to a new address and decrementing the
+ * shared-page counter for the old page.
  *
  * Note that this routine assumes that the protection checks have been
  * done by the caller (the low-level page fault routine in most cases).
- * Thus we can safely just mark it writable once we've done any necessary
- * COW.
+ * Thus, with FAULT_FLAG_WRITE, we can safely just mark it writable once we've
+ * done any necessary COW.
  *
- * We also mark the page dirty at this point even though the page will
- * change only once the write actually happens. This avoids a few races,
- * and potentially makes it more efficient.
+ * In case of FAULT_FLAG_WRITE, we also mark the page dirty at this point even
+ * though the page will change only once the write actually happens. This
+ * avoids a few races, and potentially makes it more efficient.
  *
  * We enter with non-exclusive mmap_lock (to exclude vma changes,
  * but allow concurrent faults), with pte both mapped and locked.
@@ -3241,23 +3256,35 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
 static vm_fault_t do_wp_page(struct vm_fault *vmf)
 	__releases(vmf->ptl)
 {
+	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
 	struct vm_area_struct *vma = vmf->vma;
 
-	if (userfaultfd_pte_wp(vma, *vmf->pte)) {
-		pte_unmap_unlock(vmf->pte, vmf->ptl);
-		return handle_userfault(vmf, VM_UFFD_WP);
-	}
+	VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE));
+	VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE));
 
-	/*
-	 * Userfaultfd write-protect can defer flushes. Ensure the TLB
-	 * is flushed in this case before copying.
-	 */
-	if (unlikely(userfaultfd_wp(vmf->vma) &&
-		     mm_tlb_flush_pending(vmf->vma->vm_mm)))
-		flush_tlb_page(vmf->vma, vmf->address);
+	if (likely(!unshare)) {
+		if (userfaultfd_pte_wp(vma, *vmf->pte)) {
+			pte_unmap_unlock(vmf->pte, vmf->ptl);
+			return handle_userfault(vmf, VM_UFFD_WP);
+		}
+
+		/*
+		 * Userfaultfd write-protect can defer flushes. Ensure the TLB
+		 * is flushed in this case before copying.
+		 */
+		if (unlikely(userfaultfd_wp(vmf->vma) &&
+			     mm_tlb_flush_pending(vmf->vma->vm_mm)))
+			flush_tlb_page(vmf->vma, vmf->address);
+	}
 
 	vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte);
 	if (!vmf->page) {
+		if (unlikely(unshare)) {
+			/* No anonymous page -> nothing to do. */
+			pte_unmap_unlock(vmf->pte, vmf->ptl);
+			return 0;
+		}
+
 		/*
 		 * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a
 		 * VM_PFNMAP VMA.
@@ -3320,8 +3347,16 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
 		page_move_anon_rmap(page, vma);
 		unlock_page(page);
 reuse:
+		if (unlikely(unshare)) {
+			pte_unmap_unlock(vmf->pte, vmf->ptl);
+			return 0;
+		}
 		wp_page_reuse(vmf);
 		return VM_FAULT_WRITE;
+	} else if (unshare) {
+		/* No anonymous page -> nothing to do. */
+		pte_unmap_unlock(vmf->pte, vmf->ptl);
+		return 0;
 	} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
 					(VM_WRITE|VM_SHARED))) {
 		return wp_page_shared(vmf);
@@ -4523,8 +4558,11 @@ static inline vm_fault_t create_huge_pmd(struct vm_fault *vmf)
 /* `inline' is required to avoid gcc 4.1.2 build error */
 static inline vm_fault_t wp_huge_pmd(struct vm_fault *vmf)
 {
+	const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE;
+
 	if (vma_is_anonymous(vmf->vma)) {
-		if (userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
+		if (likely(!unshare) &&
+		    userfaultfd_huge_pmd_wp(vmf->vma, vmf->orig_pmd))
 			return handle_userfault(vmf, VM_UFFD_WP);
 		return do_huge_pmd_wp_page(vmf);
 	}
@@ -4659,10 +4697,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 		update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
 		goto unlock;
 	}
-	if (vmf->flags & FAULT_FLAG_WRITE) {
+	if (vmf->flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) {
 		if (!pte_write(entry))
 			return do_wp_page(vmf);
-		entry = pte_mkdirty(entry);
+		else if (likely(vmf->flags & FAULT_FLAG_WRITE))
+			entry = pte_mkdirty(entry);
 	}
 	entry = pte_mkyoung(entry);
 	if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
@@ -4703,7 +4742,6 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
 		.pgoff = linear_page_index(vma, address),
 		.gfp_mask = __get_fault_gfp_mask(vma),
 	};
-	unsigned int dirty = flags & FAULT_FLAG_WRITE;
 	struct mm_struct *mm = vma->vm_mm;
 	pgd_t *pgd;
 	p4d_t *p4d;
@@ -4728,9 +4766,11 @@ retry_pud:
 		barrier();
 		if (pud_trans_huge(orig_pud) || pud_devmap(orig_pud)) {
 
-			/* NUMA case for anonymous PUDs would go here */
-
-			if (dirty && !pud_write(orig_pud)) {
+			/*
+			 * TODO once we support anonymous PUDs: NUMA case and
+			 * FAULT_FLAG_UNSHARE handling.
+			 */
+			if ((flags & FAULT_FLAG_WRITE) && !pud_write(orig_pud)) {
 				ret = wp_huge_pud(&vmf, orig_pud);
 				if (!(ret & VM_FAULT_FALLBACK))
 					return ret;
@@ -4768,7 +4808,8 @@ retry_pud:
 			if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
 				return do_huge_pmd_numa_page(&vmf);
 
-			if (dirty && !pmd_write(vmf.orig_pmd)) {
+			if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
+			    !pmd_write(vmf.orig_pmd)) {
 				ret = wp_huge_pmd(&vmf);
 				if (!(ret & VM_FAULT_FALLBACK))
 					return ret;
author	David Hildenbrand <david@redhat.com>	2022-05-09 18:20:45 -0700
committer	akpm <akpm@linux-foundation.org>	2022-05-09 18:20:45 -0700
commit	c89357e27f20dda3fff6791d27bb6c91eae99f4a (patch)
tree	82fa0d770531b803f7df242b6e8ec8cc849a68aa /mm/memory.c
parent	8909691b6c5a84b67573b23ee8bb917b005628f0 (diff)