mm: handling Non-LRU pages returned by vm_normal_pages

With DEVICE_COHERENT, we'll soon have vm_normal_pages() return device-managed anonymous pages that are not LRU pages. Although they behave like normal pages for purposes of mapping in CPU page, and for COW. They do not support LRU lists, NUMA migration or THP. Callers to follow_page() currently don't expect ZONE_DEVICE pages, however, with DEVICE_COHERENT we might now return ZONE_DEVICE. Check for ZONE_DEVICE pages in applicable users of follow_page() as well. Link: https://lkml.kernel.org/r/20220715150521.18165-5-alex.sierra@amd.com Signed-off-by: Alex Sierra <alex.sierra@amd.com> Acked-by: Felix Kuehling <Felix.Kuehling@amd.com> [v2] Reviewed-by: Alistair Popple <apopple@nvidia.com> [v6] Cc: Christoph Hellwig <hch@lst.de> Cc: David Hildenbrand <david@redhat.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Jerome Glisse <jglisse@redhat.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Ralph Campbell <rcampbell@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
author: Alex Sierra <alex.sierra@amd.com> 2022-07-15 10:05:11 -0500
committer: akpm <akpm@linux-foundation.org> 2022-07-17 17:14:28 -0700
commit: 3218f8712d6bba1812efd5e0d66c1e15134f2a91 (patch)
tree: 65d912c917fc8ba5a86cc1ada0c866f25f06216a /mm
parent: f25cbb7a95a24ff9a2a3bebd308e303942ae6b2c (diff)
9 files changed, 26 insertions, 15 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 60d742c33de3..a563de8234c1 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2910,7 +2910,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start,
 
 		if (IS_ERR(page))
 			continue;
-		if (!page)
+		if (!page || is_zone_device_page(page))
 			continue;
 
 		if (!is_transparent_hugepage(page))
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 01e0d6336754..dea102170ab3 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -611,7 +611,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 			goto out;
 		}
 		page = vm_normal_page(vma, address, pteval);
-		if (unlikely(!page)) {
+		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
 			result = SCAN_PAGE_NULL;
 			goto out;
 		}
@@ -1261,7 +1261,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 			writable = true;
 
 		page = vm_normal_page(vma, _address, pteval);
-		if (unlikely(!page)) {
+		if (unlikely(!page) || unlikely(is_zone_device_page(page))) {
 			result = SCAN_PAGE_NULL;
 			goto out_unmap;
 		}
@@ -1472,7 +1472,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 			goto abort;
 
 		page = vm_normal_page(vma, addr, *pte);
-
+		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+			page = NULL;
 		/*
 		 * Note that uprobe, debugger, or MAP_PRIVATE may change the
 		 * page table, but the new page will not be a subpage of hpage.
@@ -1490,6 +1491,8 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 		if (pte_none(*pte))
 			continue;
 		page = vm_normal_page(vma, addr, *pte);
+		if (WARN_ON_ONCE(page && is_zone_device_page(page)))
+			goto abort;
 		page_remove_rmap(page, vma, false);
 	}
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 8d2dc501c92c..55f1d9634869 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
 		cond_resched();
 		page = follow_page(vma, addr,
 				FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE);
-		if (IS_ERR_OR_NULL(page))
+		if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
 			break;
 		if (PageKsm(page))
 			ret = handle_mm_fault(vma, addr,
@@ -560,7 +560,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
 		goto out;
 
 	page = follow_page(vma, addr, FOLL_GET);
-	if (IS_ERR_OR_NULL(page))
+	if (IS_ERR_OR_NULL(page) || is_zone_device_page(page))
 		goto out;
 	if (PageAnon(page)) {
 		flush_anon_page(vma, page, addr);
@@ -2308,7 +2308,7 @@ next_mm:
 			if (ksm_test_exit(mm))
 				break;
 			*page = follow_page(vma, ksm_scan.address, FOLL_GET);
-			if (IS_ERR_OR_NULL(*page)) {
+			if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) {
 				ksm_scan.address += PAGE_SIZE;
 				cond_resched();
 				continue;
diff --git a/mm/madvise.c b/mm/madvise.c
index e55108d4e4b2..5f0f0948a50e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -421,7 +421,7 @@ regular_page:
 			continue;
 
 		page = vm_normal_page(vma, addr, ptent);
-		if (!page)
+		if (!page || is_zone_device_page(page))
 			continue;
 
 		/*
@@ -639,7 +639,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 		}
 
 		page = vm_normal_page(vma, addr, ptent);
-		if (!page)
+		if (!page || is_zone_device_page(page))
 			continue;
 
 		/*
diff --git a/mm/memory.c b/mm/memory.c
index 580c62febe42..dce0b2e686eb 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -624,6 +624,14 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
 		if (is_zero_pfn(pfn))
 			return NULL;
 		if (pte_devmap(pte))
+		/*
+		 * NOTE: New users of ZONE_DEVICE will not set pte_devmap()
+		 * and will have refcounts incremented on their struct pages
+		 * when they are inserted into PTEs, thus they are safe to
+		 * return here. Legacy ZONE_DEVICE pages that set pte_devmap()
+		 * do not have refcounts. Example of legacy ZONE_DEVICE is
+		 * MEMORY_DEVICE_FS_DAX type in pmem or virtio_fs drivers.
+		 */
 			return NULL;
 
 		print_bad_pte(vma, addr, pte, NULL);
@@ -4693,7 +4701,7 @@ static vm_fault_t do_numa_page(struct vm_fault *vmf)
 	pte = pte_modify(old_pte, vma->vm_page_prot);
 
 	page = vm_normal_page(vma, vmf->address, pte);
-	if (!page)
+	if (!page || is_zone_device_page(page))
 		goto out_map;
 
 	/* TODO: handle PTE-mapped THP */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index f4cd963550c1..88a5173c6ff0 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -523,7 +523,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 		if (!pte_present(*pte))
 			continue;
 		page = vm_normal_page(vma, addr, *pte);
-		if (!page)
+		if (!page || is_zone_device_page(page))
 			continue;
 		/*
 		 * vm_normal_page() filters out zero pages, but there might
diff --git a/mm/migrate.c b/mm/migrate.c
index 7934eebf1689..1649270bc1a7 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1630,7 +1630,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
 		goto out;
 
 	err = -ENOENT;
-	if (!page)
+	if (!page || is_zone_device_page(page))
 		goto out;
 
 	err = 0;
@@ -1821,7 +1821,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
 		if (IS_ERR(page))
 			goto set_status;
 
-		if (page) {
+		if (page && !is_zone_device_page(page)) {
 			err = page_to_nid(page);
 			put_page(page);
 		} else {
diff --git a/mm/mlock.c b/mm/mlock.c
index 716caf851043..b14e929084cc 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -333,7 +333,7 @@ static int mlock_pte_range(pmd_t *pmd, unsigned long addr,
 		if (!pte_present(*pte))
 			continue;
 		page = vm_normal_page(vma, addr, *pte);
-		if (!page)
+		if (!page || is_zone_device_page(page))
 			continue;
 		if (PageTransCompound(page))
 			continue;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 996a97e213ad..5ef478b06a7d 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -127,7 +127,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
 					continue;
 
 				page = vm_normal_page(vma, addr, oldpte);
-				if (!page || PageKsm(page))
+				if (!page || is_zone_device_page(page) || PageKsm(page))
 					continue;
 
 				/* Also skip shared copy-on-write pages */
author	Alex Sierra <alex.sierra@amd.com>	2022-07-15 10:05:11 -0500
committer	akpm <akpm@linux-foundation.org>	2022-07-17 17:14:28 -0700
commit	3218f8712d6bba1812efd5e0d66c1e15134f2a91 (patch)
tree	65d912c917fc8ba5a86cc1ada0c866f25f06216a /mm
parent	f25cbb7a95a24ff9a2a3bebd308e303942ae6b2c (diff)