nouveau/dmem: evict device private memory during release

When the module is unloaded or a GPU is unbound from the module it is possible for device private pages to still be mapped in currently running processes. This can lead to a hangs and RCU stall warnings when unbinding the device as memunmap_pages() will wait in an uninterruptible state until all device pages have been freed which may never happen. Fix this by migrating device mappings back to normal CPU memory prior to freeing the GPU memory chunks and associated device private pages. Link: https://lkml.kernel.org/r/66277601fb8fda9af408b33da9887192bf895bda.1664366292.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple <apopple@nvidia.com> Cc: Lyude Paul <lyude@redhat.com> Cc: Ben Skeggs <bskeggs@redhat.com> Cc: Ralph Campbell <rcampbell@nvidia.com> Cc: John Hubbard <jhubbard@nvidia.com> Cc: Alex Deucher <alexander.deucher@amd.com> Cc: Alex Sierra <alex.sierra@amd.com> Cc: Christian König <christian.koenig@amd.com> Cc: Dan Williams <dan.j.williams@intel.com> Cc: David Hildenbrand <david@redhat.com> Cc: Felix Kuehling <Felix.Kuehling@amd.com> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Jason Gunthorpe <jgg@nvidia.com> Cc: Matthew Wilcox <willy@infradead.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: Yang Shi <shy828301@gmail.com> Cc: Zi Yan <ziy@nvidia.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
author: Alistair Popple <apopple@nvidia.com> 2022-09-28 22:01:21 +1000
committer: Andrew Morton <akpm@linux-foundation.org> 2022-10-12 18:51:50 -0700
commit: 249881232e1471d28b68f9a3829acc14d150cf5d (patch)
tree: ee662cc138106e06323c0f01658f86178bb9dfb3 /drivers
parent: d9b719394a1147614351961ac454589111c76e76 (diff)
1 files changed, 48 insertions, 0 deletions
diff --git a/drivers/gpu/drm/nouveau/nouveau_dmem.c b/drivers/gpu/drm/nouveau/nouveau_dmem.c
index 65f51fb6a70c..5fe209107246 100644
--- a/drivers/gpu/drm/nouveau/nouveau_dmem.c
+++ b/drivers/gpu/drm/nouveau/nouveau_dmem.c
@@ -367,6 +367,52 @@ nouveau_dmem_suspend(struct nouveau_drm *drm)
 	mutex_unlock(&drm->dmem->mutex);
 }
 
+/*
+ * Evict all pages mapping a chunk.
+ */
+static void
+nouveau_dmem_evict_chunk(struct nouveau_dmem_chunk *chunk)
+{
+	unsigned long i, npages = range_len(&chunk->pagemap.range) >> PAGE_SHIFT;
+	unsigned long *src_pfns, *dst_pfns;
+	dma_addr_t *dma_addrs;
+	struct nouveau_fence *fence;
+
+	src_pfns = kcalloc(npages, sizeof(*src_pfns), GFP_KERNEL);
+	dst_pfns = kcalloc(npages, sizeof(*dst_pfns), GFP_KERNEL);
+	dma_addrs = kcalloc(npages, sizeof(*dma_addrs), GFP_KERNEL);
+
+	migrate_device_range(src_pfns, chunk->pagemap.range.start >> PAGE_SHIFT,
+			npages);
+
+	for (i = 0; i < npages; i++) {
+		if (src_pfns[i] & MIGRATE_PFN_MIGRATE) {
+			struct page *dpage;
+
+			/*
+			 * _GFP_NOFAIL because the GPU is going away and there
+			 * is nothing sensible we can do if we can't copy the
+			 * data back.
+			 */
+			dpage = alloc_page(GFP_HIGHUSER | __GFP_NOFAIL);
+			dst_pfns[i] = migrate_pfn(page_to_pfn(dpage));
+			nouveau_dmem_copy_one(chunk->drm,
+					migrate_pfn_to_page(src_pfns[i]), dpage,
+					&dma_addrs[i]);
+		}
+	}
+
+	nouveau_fence_new(chunk->drm->dmem->migrate.chan, false, &fence);
+	migrate_device_pages(src_pfns, dst_pfns, npages);
+	nouveau_dmem_fence_done(&fence);
+	migrate_device_finalize(src_pfns, dst_pfns, npages);
+	kfree(src_pfns);
+	kfree(dst_pfns);
+	for (i = 0; i < npages; i++)
+		dma_unmap_page(chunk->drm->dev->dev, dma_addrs[i], PAGE_SIZE, DMA_BIDIRECTIONAL);
+	kfree(dma_addrs);
+}
+
 void
 nouveau_dmem_fini(struct nouveau_drm *drm)
 {
@@ -378,8 +424,10 @@ nouveau_dmem_fini(struct nouveau_drm *drm)
 	mutex_lock(&drm->dmem->mutex);
 
 	list_for_each_entry_safe(chunk, tmp, &drm->dmem->chunks, list) {
+		nouveau_dmem_evict_chunk(chunk);
 		nouveau_bo_unpin(chunk->bo);
 		nouveau_bo_ref(NULL, &chunk->bo);
+		WARN_ON(chunk->callocated);
 		list_del(&chunk->list);
 		memunmap_pages(&chunk->pagemap);
 		release_mem_region(chunk->pagemap.range.start,
author	Alistair Popple <apopple@nvidia.com>	2022-09-28 22:01:21 +1000
committer	Andrew Morton <akpm@linux-foundation.org>	2022-10-12 18:51:50 -0700
commit	249881232e1471d28b68f9a3829acc14d150cf5d (patch)
tree	ee662cc138106e06323c0f01658f86178bb9dfb3 /drivers
parent	d9b719394a1147614351961ac454589111c76e76 (diff)