diff options
author | Dave Airlie <airlied@redhat.com> | 2019-06-21 12:06:03 +1000 |
---|---|---|
committer | Dave Airlie <airlied@redhat.com> | 2019-06-21 12:18:16 +1000 |
commit | 031e610a6a21448a63dff7a0416e5e206724caac (patch) | |
tree | 3ae658895daeda054e9572c5583d2820b4d3f324 /mm | |
parent | 52d2d44eee8091e740d0d275df1311fb8373c9a9 (diff) | |
parent | 9bbfda544ed79e8e9abde27bfe2c85428d582e7b (diff) |
Merge branch 'vmwgfx-next' of git://people.freedesktop.org/~thomash/linux into drm-next
- The coherent memory changes including mm changes.
- Some vmwgfx debug fixes.
- Removal of vmwgfx legacy security checks.
Signed-off-by: Dave Airlie <airlied@redhat.com>
From: Thomas Hellstrom <VMware> <thomas@shipmail.org>
Link: https://patchwork.freedesktop.org/patch/msgid/20190619072531.4026-1-thomas@shipmail.org
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 3 | ||||
-rw-r--r-- | mm/Makefile | 1 | ||||
-rw-r--r-- | mm/as_dirty_helpers.c | 300 | ||||
-rw-r--r-- | mm/memory.c | 145 |
4 files changed, 413 insertions, 36 deletions
diff --git a/mm/Kconfig b/mm/Kconfig index f0c76ba47695..5006d0e6a5c7 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -765,4 +765,7 @@ config GUP_BENCHMARK config ARCH_HAS_PTE_SPECIAL bool +config AS_DIRTY_HELPERS + bool + endmenu diff --git a/mm/Makefile b/mm/Makefile index ac5e5ba78874..f5d412bbc2f7 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -104,3 +104,4 @@ obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o obj-$(CONFIG_MEMFD_CREATE) += memfd.o +obj-$(CONFIG_AS_DIRTY_HELPERS) += as_dirty_helpers.o diff --git a/mm/as_dirty_helpers.c b/mm/as_dirty_helpers.c new file mode 100644 index 000000000000..f600e31534fb --- /dev/null +++ b/mm/as_dirty_helpers.c @@ -0,0 +1,300 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/mm.h> +#include <linux/mm_types.h> +#include <linux/hugetlb.h> +#include <linux/bitops.h> +#include <linux/mmu_notifier.h> +#include <asm/cacheflush.h> +#include <asm/tlbflush.h> + +/** + * struct apply_as - Closure structure for apply_as_range + * @base: struct pfn_range_apply we derive from + * @start: Address of first modified pte + * @end: Address of last modified pte + 1 + * @total: Total number of modified ptes + * @vma: Pointer to the struct vm_area_struct we're currently operating on + */ +struct apply_as { + struct pfn_range_apply base; + unsigned long start; + unsigned long end; + unsigned long total; + struct vm_area_struct *vma; +}; + +/** + * apply_pt_wrprotect - Leaf pte callback to write-protect a pte + * @pte: Pointer to the pte + * @token: Page table token, see apply_to_pfn_range() + * @addr: The virtual page address + * @closure: Pointer to a struct pfn_range_apply embedded in a + * struct apply_as + * + * The function write-protects a pte and records the range in + * virtual address space of touched ptes for efficient range TLB flushes. + * + * Return: Always zero. + */ +static int apply_pt_wrprotect(pte_t *pte, pgtable_t token, + unsigned long addr, + struct pfn_range_apply *closure) +{ + struct apply_as *aas = container_of(closure, typeof(*aas), base); + pte_t ptent = *pte; + + if (pte_write(ptent)) { + pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte); + + ptent = pte_wrprotect(old_pte); + ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent); + aas->total++; + aas->start = min(aas->start, addr); + aas->end = max(aas->end, addr + PAGE_SIZE); + } + + return 0; +} + +/** + * struct apply_as_clean - Closure structure for apply_as_clean + * @base: struct apply_as we derive from + * @bitmap_pgoff: Address_space Page offset of the first bit in @bitmap + * @bitmap: Bitmap with one bit for each page offset in the address_space range + * covered. + * @start: Address_space page offset of first modified pte relative + * to @bitmap_pgoff + * @end: Address_space page offset of last modified pte relative + * to @bitmap_pgoff + */ +struct apply_as_clean { + struct apply_as base; + pgoff_t bitmap_pgoff; + unsigned long *bitmap; + pgoff_t start; + pgoff_t end; +}; + +/** + * apply_pt_clean - Leaf pte callback to clean a pte + * @pte: Pointer to the pte + * @token: Page table token, see apply_to_pfn_range() + * @addr: The virtual page address + * @closure: Pointer to a struct pfn_range_apply embedded in a + * struct apply_as_clean + * + * The function cleans a pte and records the range in + * virtual address space of touched ptes for efficient TLB flushes. + * It also records dirty ptes in a bitmap representing page offsets + * in the address_space, as well as the first and last of the bits + * touched. + * + * Return: Always zero. + */ +static int apply_pt_clean(pte_t *pte, pgtable_t token, + unsigned long addr, + struct pfn_range_apply *closure) +{ + struct apply_as *aas = container_of(closure, typeof(*aas), base); + struct apply_as_clean *clean = container_of(aas, typeof(*clean), base); + pte_t ptent = *pte; + + if (pte_dirty(ptent)) { + pgoff_t pgoff = ((addr - aas->vma->vm_start) >> PAGE_SHIFT) + + aas->vma->vm_pgoff - clean->bitmap_pgoff; + pte_t old_pte = ptep_modify_prot_start(aas->vma, addr, pte); + + ptent = pte_mkclean(old_pte); + ptep_modify_prot_commit(aas->vma, addr, pte, old_pte, ptent); + + aas->total++; + aas->start = min(aas->start, addr); + aas->end = max(aas->end, addr + PAGE_SIZE); + + __set_bit(pgoff, clean->bitmap); + clean->start = min(clean->start, pgoff); + clean->end = max(clean->end, pgoff + 1); + } + + return 0; +} + +/** + * apply_as_range - Apply a pte callback to all PTEs pointing into a range + * of an address_space. + * @mapping: Pointer to the struct address_space + * @aas: Closure structure + * @first_index: First page offset in the address_space + * @nr: Number of incremental page offsets to cover + * + * Return: Number of ptes touched. Note that this number might be larger + * than @nr if there are overlapping vmas + */ +static unsigned long apply_as_range(struct address_space *mapping, + struct apply_as *aas, + pgoff_t first_index, pgoff_t nr) +{ + struct vm_area_struct *vma; + pgoff_t vba, vea, cba, cea; + unsigned long start_addr, end_addr; + struct mmu_notifier_range range; + + i_mmap_lock_read(mapping); + vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, + first_index + nr - 1) { + unsigned long vm_flags = READ_ONCE(vma->vm_flags); + + /* + * We can only do advisory flag tests below, since we can't + * require the vm's mmap_sem to be held to protect the flags. + * Therefore, callers that strictly depend on specific mmap + * flags to remain constant throughout the operation must + * either ensure those flags are immutable for all relevant + * vmas or can't use this function. Fixing this properly would + * require the vma::vm_flags to be protected by a separate + * lock taken after the i_mmap_lock + */ + + /* Skip non-applicable VMAs */ + if ((vm_flags & (VM_SHARED | VM_WRITE)) != + (VM_SHARED | VM_WRITE)) + continue; + + /* Warn on and skip VMAs whose flags indicate illegal usage */ + if (WARN_ON((vm_flags & (VM_HUGETLB | VM_IO)) != VM_IO)) + continue; + + /* Clip to the vma */ + vba = vma->vm_pgoff; + vea = vba + vma_pages(vma); + cba = first_index; + cba = max(cba, vba); + cea = first_index + nr; + cea = min(cea, vea); + + /* Translate to virtual address */ + start_addr = ((cba - vba) << PAGE_SHIFT) + vma->vm_start; + end_addr = ((cea - vba) << PAGE_SHIFT) + vma->vm_start; + if (start_addr >= end_addr) + continue; + + aas->base.mm = vma->vm_mm; + aas->vma = vma; + aas->start = end_addr; + aas->end = start_addr; + + mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, + vma, vma->vm_mm, start_addr, end_addr); + mmu_notifier_invalidate_range_start(&range); + + /* Needed when we only change protection? */ + flush_cache_range(vma, start_addr, end_addr); + + /* + * We're not using tlb_gather_mmu() since typically + * only a small subrange of PTEs are affected. + */ + inc_tlb_flush_pending(vma->vm_mm); + + /* Should not error since aas->base.alloc == 0 */ + WARN_ON(apply_to_pfn_range(&aas->base, start_addr, + end_addr - start_addr)); + if (aas->end > aas->start) + flush_tlb_range(vma, aas->start, aas->end); + + mmu_notifier_invalidate_range_end(&range); + dec_tlb_flush_pending(vma->vm_mm); + } + i_mmap_unlock_read(mapping); + + return aas->total; +} + +/** + * apply_as_wrprotect - Write-protect all ptes in an address_space range + * @mapping: The address_space we want to write protect + * @first_index: The first page offset in the range + * @nr: Number of incremental page offsets to cover + * + * WARNING: This function should only be used for address spaces whose + * vmas are marked VM_IO and that do not contain huge pages. + * To avoid interference with COW'd pages, vmas not marked VM_SHARED are + * simply skipped. + * + * Return: The number of ptes actually write-protected. Note that + * already write-protected ptes are not counted. + */ +unsigned long apply_as_wrprotect(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr) +{ + struct apply_as aas = { + .base = { + .alloc = 0, + .ptefn = apply_pt_wrprotect, + }, + .total = 0, + }; + + return apply_as_range(mapping, &aas, first_index, nr); +} +EXPORT_SYMBOL_GPL(apply_as_wrprotect); + +/** + * apply_as_clean - Clean all ptes in an address_space range + * @mapping: The address_space we want to clean + * @first_index: The first page offset in the range + * @nr: Number of incremental page offsets to cover + * @bitmap_pgoff: The page offset of the first bit in @bitmap + * @bitmap: Pointer to a bitmap of at least @nr bits. The bitmap needs to + * cover the whole range @first_index..@first_index + @nr. + * @start: Pointer to number of the first set bit in @bitmap. + * is modified as new bits are set by the function. + * @end: Pointer to the number of the last set bit in @bitmap. + * none set. The value is modified as new bits are set by the function. + * + * Note: When this function returns there is no guarantee that a CPU has + * not already dirtied new ptes. However it will not clean any ptes not + * reported in the bitmap. + * + * If a caller needs to make sure all dirty ptes are picked up and none + * additional are added, it first needs to write-protect the address-space + * range and make sure new writers are blocked in page_mkwrite() or + * pfn_mkwrite(). And then after a TLB flush following the write-protection + * pick up all dirty bits. + * + * WARNING: This function should only be used for address spaces whose + * vmas are marked VM_IO and that do not contain huge pages. + * To avoid interference with COW'd pages, vmas not marked VM_SHARED are + * simply skipped. + * + * Return: The number of dirty ptes actually cleaned. + */ +unsigned long apply_as_clean(struct address_space *mapping, + pgoff_t first_index, pgoff_t nr, + pgoff_t bitmap_pgoff, + unsigned long *bitmap, + pgoff_t *start, + pgoff_t *end) +{ + bool none_set = (*start >= *end); + struct apply_as_clean clean = { + .base = { + .base = { + .alloc = 0, + .ptefn = apply_pt_clean, + }, + .total = 0, + }, + .bitmap_pgoff = bitmap_pgoff, + .bitmap = bitmap, + .start = none_set ? nr : *start, + .end = none_set ? 0 : *end, + }; + unsigned long ret = apply_as_range(mapping, &clean.base, first_index, + nr); + + *start = clean.start; + *end = clean.end; + return ret; +} +EXPORT_SYMBOL_GPL(apply_as_clean); diff --git a/mm/memory.c b/mm/memory.c index ddf20bd0c317..462aa47f8878 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2032,18 +2032,17 @@ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long } EXPORT_SYMBOL(vm_iomap_memory); -static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) +static int apply_to_pte_range(struct pfn_range_apply *closure, pmd_t *pmd, + unsigned long addr, unsigned long end) { pte_t *pte; int err; pgtable_t token; spinlock_t *uninitialized_var(ptl); - pte = (mm == &init_mm) ? + pte = (closure->mm == &init_mm) ? pte_alloc_kernel(pmd, addr) : - pte_alloc_map_lock(mm, pmd, addr, &ptl); + pte_alloc_map_lock(closure->mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; @@ -2054,86 +2053,109 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, token = pmd_pgtable(*pmd); do { - err = fn(pte++, token, addr, data); + err = closure->ptefn(pte++, token, addr, closure); if (err) break; } while (addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); - if (mm != &init_mm) + if (closure->mm != &init_mm) pte_unmap_unlock(pte-1, ptl); return err; } -static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) +static int apply_to_pmd_range(struct pfn_range_apply *closure, pud_t *pud, + unsigned long addr, unsigned long end) { pmd_t *pmd; unsigned long next; - int err; + int err = 0; BUG_ON(pud_huge(*pud)); - pmd = pmd_alloc(mm, pud, addr); + pmd = pmd_alloc(closure->mm, pud, addr); if (!pmd) return -ENOMEM; + do { next = pmd_addr_end(addr, end); - err = apply_to_pte_range(mm, pmd, addr, next, fn, data); + if (!closure->alloc && pmd_none_or_clear_bad(pmd)) + continue; + err = apply_to_pte_range(closure, pmd, addr, next); if (err) break; } while (pmd++, addr = next, addr != end); return err; } -static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) +static int apply_to_pud_range(struct pfn_range_apply *closure, p4d_t *p4d, + unsigned long addr, unsigned long end) { pud_t *pud; unsigned long next; - int err; + int err = 0; - pud = pud_alloc(mm, p4d, addr); + pud = pud_alloc(closure->mm, p4d, addr); if (!pud) return -ENOMEM; + do { next = pud_addr_end(addr, end); - err = apply_to_pmd_range(mm, pud, addr, next, fn, data); + if (!closure->alloc && pud_none_or_clear_bad(pud)) + continue; + err = apply_to_pmd_range(closure, pud, addr, next); if (err) break; } while (pud++, addr = next, addr != end); return err; } -static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, - unsigned long addr, unsigned long end, - pte_fn_t fn, void *data) +static int apply_to_p4d_range(struct pfn_range_apply *closure, pgd_t *pgd, + unsigned long addr, unsigned long end) { p4d_t *p4d; unsigned long next; - int err; + int err = 0; - p4d = p4d_alloc(mm, pgd, addr); + p4d = p4d_alloc(closure->mm, pgd, addr); if (!p4d) return -ENOMEM; + do { next = p4d_addr_end(addr, end); - err = apply_to_pud_range(mm, p4d, addr, next, fn, data); + if (!closure->alloc && p4d_none_or_clear_bad(p4d)) + continue; + err = apply_to_pud_range(closure, p4d, addr, next); if (err) break; } while (p4d++, addr = next, addr != end); return err; } -/* - * Scan a region of virtual memory, filling in page tables as necessary - * and calling a provided function on each leaf page table. +/** + * apply_to_pfn_range - Scan a region of virtual memory, calling a provided + * function on each leaf page table entry + * @closure: Details about how to scan and what function to apply + * @addr: Start virtual address + * @size: Size of the region + * + * If @closure->alloc is set to 1, the function will fill in the page table + * as necessary. Otherwise it will skip non-present parts. + * Note: The caller must ensure that the range does not contain huge pages. + * The caller must also assure that the proper mmu_notifier functions are + * called before and after the call to apply_to_pfn_range. + * + * WARNING: Do not use this function unless you know exactly what you are + * doing. It is lacking support for huge pages and transparent huge pages. + * + * Return: Zero on success. If the provided function returns a non-zero status, + * the page table walk will terminate and that status will be returned. + * If @closure->alloc is set to 1, then this function may also return memory + * allocation errors arising from allocating page table memory. */ -int apply_to_page_range(struct mm_struct *mm, unsigned long addr, - unsigned long size, pte_fn_t fn, void *data) +int apply_to_pfn_range(struct pfn_range_apply *closure, + unsigned long addr, unsigned long size) { pgd_t *pgd; unsigned long next; @@ -2143,16 +2165,65 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, if (WARN_ON(addr >= end)) return -EINVAL; - pgd = pgd_offset(mm, addr); + pgd = pgd_offset(closure->mm, addr); do { next = pgd_addr_end(addr, end); - err = apply_to_p4d_range(mm, pgd, addr, next, fn, data); + if (!closure->alloc && pgd_none_or_clear_bad(pgd)) + continue; + err = apply_to_p4d_range(closure, pgd, addr, next); if (err) break; } while (pgd++, addr = next, addr != end); return err; } + +/** + * struct page_range_apply - Closure structure for apply_to_page_range() + * @pter: The base closure structure we derive from + * @fn: The leaf pte function to call + * @data: The leaf pte function closure + */ +struct page_range_apply { + struct pfn_range_apply pter; + pte_fn_t fn; + void *data; +}; + +/* + * Callback wrapper to enable use of apply_to_pfn_range for + * the apply_to_page_range interface + */ +static int apply_to_page_range_wrapper(pte_t *pte, pgtable_t token, + unsigned long addr, + struct pfn_range_apply *pter) +{ + struct page_range_apply *pra = + container_of(pter, typeof(*pra), pter); + + return pra->fn(pte, token, addr, pra->data); +} + +/* + * Scan a region of virtual memory, filling in page tables as necessary + * and calling a provided function on each leaf page table. + * + * WARNING: Do not use this function unless you know exactly what you are + * doing. It is lacking support for huge pages and transparent huge pages. + */ +int apply_to_page_range(struct mm_struct *mm, unsigned long addr, + unsigned long size, pte_fn_t fn, void *data) +{ + struct page_range_apply pra = { + .pter = {.mm = mm, + .alloc = 1, + .ptefn = apply_to_page_range_wrapper }, + .fn = fn, + .data = data + }; + + return apply_to_pfn_range(&pra.pter, addr, size); +} EXPORT_SYMBOL_GPL(apply_to_page_range); /* @@ -2238,7 +2309,7 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) ret = vmf->vma->vm_ops->page_mkwrite(vmf); /* Restore original flags so that caller is not surprised */ vmf->flags = old_flags; - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { lock_page(page); @@ -2515,7 +2586,7 @@ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); - if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) + if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)) return ret; return finish_mkwrite_fault(vmf); } @@ -2536,7 +2607,8 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf) pte_unmap_unlock(vmf->pte, vmf->ptl); tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & - (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + (VM_FAULT_ERROR | VM_FAULT_NOPAGE | + VM_FAULT_RETRY)))) { put_page(vmf->page); return tmp; } @@ -3601,7 +3673,8 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf) unlock_page(vmf->page); tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || - (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { + (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | + VM_FAULT_RETRY)))) { put_page(vmf->page); return tmp; } |